1 /*
2  * Copyright 2016 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "ac_exp_param.h"
26 #include "ac_nir_to_llvm.h"
27 #include "ac_rtld.h"
28 #include "si_pipe.h"
29 #include "si_shader_internal.h"
30 #include "sid.h"
31 #include "tgsi/tgsi_from_mesa.h"
32 #include "util/u_memory.h"
33 
34 struct si_llvm_diagnostics {
35    struct pipe_debug_callback *debug;
36    unsigned retval;
37 };
38 
si_diagnostic_handler(LLVMDiagnosticInfoRef di,void * context)39 static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
40 {
41    struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
42    LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
43    const char *severity_str = NULL;
44 
45    switch (severity) {
46    case LLVMDSError:
47       severity_str = "error";
48       break;
49    case LLVMDSWarning:
50       severity_str = "warning";
51       break;
52    case LLVMDSRemark:
53    case LLVMDSNote:
54    default:
55       return;
56    }
57 
58    char *description = LLVMGetDiagInfoDescription(di);
59 
60    pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
61                       description);
62 
63    if (severity == LLVMDSError) {
64       diag->retval = 1;
65       fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
66    }
67 
68    LLVMDisposeMessage(description);
69 }
70 
si_compile_llvm(struct si_screen * sscreen,struct si_shader_binary * binary,struct ac_shader_config * conf,struct ac_llvm_compiler * compiler,struct ac_llvm_context * ac,struct pipe_debug_callback * debug,gl_shader_stage stage,const char * name,bool less_optimized)71 bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
72                      struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
73                      struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
74                      gl_shader_stage stage, const char *name, bool less_optimized)
75 {
76    unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
77 
78    if (si_can_dump_shader(sscreen, stage)) {
79       fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
80 
81       if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
82          fprintf(stderr, "%s LLVM IR:\n\n", name);
83          ac_dump_module(ac->module);
84          fprintf(stderr, "\n");
85       }
86    }
87 
88    if (sscreen->record_llvm_ir) {
89       char *ir = LLVMPrintModuleToString(ac->module);
90       binary->llvm_ir_string = strdup(ir);
91       LLVMDisposeMessage(ir);
92    }
93 
94    if (!si_replace_shader(count, binary)) {
95       struct ac_compiler_passes *passes = compiler->passes;
96 
97       if (less_optimized && compiler->low_opt_passes)
98          passes = compiler->low_opt_passes;
99 
100       struct si_llvm_diagnostics diag = {debug};
101       LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
102 
103       if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer,
104                                     &binary->elf_size))
105          diag.retval = 1;
106 
107       if (diag.retval != 0) {
108          pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
109          return false;
110       }
111    }
112 
113    struct ac_rtld_binary rtld;
114    if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
115                                .info = &sscreen->info,
116                                .shader_type = stage,
117                                .wave_size = ac->wave_size,
118                                .num_parts = 1,
119                                .elf_ptrs = &binary->elf_buffer,
120                                .elf_sizes = &binary->elf_size}))
121       return false;
122 
123    bool ok = ac_rtld_read_config(&sscreen->info, &rtld, conf);
124    ac_rtld_close(&rtld);
125    return ok;
126 }
127 
si_llvm_context_init(struct si_shader_context * ctx,struct si_screen * sscreen,struct ac_llvm_compiler * compiler,unsigned wave_size)128 void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
129                           struct ac_llvm_compiler *compiler, unsigned wave_size)
130 {
131    memset(ctx, 0, sizeof(*ctx));
132    ctx->screen = sscreen;
133    ctx->compiler = compiler;
134 
135    ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family,
136                         &sscreen->info, AC_FLOAT_MODE_DEFAULT_OPENGL, wave_size, 64);
137 }
138 
si_llvm_create_func(struct si_shader_context * ctx,const char * name,LLVMTypeRef * return_types,unsigned num_return_elems,unsigned max_workgroup_size)139 void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
140                          unsigned num_return_elems, unsigned max_workgroup_size)
141 {
142    LLVMTypeRef ret_type;
143    enum ac_llvm_calling_convention call_conv;
144 
145    if (num_return_elems)
146       ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
147    else
148       ret_type = ctx->ac.voidt;
149 
150    gl_shader_stage real_stage = ctx->stage;
151 
152    /* LS is merged into HS (TCS), and ES is merged into GS. */
153    if (ctx->screen->info.chip_class >= GFX9) {
154       if (ctx->shader->key.as_ls)
155          real_stage = MESA_SHADER_TESS_CTRL;
156       else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
157          real_stage = MESA_SHADER_GEOMETRY;
158    }
159 
160    switch (real_stage) {
161    case MESA_SHADER_VERTEX:
162    case MESA_SHADER_TESS_EVAL:
163       call_conv = AC_LLVM_AMDGPU_VS;
164       break;
165    case MESA_SHADER_TESS_CTRL:
166       call_conv = AC_LLVM_AMDGPU_HS;
167       break;
168    case MESA_SHADER_GEOMETRY:
169       call_conv = AC_LLVM_AMDGPU_GS;
170       break;
171    case MESA_SHADER_FRAGMENT:
172       call_conv = AC_LLVM_AMDGPU_PS;
173       break;
174    case MESA_SHADER_COMPUTE:
175       call_conv = AC_LLVM_AMDGPU_CS;
176       break;
177    default:
178       unreachable("Unhandle shader type");
179    }
180 
181    /* Setup the function */
182    ctx->return_type = ret_type;
183    ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
184    ctx->return_value = LLVMGetUndef(ctx->return_type);
185 
186    if (ctx->screen->info.address32_hi) {
187       ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits",
188                                            ctx->screen->info.address32_hi);
189    }
190 
191    ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
192    ac_llvm_set_target_features(ctx->main_fn, &ctx->ac);
193 }
194 
si_llvm_create_main_func(struct si_shader_context * ctx,bool ngg_cull_shader)195 void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shader)
196 {
197    struct si_shader *shader = ctx->shader;
198    LLVMTypeRef returns[AC_MAX_ARGS];
199    unsigned i;
200 
201    si_init_shader_args(ctx, ngg_cull_shader);
202 
203    for (i = 0; i < ctx->args.num_sgprs_returned; i++)
204       returns[i] = ctx->ac.i32; /* SGPR */
205    for (; i < ctx->args.return_count; i++)
206       returns[i] = ctx->ac.f32; /* VGPR */
207 
208    si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns,
209                        ctx->args.return_count, si_get_max_workgroup_size(shader));
210 
211    /* Reserve register locations for VGPR inputs the PS prolog may need. */
212    if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
213       ac_llvm_add_target_dep_function_attr(
214          ctx->main_fn, "InitialPSInputAddr",
215          S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
216             S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
217             S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
218             S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
219    }
220 
221 
222    if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
223       if (USE_LDS_SYMBOLS) {
224          /* The LSHS size is not known until draw time, so we append it
225           * at the end of whatever LDS use there may be in the rest of
226           * the shader (currently none, unless LLVM decides to do its
227           * own LDS-based lowering).
228           */
229          ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
230                                                    "__lds_end", AC_ADDR_SPACE_LDS);
231          LLVMSetAlignment(ctx->ac.lds, 256);
232       } else {
233          ac_declare_lds_as_pointer(&ctx->ac);
234       }
235    }
236 
237    /* Unlike radv, we override these arguments in the prolog, so to the
238     * API shader they appear as normal arguments.
239     */
240    if (ctx->stage == MESA_SHADER_VERTEX) {
241       ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
242       ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
243    } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
244       ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
245       ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
246    }
247 }
248 
si_llvm_optimize_module(struct si_shader_context * ctx)249 void si_llvm_optimize_module(struct si_shader_context *ctx)
250 {
251    /* Dump LLVM IR before any optimization passes */
252    if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->stage))
253       LLVMDumpModule(ctx->ac.module);
254 
255    /* Run the pass */
256    LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
257    LLVMDisposeBuilder(ctx->ac.builder);
258 }
259 
si_llvm_dispose(struct si_shader_context * ctx)260 void si_llvm_dispose(struct si_shader_context *ctx)
261 {
262    LLVMDisposeModule(ctx->ac.module);
263    LLVMContextDispose(ctx->ac.context);
264    ac_llvm_context_dispose(&ctx->ac);
265 }
266 
267 /**
268  * Load a dword from a constant buffer.
269  */
si_buffer_load_const(struct si_shader_context * ctx,LLVMValueRef resource,LLVMValueRef offset)270 LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
271                                   LLVMValueRef offset)
272 {
273    return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, ctx->ac.f32,
274                                0, true, true);
275 }
276 
si_llvm_build_ret(struct si_shader_context * ctx,LLVMValueRef ret)277 void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
278 {
279    if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
280       LLVMBuildRetVoid(ctx->ac.builder);
281    else
282       LLVMBuildRet(ctx->ac.builder, ret);
283 }
284 
si_insert_input_ret(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)285 LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
286                                  struct ac_arg param, unsigned return_index)
287 {
288    return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
289 }
290 
si_insert_input_ret_float(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)291 LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
292                                        struct ac_arg param, unsigned return_index)
293 {
294    LLVMBuilderRef builder = ctx->ac.builder;
295    LLVMValueRef p = ac_get_arg(&ctx->ac, param);
296 
297    return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
298 }
299 
si_insert_input_ptr(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)300 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
301                                  struct ac_arg param, unsigned return_index)
302 {
303    LLVMBuilderRef builder = ctx->ac.builder;
304    LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
305    ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
306    return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
307 }
308 
si_prolog_get_internal_bindings(struct si_shader_context * ctx)309 LLVMValueRef si_prolog_get_internal_bindings(struct si_shader_context *ctx)
310 {
311    LLVMValueRef ptr[2], list;
312    bool merged_shader = si_is_merged_shader(ctx->shader);
313 
314    ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_INTERNAL_BINDINGS);
315    list =
316       LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
317    return list;
318 }
319 
si_llvm_emit_barrier(struct si_shader_context * ctx)320 void si_llvm_emit_barrier(struct si_shader_context *ctx)
321 {
322    /* GFX6 only (thanks to a hw bug workaround):
323     * The real barrier instruction isn’t needed, because an entire patch
324     * always fits into a single wave.
325     */
326    if (ctx->screen->info.chip_class == GFX6 && ctx->stage == MESA_SHADER_TESS_CTRL) {
327       ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
328       return;
329    }
330 
331    ac_build_s_barrier(&ctx->ac);
332 }
333 
334 /* Ensure that the esgs ring is declared.
335  *
336  * We declare it with 64KB alignment as a hint that the
337  * pointer value will always be 0.
338  */
si_llvm_declare_esgs_ring(struct si_shader_context * ctx)339 void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
340 {
341    if (ctx->esgs_ring)
342       return;
343 
344    assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
345 
346    ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
347                                                 "esgs_ring", AC_ADDR_SPACE_LDS);
348    LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
349    LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
350 }
351 
si_init_exec_from_input(struct si_shader_context * ctx,struct ac_arg param,unsigned bitoffset)352 static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
353                                     unsigned bitoffset)
354 {
355    LLVMValueRef args[] = {
356       ac_get_arg(&ctx->ac, param),
357       LLVMConstInt(ctx->ac.i32, bitoffset, 0),
358    };
359    ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2,
360                       AC_FUNC_ATTR_CONVERGENT);
361 }
362 
363 /**
364  * Get the value of a shader input parameter and extract a bitfield.
365  */
unpack_llvm_param(struct si_shader_context * ctx,LLVMValueRef value,unsigned rshift,unsigned bitwidth)366 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
367                                       unsigned rshift, unsigned bitwidth)
368 {
369    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
370       value = ac_to_integer(&ctx->ac, value);
371 
372    if (rshift)
373       value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
374 
375    if (rshift + bitwidth < 32) {
376       unsigned mask = (1 << bitwidth) - 1;
377       value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
378    }
379 
380    return value;
381 }
382 
si_unpack_param(struct si_shader_context * ctx,struct ac_arg param,unsigned rshift,unsigned bitwidth)383 LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
384                              unsigned bitwidth)
385 {
386    LLVMValueRef value = ac_get_arg(&ctx->ac, param);
387 
388    return unpack_llvm_param(ctx, value, rshift, bitwidth);
389 }
390 
si_get_primitive_id(struct si_shader_context * ctx,unsigned swizzle)391 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle)
392 {
393    if (swizzle > 0)
394       return ctx->ac.i32_0;
395 
396    switch (ctx->stage) {
397    case MESA_SHADER_VERTEX:
398       return ac_get_arg(&ctx->ac, ctx->args.vs_prim_id);
399    case MESA_SHADER_TESS_CTRL:
400       return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
401    case MESA_SHADER_TESS_EVAL:
402       return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
403    case MESA_SHADER_GEOMETRY:
404       return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
405    default:
406       assert(0);
407       return ctx->ac.i32_0;
408    }
409 }
410 
si_llvm_get_block_size(struct ac_shader_abi * abi)411 static LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
412 {
413    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
414 
415    assert(ctx->shader->selector->info.base.workgroup_size_variable &&
416           ctx->shader->selector->info.uses_variable_block_size);
417 
418    LLVMValueRef chan[3] = {
419       si_unpack_param(ctx, ctx->block_size, 0, 10),
420       si_unpack_param(ctx, ctx->block_size, 10, 10),
421       si_unpack_param(ctx, ctx->block_size, 20, 10),
422    };
423    return ac_build_gather_values(&ctx->ac, chan, 3);
424 }
425 
si_llvm_declare_compute_memory(struct si_shader_context * ctx)426 static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
427 {
428    struct si_shader_selector *sel = ctx->shader->selector;
429    unsigned lds_size = sel->info.base.shared_size;
430 
431    LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
432    LLVMValueRef var;
433 
434    assert(!ctx->ac.lds);
435 
436    var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size),
437                                      "compute_lds", AC_ADDR_SPACE_LDS);
438    LLVMSetAlignment(var, 64 * 1024);
439 
440    ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
441 }
442 
si_nir_build_llvm(struct si_shader_context * ctx,struct nir_shader * nir)443 static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
444 {
445    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
446       unsigned colors_read = ctx->shader->selector->info.colors_read;
447       LLVMValueRef main_fn = ctx->main_fn;
448 
449       LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
450 
451       unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
452 
453       if (colors_read & 0x0f) {
454          unsigned mask = colors_read & 0x0f;
455          LLVMValueRef values[4];
456          values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
457          values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
458          values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
459          values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
460          ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
461       }
462       if (colors_read & 0xf0) {
463          unsigned mask = (colors_read & 0xf0) >> 4;
464          LLVMValueRef values[4];
465          values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
466          values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
467          values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
468          values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
469          ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
470       }
471 
472       ctx->abi.interp_at_sample_force_center =
473          ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
474 
475       ctx->abi.kill_ps_if_inf_interp =
476          ctx->screen->options.no_infinite_interp &&
477          (ctx->shader->selector->info.uses_persp_center ||
478           ctx->shader->selector->info.uses_persp_centroid ||
479           ctx->shader->selector->info.uses_persp_sample);
480 
481    } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
482       if (nir->info.cs.user_data_components_amd) {
483          ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
484          ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
485                                                       nir->info.cs.user_data_components_amd);
486       }
487 
488       if (ctx->shader->selector->info.base.shared_size)
489          si_llvm_declare_compute_memory(ctx);
490    }
491 
492    ctx->abi.clamp_shadow_reference = true;
493    ctx->abi.robust_buffer_access = true;
494    ctx->abi.convert_undef_to_zero = true;
495    ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero;
496    ctx->abi.adjust_frag_coord_z = false;
497 
498    const struct si_shader_info *info = &ctx->shader->selector->info;
499    for (unsigned i = 0; i < info->num_outputs; i++) {
500       LLVMTypeRef type = ctx->ac.f32;
501 
502       /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */
503       if (nir->info.stage == MESA_SHADER_FRAGMENT &&
504           nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16)
505          type = ctx->ac.f16;
506 
507       for (unsigned j = 0; j < 4; j++)
508          ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, "");
509    }
510 
511    ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
512 
513    return true;
514 }
515 
516 /**
517  * Given a list of shader part functions, build a wrapper function that
518  * runs them in sequence to form a monolithic shader.
519  */
si_build_wrapper_function(struct si_shader_context * ctx,LLVMValueRef * parts,unsigned num_parts,unsigned main_part,unsigned next_shader_first_part,bool same_thread_count)520 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
521                                unsigned num_parts, unsigned main_part,
522                                unsigned next_shader_first_part, bool same_thread_count)
523 {
524    LLVMBuilderRef builder = ctx->ac.builder;
525    /* PS epilog has one arg per color component; gfx9 merged shader
526     * prologs need to forward 40 SGPRs.
527     */
528    LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
529    LLVMTypeRef function_type;
530    unsigned num_first_params;
531    unsigned num_out, initial_num_out;
532    ASSERTED unsigned num_out_sgpr;         /* used in debug checks */
533    ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
534    unsigned num_sgprs, num_vgprs;
535    unsigned gprs;
536 
537    memset(&ctx->args, 0, sizeof(ctx->args));
538 
539    for (unsigned i = 0; i < num_parts; ++i) {
540       ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE);
541       LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
542    }
543 
544    /* The parameters of the wrapper function correspond to those of the
545     * first part in terms of SGPRs and VGPRs, but we use the types of the
546     * main part to get the right types. This is relevant for the
547     * dereferenceable attribute on descriptor table pointers.
548     */
549    num_sgprs = 0;
550    num_vgprs = 0;
551 
552    function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
553    num_first_params = LLVMCountParamTypes(function_type);
554 
555    for (unsigned i = 0; i < num_first_params; ++i) {
556       LLVMValueRef param = LLVMGetParam(parts[0], i);
557 
558       if (ac_is_sgpr_param(param)) {
559          assert(num_vgprs == 0);
560          num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
561       } else {
562          num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
563       }
564    }
565 
566    gprs = 0;
567    while (gprs < num_sgprs + num_vgprs) {
568       LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
569       LLVMTypeRef type = LLVMTypeOf(param);
570       unsigned size = ac_get_type_size(type) / 4;
571 
572       /* This is going to get casted anyways, so we don't have to
573        * have the exact same type. But we do have to preserve the
574        * pointer-ness so that LLVM knows about it.
575        */
576       enum ac_arg_type arg_type = AC_ARG_INT;
577       if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
578          type = LLVMGetElementType(type);
579 
580          if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
581             if (LLVMGetVectorSize(type) == 4)
582                arg_type = AC_ARG_CONST_DESC_PTR;
583             else if (LLVMGetVectorSize(type) == 8)
584                arg_type = AC_ARG_CONST_IMAGE_PTR;
585             else
586                assert(0);
587          } else if (type == ctx->ac.f32) {
588             arg_type = AC_ARG_CONST_FLOAT_PTR;
589          } else {
590             assert(0);
591          }
592       }
593 
594       ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL);
595 
596       assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
597       assert(gprs + size <= num_sgprs + num_vgprs &&
598              (gprs >= num_sgprs || gprs + size <= num_sgprs));
599 
600       gprs += size;
601    }
602 
603    /* Prepare the return type. */
604    unsigned num_returns = 0;
605    LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
606 
607    last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
608    return_type = LLVMGetReturnType(last_func_type);
609 
610    switch (LLVMGetTypeKind(return_type)) {
611    case LLVMStructTypeKind:
612       num_returns = LLVMCountStructElementTypes(return_type);
613       assert(num_returns <= ARRAY_SIZE(returns));
614       LLVMGetStructElementTypes(return_type, returns);
615       break;
616    case LLVMVoidTypeKind:
617       break;
618    default:
619       unreachable("unexpected type");
620    }
621 
622    si_llvm_create_func(ctx, "wrapper", returns, num_returns,
623                        si_get_max_workgroup_size(ctx->shader));
624 
625    if (si_is_merged_shader(ctx->shader) && !same_thread_count)
626       ac_init_exec_full_mask(&ctx->ac);
627 
628    /* Record the arguments of the function as if they were an output of
629     * a previous part.
630     */
631    num_out = 0;
632    num_out_sgpr = 0;
633 
634    for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
635       LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
636       LLVMTypeRef param_type = LLVMTypeOf(param);
637       LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
638       unsigned size = ac_get_type_size(param_type) / 4;
639 
640       if (size == 1) {
641          if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
642             param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
643             param_type = ctx->ac.i32;
644          }
645 
646          if (param_type != out_type)
647             param = LLVMBuildBitCast(builder, param, out_type, "");
648          out[num_out++] = param;
649       } else {
650          LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
651 
652          if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
653             param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
654             param_type = ctx->ac.i64;
655          }
656 
657          if (param_type != vector_type)
658             param = LLVMBuildBitCast(builder, param, vector_type, "");
659 
660          for (unsigned j = 0; j < size; ++j)
661             out[num_out++] =
662                LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
663       }
664 
665       if (ctx->args.args[i].file == AC_ARG_SGPR)
666          num_out_sgpr = num_out;
667    }
668 
669    memcpy(initial, out, sizeof(out));
670    initial_num_out = num_out;
671    initial_num_out_sgpr = num_out_sgpr;
672 
673    /* Now chain the parts. */
674    LLVMValueRef ret = NULL;
675    for (unsigned part = 0; part < num_parts; ++part) {
676       LLVMValueRef in[AC_MAX_ARGS];
677       LLVMTypeRef ret_type;
678       unsigned out_idx = 0;
679       unsigned num_params = LLVMCountParams(parts[part]);
680 
681       /* Merged shaders are executed conditionally depending
682        * on the number of enabled threads passed in the input SGPRs. */
683       if (si_is_multi_part_shader(ctx->shader) && part == 0) {
684          if (same_thread_count) {
685             struct ac_arg arg;
686             arg.arg_index = 3;
687             arg.used = true;
688 
689             si_init_exec_from_input(ctx, arg, 0);
690          } else {
691             LLVMValueRef ena, count = initial[3];
692 
693             count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
694             ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
695             ac_build_ifcc(&ctx->ac, ena, 6506);
696          }
697       }
698 
699       /* Derive arguments for the next part from outputs of the
700        * previous one.
701        */
702       for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
703          LLVMValueRef param;
704          LLVMTypeRef param_type;
705          bool is_sgpr;
706          unsigned param_size;
707          LLVMValueRef arg = NULL;
708 
709          param = LLVMGetParam(parts[part], param_idx);
710          param_type = LLVMTypeOf(param);
711          param_size = ac_get_type_size(param_type) / 4;
712          is_sgpr = ac_is_sgpr_param(param);
713 
714          if (is_sgpr) {
715             ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG);
716          } else if (out_idx < num_out_sgpr) {
717             /* Skip returned SGPRs the current part doesn't
718              * declare on the input. */
719             out_idx = num_out_sgpr;
720          }
721 
722          assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
723 
724          if (param_size == 1)
725             arg = out[out_idx];
726          else
727             arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
728 
729          if (LLVMTypeOf(arg) != param_type) {
730             if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
731                if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) {
732                   arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
733                   arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
734                } else {
735                   arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
736                   arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
737                }
738             } else {
739                arg = LLVMBuildBitCast(builder, arg, param_type, "");
740             }
741          }
742 
743          in[param_idx] = arg;
744          out_idx += param_size;
745       }
746 
747       ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
748 
749       if (!same_thread_count &&
750           si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
751          ac_build_endif(&ctx->ac, 6506);
752 
753          /* The second half of the merged shader should use
754           * the inputs from the toplevel (wrapper) function,
755           * not the return value from the last call.
756           *
757           * That's because the last call was executed condi-
758           * tionally, so we can't consume it in the main
759           * block.
760           */
761          memcpy(out, initial, sizeof(initial));
762          num_out = initial_num_out;
763          num_out_sgpr = initial_num_out_sgpr;
764 
765          /* Execute the second shader conditionally based on the number of
766           * enabled threads there.
767           */
768          if (ctx->stage == MESA_SHADER_TESS_CTRL) {
769             LLVMValueRef ena, count = initial[3];
770 
771             count = LLVMBuildLShr(builder, count, LLVMConstInt(ctx->ac.i32, 8, 0), "");
772             count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
773             ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
774             ac_build_ifcc(&ctx->ac, ena, 6507);
775          }
776          continue;
777       }
778 
779       /* Extract the returned GPRs. */
780       ret_type = LLVMTypeOf(ret);
781       num_out = 0;
782       num_out_sgpr = 0;
783 
784       if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
785          assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
786 
787          unsigned ret_size = LLVMCountStructElementTypes(ret_type);
788 
789          for (unsigned i = 0; i < ret_size; ++i) {
790             LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, "");
791 
792             assert(num_out < ARRAY_SIZE(out));
793             out[num_out++] = val;
794 
795             if (LLVMTypeOf(val) == ctx->ac.i32) {
796                assert(num_out_sgpr + 1 == num_out);
797                num_out_sgpr = num_out;
798             }
799          }
800       }
801    }
802 
803    /* Close the conditional wrapping the second shader. */
804    if (ctx->stage == MESA_SHADER_TESS_CTRL &&
805        !same_thread_count && si_is_multi_part_shader(ctx->shader))
806       ac_build_endif(&ctx->ac, 6507);
807 
808    if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
809       LLVMBuildRetVoid(builder);
810    else
811       LLVMBuildRet(builder, ret);
812 }
813 
si_llvm_translate_nir(struct si_shader_context * ctx,struct si_shader * shader,struct nir_shader * nir,bool free_nir,bool ngg_cull_shader)814 bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
815                            struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
816 {
817    struct si_shader_selector *sel = shader->selector;
818    const struct si_shader_info *info = &sel->info;
819 
820    ctx->shader = shader;
821    ctx->stage = sel->info.stage;
822 
823    ctx->num_const_buffers = info->base.num_ubos;
824    ctx->num_shader_buffers = info->base.num_ssbos;
825 
826    ctx->num_samplers = BITSET_LAST_BIT(info->base.textures_used);
827    ctx->num_images = info->base.num_images;
828 
829    si_llvm_init_resource_callbacks(ctx);
830 
831    switch (ctx->stage) {
832    case MESA_SHADER_VERTEX:
833       si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
834       break;
835    case MESA_SHADER_TESS_CTRL:
836       si_llvm_init_tcs_callbacks(ctx);
837       break;
838    case MESA_SHADER_TESS_EVAL:
839       si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
840       break;
841    case MESA_SHADER_GEOMETRY:
842       si_llvm_init_gs_callbacks(ctx);
843       break;
844    case MESA_SHADER_FRAGMENT:
845       si_llvm_init_ps_callbacks(ctx);
846       break;
847    case MESA_SHADER_COMPUTE:
848       ctx->abi.load_local_group_size = si_llvm_get_block_size;
849       break;
850    default:
851       assert(!"Unsupported shader type");
852       return false;
853    }
854 
855    si_llvm_create_main_func(ctx, ngg_cull_shader);
856 
857    if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
858       si_preload_esgs_ring(ctx);
859 
860    if (ctx->stage == MESA_SHADER_GEOMETRY)
861       si_preload_gs_rings(ctx);
862    else if (ctx->stage == MESA_SHADER_TESS_EVAL)
863       si_llvm_preload_tes_rings(ctx);
864 
865    if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
866       for (unsigned i = 0; i < 6; i++) {
867          ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
868       }
869    }
870 
871    if (ctx->stage == MESA_SHADER_GEOMETRY) {
872       for (unsigned i = 0; i < 4; i++) {
873          ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
874       }
875       if (shader->key.as_ngg) {
876          for (unsigned i = 0; i < 4; ++i) {
877             ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
878             ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
879          }
880 
881          assert(!ctx->gs_ngg_scratch);
882          LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
883          ctx->gs_ngg_scratch =
884             LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
885          LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
886          LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
887 
888          ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
889             ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
890          LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
891          LLVMSetAlignment(ctx->gs_ngg_emit, 4);
892       }
893    }
894 
895    if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
896       /* Unconditionally declare scratch space base for streamout and
897        * vertex compaction. Whether space is actually allocated is
898        * determined during linking / PM4 creation.
899        */
900       si_llvm_declare_esgs_ring(ctx);
901 
902       /* This is really only needed when streamout and / or vertex
903        * compaction is enabled.
904        */
905       if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
906          LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
907          ctx->gs_ngg_scratch =
908             LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
909          LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
910          LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
911       }
912    }
913 
914    /* For merged shaders (VS-TCS, VS-GS, TES-GS): */
915    if (ctx->screen->info.chip_class >= GFX9 && si_is_merged_shader(shader)) {
916       LLVMValueRef thread_enabled = NULL;
917 
918       /* TES is special because it has only 1 shader part if NGG shader culling is disabled,
919        * and therefore it doesn't use the wrapper function.
920        */
921       bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es &&
922                              !shader->key.opt.ngg_culling;
923 
924       /* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there
925        * instead. For monolithic shaders, the wrapper function does this.
926        */
927       if ((!shader->is_monolithic || no_wrapper_func) &&
928           (ctx->stage == MESA_SHADER_TESS_EVAL ||
929            (ctx->stage == MESA_SHADER_VERTEX &&
930             !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader))))
931          ac_init_exec_full_mask(&ctx->ac);
932 
933       /* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease
934        * register usage.
935        */
936       if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
937           shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
938          /* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */
939          if (ctx->screen->info.chip_class == GFX10)
940             ac_build_s_barrier(&ctx->ac);
941 
942          gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
943 
944          /* Build the primitive export at the beginning
945           * of the shader if possible.
946           */
947          if (gfx10_ngg_export_prim_early(shader))
948             gfx10_ngg_build_export_prim(ctx, NULL, NULL);
949       }
950 
951       /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */
952       if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg)
953          gfx10_ngg_gs_emit_prologue(ctx);
954 
955       if (ctx->stage == MESA_SHADER_GEOMETRY ||
956           (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) {
957          /* Wrap both shaders in an if statement according to the number of enabled threads
958           * there. For monolithic TCS, the if statement is inserted by the wrapper function,
959           * not here.
960           */
961          thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
962       } else if (((shader->key.as_ls || shader->key.as_es) && !shader->is_monolithic) ||
963                  (shader->key.as_ngg && !shader->key.as_es)) {
964          /* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS.
965           * For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
966           * the if statement is inserted by the wrapper function.
967           */
968          thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
969       }
970 
971       if (thread_enabled) {
972          ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
973          ctx->merged_wrap_if_label = 11500;
974          ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
975       }
976 
977       /* Execute a barrier before the second shader in
978        * a merged shader.
979        *
980        * Execute the barrier inside the conditional block,
981        * so that empty waves can jump directly to s_endpgm,
982        * which will also signal the barrier.
983        *
984        * This is possible in gfx9, because an empty wave
985        * for the second shader does not participate in
986        * the epilogue. With NGG, empty waves may still
987        * be required to export data (e.g. GS output vertices),
988        * so we cannot let them exit early.
989        *
990        * If the shader is TCS and the TCS epilog is present
991        * and contains a barrier, it will wait there and then
992        * reach s_endpgm.
993        */
994       if (ctx->stage == MESA_SHADER_TESS_CTRL) {
995          /* We need the barrier only if TCS inputs are read from LDS. */
996          if (!shader->key.opt.same_patch_vertices ||
997              shader->selector->info.base.inputs_read &
998              ~shader->selector->tcs_vgpr_only_inputs)
999             ac_build_s_barrier(&ctx->ac);
1000       } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) {
1001          /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
1002          ac_build_s_barrier(&ctx->ac);
1003       }
1004    }
1005 
1006    bool success = si_nir_build_llvm(ctx, nir);
1007    if (free_nir)
1008       ralloc_free(nir);
1009    if (!success) {
1010       fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
1011       return false;
1012    }
1013 
1014    si_llvm_build_ret(ctx, ctx->return_value);
1015    return true;
1016 }
1017 
si_should_optimize_less(struct ac_llvm_compiler * compiler,struct si_shader_selector * sel)1018 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
1019                                     struct si_shader_selector *sel)
1020 {
1021    if (!compiler->low_opt_passes)
1022       return false;
1023 
1024    /* Assume a slow CPU. */
1025    assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
1026 
1027    /* For a crazy dEQP test containing 2597 memory opcodes, mostly
1028     * buffer stores. */
1029    return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
1030 }
1031 
si_optimize_vs_outputs(struct si_shader_context * ctx)1032 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
1033 {
1034    struct si_shader *shader = ctx->shader;
1035    struct si_shader_info *info = &shader->selector->info;
1036    unsigned skip_vs_optim_mask = 0;
1037 
1038    if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
1039        shader->key.as_ls || shader->key.as_es)
1040       return;
1041 
1042    /* Optimizing these outputs is not possible, since they might be overriden
1043     * at runtime with S_028644_PT_SPRITE_TEX. */
1044    for (int i = 0; i < info->num_outputs; i++) {
1045       if (info->output_semantic[i] == VARYING_SLOT_PNTC ||
1046           (info->output_semantic[i] >= VARYING_SLOT_TEX0 &&
1047            info->output_semantic[i] <= VARYING_SLOT_TEX7)) {
1048          skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i];
1049       }
1050    }
1051 
1052    ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
1053                           info->num_outputs, skip_vs_optim_mask,
1054                           &shader->info.nr_param_exports);
1055 }
1056 
si_llvm_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct pipe_debug_callback * debug,struct nir_shader * nir,bool free_nir)1057 bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1058                             struct si_shader *shader, struct pipe_debug_callback *debug,
1059                             struct nir_shader *nir, bool free_nir)
1060 {
1061    struct si_shader_selector *sel = shader->selector;
1062    struct si_shader_context ctx;
1063 
1064    si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
1065 
1066    LLVMValueRef ngg_cull_main_fn = NULL;
1067    if (shader->key.opt.ngg_culling) {
1068       if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) {
1069          si_llvm_dispose(&ctx);
1070          return false;
1071       }
1072       ngg_cull_main_fn = ctx.main_fn;
1073       ctx.main_fn = NULL;
1074    }
1075 
1076    if (!si_llvm_translate_nir(&ctx, shader, nir, free_nir, false)) {
1077       si_llvm_dispose(&ctx);
1078       return false;
1079    }
1080 
1081    if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
1082       LLVMValueRef parts[4];
1083       unsigned num_parts = 0;
1084       bool first_is_prolog = false;
1085       LLVMValueRef main_fn = ctx.main_fn;
1086 
1087       if (ngg_cull_main_fn) {
1088          if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
1089             union si_shader_part_key prolog_key;
1090             si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
1091                                  &shader->key.part.vs.prolog, shader, &prolog_key);
1092             prolog_key.vs_prolog.is_monolithic = true;
1093             si_llvm_build_vs_prolog(&ctx, &prolog_key);
1094             parts[num_parts++] = ctx.main_fn;
1095             first_is_prolog = true;
1096          }
1097          parts[num_parts++] = ngg_cull_main_fn;
1098       }
1099 
1100       if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
1101          union si_shader_part_key prolog_key;
1102          si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
1103                               &shader->key.part.vs.prolog, shader, &prolog_key);
1104          prolog_key.vs_prolog.is_monolithic = true;
1105          si_llvm_build_vs_prolog(&ctx, &prolog_key);
1106          parts[num_parts++] = ctx.main_fn;
1107          if (num_parts == 1)
1108             first_is_prolog = true;
1109       }
1110       parts[num_parts++] = main_fn;
1111 
1112       si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
1113    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
1114       LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
1115 
1116       /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */
1117       union si_shader_part_key prolog_key;
1118       memset(&prolog_key, 0, sizeof(prolog_key));
1119       prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
1120       prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5;
1121       prolog_key.vs_prolog.as_ngg = 1;
1122       prolog_key.vs_prolog.load_vgprs_after_culling = 1;
1123       prolog_key.vs_prolog.is_monolithic = true;
1124       si_llvm_build_vs_prolog(&ctx, &prolog_key);
1125       prolog = ctx.main_fn;
1126 
1127       parts[0] = ngg_cull_main_fn;
1128       parts[1] = prolog;
1129       parts[2] = main_fn;
1130 
1131       si_build_wrapper_function(&ctx, parts, 3, 0, 0, false);
1132    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
1133       if (sscreen->info.chip_class >= GFX9) {
1134          struct si_shader_selector *ls = shader->key.part.tcs.ls;
1135          LLVMValueRef parts[4];
1136          bool vs_needs_prolog =
1137             si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
1138 
1139          /* TCS main part */
1140          parts[2] = ctx.main_fn;
1141 
1142          /* TCS epilog */
1143          union si_shader_part_key tcs_epilog_key;
1144          memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
1145          tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1146          si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
1147          parts[3] = ctx.main_fn;
1148 
1149          /* VS as LS main part */
1150          ctx.next_shader_sel = ctx.shader->selector;
1151          nir = si_get_nir_shader(ls, NULL, &free_nir);
1152          struct si_shader shader_ls = {};
1153          shader_ls.selector = ls;
1154          shader_ls.key.as_ls = 1;
1155          shader_ls.key.mono = shader->key.mono;
1156          shader_ls.key.opt = shader->key.opt;
1157          shader_ls.is_monolithic = true;
1158 
1159          if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) {
1160             si_llvm_dispose(&ctx);
1161             return false;
1162          }
1163          shader->info.uses_instanceid |= ls->info.uses_instanceid;
1164          parts[1] = ctx.main_fn;
1165 
1166          /* LS prolog */
1167          if (vs_needs_prolog) {
1168             union si_shader_part_key vs_prolog_key;
1169             si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
1170                                  &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
1171             vs_prolog_key.vs_prolog.is_monolithic = true;
1172             si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1173             parts[0] = ctx.main_fn;
1174          }
1175 
1176          /* Reset the shader context. */
1177          ctx.shader = shader;
1178          ctx.stage = MESA_SHADER_TESS_CTRL;
1179 
1180          si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
1181                                    vs_needs_prolog, vs_needs_prolog ? 2 : 1,
1182                                    shader->key.opt.same_patch_vertices);
1183       } else {
1184          LLVMValueRef parts[2];
1185          union si_shader_part_key epilog_key;
1186 
1187          parts[0] = ctx.main_fn;
1188 
1189          memset(&epilog_key, 0, sizeof(epilog_key));
1190          epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1191          si_llvm_build_tcs_epilog(&ctx, &epilog_key);
1192          parts[1] = ctx.main_fn;
1193 
1194          si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
1195       }
1196    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
1197       if (ctx.screen->info.chip_class >= GFX9) {
1198          struct si_shader_selector *es = shader->key.part.gs.es;
1199          LLVMValueRef es_prolog = NULL;
1200          LLVMValueRef es_main = NULL;
1201          LLVMValueRef gs_prolog = NULL;
1202          LLVMValueRef gs_main = ctx.main_fn;
1203 
1204          /* GS prolog */
1205          union si_shader_part_key gs_prolog_key;
1206          memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
1207          gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1208          gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
1209          si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
1210          gs_prolog = ctx.main_fn;
1211 
1212          /* ES main part */
1213          nir = si_get_nir_shader(es, NULL, &free_nir);
1214          struct si_shader shader_es = {};
1215          shader_es.selector = es;
1216          shader_es.key.as_es = 1;
1217          shader_es.key.as_ngg = shader->key.as_ngg;
1218          shader_es.key.mono = shader->key.mono;
1219          shader_es.key.opt = shader->key.opt;
1220          shader_es.is_monolithic = true;
1221 
1222          if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) {
1223             si_llvm_dispose(&ctx);
1224             return false;
1225          }
1226          shader->info.uses_instanceid |= es->info.uses_instanceid;
1227          es_main = ctx.main_fn;
1228 
1229          /* ES prolog */
1230          if (es->info.stage == MESA_SHADER_VERTEX &&
1231              si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
1232             union si_shader_part_key vs_prolog_key;
1233             si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
1234                                  &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
1235             vs_prolog_key.vs_prolog.is_monolithic = true;
1236             si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1237             es_prolog = ctx.main_fn;
1238          }
1239 
1240          /* Reset the shader context. */
1241          ctx.shader = shader;
1242          ctx.stage = MESA_SHADER_GEOMETRY;
1243 
1244          /* Prepare the array of shader parts. */
1245          LLVMValueRef parts[4];
1246          unsigned num_parts = 0, main_part, next_first_part;
1247 
1248          if (es_prolog)
1249             parts[num_parts++] = es_prolog;
1250 
1251          parts[main_part = num_parts++] = es_main;
1252          parts[next_first_part = num_parts++] = gs_prolog;
1253          parts[num_parts++] = gs_main;
1254 
1255          si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false);
1256       } else {
1257          LLVMValueRef parts[2];
1258          union si_shader_part_key prolog_key;
1259 
1260          parts[1] = ctx.main_fn;
1261 
1262          memset(&prolog_key, 0, sizeof(prolog_key));
1263          prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1264          si_llvm_build_gs_prolog(&ctx, &prolog_key);
1265          parts[0] = ctx.main_fn;
1266 
1267          si_build_wrapper_function(&ctx, parts, 2, 1, 0, false);
1268       }
1269    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
1270       si_llvm_build_monolithic_ps(&ctx, shader);
1271    }
1272 
1273    si_llvm_optimize_module(&ctx);
1274 
1275    /* Post-optimization transformations and analysis. */
1276    si_optimize_vs_outputs(&ctx);
1277 
1278    if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.stage)) {
1279       ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
1280    }
1281 
1282    /* Make sure the input is a pointer and not integer followed by inttoptr. */
1283    assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
1284 
1285    /* Compile to bytecode. */
1286    if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
1287                         ctx.stage, si_get_shader_name(shader),
1288                         si_should_optimize_less(compiler, shader->selector))) {
1289       si_llvm_dispose(&ctx);
1290       fprintf(stderr, "LLVM failed to compile shader\n");
1291       return false;
1292    }
1293 
1294    si_llvm_dispose(&ctx);
1295    return true;
1296 }
1297