1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "nir/nir.h"
29 #include "radv_debug.h"
30 #include "radv_llvm_helper.h"
31 #include "radv_private.h"
32 #include "radv_shader.h"
33 #include "radv_shader_args.h"
34 
35 #include "ac_binary.h"
36 #include "ac_exp_param.h"
37 #include "ac_llvm_build.h"
38 #include "ac_nir_to_llvm.h"
39 #include "ac_shader_abi.h"
40 #include "ac_shader_util.h"
41 #include "sid.h"
42 
43 struct radv_shader_context {
44    struct ac_llvm_context ac;
45    const struct nir_shader *shader;
46    struct ac_shader_abi abi;
47    const struct radv_shader_args *args;
48 
49    gl_shader_stage stage;
50 
51    unsigned max_workgroup_size;
52    LLVMContextRef context;
53    LLVMValueRef main_function;
54 
55    LLVMValueRef descriptor_sets[MAX_SETS];
56 
57    LLVMValueRef ring_offsets;
58 
59    LLVMValueRef vs_rel_patch_id;
60 
61    LLVMValueRef gs_wave_id;
62    LLVMValueRef gs_vtx_offset[6];
63 
64    LLVMValueRef esgs_ring;
65    LLVMValueRef gsvs_ring[4];
66    LLVMValueRef hs_ring_tess_offchip;
67    LLVMValueRef hs_ring_tess_factor;
68 
69    uint64_t output_mask;
70 
71    LLVMValueRef gs_next_vertex[4];
72    LLVMValueRef gs_curprim_verts[4];
73    LLVMValueRef gs_generated_prims[4];
74    LLVMValueRef gs_ngg_emit;
75    LLVMValueRef gs_ngg_scratch;
76 
77    LLVMValueRef vertexptr; /* GFX10 only */
78 };
79 
80 struct radv_shader_output_values {
81    LLVMValueRef values[4];
82    unsigned slot_name;
83    unsigned slot_index;
84    unsigned usage_mask;
85 };
86 
87 static inline struct radv_shader_context *
radv_shader_context_from_abi(struct ac_shader_abi * abi)88 radv_shader_context_from_abi(struct ac_shader_abi *abi)
89 {
90    return container_of(abi, struct radv_shader_context, abi);
91 }
92 
93 static LLVMValueRef
create_llvm_function(struct ac_llvm_context * ctx,LLVMModuleRef module,LLVMBuilderRef builder,const struct ac_shader_args * args,enum ac_llvm_calling_convention convention,unsigned max_workgroup_size,const struct radv_nir_compiler_options * options)94 create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder,
95                      const struct ac_shader_args *args, enum ac_llvm_calling_convention convention,
96                      unsigned max_workgroup_size, const struct radv_nir_compiler_options *options)
97 {
98    LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
99 
100    if (options->address32_hi) {
101       ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits",
102                                            options->address32_hi);
103    }
104 
105    ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
106    ac_llvm_set_target_features(main_function, ctx);
107 
108    return main_function;
109 }
110 
111 static void
load_descriptor_sets(struct radv_shader_context * ctx)112 load_descriptor_sets(struct radv_shader_context *ctx)
113 {
114    struct radv_userdata_locations *user_sgprs_locs = &ctx->args->shader_info->user_sgprs_locs;
115    uint32_t mask = ctx->args->shader_info->desc_set_used_mask;
116 
117    if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
118       LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
119       while (mask) {
120          int i = u_bit_scan(&mask);
121 
122          ctx->descriptor_sets[i] =
123             ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false));
124          LLVMSetAlignment(ctx->descriptor_sets[i], 4);
125       }
126    } else {
127       while (mask) {
128          int i = u_bit_scan(&mask);
129 
130          ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
131       }
132    }
133 }
134 
135 static enum ac_llvm_calling_convention
get_llvm_calling_convention(LLVMValueRef func,gl_shader_stage stage)136 get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
137 {
138    switch (stage) {
139    case MESA_SHADER_VERTEX:
140    case MESA_SHADER_TESS_EVAL:
141       return AC_LLVM_AMDGPU_VS;
142       break;
143    case MESA_SHADER_GEOMETRY:
144       return AC_LLVM_AMDGPU_GS;
145       break;
146    case MESA_SHADER_TESS_CTRL:
147       return AC_LLVM_AMDGPU_HS;
148       break;
149    case MESA_SHADER_FRAGMENT:
150       return AC_LLVM_AMDGPU_PS;
151       break;
152    case MESA_SHADER_COMPUTE:
153       return AC_LLVM_AMDGPU_CS;
154       break;
155    default:
156       unreachable("Unhandle shader type");
157    }
158 }
159 
160 /* Returns whether the stage is a stage that can be directly before the GS */
161 static bool
is_pre_gs_stage(gl_shader_stage stage)162 is_pre_gs_stage(gl_shader_stage stage)
163 {
164    return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
165 }
166 
167 static void
create_function(struct radv_shader_context * ctx,gl_shader_stage stage,bool has_previous_stage)168 create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)
169 {
170    if (ctx->ac.chip_class >= GFX10) {
171       if (is_pre_gs_stage(stage) && ctx->args->shader_info->is_ngg) {
172          /* On GFX10, VS is merged into GS for NGG. */
173          stage = MESA_SHADER_GEOMETRY;
174          has_previous_stage = true;
175       }
176    }
177 
178    ctx->main_function =
179       create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
180                            get_llvm_calling_convention(ctx->main_function, stage),
181                            ctx->max_workgroup_size, ctx->args->options);
182 
183    ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
184                                           LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,
185                                           AC_FUNC_ATTR_READNONE);
186    ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
187                                         ac_array_in_const_addr_space(ctx->ac.v4i32), "");
188 
189    load_descriptor_sets(ctx);
190 
191    if (stage == MESA_SHADER_TESS_CTRL ||
192        (stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_ls) ||
193        /* GFX9 has the ESGS ring buffer in LDS. */
194        (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
195       ac_declare_lds_as_pointer(&ctx->ac);
196    }
197 }
198 
199 static LLVMValueRef
radv_load_resource(struct ac_shader_abi * abi,LLVMValueRef index,unsigned desc_set,unsigned binding)200 radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_set,
201                    unsigned binding)
202 {
203    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
204    LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
205    struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
206    struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
207    unsigned base_offset = layout->binding[binding].offset;
208    LLVMValueRef offset, stride;
209 
210    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
211        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
212       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
213                      layout->binding[binding].dynamic_offset_offset;
214       desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants);
215       base_offset = pipeline_layout->push_constant_size + 16 * idx;
216       stride = LLVMConstInt(ctx->ac.i32, 16, false);
217    } else
218       stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
219 
220    offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
221 
222    if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
223       offset = ac_build_imad(&ctx->ac, index, stride, offset);
224    }
225 
226    desc_ptr = LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.i32, "");
227 
228    LLVMValueRef res[] = {desc_ptr, offset, ctx->ac.i32_0};
229    return ac_build_gather_values(&ctx->ac, res, 3);
230 }
231 
232 static uint32_t
radv_get_sample_pos_offset(uint32_t num_samples)233 radv_get_sample_pos_offset(uint32_t num_samples)
234 {
235    uint32_t sample_pos_offset = 0;
236 
237    switch (num_samples) {
238    case 2:
239       sample_pos_offset = 1;
240       break;
241    case 4:
242       sample_pos_offset = 3;
243       break;
244    case 8:
245       sample_pos_offset = 7;
246       break;
247    default:
248       break;
249    }
250    return sample_pos_offset;
251 }
252 
253 static LLVMValueRef
load_sample_position(struct ac_shader_abi * abi,LLVMValueRef sample_id)254 load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
255 {
256    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
257 
258    LLVMValueRef result;
259    LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
260    LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
261 
262    ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");
263 
264    uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->args->options->key.ps.num_samples);
265 
266    sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,
267                             LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
268    result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
269 
270    return result;
271 }
272 
273 static LLVMValueRef
load_sample_mask_in(struct ac_shader_abi * abi)274 load_sample_mask_in(struct ac_shader_abi *abi)
275 {
276    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
277    uint8_t log2_ps_iter_samples;
278 
279    if (ctx->args->shader_info->ps.uses_sample_shading) {
280       log2_ps_iter_samples = util_logbase2(ctx->args->options->key.ps.num_samples);
281    } else {
282       log2_ps_iter_samples = ctx->args->options->key.ps.log2_ps_iter_samples;
283    }
284 
285    LLVMValueRef result, sample_id;
286    if (log2_ps_iter_samples) {
287       /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
288       sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4);
289       sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, 1, false), sample_id, "");
290       result = LLVMBuildAnd(ctx->ac.builder, sample_id,
291                             ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), "");
292    } else {
293       result = ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage);
294    }
295 
296    return result;
297 }
298 
299 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream,
300                                      LLVMValueRef vertexidx, LLVMValueRef *addrs);
301 
302 static void
visit_emit_vertex_with_counter(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef vertexidx,LLVMValueRef * addrs)303 visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,
304                                LLVMValueRef *addrs)
305 {
306    unsigned offset = 0;
307    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
308 
309    if (ctx->args->shader_info->is_ngg) {
310       gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
311       return;
312    }
313 
314    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
315       unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
316       uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
317       LLVMValueRef *out_ptr = &addrs[i * 4];
318       int length = util_last_bit(output_usage_mask);
319 
320       if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
321          continue;
322 
323       for (unsigned j = 0; j < length; j++) {
324          if (!(output_usage_mask & (1 << j)))
325             continue;
326 
327          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
328          LLVMValueRef voffset =
329             LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);
330 
331          offset++;
332 
333          voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
334          voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
335 
336          out_val = ac_to_integer(&ctx->ac, out_val);
337          out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
338 
339          ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset,
340                                      ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset), 0,
341                                      ac_glc | ac_slc | ac_swizzled);
342       }
343    }
344 
345    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
346                     ctx->gs_wave_id);
347 }
348 
349 static void
visit_end_primitive(struct ac_shader_abi * abi,unsigned stream)350 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
351 {
352    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
353 
354    if (ctx->args->shader_info->is_ngg) {
355       LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
356       return;
357    }
358 
359    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
360                     ctx->gs_wave_id);
361 }
362 
363 static LLVMValueRef
load_ring_tess_factors(struct ac_shader_abi * abi)364 load_ring_tess_factors(struct ac_shader_abi *abi)
365 {
366    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
367    assert(ctx->stage == MESA_SHADER_TESS_CTRL);
368 
369    return ctx->hs_ring_tess_factor;
370 }
371 
372 static LLVMValueRef
load_ring_tess_offchip(struct ac_shader_abi * abi)373 load_ring_tess_offchip(struct ac_shader_abi *abi)
374 {
375    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
376    assert(ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL);
377 
378    return ctx->hs_ring_tess_offchip;
379 }
380 
381 static LLVMValueRef
load_ring_esgs(struct ac_shader_abi * abi)382 load_ring_esgs(struct ac_shader_abi *abi)
383 {
384    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
385    assert(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL ||
386           ctx->stage == MESA_SHADER_GEOMETRY);
387 
388    return ctx->esgs_ring;
389 }
390 
391 static LLVMValueRef
radv_load_base_vertex(struct ac_shader_abi * abi,bool non_indexed_is_zero)392 radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
393 {
394    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
395    return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
396 }
397 
398 static LLVMValueRef
get_desc_ptr(struct radv_shader_context * ctx,LLVMValueRef ptr,bool non_uniform)399 get_desc_ptr(struct radv_shader_context *ctx, LLVMValueRef ptr, bool non_uniform)
400 {
401    LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, ptr, 0);
402    LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, ptr, 1);
403    ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
404 
405    unsigned addr_space = AC_ADDR_SPACE_CONST_32BIT;
406    if (non_uniform) {
407       /* 32-bit seems to always use SMEM. addrspacecast from 32-bit -> 64-bit is broken. */
408       LLVMValueRef dwords[] = {ptr,
409                                LLVMConstInt(ctx->ac.i32, ctx->args->options->address32_hi, false)};
410       ptr = ac_build_gather_values(&ctx->ac, dwords, 2);
411       ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
412       addr_space = AC_ADDR_SPACE_CONST;
413    }
414    return LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.v4i32, addr_space), "");
415 }
416 
417 static LLVMValueRef
radv_load_ssbo(struct ac_shader_abi * abi,LLVMValueRef buffer_ptr,bool write,bool non_uniform)418 radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform)
419 {
420    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
421    LLVMValueRef result;
422 
423    buffer_ptr = get_desc_ptr(ctx, buffer_ptr, non_uniform);
424    if (!non_uniform)
425       LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
426 
427    result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
428    LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
429    LLVMSetAlignment(result, 4);
430 
431    return result;
432 }
433 
434 static LLVMValueRef
radv_load_ubo(struct ac_shader_abi * abi,unsigned desc_set,unsigned binding,bool valid_binding,LLVMValueRef buffer_ptr)435 radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bool valid_binding,
436               LLVMValueRef buffer_ptr)
437 {
438    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
439    LLVMValueRef result;
440 
441    if (valid_binding) {
442       struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
443       struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
444 
445       if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
446          LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 0);
447          LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 1);
448          buffer_ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
449 
450          uint32_t desc_type =
451             S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
452             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
453 
454          if (ctx->ac.chip_class >= GFX10) {
455             desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
456                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
457          } else {
458             desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
459                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
460          }
461 
462          LLVMValueRef desc_components[4] = {
463             LLVMBuildPtrToInt(ctx->ac.builder, buffer_ptr, ctx->ac.intptr, ""),
464             LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi),
465                          false),
466             LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
467             LLVMConstInt(ctx->ac.i32, desc_type, false),
468          };
469 
470          return ac_build_gather_values(&ctx->ac, desc_components, 4);
471       }
472    }
473 
474    buffer_ptr = get_desc_ptr(ctx, buffer_ptr, false);
475    LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
476 
477    result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
478    LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
479    LLVMSetAlignment(result, 4);
480 
481    return result;
482 }
483 
484 static LLVMValueRef
radv_get_sampler_desc(struct ac_shader_abi * abi,unsigned descriptor_set,unsigned base_index,unsigned constant_index,LLVMValueRef index,enum ac_descriptor_type desc_type,bool image,bool write,bool bindless)485 radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index,
486                       unsigned constant_index, LLVMValueRef index,
487                       enum ac_descriptor_type desc_type, bool image, bool write, bool bindless)
488 {
489    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
490    LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
491    struct radv_descriptor_set_layout *layout =
492       ctx->args->options->layout->set[descriptor_set].layout;
493    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
494    unsigned offset = binding->offset;
495    unsigned stride = binding->size;
496    unsigned type_size;
497    LLVMBuilderRef builder = ctx->ac.builder;
498    LLVMTypeRef type;
499 
500    assert(base_index < layout->binding_count);
501 
502    if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE && desc_type == AC_DESC_FMASK)
503       return NULL;
504 
505    switch (desc_type) {
506    case AC_DESC_IMAGE:
507       type = ctx->ac.v8i32;
508       type_size = 32;
509       break;
510    case AC_DESC_FMASK:
511       type = ctx->ac.v8i32;
512       offset += 32;
513       type_size = 32;
514       break;
515    case AC_DESC_SAMPLER:
516       type = ctx->ac.v4i32;
517       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
518          offset += radv_combined_image_descriptor_sampler_offset(binding);
519       }
520 
521       type_size = 16;
522       break;
523    case AC_DESC_BUFFER:
524       type = ctx->ac.v4i32;
525       type_size = 16;
526       break;
527    case AC_DESC_PLANE_0:
528    case AC_DESC_PLANE_1:
529    case AC_DESC_PLANE_2:
530       type = ctx->ac.v8i32;
531       type_size = 32;
532       offset += 32 * (desc_type - AC_DESC_PLANE_0);
533       break;
534    default:
535       unreachable("invalid desc_type\n");
536    }
537 
538    offset += constant_index * stride;
539 
540    if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
541        (!index || binding->immutable_samplers_equal)) {
542       if (binding->immutable_samplers_equal)
543          constant_index = 0;
544 
545       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
546 
547       LLVMValueRef constants[] = {
548          LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
549          LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
550          LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
551          LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
552       };
553       return ac_build_gather_values(&ctx->ac, constants, 4);
554    }
555 
556    assert(stride % type_size == 0);
557 
558    LLVMValueRef adjusted_index = index;
559    if (!adjusted_index)
560       adjusted_index = ctx->ac.i32_0;
561 
562    adjusted_index =
563       LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
564 
565    LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);
566    list = LLVMBuildGEP(builder, list, &val_offset, 1, "");
567    list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(type), "");
568 
569    LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
570 
571    /* 3 plane formats always have same size and format for plane 1 & 2, so
572     * use the tail from plane 1 so that we can store only the first 16 bytes
573     * of the last plane. */
574    if (desc_type == AC_DESC_PLANE_2) {
575       LLVMValueRef descriptor2 =
576          radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index,
577                                AC_DESC_PLANE_1, image, write, bindless);
578 
579       LLVMValueRef components[8];
580       for (unsigned i = 0; i < 4; ++i)
581          components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
582 
583       for (unsigned i = 4; i < 8; ++i)
584          components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
585       descriptor = ac_build_gather_values(&ctx->ac, components, 8);
586    } else if (desc_type == AC_DESC_IMAGE &&
587               ctx->args->options->has_image_load_dcc_bug &&
588               image && !write) {
589       LLVMValueRef components[8];
590 
591       for (unsigned i = 0; i < 8; i++)
592          components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
593 
594       /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a hardware bug. */
595       components[6] = LLVMBuildAnd(ctx->ac.builder, components[6],
596                                    LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, false), "");
597 
598       descriptor = ac_build_gather_values(&ctx->ac, components, 8);
599    }
600 
601    return descriptor;
602 }
603 
604 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
605  * so we may need to fix it up. */
606 static LLVMValueRef
adjust_vertex_fetch_alpha(struct radv_shader_context * ctx,unsigned adjustment,LLVMValueRef alpha)607 adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment, LLVMValueRef alpha)
608 {
609    if (adjustment == ALPHA_ADJUST_NONE)
610       return alpha;
611 
612    LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
613 
614    alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
615 
616    if (adjustment == ALPHA_ADJUST_SSCALED)
617       alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
618    else
619       alpha = ac_to_integer(&ctx->ac, alpha);
620 
621    /* For the integer-like cases, do a natural sign extension.
622     *
623     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
624     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
625     * exponent.
626     */
627    alpha =
628       LLVMBuildShl(ctx->ac.builder, alpha,
629                    adjustment == ALPHA_ADJUST_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
630    alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
631 
632    /* Convert back to the right type. */
633    if (adjustment == ALPHA_ADJUST_SNORM) {
634       LLVMValueRef clamp;
635       LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
636       alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
637       clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
638       alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
639    } else if (adjustment == ALPHA_ADJUST_SSCALED) {
640       alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
641    }
642 
643    return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
644 }
645 
646 static LLVMValueRef
radv_fixup_vertex_input_fetches(struct radv_shader_context * ctx,LLVMValueRef value,unsigned num_channels,bool is_float)647 radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,
648                                 unsigned num_channels, bool is_float)
649 {
650    LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
651    LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
652    LLVMValueRef chan[4];
653 
654    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
655       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
656 
657       if (num_channels == 4 && num_channels == vec_size)
658          return value;
659 
660       num_channels = MIN2(num_channels, vec_size);
661 
662       for (unsigned i = 0; i < num_channels; i++)
663          chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
664    } else {
665       assert(num_channels == 1);
666       chan[0] = value;
667    }
668 
669    for (unsigned i = num_channels; i < 4; i++) {
670       chan[i] = i == 3 ? one : zero;
671       chan[i] = ac_to_integer(&ctx->ac, chan[i]);
672    }
673 
674    return ac_build_gather_values(&ctx->ac, chan, 4);
675 }
676 
677 static void
load_vs_input(struct radv_shader_context * ctx,unsigned driver_location,LLVMTypeRef dest_type,LLVMValueRef out[4])678 load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTypeRef dest_type,
679               LLVMValueRef out[4])
680 {
681    LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);
682    LLVMValueRef t_offset;
683    LLVMValueRef t_list;
684    LLVMValueRef input;
685    LLVMValueRef buffer_index;
686    unsigned attrib_index = driver_location - VERT_ATTRIB_GENERIC0;
687    unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];
688    unsigned data_format = attrib_format & 0x0f;
689    unsigned num_format = (attrib_format >> 4) & 0x07;
690    bool is_float =
691       num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
692    uint8_t input_usage_mask =
693       ctx->args->shader_info->vs.input_usage_mask[driver_location];
694    unsigned num_input_channels = util_last_bit(input_usage_mask);
695 
696    if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
697       uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];
698 
699       if (divisor) {
700          buffer_index = ctx->abi.instance_id;
701 
702          if (divisor != 1) {
703             buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
704                                          LLVMConstInt(ctx->ac.i32, divisor, 0), "");
705          }
706       } else {
707          buffer_index = ctx->ac.i32_0;
708       }
709 
710       buffer_index = LLVMBuildAdd(
711          ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
712    } else {
713       buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
714                                   ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
715    }
716 
717    const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
718 
719    /* Adjust the number of channels to load based on the vertex attribute format. */
720    unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
721    unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];
722    unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];
723    unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];
724    unsigned alpha_adjust = ctx->args->options->key.vs.vertex_alpha_adjust[attrib_index];
725 
726    if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
727       /* Always load, at least, 3 channels for formats that need to be shuffled because X<->Z. */
728       num_channels = MAX2(num_channels, 3);
729    }
730 
731    unsigned desc_index =
732       ctx->args->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
733    desc_index = util_bitcount(ctx->args->shader_info->vs.vb_desc_usage_mask &
734                               u_bit_consecutive(0, desc_index));
735    t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
736    t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
737 
738    /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any alignment issues that
739     * triggers memory violations and eventually a GPU hang. This can happen if the stride (static or
740     * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and
741     * VBO offset is 2 for R16G16B16A16_SNORM).
742     */
743    if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) {
744       unsigned chan_format = vtx_info->chan_format;
745       LLVMValueRef values[4];
746 
747       assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10);
748 
749       for (unsigned chan = 0; chan < num_channels; chan++) {
750          unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
751          LLVMValueRef chan_index = buffer_index;
752 
753          if (attrib_stride != 0 && chan_offset > attrib_stride) {
754             LLVMValueRef buffer_offset =
755                LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
756 
757             chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
758 
759             chan_offset = chan_offset % attrib_stride;
760          }
761 
762          values[chan] = ac_build_struct_tbuffer_load(
763             &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
764             ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
765       }
766 
767       input = ac_build_gather_values(&ctx->ac, values, num_channels);
768    } else {
769       if (attrib_stride != 0 && attrib_offset > attrib_stride) {
770          LLVMValueRef buffer_offset =
771             LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
772 
773          buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
774 
775          attrib_offset = attrib_offset % attrib_stride;
776       }
777 
778       input = ac_build_struct_tbuffer_load(
779          &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
780          ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
781    }
782 
783    if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
784       LLVMValueRef c[4];
785       c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
786       c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
787       c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
788       c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
789 
790       input = ac_build_gather_values(&ctx->ac, c, 4);
791    }
792 
793    input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
794 
795    for (unsigned chan = 0; chan < 4; chan++) {
796       LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
797       out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
798       if (dest_type == ctx->ac.i16 && is_float) {
799          out[chan] = LLVMBuildBitCast(ctx->ac.builder, out[chan], ctx->ac.f32, "");
800          out[chan] = LLVMBuildFPTrunc(ctx->ac.builder, out[chan], ctx->ac.f16, "");
801       }
802    }
803 
804    out[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, out[3]);
805 
806    for (unsigned chan = 0; chan < 4; chan++) {
807       out[chan] = ac_to_integer(&ctx->ac, out[chan]);
808       if (dest_type == ctx->ac.i16 && !is_float)
809          out[chan] = LLVMBuildTrunc(ctx->ac.builder, out[chan], ctx->ac.i16, "");
810    }
811 }
812 
813 static LLVMValueRef
radv_load_vs_inputs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)814 radv_load_vs_inputs(struct ac_shader_abi *abi, unsigned driver_location, unsigned component,
815                     unsigned num_components, unsigned vertex_index, LLVMTypeRef type)
816 {
817    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
818    LLVMValueRef values[4];
819 
820    load_vs_input(ctx, driver_location, type, values);
821 
822    for (unsigned i = 0; i < 4; i++)
823       values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
824 
825    return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
826 }
827 
828 static void
prepare_interp_optimize(struct radv_shader_context * ctx,struct nir_shader * nir)829 prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir)
830 {
831    bool uses_center = false;
832    bool uses_centroid = false;
833    nir_foreach_shader_in_variable (variable, nir) {
834       if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
835           variable->data.sample)
836          continue;
837 
838       if (variable->data.centroid)
839          uses_centroid = true;
840       else
841          uses_center = true;
842    }
843 
844    ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
845    ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
846 
847    if (uses_center && uses_centroid) {
848       LLVMValueRef sel =
849          LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
850                        ctx->ac.i32_0, "");
851       ctx->abi.persp_centroid =
852          LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
853                          ctx->abi.persp_centroid, "");
854       ctx->abi.linear_centroid =
855          LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
856                          ctx->abi.linear_centroid, "");
857    }
858 }
859 
860 static void
scan_shader_output_decl(struct radv_shader_context * ctx,struct nir_variable * variable,struct nir_shader * shader,gl_shader_stage stage)861 scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable,
862                         struct nir_shader *shader, gl_shader_stage stage)
863 {
864    int idx = variable->data.driver_location;
865    unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
866    uint64_t mask_attribs;
867 
868    if (variable->data.compact) {
869       unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type);
870       attrib_count = (component_count + 3) / 4;
871    }
872 
873    mask_attribs = ((1ull << attrib_count) - 1) << idx;
874 
875    ctx->output_mask |= mask_attribs;
876 }
877 
878 /* Initialize arguments for the shader export intrinsic */
879 static void
si_llvm_init_export_args(struct radv_shader_context * ctx,LLVMValueRef * values,unsigned enabled_channels,unsigned target,struct ac_export_args * args)880 si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
881                          unsigned enabled_channels, unsigned target, struct ac_export_args *args)
882 {
883    /* Specify the channels that are enabled. */
884    args->enabled_channels = enabled_channels;
885 
886    /* Specify whether the EXEC mask represents the valid mask */
887    args->valid_mask = 0;
888 
889    /* Specify whether this is the last export */
890    args->done = 0;
891 
892    /* Specify the target we are exporting */
893    args->target = target;
894 
895    args->compr = false;
896    args->out[0] = LLVMGetUndef(ctx->ac.f32);
897    args->out[1] = LLVMGetUndef(ctx->ac.f32);
898    args->out[2] = LLVMGetUndef(ctx->ac.f32);
899    args->out[3] = LLVMGetUndef(ctx->ac.f32);
900 
901    if (!values)
902       return;
903 
904    bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
905    if (ctx->stage == MESA_SHADER_FRAGMENT) {
906       unsigned index = target - V_008DFC_SQ_EXP_MRT;
907       unsigned col_format = (ctx->args->options->key.ps.col_format >> (4 * index)) & 0xf;
908       bool is_int8 = (ctx->args->options->key.ps.is_int8 >> index) & 1;
909       bool is_int10 = (ctx->args->options->key.ps.is_int10 >> index) & 1;
910 
911       LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
912       LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
913                             bool hi) = NULL;
914 
915       switch (col_format) {
916       case V_028714_SPI_SHADER_ZERO:
917          args->enabled_channels = 0; /* writemask */
918          args->target = V_008DFC_SQ_EXP_NULL;
919          break;
920 
921       case V_028714_SPI_SHADER_32_R:
922          args->enabled_channels = 1;
923          args->out[0] = values[0];
924          break;
925 
926       case V_028714_SPI_SHADER_32_GR:
927          args->enabled_channels = 0x3;
928          args->out[0] = values[0];
929          args->out[1] = values[1];
930          break;
931 
932       case V_028714_SPI_SHADER_32_AR:
933          if (ctx->ac.chip_class >= GFX10) {
934             args->enabled_channels = 0x3;
935             args->out[0] = values[0];
936             args->out[1] = values[3];
937          } else {
938             args->enabled_channels = 0x9;
939             args->out[0] = values[0];
940             args->out[3] = values[3];
941          }
942          break;
943 
944       case V_028714_SPI_SHADER_FP16_ABGR:
945          args->enabled_channels = 0xf;
946          packf = ac_build_cvt_pkrtz_f16;
947          if (is_16bit) {
948             for (unsigned chan = 0; chan < 4; chan++)
949                values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, "");
950          }
951          break;
952 
953       case V_028714_SPI_SHADER_UNORM16_ABGR:
954          args->enabled_channels = 0xf;
955          packf = ac_build_cvt_pknorm_u16;
956          break;
957 
958       case V_028714_SPI_SHADER_SNORM16_ABGR:
959          args->enabled_channels = 0xf;
960          packf = ac_build_cvt_pknorm_i16;
961          break;
962 
963       case V_028714_SPI_SHADER_UINT16_ABGR:
964          args->enabled_channels = 0xf;
965          packi = ac_build_cvt_pk_u16;
966          if (is_16bit) {
967             for (unsigned chan = 0; chan < 4; chan++)
968                values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
969                                             ctx->ac.i32, "");
970          }
971          break;
972 
973       case V_028714_SPI_SHADER_SINT16_ABGR:
974          args->enabled_channels = 0xf;
975          packi = ac_build_cvt_pk_i16;
976          if (is_16bit) {
977             for (unsigned chan = 0; chan < 4; chan++)
978                values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
979                                             ctx->ac.i32, "");
980          }
981          break;
982 
983       default:
984       case V_028714_SPI_SHADER_32_ABGR:
985          memcpy(&args->out[0], values, sizeof(values[0]) * 4);
986          break;
987       }
988 
989       /* Replace NaN by zero (only 32-bit) to fix game bugs if
990        * requested.
991        */
992       if (ctx->args->options->enable_mrt_output_nan_fixup && !is_16bit &&
993           (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
994            col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
995            col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
996          for (unsigned i = 0; i < 4; i++) {
997             LLVMValueRef class_args[2] = {values[i],
998                                           LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)};
999             LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
1000                                                     class_args, 2, AC_FUNC_ATTR_READNONE);
1001             values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], "");
1002          }
1003       }
1004 
1005       /* Pack f16 or norm_i16/u16. */
1006       if (packf) {
1007          for (unsigned chan = 0; chan < 2; chan++) {
1008             LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
1009             LLVMValueRef packed;
1010 
1011             packed = packf(&ctx->ac, pack_args);
1012             args->out[chan] = ac_to_float(&ctx->ac, packed);
1013          }
1014          args->compr = 1; /* COMPR flag */
1015       }
1016 
1017       /* Pack i16/u16. */
1018       if (packi) {
1019          for (unsigned chan = 0; chan < 2; chan++) {
1020             LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
1021                                          ac_to_integer(&ctx->ac, values[2 * chan + 1])};
1022             LLVMValueRef packed;
1023 
1024             packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
1025             args->out[chan] = ac_to_float(&ctx->ac, packed);
1026          }
1027          args->compr = 1; /* COMPR flag */
1028       }
1029       return;
1030    }
1031 
1032    if (is_16bit) {
1033       for (unsigned chan = 0; chan < 4; chan++) {
1034          values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
1035          args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
1036       }
1037    } else
1038       memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1039 
1040    for (unsigned i = 0; i < 4; ++i)
1041       args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
1042 }
1043 
1044 static void
radv_export_param(struct radv_shader_context * ctx,unsigned index,LLVMValueRef * values,unsigned enabled_channels)1045 radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values,
1046                   unsigned enabled_channels)
1047 {
1048    struct ac_export_args args;
1049 
1050    si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args);
1051    ac_build_export(&ctx->ac, &args);
1052 }
1053 
1054 static LLVMValueRef
radv_load_output(struct radv_shader_context * ctx,unsigned index,unsigned chan)1055 radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
1056 {
1057    LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
1058    return LLVMBuildLoad(ctx->ac.builder, output, "");
1059 }
1060 
1061 static void
radv_emit_stream_output(struct radv_shader_context * ctx,LLVMValueRef const * so_buffers,LLVMValueRef const * so_write_offsets,const struct radv_stream_output * output,struct radv_shader_output_values * shader_out)1062 radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
1063                         LLVMValueRef const *so_write_offsets,
1064                         const struct radv_stream_output *output,
1065                         struct radv_shader_output_values *shader_out)
1066 {
1067    unsigned num_comps = util_bitcount(output->component_mask);
1068    unsigned buf = output->buffer;
1069    unsigned offset = output->offset;
1070    unsigned start;
1071    LLVMValueRef out[4];
1072 
1073    assert(num_comps && num_comps <= 4);
1074    if (!num_comps || num_comps > 4)
1075       return;
1076 
1077    /* Get the first component. */
1078    start = ffs(output->component_mask) - 1;
1079 
1080    /* Load the output as int. */
1081    for (int i = 0; i < num_comps; i++) {
1082       out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
1083    }
1084 
1085    /* Pack the output. */
1086    LLVMValueRef vdata = NULL;
1087 
1088    switch (num_comps) {
1089    case 1: /* as i32 */
1090       vdata = out[0];
1091       break;
1092    case 2: /* as v2i32 */
1093    case 3: /* as v4i32 (aligned to 4) */
1094       out[3] = LLVMGetUndef(ctx->ac.i32);
1095       FALLTHROUGH;
1096    case 4: /* as v4i32 */
1097       vdata = ac_build_gather_values(&ctx->ac, out,
1098                                      !ac_has_vec3_support(ctx->ac.chip_class, false)
1099                                         ? util_next_power_of_two(num_comps)
1100                                         : num_comps);
1101       break;
1102    }
1103 
1104    ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf],
1105                                ctx->ac.i32_0, offset, ac_glc | ac_slc);
1106 }
1107 
1108 static void
radv_emit_streamout(struct radv_shader_context * ctx,unsigned stream)1109 radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
1110 {
1111    int i;
1112 
1113    /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1114    assert(ctx->args->ac.streamout_config.used);
1115    LLVMValueRef so_vtx_count = ac_build_bfe(
1116       &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
1117       LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
1118 
1119    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
1120 
1121    /* can_emit = tid < so_vtx_count; */
1122    LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
1123 
1124    /* Emit the streamout code conditionally. This actually avoids
1125     * out-of-bounds buffer access. The hw tells us via the SGPR
1126     * (so_vtx_count) which threads are allowed to emit streamout data.
1127     */
1128    ac_build_ifcc(&ctx->ac, can_emit, 6501);
1129    {
1130       /* The buffer offset is computed as follows:
1131        *   ByteOffset = streamout_offset[buffer_id]*4 +
1132        *                (streamout_write_index + thread_id)*stride[buffer_id] +
1133        *                attrib_offset
1134        */
1135       LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
1136 
1137       /* Compute (streamout_write_index + thread_id). */
1138       so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
1139 
1140       /* Load the descriptor and compute the write offset for each
1141        * enabled buffer.
1142        */
1143       LLVMValueRef so_write_offset[4] = {0};
1144       LLVMValueRef so_buffers[4] = {0};
1145       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
1146 
1147       for (i = 0; i < 4; i++) {
1148          uint16_t stride = ctx->args->shader_info->so.strides[i];
1149 
1150          if (!stride)
1151             continue;
1152 
1153          LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
1154 
1155          so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
1156 
1157          LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
1158 
1159          so_offset =
1160             LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
1161 
1162          so_write_offset[i] = ac_build_imad(
1163             &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
1164       }
1165 
1166       /* Write streamout data. */
1167       for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {
1168          struct radv_shader_output_values shader_out = {0};
1169          struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
1170 
1171          if (stream != output->stream)
1172             continue;
1173 
1174          for (int j = 0; j < 4; j++) {
1175             shader_out.values[j] = radv_load_output(ctx, output->location, j);
1176          }
1177 
1178          radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
1179       }
1180    }
1181    ac_build_endif(&ctx->ac, 6501);
1182 }
1183 
1184 static void
radv_build_param_exports(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,struct radv_vs_output_info * outinfo,bool export_clip_dists)1185 radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1186                          unsigned noutput, struct radv_vs_output_info *outinfo,
1187                          bool export_clip_dists)
1188 {
1189    for (unsigned i = 0; i < noutput; i++) {
1190       unsigned slot_name = outputs[i].slot_name;
1191       unsigned usage_mask = outputs[i].usage_mask;
1192 
1193       if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID &&
1194           slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 &&
1195           slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0)
1196          continue;
1197 
1198       if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) &&
1199           !export_clip_dists)
1200          continue;
1201 
1202       radv_export_param(ctx, outinfo->vs_output_param_offset[slot_name], outputs[i].values,
1203                         usage_mask);
1204    }
1205 }
1206 
1207 /* Generate export instructions for hardware VS shader stage or NGG GS stage
1208  * (position and parameter data only).
1209  */
1210 static void
radv_llvm_export_vs(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,struct radv_vs_output_info * outinfo,bool export_clip_dists)1211 radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1212                     unsigned noutput, struct radv_vs_output_info *outinfo, bool export_clip_dists)
1213 {
1214    LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
1215    LLVMValueRef primitive_shading_rate = NULL;
1216    struct ac_export_args pos_args[4] = {0};
1217    unsigned pos_idx, index;
1218    int i;
1219 
1220    /* Build position exports */
1221    for (i = 0; i < noutput; i++) {
1222       switch (outputs[i].slot_name) {
1223       case VARYING_SLOT_POS:
1224          si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]);
1225          break;
1226       case VARYING_SLOT_PSIZ:
1227          psize_value = outputs[i].values[0];
1228          break;
1229       case VARYING_SLOT_LAYER:
1230          layer_value = outputs[i].values[0];
1231          break;
1232       case VARYING_SLOT_VIEWPORT:
1233          viewport_value = outputs[i].values[0];
1234          break;
1235       case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
1236          primitive_shading_rate = outputs[i].values[0];
1237          break;
1238       case VARYING_SLOT_CLIP_DIST0:
1239       case VARYING_SLOT_CLIP_DIST1:
1240          index = 2 + outputs[i].slot_index;
1241          si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index,
1242                                   &pos_args[index]);
1243          break;
1244       default:
1245          break;
1246       }
1247    }
1248 
1249    /* We need to add the position output manually if it's missing. */
1250    if (!pos_args[0].out[0]) {
1251       pos_args[0].enabled_channels = 0xf; /* writemask */
1252       pos_args[0].valid_mask = 0;         /* EXEC mask */
1253       pos_args[0].done = 0;               /* last export? */
1254       pos_args[0].target = V_008DFC_SQ_EXP_POS;
1255       pos_args[0].compr = 0;              /* COMPR flag */
1256       pos_args[0].out[0] = ctx->ac.f32_0; /* X */
1257       pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
1258       pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
1259       pos_args[0].out[3] = ctx->ac.f32_1; /* W */
1260    }
1261 
1262    bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||
1263                                         ctx->args->options->force_vrs_rates;
1264 
1265    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||
1266        outinfo->writes_viewport_index || writes_primitive_shading_rate) {
1267       pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
1268                                       (writes_primitive_shading_rate == true ? 2 : 0) |
1269                                       (outinfo->writes_layer == true ? 4 : 0));
1270       pos_args[1].valid_mask = 0;
1271       pos_args[1].done = 0;
1272       pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
1273       pos_args[1].compr = 0;
1274       pos_args[1].out[0] = ctx->ac.f32_0; /* X */
1275       pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
1276       pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
1277       pos_args[1].out[3] = ctx->ac.f32_0; /* W */
1278 
1279       if (outinfo->writes_pointsize == true)
1280          pos_args[1].out[0] = psize_value;
1281       if (outinfo->writes_layer == true)
1282          pos_args[1].out[2] = layer_value;
1283       if (outinfo->writes_viewport_index == true) {
1284          if (ctx->args->options->chip_class >= GFX9) {
1285             /* GFX9 has the layer in out.z[10:0] and the viewport
1286              * index in out.z[19:16].
1287              */
1288             LLVMValueRef v = viewport_value;
1289             v = ac_to_integer(&ctx->ac, v);
1290             v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), "");
1291             v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
1292 
1293             pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
1294             pos_args[1].enabled_channels |= 1 << 2;
1295          } else {
1296             pos_args[1].out[3] = viewport_value;
1297             pos_args[1].enabled_channels |= 1 << 3;
1298          }
1299       }
1300 
1301       if (outinfo->writes_primitive_shading_rate) {
1302          pos_args[1].out[1] = primitive_shading_rate;
1303       } else if (ctx->args->options->force_vrs_rates) {
1304          /* Bits [2:3] = VRS rate X
1305           * Bits [4:5] = VRS rate Y
1306           *
1307           * The range is [-2, 1]. Values:
1308           *   1: 2x coarser shading rate in that direction.
1309           *   0: normal shading rate
1310           *  -1: 2x finer shading rate (sample shading, not directional)
1311           *  -2: 4x finer shading rate (sample shading, not directional)
1312           *
1313           * Sample shading can't go above 8 samples, so both numbers can't be -2 at the same time.
1314           */
1315          LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->args->options->force_vrs_rates, false);
1316          LLVMValueRef cond;
1317          LLVMValueRef v;
1318 
1319          /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
1320          cond = LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE, pos_args[0].out[3], ctx->ac.f32_1, "");
1321          v = LLVMBuildSelect(ctx->ac.builder, cond, rates, ctx->ac.i32_0, "");
1322 
1323          pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
1324       }
1325    }
1326 
1327    /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
1328     * Setting valid_mask=1 prevents it and has no other effect.
1329     */
1330    if (ctx->ac.chip_class == GFX10)
1331       pos_args[0].valid_mask = 1;
1332 
1333    pos_idx = 0;
1334    for (i = 0; i < 4; i++) {
1335       if (!pos_args[i].out[0])
1336          continue;
1337 
1338       /* Specify the target we are exporting */
1339       pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1340 
1341       if (pos_idx == outinfo->pos_exports)
1342          /* Specify that this is the last export */
1343          pos_args[i].done = 1;
1344 
1345       ac_build_export(&ctx->ac, &pos_args[i]);
1346    }
1347 
1348    /* Build parameter exports */
1349    radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
1350 }
1351 
1352 static void
handle_vs_outputs_post(struct radv_shader_context * ctx,bool export_prim_id,bool export_clip_dists,struct radv_vs_output_info * outinfo)1353 handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists,
1354                        struct radv_vs_output_info *outinfo)
1355 {
1356    struct radv_shader_output_values *outputs;
1357    unsigned noutput = 0;
1358 
1359    if (ctx->args->options->key.has_multiview_view_index) {
1360       LLVMValueRef *tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
1361       if (!*tmp_out) {
1362          for (unsigned i = 0; i < 4; ++i)
1363             ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
1364                ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
1365       }
1366 
1367       LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index);
1368       LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out);
1369       ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
1370    }
1371 
1372    if (ctx->args->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader) {
1373       /* The GS copy shader emission already emits streamout. */
1374       radv_emit_streamout(ctx, 0);
1375    }
1376 
1377    /* Allocate a temporary array for the output values. */
1378    unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;
1379    outputs = malloc(num_outputs * sizeof(outputs[0]));
1380 
1381    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1382       if (!(ctx->output_mask & (1ull << i)))
1383          continue;
1384 
1385       outputs[noutput].slot_name = i;
1386       outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1387 
1388       if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {
1389          outputs[noutput].usage_mask = ctx->args->shader_info->vs.output_usage_mask[i];
1390       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1391          outputs[noutput].usage_mask = ctx->args->shader_info->tes.output_usage_mask[i];
1392       } else {
1393          assert(ctx->args->is_gs_copy_shader);
1394          outputs[noutput].usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1395       }
1396 
1397       for (unsigned j = 0; j < 4; j++) {
1398          outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1399       }
1400 
1401       noutput++;
1402    }
1403 
1404    /* Export PrimitiveID. */
1405    if (export_prim_id) {
1406       outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;
1407       outputs[noutput].slot_index = 0;
1408       outputs[noutput].usage_mask = 0x1;
1409       if (ctx->stage == MESA_SHADER_TESS_EVAL)
1410          outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
1411       else
1412          outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.vs_prim_id);
1413       for (unsigned j = 1; j < 4; j++)
1414          outputs[noutput].values[j] = ctx->ac.f32_0;
1415       noutput++;
1416    }
1417 
1418    radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
1419 
1420    free(outputs);
1421 }
1422 
1423 static LLVMValueRef
get_wave_id_in_tg(struct radv_shader_context * ctx)1424 get_wave_id_in_tg(struct radv_shader_context *ctx)
1425 {
1426    return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 24, 4);
1427 }
1428 
1429 static LLVMValueRef
get_tgsize(struct radv_shader_context * ctx)1430 get_tgsize(struct radv_shader_context *ctx)
1431 {
1432    return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 28, 4);
1433 }
1434 
1435 static LLVMValueRef
get_thread_id_in_tg(struct radv_shader_context * ctx)1436 get_thread_id_in_tg(struct radv_shader_context *ctx)
1437 {
1438    LLVMBuilderRef builder = ctx->ac.builder;
1439    LLVMValueRef tmp;
1440    tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1441                       LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
1442    return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
1443 }
1444 
1445 static LLVMValueRef
ngg_get_vtx_cnt(struct radv_shader_context * ctx)1446 ngg_get_vtx_cnt(struct radv_shader_context *ctx)
1447 {
1448    return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1449                        LLVMConstInt(ctx->ac.i32, 12, false), LLVMConstInt(ctx->ac.i32, 9, false),
1450                        false);
1451 }
1452 
1453 static LLVMValueRef
ngg_get_prim_cnt(struct radv_shader_context * ctx)1454 ngg_get_prim_cnt(struct radv_shader_context *ctx)
1455 {
1456    return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1457                        LLVMConstInt(ctx->ac.i32, 22, false), LLVMConstInt(ctx->ac.i32, 9, false),
1458                        false);
1459 }
1460 
1461 static LLVMValueRef
ngg_gs_get_vertex_storage(struct radv_shader_context * ctx)1462 ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
1463 {
1464    unsigned num_outputs = util_bitcount64(ctx->output_mask);
1465 
1466    if (ctx->args->options->key.has_multiview_view_index)
1467       num_outputs++;
1468 
1469    LLVMTypeRef elements[2] = {
1470       LLVMArrayType(ctx->ac.i32, 4 * num_outputs),
1471       LLVMArrayType(ctx->ac.i8, 4),
1472    };
1473    LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1474    type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1475    return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1476 }
1477 
1478 /**
1479  * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1480  * is in emit order; that is:
1481  * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1482  * - during vertex emit, i.e. while the API GS shader invocation is running,
1483  *   N = threadidx * gs_max_out_vertices + emitidx
1484  *
1485  * Goals of the LDS memory layout:
1486  * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1487  *    in uniform control flow
1488  * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1489  *    culling
1490  * 3. Agnostic to the number of waves (since we don't know it before compiling)
1491  * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1492  * 5. Avoid wasting memory.
1493  *
1494  * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1495  * layout, elimination of bank conflicts requires that each vertex occupy an
1496  * odd number of dwords. We use the additional dword to store the output stream
1497  * index as well as a flag to indicate whether this vertex ends a primitive
1498  * for rasterization.
1499  *
1500  * Swizzling is required to satisfy points 1 and 2 simultaneously.
1501  *
1502  * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1503  * Indices are swizzled in groups of 32, which ensures point 1 without
1504  * disturbing point 2.
1505  *
1506  * \return an LDS pointer to type {[N x i32], [4 x i8]}
1507  */
1508 static LLVMValueRef
ngg_gs_vertex_ptr(struct radv_shader_context * ctx,LLVMValueRef vertexidx)1509 ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx)
1510 {
1511    LLVMBuilderRef builder = ctx->ac.builder;
1512    LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1513 
1514    /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1515    unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;
1516    if (write_stride_2exp) {
1517       LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1518       LLVMValueRef swizzle = LLVMBuildAnd(
1519          builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1520       vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1521    }
1522 
1523    return ac_build_gep0(&ctx->ac, storage, vertexidx);
1524 }
1525 
1526 static LLVMValueRef
ngg_gs_emit_vertex_ptr(struct radv_shader_context * ctx,LLVMValueRef gsthread,LLVMValueRef emitidx)1527 ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, LLVMValueRef emitidx)
1528 {
1529    LLVMBuilderRef builder = ctx->ac.builder;
1530    LLVMValueRef tmp;
1531 
1532    tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false);
1533    tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1534    const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1535    return ngg_gs_vertex_ptr(ctx, vertexidx);
1536 }
1537 
1538 static LLVMValueRef
ngg_gs_get_emit_output_ptr(struct radv_shader_context * ctx,LLVMValueRef vertexptr,unsigned out_idx)1539 ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1540                            unsigned out_idx)
1541 {
1542    LLVMValueRef gep_idx[3] = {
1543       ctx->ac.i32_0, /* implied C-style array */
1544       ctx->ac.i32_0, /* first struct entry */
1545       LLVMConstInt(ctx->ac.i32, out_idx, false),
1546    };
1547    return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1548 }
1549 
1550 static LLVMValueRef
ngg_gs_get_emit_primflag_ptr(struct radv_shader_context * ctx,LLVMValueRef vertexptr,unsigned stream)1551 ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1552                              unsigned stream)
1553 {
1554    LLVMValueRef gep_idx[3] = {
1555       ctx->ac.i32_0, /* implied C-style array */
1556       ctx->ac.i32_1, /* second struct entry */
1557       LLVMConstInt(ctx->ac.i32, stream, false),
1558    };
1559    return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1560 }
1561 
1562 static void
handle_ngg_outputs_post_2(struct radv_shader_context * ctx)1563 handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
1564 {
1565    LLVMBuilderRef builder = ctx->ac.builder;
1566    LLVMValueRef tmp;
1567 
1568    assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
1569           !ctx->args->is_gs_copy_shader);
1570 
1571    LLVMValueRef prims_in_wave =
1572       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
1573    LLVMValueRef vtx_in_wave =
1574       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 0, 8);
1575    LLVMValueRef is_gs_thread =
1576       LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, "");
1577    LLVMValueRef is_es_thread =
1578       LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
1579    LLVMValueRef vtxindex[] = {
1580       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 0, 16),
1581       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 16, 16),
1582       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[1]), 0, 16),
1583    };
1584 
1585    /* Determine the number of vertices per primitive. */
1586    unsigned num_vertices;
1587 
1588    if (ctx->stage == MESA_SHADER_VERTEX) {
1589       num_vertices = 3; /* TODO: optimize for points & lines */
1590    } else {
1591       assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1592 
1593       if (ctx->shader->info.tess.point_mode)
1594          num_vertices = 1;
1595       else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
1596          num_vertices = 2;
1597       else
1598          num_vertices = 3;
1599    }
1600 
1601    /* Copy Primitive IDs from GS threads to the LDS address corresponding
1602     * to the ES thread of the provoking vertex.
1603     */
1604    if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.outinfo.export_prim_id) {
1605       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1606 
1607       LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);
1608 
1609       /* For provoking vertex last mode, use num_vtx_in_prim - 1. */
1610       if (ctx->args->options->key.vs.provoking_vtx_last) {
1611          uint8_t outprim = si_conv_prim_to_gs_out(ctx->args->options->key.vs.topology);
1612          provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, outprim, false);
1613       }
1614 
1615       /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1616       LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1617       LLVMValueRef provoking_vtx_index =
1618          LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1619 
1620       LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id),
1621                      ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));
1622       ac_build_endif(&ctx->ac, 5400);
1623    }
1624 
1625    /* TODO: primitive culling */
1626 
1627    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
1628                                  ngg_get_prim_cnt(ctx));
1629 
1630    /* TODO: streamout queries */
1631    /* Export primitive data to the index buffer.
1632     *
1633     * For the first version, we will always build up all three indices
1634     * independent of the primitive type. The additional garbage data
1635     * shouldn't hurt.
1636     *
1637     * TODO: culling depends on the primitive type, so can have some
1638     * interaction here.
1639     */
1640    ac_build_ifcc(&ctx->ac, is_gs_thread, 6001);
1641    {
1642       struct ac_ngg_prim prim = {0};
1643 
1644       if (ctx->args->shader_info->is_ngg_passthrough) {
1645          prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]);
1646       } else {
1647          prim.num_vertices = num_vertices;
1648          prim.isnull = ctx->ac.i1false;
1649          prim.edgeflags = ctx->ac.i32_0;
1650          memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
1651       }
1652 
1653       ac_build_export_prim(&ctx->ac, &prim);
1654    }
1655    ac_build_endif(&ctx->ac, 6001);
1656 
1657    /* Export per-vertex data (positions and parameters). */
1658    ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1659    {
1660       struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL
1661                                                ? &ctx->args->shader_info->tes.outinfo
1662                                                : &ctx->args->shader_info->vs.outinfo;
1663 
1664       /* Exporting the primitive ID is handled below. */
1665       /* TODO: use the new VS export path */
1666       handle_vs_outputs_post(ctx, false, outinfo->export_clip_dists, outinfo);
1667 
1668       if (outinfo->export_prim_id) {
1669          LLVMValueRef values[4];
1670 
1671          if (ctx->stage == MESA_SHADER_VERTEX) {
1672             /* Wait for GS stores to finish. */
1673             ac_build_s_barrier(&ctx->ac);
1674 
1675             tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
1676             values[0] = LLVMBuildLoad(builder, tmp, "");
1677          } else {
1678             assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1679             values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
1680          }
1681 
1682          values[0] = ac_to_float(&ctx->ac, values[0]);
1683          for (unsigned j = 1; j < 4; j++)
1684             values[j] = ctx->ac.f32_0;
1685 
1686          radv_export_param(ctx, outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID], values,
1687                            0x1);
1688       }
1689    }
1690    ac_build_endif(&ctx->ac, 6002);
1691 }
1692 
1693 static void
gfx10_ngg_gs_emit_prologue(struct radv_shader_context * ctx)1694 gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
1695 {
1696    /* Zero out the part of LDS scratch that is used to accumulate the
1697     * per-stream generated primitive count.
1698     */
1699    LLVMBuilderRef builder = ctx->ac.builder;
1700    LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1701    LLVMValueRef tid = get_thread_id_in_tg(ctx);
1702    LLVMBasicBlockRef merge_block;
1703    LLVMValueRef cond;
1704 
1705    LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
1706    LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
1707    merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
1708 
1709    cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1710    LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block);
1711    LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block);
1712 
1713    LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1714    LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1715 
1716    LLVMBuildBr(ctx->ac.builder, merge_block);
1717    LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
1718 
1719    ac_build_s_barrier(&ctx->ac);
1720 }
1721 
1722 static void
gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context * ctx)1723 gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
1724 {
1725    LLVMBuilderRef builder = ctx->ac.builder;
1726    LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1727    LLVMValueRef tmp;
1728 
1729    /* Zero out remaining (non-emitted) primitive flags.
1730     *
1731     * Note: Alternatively, we could pass the relevant gs_next_vertex to
1732     *       the emit threads via LDS. This is likely worse in the expected
1733     *       typical case where each GS thread emits the full set of
1734     *       vertices.
1735     */
1736    for (unsigned stream = 0; stream < 4; ++stream) {
1737       unsigned num_components;
1738 
1739       num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
1740       if (!num_components)
1741          continue;
1742 
1743       const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1744 
1745       ac_build_bgnloop(&ctx->ac, 5100);
1746 
1747       const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1748       tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1749                           LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
1750       ac_build_ifcc(&ctx->ac, tmp, 5101);
1751       ac_build_break(&ctx->ac);
1752       ac_build_endif(&ctx->ac, 5101);
1753 
1754       tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1755       LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1756 
1757       tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1758       LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1759 
1760       ac_build_endloop(&ctx->ac, 5100);
1761    }
1762 
1763    /* Accumulate generated primitives counts across the entire threadgroup. */
1764    for (unsigned stream = 0; stream < 4; ++stream) {
1765       unsigned num_components;
1766 
1767       num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
1768       if (!num_components)
1769          continue;
1770 
1771       LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1772       numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1773 
1774       tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1775       ac_build_ifcc(&ctx->ac, tmp, 5105);
1776       {
1777          LLVMBuildAtomicRMW(
1778             builder, LLVMAtomicRMWBinOpAdd,
1779             ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
1780             numprims, LLVMAtomicOrderingMonotonic, false);
1781       }
1782       ac_build_endif(&ctx->ac, 5105);
1783    }
1784 }
1785 
1786 static void
gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context * ctx)1787 gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
1788 {
1789    const unsigned verts_per_prim =
1790       si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive);
1791    LLVMBuilderRef builder = ctx->ac.builder;
1792    LLVMValueRef tmp, tmp2;
1793 
1794    ac_build_s_barrier(&ctx->ac);
1795 
1796    const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1797    LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1798 
1799    /* Write shader query data. */
1800    tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
1801    tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1802    ac_build_ifcc(&ctx->ac, tmp, 5109);
1803    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1804    ac_build_ifcc(&ctx->ac, tmp, 5110);
1805    {
1806       tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1807 
1808       ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
1809 
1810       LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
1811       LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
1812 
1813       const char *sync_scope = "workgroup-one-as";
1814 
1815       /* Use a plain GDS atomic to accumulate the number of generated
1816        * primitives.
1817        */
1818       ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, tmp, sync_scope);
1819    }
1820    ac_build_endif(&ctx->ac, 5110);
1821    ac_build_endif(&ctx->ac, 5109);
1822 
1823    /* TODO: culling */
1824 
1825    /* Determine vertex liveness. */
1826    LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1827 
1828    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1829    ac_build_ifcc(&ctx->ac, tmp, 5120);
1830    {
1831       for (unsigned i = 0; i < verts_per_prim; ++i) {
1832          const LLVMValueRef primidx =
1833             LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
1834 
1835          if (i > 0) {
1836             tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1837             ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1838          }
1839 
1840          /* Load primitive liveness */
1841          tmp = ngg_gs_vertex_ptr(ctx, primidx);
1842          tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1843          const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1844 
1845          tmp = LLVMBuildLoad(builder, vertliveptr, "");
1846          tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
1847 
1848          if (i > 0)
1849             ac_build_endif(&ctx->ac, 5121 + i);
1850       }
1851    }
1852    ac_build_endif(&ctx->ac, 5120);
1853 
1854    /* Inclusive scan addition across the current wave. */
1855    LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1856    struct ac_wg_scan vertlive_scan = {0};
1857    vertlive_scan.op = nir_op_iadd;
1858    vertlive_scan.enable_reduce = true;
1859    vertlive_scan.enable_exclusive = true;
1860    vertlive_scan.src = vertlive;
1861    vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1862    vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1863    vertlive_scan.numwaves = get_tgsize(ctx);
1864    vertlive_scan.maxwaves = 8;
1865 
1866    ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1867 
1868    /* Skip all exports (including index exports) when possible. At least on
1869     * early gfx10 revisions this is also to avoid hangs.
1870     */
1871    LLVMValueRef have_exports =
1872       LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1873    num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1874 
1875    /* Allocate export space. Send this message as early as possible, to
1876     * hide the latency of the SQ <-> SPI roundtrip.
1877     *
1878     * Note: We could consider compacting primitives for export as well.
1879     *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
1880     *       prim data per clock and skips null primitives at no additional
1881     *       cost. So compacting primitives can only be beneficial when
1882     *       there are 4 or more contiguous null primitives in the export
1883     *       (in the common case of single-dword prim exports).
1884     */
1885    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
1886                                  num_emit_threads);
1887 
1888    /* Setup the reverse vertex compaction permutation. We re-use stream 1
1889     * of the primitive liveness flags, relying on the fact that each
1890     * threadgroup can have at most 256 threads. */
1891    ac_build_ifcc(&ctx->ac, vertlive, 5130);
1892    {
1893       tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1894       tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1895       LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1896    }
1897    ac_build_endif(&ctx->ac, 5130);
1898 
1899    ac_build_s_barrier(&ctx->ac);
1900 
1901    /* Export primitive data */
1902    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1903    ac_build_ifcc(&ctx->ac, tmp, 5140);
1904    {
1905       LLVMValueRef flags;
1906       struct ac_ngg_prim prim = {0};
1907       prim.num_vertices = verts_per_prim;
1908 
1909       tmp = ngg_gs_vertex_ptr(ctx, tid);
1910       flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1911       prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1912       prim.edgeflags = ctx->ac.i32_0;
1913 
1914       for (unsigned i = 0; i < verts_per_prim; ++i) {
1915          prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1916                                       LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1917       }
1918 
1919       /* Geometry shaders output triangle strips, but NGG expects triangles. */
1920       if (verts_per_prim == 3) {
1921          LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1922          is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1923 
1924          LLVMValueRef flatshade_first =
1925             LLVMConstInt(ctx->ac.i1, !ctx->args->options->key.vs.provoking_vtx_last, false);
1926 
1927          ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
1928       }
1929 
1930       ac_build_export_prim(&ctx->ac, &prim);
1931    }
1932    ac_build_endif(&ctx->ac, 5140);
1933 
1934    /* Export position and parameter data */
1935    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1936    ac_build_ifcc(&ctx->ac, tmp, 5145);
1937    {
1938       struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo;
1939       bool export_view_index = ctx->args->options->key.has_multiview_view_index;
1940       struct radv_shader_output_values *outputs;
1941       unsigned noutput = 0;
1942 
1943       /* Allocate a temporary array for the output values. */
1944       unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;
1945       outputs = calloc(num_outputs, sizeof(outputs[0]));
1946 
1947       tmp = ngg_gs_vertex_ptr(ctx, tid);
1948       tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1949       tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1950       const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1951 
1952       unsigned out_idx = 0;
1953       for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1954          unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1955          int length = util_last_bit(output_usage_mask);
1956 
1957          if (!(ctx->output_mask & (1ull << i)))
1958             continue;
1959 
1960          outputs[noutput].slot_name = i;
1961          outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1962          outputs[noutput].usage_mask = output_usage_mask;
1963 
1964          for (unsigned j = 0; j < length; j++, out_idx++) {
1965             if (!(output_usage_mask & (1 << j)))
1966                continue;
1967 
1968             tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1969             tmp = LLVMBuildLoad(builder, tmp, "");
1970 
1971             LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
1972             if (ac_get_type_size(type) == 2) {
1973                tmp = ac_to_integer(&ctx->ac, tmp);
1974                tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, "");
1975             }
1976 
1977             outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp);
1978          }
1979 
1980          for (unsigned j = length; j < 4; j++)
1981             outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
1982 
1983          noutput++;
1984       }
1985 
1986       /* Export ViewIndex. */
1987       if (export_view_index) {
1988          outputs[noutput].slot_name = VARYING_SLOT_LAYER;
1989          outputs[noutput].slot_index = 0;
1990          outputs[noutput].usage_mask = 0x1;
1991          outputs[noutput].values[0] =
1992             ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index));
1993          for (unsigned j = 1; j < 4; j++)
1994             outputs[noutput].values[j] = ctx->ac.f32_0;
1995          noutput++;
1996       }
1997 
1998       radv_llvm_export_vs(ctx, outputs, noutput, outinfo, outinfo->export_clip_dists);
1999       FREE(outputs);
2000    }
2001    ac_build_endif(&ctx->ac, 5145);
2002 }
2003 
2004 static void
gfx10_ngg_gs_emit_vertex(struct radv_shader_context * ctx,unsigned stream,LLVMValueRef vertexidx,LLVMValueRef * addrs)2005 gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMValueRef vertexidx,
2006                          LLVMValueRef *addrs)
2007 {
2008    LLVMBuilderRef builder = ctx->ac.builder;
2009    LLVMValueRef tmp;
2010 
2011    const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
2012    unsigned out_idx = 0;
2013    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2014       unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2015       uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
2016       LLVMValueRef *out_ptr = &addrs[i * 4];
2017       int length = util_last_bit(output_usage_mask);
2018 
2019       if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
2020          continue;
2021 
2022       for (unsigned j = 0; j < length; j++, out_idx++) {
2023          if (!(output_usage_mask & (1 << j)))
2024             continue;
2025 
2026          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
2027          out_val = ac_to_integer(&ctx->ac, out_val);
2028          out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
2029 
2030          LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
2031       }
2032    }
2033    assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
2034 
2035    /* Store the current number of emitted vertices to zero out remaining
2036     * primitive flags in case the geometry shader doesn't emit the maximum
2037     * number of vertices.
2038     */
2039    tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
2040    LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
2041 
2042    /* Determine and store whether this vertex completed a primitive. */
2043    const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
2044 
2045    tmp = LLVMConstInt(
2046       ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false);
2047    const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
2048 
2049    /* Since the geometry shader emits triangle strips, we need to
2050     * track which primitive is odd and swap vertex indices to get
2051     * the correct vertex order.
2052     */
2053    LLVMValueRef is_odd = ctx->ac.i1false;
2054    if (stream == 0 && si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) {
2055       tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
2056       is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
2057    }
2058 
2059    tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
2060    LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
2061 
2062    /* The per-vertex primitive flag encoding:
2063     *   bit 0: whether this vertex finishes a primitive
2064     *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
2065     */
2066    tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
2067    tmp = LLVMBuildOr(
2068       builder, tmp,
2069       LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
2070    LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
2071 
2072    tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
2073    tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
2074    LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
2075 }
2076 
2077 static bool
si_export_mrt_color(struct radv_shader_context * ctx,LLVMValueRef * color,unsigned index,struct ac_export_args * args)2078 si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index,
2079                     struct ac_export_args *args)
2080 {
2081    /* Export */
2082    si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args);
2083    if (!args->enabled_channels)
2084       return false; /* unnecessary NULL export */
2085 
2086    return true;
2087 }
2088 
2089 static void
radv_export_mrt_z(struct radv_shader_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask)2090 radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
2091                   LLVMValueRef samplemask)
2092 {
2093    struct ac_export_args args;
2094 
2095    ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
2096 
2097    ac_build_export(&ctx->ac, &args);
2098 }
2099 
2100 static void
handle_fs_outputs_post(struct radv_shader_context * ctx)2101 handle_fs_outputs_post(struct radv_shader_context *ctx)
2102 {
2103    unsigned index = 0;
2104    LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2105    struct ac_export_args color_args[8];
2106 
2107    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2108       LLVMValueRef values[4];
2109 
2110       if (!(ctx->output_mask & (1ull << i)))
2111          continue;
2112 
2113       if (i < FRAG_RESULT_DATA0)
2114          continue;
2115 
2116       for (unsigned j = 0; j < 4; j++)
2117          values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
2118 
2119       bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]);
2120       if (ret)
2121          index++;
2122    }
2123 
2124    /* Process depth, stencil, samplemask. */
2125    if (ctx->args->shader_info->ps.writes_z) {
2126       depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
2127    }
2128    if (ctx->args->shader_info->ps.writes_stencil) {
2129       stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
2130    }
2131    if (ctx->args->shader_info->ps.writes_sample_mask) {
2132       samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
2133    }
2134 
2135    /* Set the DONE bit on last non-null color export only if Z isn't
2136     * exported.
2137     */
2138    if (index > 0 && !ctx->args->shader_info->ps.writes_z &&
2139        !ctx->args->shader_info->ps.writes_stencil &&
2140        !ctx->args->shader_info->ps.writes_sample_mask) {
2141       unsigned last = index - 1;
2142 
2143       color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
2144       color_args[last].done = 1;       /* DONE bit */
2145    }
2146 
2147    /* Export PS outputs. */
2148    for (unsigned i = 0; i < index; i++)
2149       ac_build_export(&ctx->ac, &color_args[i]);
2150 
2151    if (depth || stencil || samplemask)
2152       radv_export_mrt_z(ctx, depth, stencil, samplemask);
2153    else if (!index)
2154       ac_build_export_null(&ctx->ac);
2155 }
2156 
2157 static void
emit_gs_epilogue(struct radv_shader_context * ctx)2158 emit_gs_epilogue(struct radv_shader_context *ctx)
2159 {
2160    if (ctx->args->shader_info->is_ngg) {
2161       gfx10_ngg_gs_emit_epilogue_1(ctx);
2162       return;
2163    }
2164 
2165    if (ctx->ac.chip_class >= GFX10)
2166       LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
2167 
2168    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
2169 }
2170 
2171 static void
handle_shader_outputs_post(struct ac_shader_abi * abi)2172 handle_shader_outputs_post(struct ac_shader_abi *abi)
2173 {
2174    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
2175 
2176    switch (ctx->stage) {
2177    case MESA_SHADER_VERTEX:
2178       if (ctx->args->shader_info->vs.as_ls)
2179          break; /* Lowered in NIR */
2180       else if (ctx->args->shader_info->vs.as_es)
2181          break; /* Lowered in NIR */
2182       else if (ctx->args->shader_info->is_ngg)
2183          break;
2184       else
2185          handle_vs_outputs_post(ctx, ctx->args->shader_info->vs.outinfo.export_prim_id,
2186                                 ctx->args->shader_info->vs.outinfo.export_clip_dists,
2187                                 &ctx->args->shader_info->vs.outinfo);
2188       break;
2189    case MESA_SHADER_FRAGMENT:
2190       handle_fs_outputs_post(ctx);
2191       break;
2192    case MESA_SHADER_GEOMETRY:
2193       emit_gs_epilogue(ctx);
2194       break;
2195    case MESA_SHADER_TESS_CTRL:
2196       break; /* Lowered in NIR */
2197    case MESA_SHADER_TESS_EVAL:
2198       if (ctx->args->shader_info->tes.as_es)
2199          break; /* Lowered in NIR */
2200       else if (ctx->args->shader_info->is_ngg)
2201          break;
2202       else
2203          handle_vs_outputs_post(ctx, ctx->args->shader_info->tes.outinfo.export_prim_id,
2204                                 ctx->args->shader_info->tes.outinfo.export_clip_dists,
2205                                 &ctx->args->shader_info->tes.outinfo);
2206       break;
2207    default:
2208       break;
2209    }
2210 }
2211 
2212 static void
ac_llvm_finalize_module(struct radv_shader_context * ctx,LLVMPassManagerRef passmgr,const struct radv_nir_compiler_options * options)2213 ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr,
2214                         const struct radv_nir_compiler_options *options)
2215 {
2216    LLVMRunPassManager(passmgr, ctx->ac.module);
2217    LLVMDisposeBuilder(ctx->ac.builder);
2218 
2219    ac_llvm_context_dispose(&ctx->ac);
2220 }
2221 
2222 static void
ac_nir_eliminate_const_vs_outputs(struct radv_shader_context * ctx)2223 ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
2224 {
2225    struct radv_vs_output_info *outinfo;
2226 
2227    switch (ctx->stage) {
2228    case MESA_SHADER_FRAGMENT:
2229    case MESA_SHADER_COMPUTE:
2230    case MESA_SHADER_TESS_CTRL:
2231    case MESA_SHADER_GEOMETRY:
2232       return;
2233    case MESA_SHADER_VERTEX:
2234       if (ctx->args->shader_info->vs.as_ls ||
2235           ctx->args->shader_info->vs.as_es)
2236          return;
2237       outinfo = &ctx->args->shader_info->vs.outinfo;
2238       break;
2239    case MESA_SHADER_TESS_EVAL:
2240       if (ctx->args->shader_info->tes.as_es)
2241          return;
2242       outinfo = &ctx->args->shader_info->tes.outinfo;
2243       break;
2244    default:
2245       unreachable("Unhandled shader type");
2246    }
2247 
2248    ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset,
2249                           VARYING_SLOT_MAX, 0, &outinfo->param_exports);
2250 }
2251 
2252 static void
ac_setup_rings(struct radv_shader_context * ctx)2253 ac_setup_rings(struct radv_shader_context *ctx)
2254 {
2255    if (ctx->args->options->chip_class <= GFX8 &&
2256        (ctx->stage == MESA_SHADER_GEOMETRY ||
2257         (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_es) ||
2258         (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->args->shader_info->tes.as_es))) {
2259       unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;
2260       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
2261 
2262       ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset);
2263    }
2264 
2265    if (ctx->args->is_gs_copy_shader) {
2266       ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2267                                                 LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
2268    }
2269 
2270    if (ctx->stage == MESA_SHADER_GEOMETRY) {
2271       /* The conceptual layout of the GSVS ring is
2272        *   v0c0 .. vLv0 v0c1 .. vLc1 ..
2273        * but the real memory layout is swizzled across
2274        * threads:
2275        *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
2276        *   t16v0c0 ..
2277        * Override the buffer descriptor accordingly.
2278        */
2279       LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
2280       uint64_t stream_offset = 0;
2281       unsigned num_records = ctx->ac.wave_size;
2282       LLVMValueRef base_ring;
2283 
2284       base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2285                                         LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
2286 
2287       for (unsigned stream = 0; stream < 4; stream++) {
2288          unsigned num_components, stride;
2289          LLVMValueRef ring, tmp;
2290 
2291          num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2292 
2293          if (!num_components)
2294             continue;
2295 
2296          stride = 4 * num_components * ctx->shader->info.gs.vertices_out;
2297 
2298          /* Limit on the stride field for <= GFX7. */
2299          assert(stride < (1 << 14));
2300 
2301          ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, "");
2302          tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, "");
2303          tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
2304          ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, "");
2305 
2306          stream_offset += stride * ctx->ac.wave_size;
2307 
2308          ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, "");
2309 
2310          tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, "");
2311          tmp = LLVMBuildOr(ctx->ac.builder, tmp,
2312                            LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), "");
2313          ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, "");
2314 
2315          ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
2316                                        LLVMConstInt(ctx->ac.i32, num_records, false),
2317                                        LLVMConstInt(ctx->ac.i32, 2, false), "");
2318 
2319          ctx->gsvs_ring[stream] = ring;
2320       }
2321    }
2322 
2323    if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {
2324       ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(
2325          &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
2326       ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(
2327          &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
2328    }
2329 }
2330 
2331 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
2332 static void
ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context * ctx)2333 ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
2334 {
2335    LLVMValueRef count =
2336       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
2337    LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, "");
2338    ctx->abi.instance_id =
2339       LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
2340                       ctx->abi.instance_id, "");
2341    ctx->vs_rel_patch_id =
2342       LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
2343                       ctx->vs_rel_patch_id, "");
2344    ctx->abi.vertex_id =
2345       LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),
2346                       ctx->abi.vertex_id, "");
2347 }
2348 
2349 static void
prepare_gs_input_vgprs(struct radv_shader_context * ctx,bool merged)2350 prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
2351 {
2352    if (merged) {
2353       for (int i = 5; i >= 0; --i) {
2354          ctx->gs_vtx_offset[i] = ac_unpack_param(
2355             &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i / 2]), (i & 1) * 16, 16);
2356       }
2357 
2358       ctx->gs_wave_id =
2359          ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8);
2360    } else {
2361       for (int i = 0; i < 6; i++)
2362          ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i]);
2363       ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id);
2364    }
2365 }
2366 
2367 /* Ensure that the esgs ring is declared.
2368  *
2369  * We declare it with 64KB alignment as a hint that the
2370  * pointer value will always be 0.
2371  */
2372 static void
declare_esgs_ring(struct radv_shader_context * ctx)2373 declare_esgs_ring(struct radv_shader_context *ctx)
2374 {
2375    if (ctx->esgs_ring)
2376       return;
2377 
2378    assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
2379 
2380    ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
2381                                                 "esgs_ring", AC_ADDR_SPACE_LDS);
2382    LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
2383    LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
2384 }
2385 
2386 static LLVMModuleRef
ac_translate_nir_to_llvm(struct ac_llvm_compiler * ac_llvm,struct nir_shader * const * shaders,int shader_count,const struct radv_shader_args * args)2387 ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders,
2388                          int shader_count, const struct radv_shader_args *args)
2389 {
2390    struct radv_shader_context ctx = {0};
2391    ctx.args = args;
2392 
2393    enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
2394 
2395    if (shaders[0]->info.float_controls_execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
2396       float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
2397    }
2398 
2399    ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
2400                         args->options->info, float_mode, args->shader_info->wave_size,
2401                         args->shader_info->ballot_bit_size);
2402    ctx.context = ctx.ac.context;
2403 
2404    ctx.max_workgroup_size = args->shader_info->workgroup_size;
2405 
2406    if (ctx.ac.chip_class >= GFX10) {
2407       if (is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg) {
2408          ctx.max_workgroup_size = 128;
2409       }
2410    }
2411 
2412    create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
2413 
2414    ctx.abi.emit_outputs = handle_shader_outputs_post;
2415    ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
2416    ctx.abi.load_ubo = radv_load_ubo;
2417    ctx.abi.load_ssbo = radv_load_ssbo;
2418    ctx.abi.load_sampler_desc = radv_get_sampler_desc;
2419    ctx.abi.load_resource = radv_load_resource;
2420    ctx.abi.load_ring_tess_factors = load_ring_tess_factors;
2421    ctx.abi.load_ring_tess_offchip = load_ring_tess_offchip;
2422    ctx.abi.load_ring_esgs = load_ring_esgs;
2423    ctx.abi.clamp_shadow_reference = false;
2424    ctx.abi.adjust_frag_coord_z = args->options->adjust_frag_coord_z;
2425    ctx.abi.robust_buffer_access = args->options->robust_buffer_access;
2426 
2427    bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg;
2428    if (shader_count >= 2 || is_ngg)
2429       ac_init_exec_full_mask(&ctx.ac);
2430 
2431    if (args->ac.vertex_id.used)
2432       ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);
2433    if (args->ac.vs_rel_patch_id.used)
2434       ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id);
2435    if (args->ac.instance_id.used)
2436       ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
2437 
2438    if (args->options->has_ls_vgpr_init_bug &&
2439        shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
2440       ac_nir_fixup_ls_hs_input_vgprs(&ctx);
2441 
2442    if (is_ngg) {
2443       /* Declare scratch space base for streamout and vertex
2444        * compaction. Whether space is actually allocated is
2445        * determined during linking / PM4 creation.
2446        *
2447        * Add an extra dword per vertex to ensure an odd stride, which
2448        * avoids bank conflicts for SoA accesses.
2449        */
2450       if (!args->shader_info->is_ngg_passthrough)
2451          declare_esgs_ring(&ctx);
2452 
2453       /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
2454       if (ctx.ac.chip_class == GFX10 && shader_count == 1)
2455          ac_build_s_barrier(&ctx.ac);
2456    }
2457 
2458    for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {
2459       ctx.stage = shaders[shader_idx]->info.stage;
2460       ctx.shader = shaders[shader_idx];
2461       ctx.output_mask = 0;
2462 
2463       if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY) {
2464          for (int i = 0; i < 4; i++) {
2465             ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2466          }
2467          if (args->shader_info->is_ngg) {
2468             for (unsigned i = 0; i < 4; ++i) {
2469                ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2470                ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2471             }
2472 
2473             LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
2474             ctx.gs_ngg_scratch =
2475                LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
2476             LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
2477             LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
2478 
2479             ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(
2480                ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
2481             LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage);
2482             LLVMSetAlignment(ctx.gs_ngg_emit, 4);
2483          }
2484 
2485          ctx.abi.emit_primitive = visit_end_primitive;
2486       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
2487       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
2488          ctx.abi.load_base_vertex = radv_load_base_vertex;
2489          ctx.abi.load_inputs = radv_load_vs_inputs;
2490       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
2491          ctx.abi.load_sample_position = load_sample_position;
2492          ctx.abi.load_sample_mask_in = load_sample_mask_in;
2493       }
2494 
2495       if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX &&
2496           args->shader_info->is_ngg &&
2497           args->shader_info->vs.outinfo.export_prim_id) {
2498          declare_esgs_ring(&ctx);
2499       }
2500 
2501       bool nested_barrier = false;
2502 
2503       if (shader_idx) {
2504          if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
2505              args->shader_info->is_ngg) {
2506             gfx10_ngg_gs_emit_prologue(&ctx);
2507             nested_barrier = false;
2508          } else {
2509             nested_barrier = true;
2510          }
2511       }
2512 
2513       if (nested_barrier) {
2514          /* Execute a barrier before the second shader in
2515           * a merged shader.
2516           *
2517           * Execute the barrier inside the conditional block,
2518           * so that empty waves can jump directly to s_endpgm,
2519           * which will also signal the barrier.
2520           *
2521           * This is possible in gfx9, because an empty wave
2522           * for the second shader does not participate in
2523           * the epilogue. With NGG, empty waves may still
2524           * be required to export data (e.g. GS output vertices),
2525           * so we cannot let them exit early.
2526           *
2527           * If the shader is TCS and the TCS epilog is present
2528           * and contains a barrier, it will wait there and then
2529           * reach s_endpgm.
2530           */
2531          ac_emit_barrier(&ctx.ac, ctx.stage);
2532       }
2533 
2534       nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(
2535          &ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage);
2536 
2537       ac_setup_rings(&ctx);
2538 
2539       LLVMBasicBlockRef merge_block = NULL;
2540       if (shader_count >= 2 || is_ngg) {
2541          LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
2542          LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
2543          merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
2544 
2545          LLVMValueRef count = ac_unpack_param(
2546             &ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8);
2547          LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
2548          LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, "");
2549          LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
2550 
2551          LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
2552       }
2553 
2554       if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)
2555          prepare_interp_optimize(&ctx, shaders[shader_idx]);
2556       else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY)
2557          prepare_gs_input_vgprs(&ctx, shader_count >= 2);
2558 
2559       ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]);
2560 
2561       if (shader_count >= 2 || is_ngg) {
2562          LLVMBuildBr(ctx.ac.builder, merge_block);
2563          LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
2564       }
2565 
2566       /* This needs to be outside the if wrapping the shader body, as sometimes
2567        * the HW generates waves with 0 es/vs threads. */
2568       if (is_pre_gs_stage(shaders[shader_idx]->info.stage) &&
2569           args->shader_info->is_ngg && shader_idx == shader_count - 1) {
2570          handle_ngg_outputs_post_2(&ctx);
2571       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
2572                  args->shader_info->is_ngg) {
2573          gfx10_ngg_gs_emit_epilogue_2(&ctx);
2574       }
2575    }
2576 
2577    LLVMBuildRetVoid(ctx.ac.builder);
2578 
2579    if (args->options->dump_preoptir) {
2580       fprintf(stderr, "%s LLVM IR:\n\n",
2581               radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage));
2582       ac_dump_module(ctx.ac.module);
2583       fprintf(stderr, "\n");
2584    }
2585 
2586    ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
2587 
2588    if (shader_count == 1)
2589       ac_nir_eliminate_const_vs_outputs(&ctx);
2590 
2591    return ctx.ac.module;
2592 }
2593 
2594 static void
ac_diagnostic_handler(LLVMDiagnosticInfoRef di,void * context)2595 ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
2596 {
2597    unsigned *retval = (unsigned *)context;
2598    LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
2599    char *description = LLVMGetDiagInfoDescription(di);
2600 
2601    if (severity == LLVMDSError) {
2602       *retval = 1;
2603       fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
2604    }
2605 
2606    LLVMDisposeMessage(description);
2607 }
2608 
2609 static unsigned
radv_llvm_compile(LLVMModuleRef M,char ** pelf_buffer,size_t * pelf_size,struct ac_llvm_compiler * ac_llvm)2610 radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size,
2611                   struct ac_llvm_compiler *ac_llvm)
2612 {
2613    unsigned retval = 0;
2614    LLVMContextRef llvm_ctx;
2615 
2616    /* Setup Diagnostic Handler*/
2617    llvm_ctx = LLVMGetModuleContext(M);
2618 
2619    LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval);
2620 
2621    /* Compile IR*/
2622    if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
2623       retval = 1;
2624    return retval;
2625 }
2626 
2627 static void
ac_compile_llvm_module(struct ac_llvm_compiler * ac_llvm,LLVMModuleRef llvm_module,struct radv_shader_binary ** rbinary,gl_shader_stage stage,const char * name,const struct radv_nir_compiler_options * options)2628 ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module,
2629                        struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name,
2630                        const struct radv_nir_compiler_options *options)
2631 {
2632    char *elf_buffer = NULL;
2633    size_t elf_size = 0;
2634    char *llvm_ir_string = NULL;
2635 
2636    if (options->dump_shader) {
2637       fprintf(stderr, "%s LLVM IR:\n\n", name);
2638       ac_dump_module(llvm_module);
2639       fprintf(stderr, "\n");
2640    }
2641 
2642    if (options->record_ir) {
2643       char *llvm_ir = LLVMPrintModuleToString(llvm_module);
2644       llvm_ir_string = strdup(llvm_ir);
2645       LLVMDisposeMessage(llvm_ir);
2646    }
2647 
2648    int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
2649    if (v) {
2650       fprintf(stderr, "compile failed\n");
2651    }
2652 
2653    LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
2654    LLVMDisposeModule(llvm_module);
2655    LLVMContextDispose(ctx);
2656 
2657    size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
2658    size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
2659    struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
2660    memcpy(rbin->data, elf_buffer, elf_size);
2661    if (llvm_ir_string)
2662       memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
2663 
2664    rbin->base.type = RADV_BINARY_TYPE_RTLD;
2665    rbin->base.stage = stage;
2666    rbin->base.total_size = alloc_size;
2667    rbin->elf_size = elf_size;
2668    rbin->llvm_ir_size = llvm_ir_size;
2669    *rbinary = &rbin->base;
2670 
2671    free(llvm_ir_string);
2672    free(elf_buffer);
2673 }
2674 
2675 static void
radv_compile_nir_shader(struct ac_llvm_compiler * ac_llvm,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args,struct nir_shader * const * nir,int nir_count)2676 radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary,
2677                         const struct radv_shader_args *args, struct nir_shader *const *nir,
2678                         int nir_count)
2679 {
2680 
2681    LLVMModuleRef llvm_module;
2682 
2683    llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args);
2684 
2685    ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,
2686                           radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage),
2687                           args->options);
2688 }
2689 
2690 static void
ac_gs_copy_shader_emit(struct radv_shader_context * ctx)2691 ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
2692 {
2693    LLVMValueRef vtx_offset =
2694       LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
2695                    LLVMConstInt(ctx->ac.i32, 4, false), "");
2696    LLVMValueRef stream_id;
2697 
2698    /* Fetch the vertex stream ID. */
2699    if (ctx->args->shader_info->so.num_outputs) {
2700       stream_id =
2701          ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
2702    } else {
2703       stream_id = ctx->ac.i32_0;
2704    }
2705 
2706    LLVMBasicBlockRef end_bb;
2707    LLVMValueRef switch_inst;
2708 
2709    end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end");
2710    switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
2711 
2712    for (unsigned stream = 0; stream < 4; stream++) {
2713       unsigned num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2714       LLVMBasicBlockRef bb;
2715       unsigned offset;
2716 
2717       if (stream > 0 && !num_components)
2718          continue;
2719 
2720       if (stream > 0 && !ctx->args->shader_info->so.num_outputs)
2721          continue;
2722 
2723       bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
2724       LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
2725       LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
2726 
2727       offset = 0;
2728       for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2729          unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2730          unsigned output_stream = ctx->args->shader_info->gs.output_streams[i];
2731          int length = util_last_bit(output_usage_mask);
2732 
2733          if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
2734             continue;
2735 
2736          for (unsigned j = 0; j < length; j++) {
2737             LLVMValueRef value, soffset;
2738 
2739             if (!(output_usage_mask & (1 << j)))
2740                continue;
2741 
2742             soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4,
2743                                    false);
2744 
2745             offset++;
2746 
2747             value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset,
2748                                          soffset, 0, ctx->ac.f32, ac_glc | ac_slc, true, false);
2749 
2750             LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
2751             if (ac_get_type_size(type) == 2) {
2752                value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
2753                value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
2754             }
2755 
2756             LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value),
2757                            ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
2758          }
2759       }
2760 
2761       if (ctx->args->shader_info->so.num_outputs)
2762          radv_emit_streamout(ctx, stream);
2763 
2764       if (stream == 0) {
2765          handle_vs_outputs_post(ctx, false, ctx->args->shader_info->vs.outinfo.export_clip_dists,
2766                                 &ctx->args->shader_info->vs.outinfo);
2767       }
2768 
2769       LLVMBuildBr(ctx->ac.builder, end_bb);
2770    }
2771 
2772    LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
2773 }
2774 
2775 static void
radv_compile_gs_copy_shader(struct ac_llvm_compiler * ac_llvm,struct nir_shader * geom_shader,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args)2776 radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader,
2777                             struct radv_shader_binary **rbinary,
2778                             const struct radv_shader_args *args)
2779 {
2780    struct radv_shader_context ctx = {0};
2781    ctx.args = args;
2782 
2783    assert(args->is_gs_copy_shader);
2784 
2785    ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
2786                         args->options->info, AC_FLOAT_MODE_DEFAULT, 64, 64);
2787    ctx.context = ctx.ac.context;
2788 
2789    ctx.stage = MESA_SHADER_VERTEX;
2790    ctx.shader = geom_shader;
2791 
2792    create_function(&ctx, MESA_SHADER_VERTEX, false);
2793 
2794    ac_setup_rings(&ctx);
2795 
2796    nir_foreach_shader_out_variable(variable, geom_shader)
2797    {
2798       scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
2799       ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX);
2800    }
2801 
2802    ac_gs_copy_shader_emit(&ctx);
2803 
2804    LLVMBuildRetVoid(ctx.ac.builder);
2805 
2806    ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
2807 
2808    ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",
2809                           args->options);
2810    (*rbinary)->is_gs_copy_shader = true;
2811 }
2812 
2813 void
llvm_compile_shader(struct radv_device * device,unsigned shader_count,struct nir_shader * const * shaders,struct radv_shader_binary ** binary,struct radv_shader_args * args)2814 llvm_compile_shader(struct radv_device *device, unsigned shader_count,
2815                     struct nir_shader *const *shaders, struct radv_shader_binary **binary,
2816                     struct radv_shader_args *args)
2817 {
2818    enum ac_target_machine_options tm_options = 0;
2819    struct ac_llvm_compiler ac_llvm;
2820 
2821    tm_options |= AC_TM_SUPPORTS_SPILL;
2822    if (args->options->check_ir)
2823       tm_options |= AC_TM_CHECK_IR;
2824 
2825    radv_init_llvm_compiler(&ac_llvm, args->options->family, tm_options,
2826                            args->shader_info->wave_size);
2827 
2828    if (args->is_gs_copy_shader) {
2829       radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);
2830    } else {
2831       radv_compile_nir_shader(&ac_llvm, binary, args, shaders, shader_count);
2832    }
2833 }
2834