1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "nir/nir.h"
29 #include "radv_debug.h"
30 #include "radv_llvm_helper.h"
31 #include "radv_private.h"
32 #include "radv_shader.h"
33 #include "radv_shader_args.h"
34
35 #include "ac_binary.h"
36 #include "ac_exp_param.h"
37 #include "ac_llvm_build.h"
38 #include "ac_nir_to_llvm.h"
39 #include "ac_shader_abi.h"
40 #include "ac_shader_util.h"
41 #include "sid.h"
42
43 struct radv_shader_context {
44 struct ac_llvm_context ac;
45 const struct nir_shader *shader;
46 struct ac_shader_abi abi;
47 const struct radv_shader_args *args;
48
49 gl_shader_stage stage;
50
51 unsigned max_workgroup_size;
52 LLVMContextRef context;
53 LLVMValueRef main_function;
54
55 LLVMValueRef descriptor_sets[MAX_SETS];
56
57 LLVMValueRef ring_offsets;
58
59 LLVMValueRef vs_rel_patch_id;
60
61 LLVMValueRef gs_wave_id;
62 LLVMValueRef gs_vtx_offset[6];
63
64 LLVMValueRef esgs_ring;
65 LLVMValueRef gsvs_ring[4];
66 LLVMValueRef hs_ring_tess_offchip;
67 LLVMValueRef hs_ring_tess_factor;
68
69 uint64_t output_mask;
70
71 LLVMValueRef gs_next_vertex[4];
72 LLVMValueRef gs_curprim_verts[4];
73 LLVMValueRef gs_generated_prims[4];
74 LLVMValueRef gs_ngg_emit;
75 LLVMValueRef gs_ngg_scratch;
76
77 LLVMValueRef vertexptr; /* GFX10 only */
78 };
79
80 struct radv_shader_output_values {
81 LLVMValueRef values[4];
82 unsigned slot_name;
83 unsigned slot_index;
84 unsigned usage_mask;
85 };
86
87 static inline struct radv_shader_context *
radv_shader_context_from_abi(struct ac_shader_abi * abi)88 radv_shader_context_from_abi(struct ac_shader_abi *abi)
89 {
90 return container_of(abi, struct radv_shader_context, abi);
91 }
92
93 static LLVMValueRef
create_llvm_function(struct ac_llvm_context * ctx,LLVMModuleRef module,LLVMBuilderRef builder,const struct ac_shader_args * args,enum ac_llvm_calling_convention convention,unsigned max_workgroup_size,const struct radv_nir_compiler_options * options)94 create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuilderRef builder,
95 const struct ac_shader_args *args, enum ac_llvm_calling_convention convention,
96 unsigned max_workgroup_size, const struct radv_nir_compiler_options *options)
97 {
98 LLVMValueRef main_function = ac_build_main(args, ctx, convention, "main", ctx->voidt, module);
99
100 if (options->address32_hi) {
101 ac_llvm_add_target_dep_function_attr(main_function, "amdgpu-32bit-address-high-bits",
102 options->address32_hi);
103 }
104
105 ac_llvm_set_workgroup_size(main_function, max_workgroup_size);
106 ac_llvm_set_target_features(main_function, ctx);
107
108 return main_function;
109 }
110
111 static void
load_descriptor_sets(struct radv_shader_context * ctx)112 load_descriptor_sets(struct radv_shader_context *ctx)
113 {
114 struct radv_userdata_locations *user_sgprs_locs = &ctx->args->shader_info->user_sgprs_locs;
115 uint32_t mask = ctx->args->shader_info->desc_set_used_mask;
116
117 if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
118 LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
119 while (mask) {
120 int i = u_bit_scan(&mask);
121
122 ctx->descriptor_sets[i] =
123 ac_build_load_to_sgpr(&ctx->ac, desc_sets, LLVMConstInt(ctx->ac.i32, i, false));
124 LLVMSetAlignment(ctx->descriptor_sets[i], 4);
125 }
126 } else {
127 while (mask) {
128 int i = u_bit_scan(&mask);
129
130 ctx->descriptor_sets[i] = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]);
131 }
132 }
133 }
134
135 static enum ac_llvm_calling_convention
get_llvm_calling_convention(LLVMValueRef func,gl_shader_stage stage)136 get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage)
137 {
138 switch (stage) {
139 case MESA_SHADER_VERTEX:
140 case MESA_SHADER_TESS_EVAL:
141 return AC_LLVM_AMDGPU_VS;
142 break;
143 case MESA_SHADER_GEOMETRY:
144 return AC_LLVM_AMDGPU_GS;
145 break;
146 case MESA_SHADER_TESS_CTRL:
147 return AC_LLVM_AMDGPU_HS;
148 break;
149 case MESA_SHADER_FRAGMENT:
150 return AC_LLVM_AMDGPU_PS;
151 break;
152 case MESA_SHADER_COMPUTE:
153 return AC_LLVM_AMDGPU_CS;
154 break;
155 default:
156 unreachable("Unhandle shader type");
157 }
158 }
159
160 /* Returns whether the stage is a stage that can be directly before the GS */
161 static bool
is_pre_gs_stage(gl_shader_stage stage)162 is_pre_gs_stage(gl_shader_stage stage)
163 {
164 return stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL;
165 }
166
167 static void
create_function(struct radv_shader_context * ctx,gl_shader_stage stage,bool has_previous_stage)168 create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)
169 {
170 if (ctx->ac.chip_class >= GFX10) {
171 if (is_pre_gs_stage(stage) && ctx->args->shader_info->is_ngg) {
172 /* On GFX10, VS is merged into GS for NGG. */
173 stage = MESA_SHADER_GEOMETRY;
174 has_previous_stage = true;
175 }
176 }
177
178 ctx->main_function =
179 create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
180 get_llvm_calling_convention(ctx->main_function, stage),
181 ctx->max_workgroup_size, ctx->args->options);
182
183 ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
184 LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,
185 AC_FUNC_ATTR_READNONE);
186 ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets,
187 ac_array_in_const_addr_space(ctx->ac.v4i32), "");
188
189 load_descriptor_sets(ctx);
190
191 if (stage == MESA_SHADER_TESS_CTRL ||
192 (stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_ls) ||
193 /* GFX9 has the ESGS ring buffer in LDS. */
194 (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
195 ac_declare_lds_as_pointer(&ctx->ac);
196 }
197 }
198
199 static LLVMValueRef
radv_load_resource(struct ac_shader_abi * abi,LLVMValueRef index,unsigned desc_set,unsigned binding)200 radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_set,
201 unsigned binding)
202 {
203 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
204 LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
205 struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
206 struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
207 unsigned base_offset = layout->binding[binding].offset;
208 LLVMValueRef offset, stride;
209
210 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
211 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
212 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
213 layout->binding[binding].dynamic_offset_offset;
214 desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants);
215 base_offset = pipeline_layout->push_constant_size + 16 * idx;
216 stride = LLVMConstInt(ctx->ac.i32, 16, false);
217 } else
218 stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
219
220 offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
221
222 if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
223 offset = ac_build_imad(&ctx->ac, index, stride, offset);
224 }
225
226 desc_ptr = LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.i32, "");
227
228 LLVMValueRef res[] = {desc_ptr, offset, ctx->ac.i32_0};
229 return ac_build_gather_values(&ctx->ac, res, 3);
230 }
231
232 static uint32_t
radv_get_sample_pos_offset(uint32_t num_samples)233 radv_get_sample_pos_offset(uint32_t num_samples)
234 {
235 uint32_t sample_pos_offset = 0;
236
237 switch (num_samples) {
238 case 2:
239 sample_pos_offset = 1;
240 break;
241 case 4:
242 sample_pos_offset = 3;
243 break;
244 case 8:
245 sample_pos_offset = 7;
246 break;
247 default:
248 break;
249 }
250 return sample_pos_offset;
251 }
252
253 static LLVMValueRef
load_sample_position(struct ac_shader_abi * abi,LLVMValueRef sample_id)254 load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
255 {
256 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
257
258 LLVMValueRef result;
259 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false);
260 LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, "");
261
262 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");
263
264 uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->args->options->key.ps.num_samples);
265
266 sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,
267 LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
268 result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
269
270 return result;
271 }
272
273 static LLVMValueRef
load_sample_mask_in(struct ac_shader_abi * abi)274 load_sample_mask_in(struct ac_shader_abi *abi)
275 {
276 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
277 uint8_t log2_ps_iter_samples;
278
279 if (ctx->args->shader_info->ps.uses_sample_shading) {
280 log2_ps_iter_samples = util_logbase2(ctx->args->options->key.ps.num_samples);
281 } else {
282 log2_ps_iter_samples = ctx->args->options->key.ps.log2_ps_iter_samples;
283 }
284
285 LLVMValueRef result, sample_id;
286 if (log2_ps_iter_samples) {
287 /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
288 sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4);
289 sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, 1, false), sample_id, "");
290 result = LLVMBuildAnd(ctx->ac.builder, sample_id,
291 ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), "");
292 } else {
293 result = ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage);
294 }
295
296 return result;
297 }
298
299 static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream,
300 LLVMValueRef vertexidx, LLVMValueRef *addrs);
301
302 static void
visit_emit_vertex_with_counter(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef vertexidx,LLVMValueRef * addrs)303 visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef vertexidx,
304 LLVMValueRef *addrs)
305 {
306 unsigned offset = 0;
307 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
308
309 if (ctx->args->shader_info->is_ngg) {
310 gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
311 return;
312 }
313
314 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
315 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
316 uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
317 LLVMValueRef *out_ptr = &addrs[i * 4];
318 int length = util_last_bit(output_usage_mask);
319
320 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
321 continue;
322
323 for (unsigned j = 0; j < length; j++) {
324 if (!(output_usage_mask & (1 << j)))
325 continue;
326
327 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
328 LLVMValueRef voffset =
329 LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out, false);
330
331 offset++;
332
333 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, "");
334 voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
335
336 out_val = ac_to_integer(&ctx->ac, out_val);
337 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
338
339 ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset,
340 ac_get_arg(&ctx->ac, ctx->args->ac.gs2vs_offset), 0,
341 ac_glc | ac_slc | ac_swizzled);
342 }
343 }
344
345 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
346 ctx->gs_wave_id);
347 }
348
349 static void
visit_end_primitive(struct ac_shader_abi * abi,unsigned stream)350 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
351 {
352 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
353
354 if (ctx->args->shader_info->is_ngg) {
355 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
356 return;
357 }
358
359 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
360 ctx->gs_wave_id);
361 }
362
363 static LLVMValueRef
load_ring_tess_factors(struct ac_shader_abi * abi)364 load_ring_tess_factors(struct ac_shader_abi *abi)
365 {
366 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
367 assert(ctx->stage == MESA_SHADER_TESS_CTRL);
368
369 return ctx->hs_ring_tess_factor;
370 }
371
372 static LLVMValueRef
load_ring_tess_offchip(struct ac_shader_abi * abi)373 load_ring_tess_offchip(struct ac_shader_abi *abi)
374 {
375 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
376 assert(ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL);
377
378 return ctx->hs_ring_tess_offchip;
379 }
380
381 static LLVMValueRef
load_ring_esgs(struct ac_shader_abi * abi)382 load_ring_esgs(struct ac_shader_abi *abi)
383 {
384 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
385 assert(ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL ||
386 ctx->stage == MESA_SHADER_GEOMETRY);
387
388 return ctx->esgs_ring;
389 }
390
391 static LLVMValueRef
radv_load_base_vertex(struct ac_shader_abi * abi,bool non_indexed_is_zero)392 radv_load_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
393 {
394 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
395 return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex);
396 }
397
398 static LLVMValueRef
get_desc_ptr(struct radv_shader_context * ctx,LLVMValueRef ptr,bool non_uniform)399 get_desc_ptr(struct radv_shader_context *ctx, LLVMValueRef ptr, bool non_uniform)
400 {
401 LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, ptr, 0);
402 LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, ptr, 1);
403 ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
404
405 unsigned addr_space = AC_ADDR_SPACE_CONST_32BIT;
406 if (non_uniform) {
407 /* 32-bit seems to always use SMEM. addrspacecast from 32-bit -> 64-bit is broken. */
408 LLVMValueRef dwords[] = {ptr,
409 LLVMConstInt(ctx->ac.i32, ctx->args->options->address32_hi, false)};
410 ptr = ac_build_gather_values(&ctx->ac, dwords, 2);
411 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
412 addr_space = AC_ADDR_SPACE_CONST;
413 }
414 return LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.v4i32, addr_space), "");
415 }
416
417 static LLVMValueRef
radv_load_ssbo(struct ac_shader_abi * abi,LLVMValueRef buffer_ptr,bool write,bool non_uniform)418 radv_load_ssbo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr, bool write, bool non_uniform)
419 {
420 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
421 LLVMValueRef result;
422
423 buffer_ptr = get_desc_ptr(ctx, buffer_ptr, non_uniform);
424 if (!non_uniform)
425 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
426
427 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
428 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
429 LLVMSetAlignment(result, 4);
430
431 return result;
432 }
433
434 static LLVMValueRef
radv_load_ubo(struct ac_shader_abi * abi,unsigned desc_set,unsigned binding,bool valid_binding,LLVMValueRef buffer_ptr)435 radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bool valid_binding,
436 LLVMValueRef buffer_ptr)
437 {
438 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
439 LLVMValueRef result;
440
441 if (valid_binding) {
442 struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
443 struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
444
445 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
446 LLVMValueRef set_ptr = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 0);
447 LLVMValueRef offset = ac_llvm_extract_elem(&ctx->ac, buffer_ptr, 1);
448 buffer_ptr = LLVMBuildNUWAdd(ctx->ac.builder, set_ptr, offset, "");
449
450 uint32_t desc_type =
451 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
452 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
453
454 if (ctx->ac.chip_class >= GFX10) {
455 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
456 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
457 } else {
458 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
459 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
460 }
461
462 LLVMValueRef desc_components[4] = {
463 LLVMBuildPtrToInt(ctx->ac.builder, buffer_ptr, ctx->ac.intptr, ""),
464 LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi),
465 false),
466 LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
467 LLVMConstInt(ctx->ac.i32, desc_type, false),
468 };
469
470 return ac_build_gather_values(&ctx->ac, desc_components, 4);
471 }
472 }
473
474 buffer_ptr = get_desc_ptr(ctx, buffer_ptr, false);
475 LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
476
477 result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
478 LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
479 LLVMSetAlignment(result, 4);
480
481 return result;
482 }
483
484 static LLVMValueRef
radv_get_sampler_desc(struct ac_shader_abi * abi,unsigned descriptor_set,unsigned base_index,unsigned constant_index,LLVMValueRef index,enum ac_descriptor_type desc_type,bool image,bool write,bool bindless)485 radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsigned base_index,
486 unsigned constant_index, LLVMValueRef index,
487 enum ac_descriptor_type desc_type, bool image, bool write, bool bindless)
488 {
489 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
490 LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
491 struct radv_descriptor_set_layout *layout =
492 ctx->args->options->layout->set[descriptor_set].layout;
493 struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
494 unsigned offset = binding->offset;
495 unsigned stride = binding->size;
496 unsigned type_size;
497 LLVMBuilderRef builder = ctx->ac.builder;
498 LLVMTypeRef type;
499
500 assert(base_index < layout->binding_count);
501
502 if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE && desc_type == AC_DESC_FMASK)
503 return NULL;
504
505 switch (desc_type) {
506 case AC_DESC_IMAGE:
507 type = ctx->ac.v8i32;
508 type_size = 32;
509 break;
510 case AC_DESC_FMASK:
511 type = ctx->ac.v8i32;
512 offset += 32;
513 type_size = 32;
514 break;
515 case AC_DESC_SAMPLER:
516 type = ctx->ac.v4i32;
517 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
518 offset += radv_combined_image_descriptor_sampler_offset(binding);
519 }
520
521 type_size = 16;
522 break;
523 case AC_DESC_BUFFER:
524 type = ctx->ac.v4i32;
525 type_size = 16;
526 break;
527 case AC_DESC_PLANE_0:
528 case AC_DESC_PLANE_1:
529 case AC_DESC_PLANE_2:
530 type = ctx->ac.v8i32;
531 type_size = 32;
532 offset += 32 * (desc_type - AC_DESC_PLANE_0);
533 break;
534 default:
535 unreachable("invalid desc_type\n");
536 }
537
538 offset += constant_index * stride;
539
540 if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
541 (!index || binding->immutable_samplers_equal)) {
542 if (binding->immutable_samplers_equal)
543 constant_index = 0;
544
545 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
546
547 LLVMValueRef constants[] = {
548 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
549 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
550 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
551 LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
552 };
553 return ac_build_gather_values(&ctx->ac, constants, 4);
554 }
555
556 assert(stride % type_size == 0);
557
558 LLVMValueRef adjusted_index = index;
559 if (!adjusted_index)
560 adjusted_index = ctx->ac.i32_0;
561
562 adjusted_index =
563 LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
564
565 LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0);
566 list = LLVMBuildGEP(builder, list, &val_offset, 1, "");
567 list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(type), "");
568
569 LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index);
570
571 /* 3 plane formats always have same size and format for plane 1 & 2, so
572 * use the tail from plane 1 so that we can store only the first 16 bytes
573 * of the last plane. */
574 if (desc_type == AC_DESC_PLANE_2) {
575 LLVMValueRef descriptor2 =
576 radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index,
577 AC_DESC_PLANE_1, image, write, bindless);
578
579 LLVMValueRef components[8];
580 for (unsigned i = 0; i < 4; ++i)
581 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
582
583 for (unsigned i = 4; i < 8; ++i)
584 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
585 descriptor = ac_build_gather_values(&ctx->ac, components, 8);
586 } else if (desc_type == AC_DESC_IMAGE &&
587 ctx->args->options->has_image_load_dcc_bug &&
588 image && !write) {
589 LLVMValueRef components[8];
590
591 for (unsigned i = 0; i < 8; i++)
592 components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i);
593
594 /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a hardware bug. */
595 components[6] = LLVMBuildAnd(ctx->ac.builder, components[6],
596 LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, false), "");
597
598 descriptor = ac_build_gather_values(&ctx->ac, components, 8);
599 }
600
601 return descriptor;
602 }
603
604 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
605 * so we may need to fix it up. */
606 static LLVMValueRef
adjust_vertex_fetch_alpha(struct radv_shader_context * ctx,unsigned adjustment,LLVMValueRef alpha)607 adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment, LLVMValueRef alpha)
608 {
609 if (adjustment == ALPHA_ADJUST_NONE)
610 return alpha;
611
612 LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
613
614 alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
615
616 if (adjustment == ALPHA_ADJUST_SSCALED)
617 alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
618 else
619 alpha = ac_to_integer(&ctx->ac, alpha);
620
621 /* For the integer-like cases, do a natural sign extension.
622 *
623 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
624 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
625 * exponent.
626 */
627 alpha =
628 LLVMBuildShl(ctx->ac.builder, alpha,
629 adjustment == ALPHA_ADJUST_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
630 alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
631
632 /* Convert back to the right type. */
633 if (adjustment == ALPHA_ADJUST_SNORM) {
634 LLVMValueRef clamp;
635 LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
636 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
637 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
638 alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
639 } else if (adjustment == ALPHA_ADJUST_SSCALED) {
640 alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
641 }
642
643 return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
644 }
645
646 static LLVMValueRef
radv_fixup_vertex_input_fetches(struct radv_shader_context * ctx,LLVMValueRef value,unsigned num_channels,bool is_float)647 radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value,
648 unsigned num_channels, bool is_float)
649 {
650 LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
651 LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
652 LLVMValueRef chan[4];
653
654 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
655 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
656
657 if (num_channels == 4 && num_channels == vec_size)
658 return value;
659
660 num_channels = MIN2(num_channels, vec_size);
661
662 for (unsigned i = 0; i < num_channels; i++)
663 chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i);
664 } else {
665 assert(num_channels == 1);
666 chan[0] = value;
667 }
668
669 for (unsigned i = num_channels; i < 4; i++) {
670 chan[i] = i == 3 ? one : zero;
671 chan[i] = ac_to_integer(&ctx->ac, chan[i]);
672 }
673
674 return ac_build_gather_values(&ctx->ac, chan, 4);
675 }
676
677 static void
load_vs_input(struct radv_shader_context * ctx,unsigned driver_location,LLVMTypeRef dest_type,LLVMValueRef out[4])678 load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTypeRef dest_type,
679 LLVMValueRef out[4])
680 {
681 LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);
682 LLVMValueRef t_offset;
683 LLVMValueRef t_list;
684 LLVMValueRef input;
685 LLVMValueRef buffer_index;
686 unsigned attrib_index = driver_location - VERT_ATTRIB_GENERIC0;
687 unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];
688 unsigned data_format = attrib_format & 0x0f;
689 unsigned num_format = (attrib_format >> 4) & 0x07;
690 bool is_float =
691 num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
692 uint8_t input_usage_mask =
693 ctx->args->shader_info->vs.input_usage_mask[driver_location];
694 unsigned num_input_channels = util_last_bit(input_usage_mask);
695
696 if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
697 uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];
698
699 if (divisor) {
700 buffer_index = ctx->abi.instance_id;
701
702 if (divisor != 1) {
703 buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
704 LLVMConstInt(ctx->ac.i32, divisor, 0), "");
705 }
706 } else {
707 buffer_index = ctx->ac.i32_0;
708 }
709
710 buffer_index = LLVMBuildAdd(
711 ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
712 } else {
713 buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
714 ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
715 }
716
717 const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
718
719 /* Adjust the number of channels to load based on the vertex attribute format. */
720 unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
721 unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];
722 unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];
723 unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];
724 unsigned alpha_adjust = ctx->args->options->key.vs.vertex_alpha_adjust[attrib_index];
725
726 if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
727 /* Always load, at least, 3 channels for formats that need to be shuffled because X<->Z. */
728 num_channels = MAX2(num_channels, 3);
729 }
730
731 unsigned desc_index =
732 ctx->args->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
733 desc_index = util_bitcount(ctx->args->shader_info->vs.vb_desc_usage_mask &
734 u_bit_consecutive(0, desc_index));
735 t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
736 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
737
738 /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any alignment issues that
739 * triggers memory violations and eventually a GPU hang. This can happen if the stride (static or
740 * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and
741 * VBO offset is 2 for R16G16B16A16_SNORM).
742 */
743 if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) {
744 unsigned chan_format = vtx_info->chan_format;
745 LLVMValueRef values[4];
746
747 assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10);
748
749 for (unsigned chan = 0; chan < num_channels; chan++) {
750 unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
751 LLVMValueRef chan_index = buffer_index;
752
753 if (attrib_stride != 0 && chan_offset > attrib_stride) {
754 LLVMValueRef buffer_offset =
755 LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
756
757 chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
758
759 chan_offset = chan_offset % attrib_stride;
760 }
761
762 values[chan] = ac_build_struct_tbuffer_load(
763 &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
764 ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
765 }
766
767 input = ac_build_gather_values(&ctx->ac, values, num_channels);
768 } else {
769 if (attrib_stride != 0 && attrib_offset > attrib_stride) {
770 LLVMValueRef buffer_offset =
771 LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
772
773 buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
774
775 attrib_offset = attrib_offset % attrib_stride;
776 }
777
778 input = ac_build_struct_tbuffer_load(
779 &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
780 ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
781 }
782
783 if (ctx->args->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
784 LLVMValueRef c[4];
785 c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
786 c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
787 c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
788 c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
789
790 input = ac_build_gather_values(&ctx->ac, c, 4);
791 }
792
793 input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
794
795 for (unsigned chan = 0; chan < 4; chan++) {
796 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
797 out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
798 if (dest_type == ctx->ac.i16 && is_float) {
799 out[chan] = LLVMBuildBitCast(ctx->ac.builder, out[chan], ctx->ac.f32, "");
800 out[chan] = LLVMBuildFPTrunc(ctx->ac.builder, out[chan], ctx->ac.f16, "");
801 }
802 }
803
804 out[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, out[3]);
805
806 for (unsigned chan = 0; chan < 4; chan++) {
807 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
808 if (dest_type == ctx->ac.i16 && !is_float)
809 out[chan] = LLVMBuildTrunc(ctx->ac.builder, out[chan], ctx->ac.i16, "");
810 }
811 }
812
813 static LLVMValueRef
radv_load_vs_inputs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)814 radv_load_vs_inputs(struct ac_shader_abi *abi, unsigned driver_location, unsigned component,
815 unsigned num_components, unsigned vertex_index, LLVMTypeRef type)
816 {
817 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
818 LLVMValueRef values[4];
819
820 load_vs_input(ctx, driver_location, type, values);
821
822 for (unsigned i = 0; i < 4; i++)
823 values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
824
825 return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
826 }
827
828 static void
prepare_interp_optimize(struct radv_shader_context * ctx,struct nir_shader * nir)829 prepare_interp_optimize(struct radv_shader_context *ctx, struct nir_shader *nir)
830 {
831 bool uses_center = false;
832 bool uses_centroid = false;
833 nir_foreach_shader_in_variable (variable, nir) {
834 if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
835 variable->data.sample)
836 continue;
837
838 if (variable->data.centroid)
839 uses_centroid = true;
840 else
841 uses_center = true;
842 }
843
844 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid);
845 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid);
846
847 if (uses_center && uses_centroid) {
848 LLVMValueRef sel =
849 LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask),
850 ctx->ac.i32_0, "");
851 ctx->abi.persp_centroid =
852 LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.persp_center),
853 ctx->abi.persp_centroid, "");
854 ctx->abi.linear_centroid =
855 LLVMBuildSelect(ctx->ac.builder, sel, ac_get_arg(&ctx->ac, ctx->args->ac.linear_center),
856 ctx->abi.linear_centroid, "");
857 }
858 }
859
860 static void
scan_shader_output_decl(struct radv_shader_context * ctx,struct nir_variable * variable,struct nir_shader * shader,gl_shader_stage stage)861 scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *variable,
862 struct nir_shader *shader, gl_shader_stage stage)
863 {
864 int idx = variable->data.driver_location;
865 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
866 uint64_t mask_attribs;
867
868 if (variable->data.compact) {
869 unsigned component_count = variable->data.location_frac + glsl_get_length(variable->type);
870 attrib_count = (component_count + 3) / 4;
871 }
872
873 mask_attribs = ((1ull << attrib_count) - 1) << idx;
874
875 ctx->output_mask |= mask_attribs;
876 }
877
878 /* Initialize arguments for the shader export intrinsic */
879 static void
si_llvm_init_export_args(struct radv_shader_context * ctx,LLVMValueRef * values,unsigned enabled_channels,unsigned target,struct ac_export_args * args)880 si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
881 unsigned enabled_channels, unsigned target, struct ac_export_args *args)
882 {
883 /* Specify the channels that are enabled. */
884 args->enabled_channels = enabled_channels;
885
886 /* Specify whether the EXEC mask represents the valid mask */
887 args->valid_mask = 0;
888
889 /* Specify whether this is the last export */
890 args->done = 0;
891
892 /* Specify the target we are exporting */
893 args->target = target;
894
895 args->compr = false;
896 args->out[0] = LLVMGetUndef(ctx->ac.f32);
897 args->out[1] = LLVMGetUndef(ctx->ac.f32);
898 args->out[2] = LLVMGetUndef(ctx->ac.f32);
899 args->out[3] = LLVMGetUndef(ctx->ac.f32);
900
901 if (!values)
902 return;
903
904 bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
905 if (ctx->stage == MESA_SHADER_FRAGMENT) {
906 unsigned index = target - V_008DFC_SQ_EXP_MRT;
907 unsigned col_format = (ctx->args->options->key.ps.col_format >> (4 * index)) & 0xf;
908 bool is_int8 = (ctx->args->options->key.ps.is_int8 >> index) & 1;
909 bool is_int10 = (ctx->args->options->key.ps.is_int10 >> index) & 1;
910
911 LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
912 LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
913 bool hi) = NULL;
914
915 switch (col_format) {
916 case V_028714_SPI_SHADER_ZERO:
917 args->enabled_channels = 0; /* writemask */
918 args->target = V_008DFC_SQ_EXP_NULL;
919 break;
920
921 case V_028714_SPI_SHADER_32_R:
922 args->enabled_channels = 1;
923 args->out[0] = values[0];
924 break;
925
926 case V_028714_SPI_SHADER_32_GR:
927 args->enabled_channels = 0x3;
928 args->out[0] = values[0];
929 args->out[1] = values[1];
930 break;
931
932 case V_028714_SPI_SHADER_32_AR:
933 if (ctx->ac.chip_class >= GFX10) {
934 args->enabled_channels = 0x3;
935 args->out[0] = values[0];
936 args->out[1] = values[3];
937 } else {
938 args->enabled_channels = 0x9;
939 args->out[0] = values[0];
940 args->out[3] = values[3];
941 }
942 break;
943
944 case V_028714_SPI_SHADER_FP16_ABGR:
945 args->enabled_channels = 0xf;
946 packf = ac_build_cvt_pkrtz_f16;
947 if (is_16bit) {
948 for (unsigned chan = 0; chan < 4; chan++)
949 values[chan] = LLVMBuildFPExt(ctx->ac.builder, values[chan], ctx->ac.f32, "");
950 }
951 break;
952
953 case V_028714_SPI_SHADER_UNORM16_ABGR:
954 args->enabled_channels = 0xf;
955 packf = ac_build_cvt_pknorm_u16;
956 break;
957
958 case V_028714_SPI_SHADER_SNORM16_ABGR:
959 args->enabled_channels = 0xf;
960 packf = ac_build_cvt_pknorm_i16;
961 break;
962
963 case V_028714_SPI_SHADER_UINT16_ABGR:
964 args->enabled_channels = 0xf;
965 packi = ac_build_cvt_pk_u16;
966 if (is_16bit) {
967 for (unsigned chan = 0; chan < 4; chan++)
968 values[chan] = LLVMBuildZExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
969 ctx->ac.i32, "");
970 }
971 break;
972
973 case V_028714_SPI_SHADER_SINT16_ABGR:
974 args->enabled_channels = 0xf;
975 packi = ac_build_cvt_pk_i16;
976 if (is_16bit) {
977 for (unsigned chan = 0; chan < 4; chan++)
978 values[chan] = LLVMBuildSExt(ctx->ac.builder, ac_to_integer(&ctx->ac, values[chan]),
979 ctx->ac.i32, "");
980 }
981 break;
982
983 default:
984 case V_028714_SPI_SHADER_32_ABGR:
985 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
986 break;
987 }
988
989 /* Replace NaN by zero (only 32-bit) to fix game bugs if
990 * requested.
991 */
992 if (ctx->args->options->enable_mrt_output_nan_fixup && !is_16bit &&
993 (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
994 col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
995 col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
996 for (unsigned i = 0; i < 4; i++) {
997 LLVMValueRef class_args[2] = {values[i],
998 LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN, false)};
999 LLVMValueRef isnan = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1,
1000 class_args, 2, AC_FUNC_ATTR_READNONE);
1001 values[i] = LLVMBuildSelect(ctx->ac.builder, isnan, ctx->ac.f32_0, values[i], "");
1002 }
1003 }
1004
1005 /* Pack f16 or norm_i16/u16. */
1006 if (packf) {
1007 for (unsigned chan = 0; chan < 2; chan++) {
1008 LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
1009 LLVMValueRef packed;
1010
1011 packed = packf(&ctx->ac, pack_args);
1012 args->out[chan] = ac_to_float(&ctx->ac, packed);
1013 }
1014 args->compr = 1; /* COMPR flag */
1015 }
1016
1017 /* Pack i16/u16. */
1018 if (packi) {
1019 for (unsigned chan = 0; chan < 2; chan++) {
1020 LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
1021 ac_to_integer(&ctx->ac, values[2 * chan + 1])};
1022 LLVMValueRef packed;
1023
1024 packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
1025 args->out[chan] = ac_to_float(&ctx->ac, packed);
1026 }
1027 args->compr = 1; /* COMPR flag */
1028 }
1029 return;
1030 }
1031
1032 if (is_16bit) {
1033 for (unsigned chan = 0; chan < 4; chan++) {
1034 values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
1035 args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
1036 }
1037 } else
1038 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1039
1040 for (unsigned i = 0; i < 4; ++i)
1041 args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
1042 }
1043
1044 static void
radv_export_param(struct radv_shader_context * ctx,unsigned index,LLVMValueRef * values,unsigned enabled_channels)1045 radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef *values,
1046 unsigned enabled_channels)
1047 {
1048 struct ac_export_args args;
1049
1050 si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args);
1051 ac_build_export(&ctx->ac, &args);
1052 }
1053
1054 static LLVMValueRef
radv_load_output(struct radv_shader_context * ctx,unsigned index,unsigned chan)1055 radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
1056 {
1057 LLVMValueRef output = ctx->abi.outputs[ac_llvm_reg_index_soa(index, chan)];
1058 return LLVMBuildLoad(ctx->ac.builder, output, "");
1059 }
1060
1061 static void
radv_emit_stream_output(struct radv_shader_context * ctx,LLVMValueRef const * so_buffers,LLVMValueRef const * so_write_offsets,const struct radv_stream_output * output,struct radv_shader_output_values * shader_out)1062 radv_emit_stream_output(struct radv_shader_context *ctx, LLVMValueRef const *so_buffers,
1063 LLVMValueRef const *so_write_offsets,
1064 const struct radv_stream_output *output,
1065 struct radv_shader_output_values *shader_out)
1066 {
1067 unsigned num_comps = util_bitcount(output->component_mask);
1068 unsigned buf = output->buffer;
1069 unsigned offset = output->offset;
1070 unsigned start;
1071 LLVMValueRef out[4];
1072
1073 assert(num_comps && num_comps <= 4);
1074 if (!num_comps || num_comps > 4)
1075 return;
1076
1077 /* Get the first component. */
1078 start = ffs(output->component_mask) - 1;
1079
1080 /* Load the output as int. */
1081 for (int i = 0; i < num_comps; i++) {
1082 out[i] = ac_to_integer(&ctx->ac, shader_out->values[start + i]);
1083 }
1084
1085 /* Pack the output. */
1086 LLVMValueRef vdata = NULL;
1087
1088 switch (num_comps) {
1089 case 1: /* as i32 */
1090 vdata = out[0];
1091 break;
1092 case 2: /* as v2i32 */
1093 case 3: /* as v4i32 (aligned to 4) */
1094 out[3] = LLVMGetUndef(ctx->ac.i32);
1095 FALLTHROUGH;
1096 case 4: /* as v4i32 */
1097 vdata = ac_build_gather_values(&ctx->ac, out,
1098 !ac_has_vec3_support(ctx->ac.chip_class, false)
1099 ? util_next_power_of_two(num_comps)
1100 : num_comps);
1101 break;
1102 }
1103
1104 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf],
1105 ctx->ac.i32_0, offset, ac_glc | ac_slc);
1106 }
1107
1108 static void
radv_emit_streamout(struct radv_shader_context * ctx,unsigned stream)1109 radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
1110 {
1111 int i;
1112
1113 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1114 assert(ctx->args->ac.streamout_config.used);
1115 LLVMValueRef so_vtx_count = ac_build_bfe(
1116 &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config),
1117 LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false);
1118
1119 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
1120
1121 /* can_emit = tid < so_vtx_count; */
1122 LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, so_vtx_count, "");
1123
1124 /* Emit the streamout code conditionally. This actually avoids
1125 * out-of-bounds buffer access. The hw tells us via the SGPR
1126 * (so_vtx_count) which threads are allowed to emit streamout data.
1127 */
1128 ac_build_ifcc(&ctx->ac, can_emit, 6501);
1129 {
1130 /* The buffer offset is computed as follows:
1131 * ByteOffset = streamout_offset[buffer_id]*4 +
1132 * (streamout_write_index + thread_id)*stride[buffer_id] +
1133 * attrib_offset
1134 */
1135 LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_write_index);
1136
1137 /* Compute (streamout_write_index + thread_id). */
1138 so_write_index = LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, "");
1139
1140 /* Load the descriptor and compute the write offset for each
1141 * enabled buffer.
1142 */
1143 LLVMValueRef so_write_offset[4] = {0};
1144 LLVMValueRef so_buffers[4] = {0};
1145 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
1146
1147 for (i = 0; i < 4; i++) {
1148 uint16_t stride = ctx->args->shader_info->so.strides[i];
1149
1150 if (!stride)
1151 continue;
1152
1153 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i, false);
1154
1155 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
1156
1157 LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args->ac.streamout_offset[i]);
1158
1159 so_offset =
1160 LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), "");
1161
1162 so_write_offset[i] = ac_build_imad(
1163 &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, stride * 4, false), so_offset);
1164 }
1165
1166 /* Write streamout data. */
1167 for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {
1168 struct radv_shader_output_values shader_out = {0};
1169 struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
1170
1171 if (stream != output->stream)
1172 continue;
1173
1174 for (int j = 0; j < 4; j++) {
1175 shader_out.values[j] = radv_load_output(ctx, output->location, j);
1176 }
1177
1178 radv_emit_stream_output(ctx, so_buffers, so_write_offset, output, &shader_out);
1179 }
1180 }
1181 ac_build_endif(&ctx->ac, 6501);
1182 }
1183
1184 static void
radv_build_param_exports(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,struct radv_vs_output_info * outinfo,bool export_clip_dists)1185 radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1186 unsigned noutput, struct radv_vs_output_info *outinfo,
1187 bool export_clip_dists)
1188 {
1189 for (unsigned i = 0; i < noutput; i++) {
1190 unsigned slot_name = outputs[i].slot_name;
1191 unsigned usage_mask = outputs[i].usage_mask;
1192
1193 if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID &&
1194 slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 &&
1195 slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0)
1196 continue;
1197
1198 if ((slot_name == VARYING_SLOT_CLIP_DIST0 || slot_name == VARYING_SLOT_CLIP_DIST1) &&
1199 !export_clip_dists)
1200 continue;
1201
1202 radv_export_param(ctx, outinfo->vs_output_param_offset[slot_name], outputs[i].values,
1203 usage_mask);
1204 }
1205 }
1206
1207 /* Generate export instructions for hardware VS shader stage or NGG GS stage
1208 * (position and parameter data only).
1209 */
1210 static void
radv_llvm_export_vs(struct radv_shader_context * ctx,struct radv_shader_output_values * outputs,unsigned noutput,struct radv_vs_output_info * outinfo,bool export_clip_dists)1211 radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs,
1212 unsigned noutput, struct radv_vs_output_info *outinfo, bool export_clip_dists)
1213 {
1214 LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_value = NULL;
1215 LLVMValueRef primitive_shading_rate = NULL;
1216 struct ac_export_args pos_args[4] = {0};
1217 unsigned pos_idx, index;
1218 int i;
1219
1220 /* Build position exports */
1221 for (i = 0; i < noutput; i++) {
1222 switch (outputs[i].slot_name) {
1223 case VARYING_SLOT_POS:
1224 si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]);
1225 break;
1226 case VARYING_SLOT_PSIZ:
1227 psize_value = outputs[i].values[0];
1228 break;
1229 case VARYING_SLOT_LAYER:
1230 layer_value = outputs[i].values[0];
1231 break;
1232 case VARYING_SLOT_VIEWPORT:
1233 viewport_value = outputs[i].values[0];
1234 break;
1235 case VARYING_SLOT_PRIMITIVE_SHADING_RATE:
1236 primitive_shading_rate = outputs[i].values[0];
1237 break;
1238 case VARYING_SLOT_CLIP_DIST0:
1239 case VARYING_SLOT_CLIP_DIST1:
1240 index = 2 + outputs[i].slot_index;
1241 si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index,
1242 &pos_args[index]);
1243 break;
1244 default:
1245 break;
1246 }
1247 }
1248
1249 /* We need to add the position output manually if it's missing. */
1250 if (!pos_args[0].out[0]) {
1251 pos_args[0].enabled_channels = 0xf; /* writemask */
1252 pos_args[0].valid_mask = 0; /* EXEC mask */
1253 pos_args[0].done = 0; /* last export? */
1254 pos_args[0].target = V_008DFC_SQ_EXP_POS;
1255 pos_args[0].compr = 0; /* COMPR flag */
1256 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
1257 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
1258 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
1259 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
1260 }
1261
1262 bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||
1263 ctx->args->options->force_vrs_rates;
1264
1265 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||
1266 outinfo->writes_viewport_index || writes_primitive_shading_rate) {
1267 pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
1268 (writes_primitive_shading_rate == true ? 2 : 0) |
1269 (outinfo->writes_layer == true ? 4 : 0));
1270 pos_args[1].valid_mask = 0;
1271 pos_args[1].done = 0;
1272 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
1273 pos_args[1].compr = 0;
1274 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
1275 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
1276 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
1277 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
1278
1279 if (outinfo->writes_pointsize == true)
1280 pos_args[1].out[0] = psize_value;
1281 if (outinfo->writes_layer == true)
1282 pos_args[1].out[2] = layer_value;
1283 if (outinfo->writes_viewport_index == true) {
1284 if (ctx->args->options->chip_class >= GFX9) {
1285 /* GFX9 has the layer in out.z[10:0] and the viewport
1286 * index in out.z[19:16].
1287 */
1288 LLVMValueRef v = viewport_value;
1289 v = ac_to_integer(&ctx->ac, v);
1290 v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, false), "");
1291 v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
1292
1293 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
1294 pos_args[1].enabled_channels |= 1 << 2;
1295 } else {
1296 pos_args[1].out[3] = viewport_value;
1297 pos_args[1].enabled_channels |= 1 << 3;
1298 }
1299 }
1300
1301 if (outinfo->writes_primitive_shading_rate) {
1302 pos_args[1].out[1] = primitive_shading_rate;
1303 } else if (ctx->args->options->force_vrs_rates) {
1304 /* Bits [2:3] = VRS rate X
1305 * Bits [4:5] = VRS rate Y
1306 *
1307 * The range is [-2, 1]. Values:
1308 * 1: 2x coarser shading rate in that direction.
1309 * 0: normal shading rate
1310 * -1: 2x finer shading rate (sample shading, not directional)
1311 * -2: 4x finer shading rate (sample shading, not directional)
1312 *
1313 * Sample shading can't go above 8 samples, so both numbers can't be -2 at the same time.
1314 */
1315 LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->args->options->force_vrs_rates, false);
1316 LLVMValueRef cond;
1317 LLVMValueRef v;
1318
1319 /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
1320 cond = LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE, pos_args[0].out[3], ctx->ac.f32_1, "");
1321 v = LLVMBuildSelect(ctx->ac.builder, cond, rates, ctx->ac.i32_0, "");
1322
1323 pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
1324 }
1325 }
1326
1327 /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
1328 * Setting valid_mask=1 prevents it and has no other effect.
1329 */
1330 if (ctx->ac.chip_class == GFX10)
1331 pos_args[0].valid_mask = 1;
1332
1333 pos_idx = 0;
1334 for (i = 0; i < 4; i++) {
1335 if (!pos_args[i].out[0])
1336 continue;
1337
1338 /* Specify the target we are exporting */
1339 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1340
1341 if (pos_idx == outinfo->pos_exports)
1342 /* Specify that this is the last export */
1343 pos_args[i].done = 1;
1344
1345 ac_build_export(&ctx->ac, &pos_args[i]);
1346 }
1347
1348 /* Build parameter exports */
1349 radv_build_param_exports(ctx, outputs, noutput, outinfo, export_clip_dists);
1350 }
1351
1352 static void
handle_vs_outputs_post(struct radv_shader_context * ctx,bool export_prim_id,bool export_clip_dists,struct radv_vs_output_info * outinfo)1353 handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_clip_dists,
1354 struct radv_vs_output_info *outinfo)
1355 {
1356 struct radv_shader_output_values *outputs;
1357 unsigned noutput = 0;
1358
1359 if (ctx->args->options->key.has_multiview_view_index) {
1360 LLVMValueRef *tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
1361 if (!*tmp_out) {
1362 for (unsigned i = 0; i < 4; ++i)
1363 ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
1364 ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
1365 }
1366
1367 LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index);
1368 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out);
1369 ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
1370 }
1371
1372 if (ctx->args->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader) {
1373 /* The GS copy shader emission already emits streamout. */
1374 radv_emit_streamout(ctx, 0);
1375 }
1376
1377 /* Allocate a temporary array for the output values. */
1378 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_prim_id;
1379 outputs = malloc(num_outputs * sizeof(outputs[0]));
1380
1381 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1382 if (!(ctx->output_mask & (1ull << i)))
1383 continue;
1384
1385 outputs[noutput].slot_name = i;
1386 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1387
1388 if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {
1389 outputs[noutput].usage_mask = ctx->args->shader_info->vs.output_usage_mask[i];
1390 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1391 outputs[noutput].usage_mask = ctx->args->shader_info->tes.output_usage_mask[i];
1392 } else {
1393 assert(ctx->args->is_gs_copy_shader);
1394 outputs[noutput].usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1395 }
1396
1397 for (unsigned j = 0; j < 4; j++) {
1398 outputs[noutput].values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
1399 }
1400
1401 noutput++;
1402 }
1403
1404 /* Export PrimitiveID. */
1405 if (export_prim_id) {
1406 outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID;
1407 outputs[noutput].slot_index = 0;
1408 outputs[noutput].usage_mask = 0x1;
1409 if (ctx->stage == MESA_SHADER_TESS_EVAL)
1410 outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
1411 else
1412 outputs[noutput].values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.vs_prim_id);
1413 for (unsigned j = 1; j < 4; j++)
1414 outputs[noutput].values[j] = ctx->ac.f32_0;
1415 noutput++;
1416 }
1417
1418 radv_llvm_export_vs(ctx, outputs, noutput, outinfo, export_clip_dists);
1419
1420 free(outputs);
1421 }
1422
1423 static LLVMValueRef
get_wave_id_in_tg(struct radv_shader_context * ctx)1424 get_wave_id_in_tg(struct radv_shader_context *ctx)
1425 {
1426 return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 24, 4);
1427 }
1428
1429 static LLVMValueRef
get_tgsize(struct radv_shader_context * ctx)1430 get_tgsize(struct radv_shader_context *ctx)
1431 {
1432 return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 28, 4);
1433 }
1434
1435 static LLVMValueRef
get_thread_id_in_tg(struct radv_shader_context * ctx)1436 get_thread_id_in_tg(struct radv_shader_context *ctx)
1437 {
1438 LLVMBuilderRef builder = ctx->ac.builder;
1439 LLVMValueRef tmp;
1440 tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
1441 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
1442 return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
1443 }
1444
1445 static LLVMValueRef
ngg_get_vtx_cnt(struct radv_shader_context * ctx)1446 ngg_get_vtx_cnt(struct radv_shader_context *ctx)
1447 {
1448 return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1449 LLVMConstInt(ctx->ac.i32, 12, false), LLVMConstInt(ctx->ac.i32, 9, false),
1450 false);
1451 }
1452
1453 static LLVMValueRef
ngg_get_prim_cnt(struct radv_shader_context * ctx)1454 ngg_get_prim_cnt(struct radv_shader_context *ctx)
1455 {
1456 return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info),
1457 LLVMConstInt(ctx->ac.i32, 22, false), LLVMConstInt(ctx->ac.i32, 9, false),
1458 false);
1459 }
1460
1461 static LLVMValueRef
ngg_gs_get_vertex_storage(struct radv_shader_context * ctx)1462 ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
1463 {
1464 unsigned num_outputs = util_bitcount64(ctx->output_mask);
1465
1466 if (ctx->args->options->key.has_multiview_view_index)
1467 num_outputs++;
1468
1469 LLVMTypeRef elements[2] = {
1470 LLVMArrayType(ctx->ac.i32, 4 * num_outputs),
1471 LLVMArrayType(ctx->ac.i8, 4),
1472 };
1473 LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1474 type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1475 return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1476 }
1477
1478 /**
1479 * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1480 * is in emit order; that is:
1481 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1482 * - during vertex emit, i.e. while the API GS shader invocation is running,
1483 * N = threadidx * gs_max_out_vertices + emitidx
1484 *
1485 * Goals of the LDS memory layout:
1486 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1487 * in uniform control flow
1488 * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1489 * culling
1490 * 3. Agnostic to the number of waves (since we don't know it before compiling)
1491 * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1492 * 5. Avoid wasting memory.
1493 *
1494 * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1495 * layout, elimination of bank conflicts requires that each vertex occupy an
1496 * odd number of dwords. We use the additional dword to store the output stream
1497 * index as well as a flag to indicate whether this vertex ends a primitive
1498 * for rasterization.
1499 *
1500 * Swizzling is required to satisfy points 1 and 2 simultaneously.
1501 *
1502 * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1503 * Indices are swizzled in groups of 32, which ensures point 1 without
1504 * disturbing point 2.
1505 *
1506 * \return an LDS pointer to type {[N x i32], [4 x i8]}
1507 */
1508 static LLVMValueRef
ngg_gs_vertex_ptr(struct radv_shader_context * ctx,LLVMValueRef vertexidx)1509 ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx)
1510 {
1511 LLVMBuilderRef builder = ctx->ac.builder;
1512 LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1513
1514 /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1515 unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;
1516 if (write_stride_2exp) {
1517 LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1518 LLVMValueRef swizzle = LLVMBuildAnd(
1519 builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1520 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1521 }
1522
1523 return ac_build_gep0(&ctx->ac, storage, vertexidx);
1524 }
1525
1526 static LLVMValueRef
ngg_gs_emit_vertex_ptr(struct radv_shader_context * ctx,LLVMValueRef gsthread,LLVMValueRef emitidx)1527 ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, LLVMValueRef emitidx)
1528 {
1529 LLVMBuilderRef builder = ctx->ac.builder;
1530 LLVMValueRef tmp;
1531
1532 tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false);
1533 tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1534 const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1535 return ngg_gs_vertex_ptr(ctx, vertexidx);
1536 }
1537
1538 static LLVMValueRef
ngg_gs_get_emit_output_ptr(struct radv_shader_context * ctx,LLVMValueRef vertexptr,unsigned out_idx)1539 ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1540 unsigned out_idx)
1541 {
1542 LLVMValueRef gep_idx[3] = {
1543 ctx->ac.i32_0, /* implied C-style array */
1544 ctx->ac.i32_0, /* first struct entry */
1545 LLVMConstInt(ctx->ac.i32, out_idx, false),
1546 };
1547 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1548 }
1549
1550 static LLVMValueRef
ngg_gs_get_emit_primflag_ptr(struct radv_shader_context * ctx,LLVMValueRef vertexptr,unsigned stream)1551 ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr,
1552 unsigned stream)
1553 {
1554 LLVMValueRef gep_idx[3] = {
1555 ctx->ac.i32_0, /* implied C-style array */
1556 ctx->ac.i32_1, /* second struct entry */
1557 LLVMConstInt(ctx->ac.i32, stream, false),
1558 };
1559 return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1560 }
1561
1562 static void
handle_ngg_outputs_post_2(struct radv_shader_context * ctx)1563 handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
1564 {
1565 LLVMBuilderRef builder = ctx->ac.builder;
1566 LLVMValueRef tmp;
1567
1568 assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
1569 !ctx->args->is_gs_copy_shader);
1570
1571 LLVMValueRef prims_in_wave =
1572 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
1573 LLVMValueRef vtx_in_wave =
1574 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 0, 8);
1575 LLVMValueRef is_gs_thread =
1576 LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, "");
1577 LLVMValueRef is_es_thread =
1578 LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
1579 LLVMValueRef vtxindex[] = {
1580 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 0, 16),
1581 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 16, 16),
1582 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[1]), 0, 16),
1583 };
1584
1585 /* Determine the number of vertices per primitive. */
1586 unsigned num_vertices;
1587
1588 if (ctx->stage == MESA_SHADER_VERTEX) {
1589 num_vertices = 3; /* TODO: optimize for points & lines */
1590 } else {
1591 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1592
1593 if (ctx->shader->info.tess.point_mode)
1594 num_vertices = 1;
1595 else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
1596 num_vertices = 2;
1597 else
1598 num_vertices = 3;
1599 }
1600
1601 /* Copy Primitive IDs from GS threads to the LDS address corresponding
1602 * to the ES thread of the provoking vertex.
1603 */
1604 if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.outinfo.export_prim_id) {
1605 ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1606
1607 LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);
1608
1609 /* For provoking vertex last mode, use num_vtx_in_prim - 1. */
1610 if (ctx->args->options->key.vs.provoking_vtx_last) {
1611 uint8_t outprim = si_conv_prim_to_gs_out(ctx->args->options->key.vs.topology);
1612 provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, outprim, false);
1613 }
1614
1615 /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1616 LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1617 LLVMValueRef provoking_vtx_index =
1618 LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1619
1620 LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id),
1621 ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index));
1622 ac_build_endif(&ctx->ac, 5400);
1623 }
1624
1625 /* TODO: primitive culling */
1626
1627 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
1628 ngg_get_prim_cnt(ctx));
1629
1630 /* TODO: streamout queries */
1631 /* Export primitive data to the index buffer.
1632 *
1633 * For the first version, we will always build up all three indices
1634 * independent of the primitive type. The additional garbage data
1635 * shouldn't hurt.
1636 *
1637 * TODO: culling depends on the primitive type, so can have some
1638 * interaction here.
1639 */
1640 ac_build_ifcc(&ctx->ac, is_gs_thread, 6001);
1641 {
1642 struct ac_ngg_prim prim = {0};
1643
1644 if (ctx->args->shader_info->is_ngg_passthrough) {
1645 prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]);
1646 } else {
1647 prim.num_vertices = num_vertices;
1648 prim.isnull = ctx->ac.i1false;
1649 prim.edgeflags = ctx->ac.i32_0;
1650 memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
1651 }
1652
1653 ac_build_export_prim(&ctx->ac, &prim);
1654 }
1655 ac_build_endif(&ctx->ac, 6001);
1656
1657 /* Export per-vertex data (positions and parameters). */
1658 ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1659 {
1660 struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL
1661 ? &ctx->args->shader_info->tes.outinfo
1662 : &ctx->args->shader_info->vs.outinfo;
1663
1664 /* Exporting the primitive ID is handled below. */
1665 /* TODO: use the new VS export path */
1666 handle_vs_outputs_post(ctx, false, outinfo->export_clip_dists, outinfo);
1667
1668 if (outinfo->export_prim_id) {
1669 LLVMValueRef values[4];
1670
1671 if (ctx->stage == MESA_SHADER_VERTEX) {
1672 /* Wait for GS stores to finish. */
1673 ac_build_s_barrier(&ctx->ac);
1674
1675 tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
1676 values[0] = LLVMBuildLoad(builder, tmp, "");
1677 } else {
1678 assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1679 values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id);
1680 }
1681
1682 values[0] = ac_to_float(&ctx->ac, values[0]);
1683 for (unsigned j = 1; j < 4; j++)
1684 values[j] = ctx->ac.f32_0;
1685
1686 radv_export_param(ctx, outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID], values,
1687 0x1);
1688 }
1689 }
1690 ac_build_endif(&ctx->ac, 6002);
1691 }
1692
1693 static void
gfx10_ngg_gs_emit_prologue(struct radv_shader_context * ctx)1694 gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
1695 {
1696 /* Zero out the part of LDS scratch that is used to accumulate the
1697 * per-stream generated primitive count.
1698 */
1699 LLVMBuilderRef builder = ctx->ac.builder;
1700 LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1701 LLVMValueRef tid = get_thread_id_in_tg(ctx);
1702 LLVMBasicBlockRef merge_block;
1703 LLVMValueRef cond;
1704
1705 LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
1706 LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
1707 merge_block = LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
1708
1709 cond = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1710 LLVMBuildCondBr(ctx->ac.builder, cond, then_block, merge_block);
1711 LLVMPositionBuilderAtEnd(ctx->ac.builder, then_block);
1712
1713 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1714 LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1715
1716 LLVMBuildBr(ctx->ac.builder, merge_block);
1717 LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
1718
1719 ac_build_s_barrier(&ctx->ac);
1720 }
1721
1722 static void
gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context * ctx)1723 gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
1724 {
1725 LLVMBuilderRef builder = ctx->ac.builder;
1726 LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1727 LLVMValueRef tmp;
1728
1729 /* Zero out remaining (non-emitted) primitive flags.
1730 *
1731 * Note: Alternatively, we could pass the relevant gs_next_vertex to
1732 * the emit threads via LDS. This is likely worse in the expected
1733 * typical case where each GS thread emits the full set of
1734 * vertices.
1735 */
1736 for (unsigned stream = 0; stream < 4; ++stream) {
1737 unsigned num_components;
1738
1739 num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
1740 if (!num_components)
1741 continue;
1742
1743 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1744
1745 ac_build_bgnloop(&ctx->ac, 5100);
1746
1747 const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1748 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1749 LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), "");
1750 ac_build_ifcc(&ctx->ac, tmp, 5101);
1751 ac_build_break(&ctx->ac);
1752 ac_build_endif(&ctx->ac, 5101);
1753
1754 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1755 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1756
1757 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1758 LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1759
1760 ac_build_endloop(&ctx->ac, 5100);
1761 }
1762
1763 /* Accumulate generated primitives counts across the entire threadgroup. */
1764 for (unsigned stream = 0; stream < 4; ++stream) {
1765 unsigned num_components;
1766
1767 num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
1768 if (!num_components)
1769 continue;
1770
1771 LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1772 numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1773
1774 tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1775 ac_build_ifcc(&ctx->ac, tmp, 5105);
1776 {
1777 LLVMBuildAtomicRMW(
1778 builder, LLVMAtomicRMWBinOpAdd,
1779 ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
1780 numprims, LLVMAtomicOrderingMonotonic, false);
1781 }
1782 ac_build_endif(&ctx->ac, 5105);
1783 }
1784 }
1785
1786 static void
gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context * ctx)1787 gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
1788 {
1789 const unsigned verts_per_prim =
1790 si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive);
1791 LLVMBuilderRef builder = ctx->ac.builder;
1792 LLVMValueRef tmp, tmp2;
1793
1794 ac_build_s_barrier(&ctx->ac);
1795
1796 const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1797 LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1798
1799 /* Write shader query data. */
1800 tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
1801 tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1802 ac_build_ifcc(&ctx->ac, tmp, 5109);
1803 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1804 ac_build_ifcc(&ctx->ac, tmp, 5110);
1805 {
1806 tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1807
1808 ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
1809
1810 LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
1811 LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
1812
1813 const char *sync_scope = "workgroup-one-as";
1814
1815 /* Use a plain GDS atomic to accumulate the number of generated
1816 * primitives.
1817 */
1818 ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, tmp, sync_scope);
1819 }
1820 ac_build_endif(&ctx->ac, 5110);
1821 ac_build_endif(&ctx->ac, 5109);
1822
1823 /* TODO: culling */
1824
1825 /* Determine vertex liveness. */
1826 LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1827
1828 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1829 ac_build_ifcc(&ctx->ac, tmp, 5120);
1830 {
1831 for (unsigned i = 0; i < verts_per_prim; ++i) {
1832 const LLVMValueRef primidx =
1833 LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
1834
1835 if (i > 0) {
1836 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1837 ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1838 }
1839
1840 /* Load primitive liveness */
1841 tmp = ngg_gs_vertex_ptr(ctx, primidx);
1842 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1843 const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1844
1845 tmp = LLVMBuildLoad(builder, vertliveptr, "");
1846 tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
1847
1848 if (i > 0)
1849 ac_build_endif(&ctx->ac, 5121 + i);
1850 }
1851 }
1852 ac_build_endif(&ctx->ac, 5120);
1853
1854 /* Inclusive scan addition across the current wave. */
1855 LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1856 struct ac_wg_scan vertlive_scan = {0};
1857 vertlive_scan.op = nir_op_iadd;
1858 vertlive_scan.enable_reduce = true;
1859 vertlive_scan.enable_exclusive = true;
1860 vertlive_scan.src = vertlive;
1861 vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1862 vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1863 vertlive_scan.numwaves = get_tgsize(ctx);
1864 vertlive_scan.maxwaves = 8;
1865
1866 ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1867
1868 /* Skip all exports (including index exports) when possible. At least on
1869 * early gfx10 revisions this is also to avoid hangs.
1870 */
1871 LLVMValueRef have_exports =
1872 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1873 num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1874
1875 /* Allocate export space. Send this message as early as possible, to
1876 * hide the latency of the SQ <-> SPI roundtrip.
1877 *
1878 * Note: We could consider compacting primitives for export as well.
1879 * PA processes 1 non-null prim / clock, but it fetches 4 DW of
1880 * prim data per clock and skips null primitives at no additional
1881 * cost. So compacting primitives can only be beneficial when
1882 * there are 4 or more contiguous null primitives in the export
1883 * (in the common case of single-dword prim exports).
1884 */
1885 ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
1886 num_emit_threads);
1887
1888 /* Setup the reverse vertex compaction permutation. We re-use stream 1
1889 * of the primitive liveness flags, relying on the fact that each
1890 * threadgroup can have at most 256 threads. */
1891 ac_build_ifcc(&ctx->ac, vertlive, 5130);
1892 {
1893 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1894 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1895 LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1896 }
1897 ac_build_endif(&ctx->ac, 5130);
1898
1899 ac_build_s_barrier(&ctx->ac);
1900
1901 /* Export primitive data */
1902 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1903 ac_build_ifcc(&ctx->ac, tmp, 5140);
1904 {
1905 LLVMValueRef flags;
1906 struct ac_ngg_prim prim = {0};
1907 prim.num_vertices = verts_per_prim;
1908
1909 tmp = ngg_gs_vertex_ptr(ctx, tid);
1910 flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1911 prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1912 prim.edgeflags = ctx->ac.i32_0;
1913
1914 for (unsigned i = 0; i < verts_per_prim; ++i) {
1915 prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1916 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1917 }
1918
1919 /* Geometry shaders output triangle strips, but NGG expects triangles. */
1920 if (verts_per_prim == 3) {
1921 LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1922 is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1923
1924 LLVMValueRef flatshade_first =
1925 LLVMConstInt(ctx->ac.i1, !ctx->args->options->key.vs.provoking_vtx_last, false);
1926
1927 ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
1928 }
1929
1930 ac_build_export_prim(&ctx->ac, &prim);
1931 }
1932 ac_build_endif(&ctx->ac, 5140);
1933
1934 /* Export position and parameter data */
1935 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1936 ac_build_ifcc(&ctx->ac, tmp, 5145);
1937 {
1938 struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo;
1939 bool export_view_index = ctx->args->options->key.has_multiview_view_index;
1940 struct radv_shader_output_values *outputs;
1941 unsigned noutput = 0;
1942
1943 /* Allocate a temporary array for the output values. */
1944 unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;
1945 outputs = calloc(num_outputs, sizeof(outputs[0]));
1946
1947 tmp = ngg_gs_vertex_ptr(ctx, tid);
1948 tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1949 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1950 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1951
1952 unsigned out_idx = 0;
1953 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
1954 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
1955 int length = util_last_bit(output_usage_mask);
1956
1957 if (!(ctx->output_mask & (1ull << i)))
1958 continue;
1959
1960 outputs[noutput].slot_name = i;
1961 outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
1962 outputs[noutput].usage_mask = output_usage_mask;
1963
1964 for (unsigned j = 0; j < length; j++, out_idx++) {
1965 if (!(output_usage_mask & (1 << j)))
1966 continue;
1967
1968 tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1969 tmp = LLVMBuildLoad(builder, tmp, "");
1970
1971 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
1972 if (ac_get_type_size(type) == 2) {
1973 tmp = ac_to_integer(&ctx->ac, tmp);
1974 tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i16, "");
1975 }
1976
1977 outputs[noutput].values[j] = ac_to_float(&ctx->ac, tmp);
1978 }
1979
1980 for (unsigned j = length; j < 4; j++)
1981 outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
1982
1983 noutput++;
1984 }
1985
1986 /* Export ViewIndex. */
1987 if (export_view_index) {
1988 outputs[noutput].slot_name = VARYING_SLOT_LAYER;
1989 outputs[noutput].slot_index = 0;
1990 outputs[noutput].usage_mask = 0x1;
1991 outputs[noutput].values[0] =
1992 ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index));
1993 for (unsigned j = 1; j < 4; j++)
1994 outputs[noutput].values[j] = ctx->ac.f32_0;
1995 noutput++;
1996 }
1997
1998 radv_llvm_export_vs(ctx, outputs, noutput, outinfo, outinfo->export_clip_dists);
1999 FREE(outputs);
2000 }
2001 ac_build_endif(&ctx->ac, 5145);
2002 }
2003
2004 static void
gfx10_ngg_gs_emit_vertex(struct radv_shader_context * ctx,unsigned stream,LLVMValueRef vertexidx,LLVMValueRef * addrs)2005 gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMValueRef vertexidx,
2006 LLVMValueRef *addrs)
2007 {
2008 LLVMBuilderRef builder = ctx->ac.builder;
2009 LLVMValueRef tmp;
2010
2011 const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
2012 unsigned out_idx = 0;
2013 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2014 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2015 uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
2016 LLVMValueRef *out_ptr = &addrs[i * 4];
2017 int length = util_last_bit(output_usage_mask);
2018
2019 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
2020 continue;
2021
2022 for (unsigned j = 0; j < length; j++, out_idx++) {
2023 if (!(output_usage_mask & (1 << j)))
2024 continue;
2025
2026 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
2027 out_val = ac_to_integer(&ctx->ac, out_val);
2028 out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
2029
2030 LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
2031 }
2032 }
2033 assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
2034
2035 /* Store the current number of emitted vertices to zero out remaining
2036 * primitive flags in case the geometry shader doesn't emit the maximum
2037 * number of vertices.
2038 */
2039 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
2040 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
2041
2042 /* Determine and store whether this vertex completed a primitive. */
2043 const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
2044
2045 tmp = LLVMConstInt(
2046 ctx->ac.i32, si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) - 1, false);
2047 const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
2048
2049 /* Since the geometry shader emits triangle strips, we need to
2050 * track which primitive is odd and swap vertex indices to get
2051 * the correct vertex order.
2052 */
2053 LLVMValueRef is_odd = ctx->ac.i1false;
2054 if (stream == 0 && si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) {
2055 tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
2056 is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
2057 }
2058
2059 tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
2060 LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
2061
2062 /* The per-vertex primitive flag encoding:
2063 * bit 0: whether this vertex finishes a primitive
2064 * bit 1: whether the primitive is odd (if we are emitting triangle strips)
2065 */
2066 tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
2067 tmp = LLVMBuildOr(
2068 builder, tmp,
2069 LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
2070 LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
2071
2072 tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
2073 tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
2074 LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
2075 }
2076
2077 static bool
si_export_mrt_color(struct radv_shader_context * ctx,LLVMValueRef * color,unsigned index,struct ac_export_args * args)2078 si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index,
2079 struct ac_export_args *args)
2080 {
2081 /* Export */
2082 si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args);
2083 if (!args->enabled_channels)
2084 return false; /* unnecessary NULL export */
2085
2086 return true;
2087 }
2088
2089 static void
radv_export_mrt_z(struct radv_shader_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask)2090 radv_export_mrt_z(struct radv_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
2091 LLVMValueRef samplemask)
2092 {
2093 struct ac_export_args args;
2094
2095 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
2096
2097 ac_build_export(&ctx->ac, &args);
2098 }
2099
2100 static void
handle_fs_outputs_post(struct radv_shader_context * ctx)2101 handle_fs_outputs_post(struct radv_shader_context *ctx)
2102 {
2103 unsigned index = 0;
2104 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2105 struct ac_export_args color_args[8];
2106
2107 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2108 LLVMValueRef values[4];
2109
2110 if (!(ctx->output_mask & (1ull << i)))
2111 continue;
2112
2113 if (i < FRAG_RESULT_DATA0)
2114 continue;
2115
2116 for (unsigned j = 0; j < 4; j++)
2117 values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j));
2118
2119 bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]);
2120 if (ret)
2121 index++;
2122 }
2123
2124 /* Process depth, stencil, samplemask. */
2125 if (ctx->args->shader_info->ps.writes_z) {
2126 depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
2127 }
2128 if (ctx->args->shader_info->ps.writes_stencil) {
2129 stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
2130 }
2131 if (ctx->args->shader_info->ps.writes_sample_mask) {
2132 samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
2133 }
2134
2135 /* Set the DONE bit on last non-null color export only if Z isn't
2136 * exported.
2137 */
2138 if (index > 0 && !ctx->args->shader_info->ps.writes_z &&
2139 !ctx->args->shader_info->ps.writes_stencil &&
2140 !ctx->args->shader_info->ps.writes_sample_mask) {
2141 unsigned last = index - 1;
2142
2143 color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
2144 color_args[last].done = 1; /* DONE bit */
2145 }
2146
2147 /* Export PS outputs. */
2148 for (unsigned i = 0; i < index; i++)
2149 ac_build_export(&ctx->ac, &color_args[i]);
2150
2151 if (depth || stencil || samplemask)
2152 radv_export_mrt_z(ctx, depth, stencil, samplemask);
2153 else if (!index)
2154 ac_build_export_null(&ctx->ac);
2155 }
2156
2157 static void
emit_gs_epilogue(struct radv_shader_context * ctx)2158 emit_gs_epilogue(struct radv_shader_context *ctx)
2159 {
2160 if (ctx->args->shader_info->is_ngg) {
2161 gfx10_ngg_gs_emit_epilogue_1(ctx);
2162 return;
2163 }
2164
2165 if (ctx->ac.chip_class >= GFX10)
2166 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
2167
2168 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
2169 }
2170
2171 static void
handle_shader_outputs_post(struct ac_shader_abi * abi)2172 handle_shader_outputs_post(struct ac_shader_abi *abi)
2173 {
2174 struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
2175
2176 switch (ctx->stage) {
2177 case MESA_SHADER_VERTEX:
2178 if (ctx->args->shader_info->vs.as_ls)
2179 break; /* Lowered in NIR */
2180 else if (ctx->args->shader_info->vs.as_es)
2181 break; /* Lowered in NIR */
2182 else if (ctx->args->shader_info->is_ngg)
2183 break;
2184 else
2185 handle_vs_outputs_post(ctx, ctx->args->shader_info->vs.outinfo.export_prim_id,
2186 ctx->args->shader_info->vs.outinfo.export_clip_dists,
2187 &ctx->args->shader_info->vs.outinfo);
2188 break;
2189 case MESA_SHADER_FRAGMENT:
2190 handle_fs_outputs_post(ctx);
2191 break;
2192 case MESA_SHADER_GEOMETRY:
2193 emit_gs_epilogue(ctx);
2194 break;
2195 case MESA_SHADER_TESS_CTRL:
2196 break; /* Lowered in NIR */
2197 case MESA_SHADER_TESS_EVAL:
2198 if (ctx->args->shader_info->tes.as_es)
2199 break; /* Lowered in NIR */
2200 else if (ctx->args->shader_info->is_ngg)
2201 break;
2202 else
2203 handle_vs_outputs_post(ctx, ctx->args->shader_info->tes.outinfo.export_prim_id,
2204 ctx->args->shader_info->tes.outinfo.export_clip_dists,
2205 &ctx->args->shader_info->tes.outinfo);
2206 break;
2207 default:
2208 break;
2209 }
2210 }
2211
2212 static void
ac_llvm_finalize_module(struct radv_shader_context * ctx,LLVMPassManagerRef passmgr,const struct radv_nir_compiler_options * options)2213 ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr,
2214 const struct radv_nir_compiler_options *options)
2215 {
2216 LLVMRunPassManager(passmgr, ctx->ac.module);
2217 LLVMDisposeBuilder(ctx->ac.builder);
2218
2219 ac_llvm_context_dispose(&ctx->ac);
2220 }
2221
2222 static void
ac_nir_eliminate_const_vs_outputs(struct radv_shader_context * ctx)2223 ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
2224 {
2225 struct radv_vs_output_info *outinfo;
2226
2227 switch (ctx->stage) {
2228 case MESA_SHADER_FRAGMENT:
2229 case MESA_SHADER_COMPUTE:
2230 case MESA_SHADER_TESS_CTRL:
2231 case MESA_SHADER_GEOMETRY:
2232 return;
2233 case MESA_SHADER_VERTEX:
2234 if (ctx->args->shader_info->vs.as_ls ||
2235 ctx->args->shader_info->vs.as_es)
2236 return;
2237 outinfo = &ctx->args->shader_info->vs.outinfo;
2238 break;
2239 case MESA_SHADER_TESS_EVAL:
2240 if (ctx->args->shader_info->tes.as_es)
2241 return;
2242 outinfo = &ctx->args->shader_info->tes.outinfo;
2243 break;
2244 default:
2245 unreachable("Unhandled shader type");
2246 }
2247
2248 ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset,
2249 VARYING_SLOT_MAX, 0, &outinfo->param_exports);
2250 }
2251
2252 static void
ac_setup_rings(struct radv_shader_context * ctx)2253 ac_setup_rings(struct radv_shader_context *ctx)
2254 {
2255 if (ctx->args->options->chip_class <= GFX8 &&
2256 (ctx->stage == MESA_SHADER_GEOMETRY ||
2257 (ctx->stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.as_es) ||
2258 (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->args->shader_info->tes.as_es))) {
2259 unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;
2260 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
2261
2262 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, offset);
2263 }
2264
2265 if (ctx->args->is_gs_copy_shader) {
2266 ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2267 LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
2268 }
2269
2270 if (ctx->stage == MESA_SHADER_GEOMETRY) {
2271 /* The conceptual layout of the GSVS ring is
2272 * v0c0 .. vLv0 v0c1 .. vLc1 ..
2273 * but the real memory layout is swizzled across
2274 * threads:
2275 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
2276 * t16v0c0 ..
2277 * Override the buffer descriptor accordingly.
2278 */
2279 LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
2280 uint64_t stream_offset = 0;
2281 unsigned num_records = ctx->ac.wave_size;
2282 LLVMValueRef base_ring;
2283
2284 base_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets,
2285 LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
2286
2287 for (unsigned stream = 0; stream < 4; stream++) {
2288 unsigned num_components, stride;
2289 LLVMValueRef ring, tmp;
2290
2291 num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2292
2293 if (!num_components)
2294 continue;
2295
2296 stride = 4 * num_components * ctx->shader->info.gs.vertices_out;
2297
2298 /* Limit on the stride field for <= GFX7. */
2299 assert(stride < (1 << 14));
2300
2301 ring = LLVMBuildBitCast(ctx->ac.builder, base_ring, v2i64, "");
2302 tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_0, "");
2303 tmp = LLVMBuildAdd(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
2304 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_0, "");
2305
2306 stream_offset += stride * ctx->ac.wave_size;
2307
2308 ring = LLVMBuildBitCast(ctx->ac.builder, ring, ctx->ac.v4i32, "");
2309
2310 tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, ctx->ac.i32_1, "");
2311 tmp = LLVMBuildOr(ctx->ac.builder, tmp,
2312 LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride), false), "");
2313 ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, ctx->ac.i32_1, "");
2314
2315 ring = LLVMBuildInsertElement(ctx->ac.builder, ring,
2316 LLVMConstInt(ctx->ac.i32, num_records, false),
2317 LLVMConstInt(ctx->ac.i32, 2, false), "");
2318
2319 ctx->gsvs_ring[stream] = ring;
2320 }
2321 }
2322
2323 if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {
2324 ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(
2325 &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
2326 ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(
2327 &ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
2328 }
2329 }
2330
2331 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
2332 static void
ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context * ctx)2333 ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
2334 {
2335 LLVMValueRef count =
2336 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 8, 8);
2337 LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, "");
2338 ctx->abi.instance_id =
2339 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
2340 ctx->abi.instance_id, "");
2341 ctx->vs_rel_patch_id =
2342 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids),
2343 ctx->vs_rel_patch_id, "");
2344 ctx->abi.vertex_id =
2345 LLVMBuildSelect(ctx->ac.builder, hs_empty, ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id),
2346 ctx->abi.vertex_id, "");
2347 }
2348
2349 static void
prepare_gs_input_vgprs(struct radv_shader_context * ctx,bool merged)2350 prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
2351 {
2352 if (merged) {
2353 for (int i = 5; i >= 0; --i) {
2354 ctx->gs_vtx_offset[i] = ac_unpack_param(
2355 &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i / 2]), (i & 1) * 16, 16);
2356 }
2357
2358 ctx->gs_wave_id =
2359 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info), 16, 8);
2360 } else {
2361 for (int i = 0; i < 6; i++)
2362 ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i]);
2363 ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->ac.gs_wave_id);
2364 }
2365 }
2366
2367 /* Ensure that the esgs ring is declared.
2368 *
2369 * We declare it with 64KB alignment as a hint that the
2370 * pointer value will always be 0.
2371 */
2372 static void
declare_esgs_ring(struct radv_shader_context * ctx)2373 declare_esgs_ring(struct radv_shader_context *ctx)
2374 {
2375 if (ctx->esgs_ring)
2376 return;
2377
2378 assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
2379
2380 ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
2381 "esgs_ring", AC_ADDR_SPACE_LDS);
2382 LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
2383 LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
2384 }
2385
2386 static LLVMModuleRef
ac_translate_nir_to_llvm(struct ac_llvm_compiler * ac_llvm,struct nir_shader * const * shaders,int shader_count,const struct radv_shader_args * args)2387 ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders,
2388 int shader_count, const struct radv_shader_args *args)
2389 {
2390 struct radv_shader_context ctx = {0};
2391 ctx.args = args;
2392
2393 enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
2394
2395 if (shaders[0]->info.float_controls_execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
2396 float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
2397 }
2398
2399 ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
2400 args->options->info, float_mode, args->shader_info->wave_size,
2401 args->shader_info->ballot_bit_size);
2402 ctx.context = ctx.ac.context;
2403
2404 ctx.max_workgroup_size = args->shader_info->workgroup_size;
2405
2406 if (ctx.ac.chip_class >= GFX10) {
2407 if (is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg) {
2408 ctx.max_workgroup_size = 128;
2409 }
2410 }
2411
2412 create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
2413
2414 ctx.abi.emit_outputs = handle_shader_outputs_post;
2415 ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
2416 ctx.abi.load_ubo = radv_load_ubo;
2417 ctx.abi.load_ssbo = radv_load_ssbo;
2418 ctx.abi.load_sampler_desc = radv_get_sampler_desc;
2419 ctx.abi.load_resource = radv_load_resource;
2420 ctx.abi.load_ring_tess_factors = load_ring_tess_factors;
2421 ctx.abi.load_ring_tess_offchip = load_ring_tess_offchip;
2422 ctx.abi.load_ring_esgs = load_ring_esgs;
2423 ctx.abi.clamp_shadow_reference = false;
2424 ctx.abi.adjust_frag_coord_z = args->options->adjust_frag_coord_z;
2425 ctx.abi.robust_buffer_access = args->options->robust_buffer_access;
2426
2427 bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->shader_info->is_ngg;
2428 if (shader_count >= 2 || is_ngg)
2429 ac_init_exec_full_mask(&ctx.ac);
2430
2431 if (args->ac.vertex_id.used)
2432 ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id);
2433 if (args->ac.vs_rel_patch_id.used)
2434 ctx.vs_rel_patch_id = ac_get_arg(&ctx.ac, args->ac.vs_rel_patch_id);
2435 if (args->ac.instance_id.used)
2436 ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
2437
2438 if (args->options->has_ls_vgpr_init_bug &&
2439 shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
2440 ac_nir_fixup_ls_hs_input_vgprs(&ctx);
2441
2442 if (is_ngg) {
2443 /* Declare scratch space base for streamout and vertex
2444 * compaction. Whether space is actually allocated is
2445 * determined during linking / PM4 creation.
2446 *
2447 * Add an extra dword per vertex to ensure an odd stride, which
2448 * avoids bank conflicts for SoA accesses.
2449 */
2450 if (!args->shader_info->is_ngg_passthrough)
2451 declare_esgs_ring(&ctx);
2452
2453 /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
2454 if (ctx.ac.chip_class == GFX10 && shader_count == 1)
2455 ac_build_s_barrier(&ctx.ac);
2456 }
2457
2458 for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {
2459 ctx.stage = shaders[shader_idx]->info.stage;
2460 ctx.shader = shaders[shader_idx];
2461 ctx.output_mask = 0;
2462
2463 if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY) {
2464 for (int i = 0; i < 4; i++) {
2465 ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2466 }
2467 if (args->shader_info->is_ngg) {
2468 for (unsigned i = 0; i < 4; ++i) {
2469 ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2470 ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
2471 }
2472
2473 LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
2474 ctx.gs_ngg_scratch =
2475 LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
2476 LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
2477 LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
2478
2479 ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(
2480 ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
2481 LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage);
2482 LLVMSetAlignment(ctx.gs_ngg_emit, 4);
2483 }
2484
2485 ctx.abi.emit_primitive = visit_end_primitive;
2486 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
2487 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
2488 ctx.abi.load_base_vertex = radv_load_base_vertex;
2489 ctx.abi.load_inputs = radv_load_vs_inputs;
2490 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
2491 ctx.abi.load_sample_position = load_sample_position;
2492 ctx.abi.load_sample_mask_in = load_sample_mask_in;
2493 }
2494
2495 if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX &&
2496 args->shader_info->is_ngg &&
2497 args->shader_info->vs.outinfo.export_prim_id) {
2498 declare_esgs_ring(&ctx);
2499 }
2500
2501 bool nested_barrier = false;
2502
2503 if (shader_idx) {
2504 if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
2505 args->shader_info->is_ngg) {
2506 gfx10_ngg_gs_emit_prologue(&ctx);
2507 nested_barrier = false;
2508 } else {
2509 nested_barrier = true;
2510 }
2511 }
2512
2513 if (nested_barrier) {
2514 /* Execute a barrier before the second shader in
2515 * a merged shader.
2516 *
2517 * Execute the barrier inside the conditional block,
2518 * so that empty waves can jump directly to s_endpgm,
2519 * which will also signal the barrier.
2520 *
2521 * This is possible in gfx9, because an empty wave
2522 * for the second shader does not participate in
2523 * the epilogue. With NGG, empty waves may still
2524 * be required to export data (e.g. GS output vertices),
2525 * so we cannot let them exit early.
2526 *
2527 * If the shader is TCS and the TCS epilog is present
2528 * and contains a barrier, it will wait there and then
2529 * reach s_endpgm.
2530 */
2531 ac_emit_barrier(&ctx.ac, ctx.stage);
2532 }
2533
2534 nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(
2535 &ctx, variable, shaders[shader_idx], shaders[shader_idx]->info.stage);
2536
2537 ac_setup_rings(&ctx);
2538
2539 LLVMBasicBlockRef merge_block = NULL;
2540 if (shader_count >= 2 || is_ngg) {
2541 LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
2542 LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
2543 merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
2544
2545 LLVMValueRef count = ac_unpack_param(
2546 &ctx.ac, ac_get_arg(&ctx.ac, args->ac.merged_wave_info), 8 * shader_idx, 8);
2547 LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
2548 LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, "");
2549 LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
2550
2551 LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
2552 }
2553
2554 if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)
2555 prepare_interp_optimize(&ctx, shaders[shader_idx]);
2556 else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY)
2557 prepare_gs_input_vgprs(&ctx, shader_count >= 2);
2558
2559 ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[shader_idx]);
2560
2561 if (shader_count >= 2 || is_ngg) {
2562 LLVMBuildBr(ctx.ac.builder, merge_block);
2563 LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
2564 }
2565
2566 /* This needs to be outside the if wrapping the shader body, as sometimes
2567 * the HW generates waves with 0 es/vs threads. */
2568 if (is_pre_gs_stage(shaders[shader_idx]->info.stage) &&
2569 args->shader_info->is_ngg && shader_idx == shader_count - 1) {
2570 handle_ngg_outputs_post_2(&ctx);
2571 } else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
2572 args->shader_info->is_ngg) {
2573 gfx10_ngg_gs_emit_epilogue_2(&ctx);
2574 }
2575 }
2576
2577 LLVMBuildRetVoid(ctx.ac.builder);
2578
2579 if (args->options->dump_preoptir) {
2580 fprintf(stderr, "%s LLVM IR:\n\n",
2581 radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage));
2582 ac_dump_module(ctx.ac.module);
2583 fprintf(stderr, "\n");
2584 }
2585
2586 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
2587
2588 if (shader_count == 1)
2589 ac_nir_eliminate_const_vs_outputs(&ctx);
2590
2591 return ctx.ac.module;
2592 }
2593
2594 static void
ac_diagnostic_handler(LLVMDiagnosticInfoRef di,void * context)2595 ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
2596 {
2597 unsigned *retval = (unsigned *)context;
2598 LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
2599 char *description = LLVMGetDiagInfoDescription(di);
2600
2601 if (severity == LLVMDSError) {
2602 *retval = 1;
2603 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
2604 }
2605
2606 LLVMDisposeMessage(description);
2607 }
2608
2609 static unsigned
radv_llvm_compile(LLVMModuleRef M,char ** pelf_buffer,size_t * pelf_size,struct ac_llvm_compiler * ac_llvm)2610 radv_llvm_compile(LLVMModuleRef M, char **pelf_buffer, size_t *pelf_size,
2611 struct ac_llvm_compiler *ac_llvm)
2612 {
2613 unsigned retval = 0;
2614 LLVMContextRef llvm_ctx;
2615
2616 /* Setup Diagnostic Handler*/
2617 llvm_ctx = LLVMGetModuleContext(M);
2618
2619 LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler, &retval);
2620
2621 /* Compile IR*/
2622 if (!radv_compile_to_elf(ac_llvm, M, pelf_buffer, pelf_size))
2623 retval = 1;
2624 return retval;
2625 }
2626
2627 static void
ac_compile_llvm_module(struct ac_llvm_compiler * ac_llvm,LLVMModuleRef llvm_module,struct radv_shader_binary ** rbinary,gl_shader_stage stage,const char * name,const struct radv_nir_compiler_options * options)2628 ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module,
2629 struct radv_shader_binary **rbinary, gl_shader_stage stage, const char *name,
2630 const struct radv_nir_compiler_options *options)
2631 {
2632 char *elf_buffer = NULL;
2633 size_t elf_size = 0;
2634 char *llvm_ir_string = NULL;
2635
2636 if (options->dump_shader) {
2637 fprintf(stderr, "%s LLVM IR:\n\n", name);
2638 ac_dump_module(llvm_module);
2639 fprintf(stderr, "\n");
2640 }
2641
2642 if (options->record_ir) {
2643 char *llvm_ir = LLVMPrintModuleToString(llvm_module);
2644 llvm_ir_string = strdup(llvm_ir);
2645 LLVMDisposeMessage(llvm_ir);
2646 }
2647
2648 int v = radv_llvm_compile(llvm_module, &elf_buffer, &elf_size, ac_llvm);
2649 if (v) {
2650 fprintf(stderr, "compile failed\n");
2651 }
2652
2653 LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
2654 LLVMDisposeModule(llvm_module);
2655 LLVMContextDispose(ctx);
2656
2657 size_t llvm_ir_size = llvm_ir_string ? strlen(llvm_ir_string) : 0;
2658 size_t alloc_size = sizeof(struct radv_shader_binary_rtld) + elf_size + llvm_ir_size + 1;
2659 struct radv_shader_binary_rtld *rbin = calloc(1, alloc_size);
2660 memcpy(rbin->data, elf_buffer, elf_size);
2661 if (llvm_ir_string)
2662 memcpy(rbin->data + elf_size, llvm_ir_string, llvm_ir_size + 1);
2663
2664 rbin->base.type = RADV_BINARY_TYPE_RTLD;
2665 rbin->base.stage = stage;
2666 rbin->base.total_size = alloc_size;
2667 rbin->elf_size = elf_size;
2668 rbin->llvm_ir_size = llvm_ir_size;
2669 *rbinary = &rbin->base;
2670
2671 free(llvm_ir_string);
2672 free(elf_buffer);
2673 }
2674
2675 static void
radv_compile_nir_shader(struct ac_llvm_compiler * ac_llvm,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args,struct nir_shader * const * nir,int nir_count)2676 radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary,
2677 const struct radv_shader_args *args, struct nir_shader *const *nir,
2678 int nir_count)
2679 {
2680
2681 LLVMModuleRef llvm_module;
2682
2683 llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args);
2684
2685 ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,
2686 radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage),
2687 args->options);
2688 }
2689
2690 static void
ac_gs_copy_shader_emit(struct radv_shader_context * ctx)2691 ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
2692 {
2693 LLVMValueRef vtx_offset =
2694 LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id),
2695 LLVMConstInt(ctx->ac.i32, 4, false), "");
2696 LLVMValueRef stream_id;
2697
2698 /* Fetch the vertex stream ID. */
2699 if (ctx->args->shader_info->so.num_outputs) {
2700 stream_id =
2701 ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
2702 } else {
2703 stream_id = ctx->ac.i32_0;
2704 }
2705
2706 LLVMBasicBlockRef end_bb;
2707 LLVMValueRef switch_inst;
2708
2709 end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, ctx->main_function, "end");
2710 switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
2711
2712 for (unsigned stream = 0; stream < 4; stream++) {
2713 unsigned num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
2714 LLVMBasicBlockRef bb;
2715 unsigned offset;
2716
2717 if (stream > 0 && !num_components)
2718 continue;
2719
2720 if (stream > 0 && !ctx->args->shader_info->so.num_outputs)
2721 continue;
2722
2723 bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
2724 LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb);
2725 LLVMPositionBuilderAtEnd(ctx->ac.builder, bb);
2726
2727 offset = 0;
2728 for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
2729 unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
2730 unsigned output_stream = ctx->args->shader_info->gs.output_streams[i];
2731 int length = util_last_bit(output_usage_mask);
2732
2733 if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
2734 continue;
2735
2736 for (unsigned j = 0; j < length; j++) {
2737 LLVMValueRef value, soffset;
2738
2739 if (!(output_usage_mask & (1 << j)))
2740 continue;
2741
2742 soffset = LLVMConstInt(ctx->ac.i32, offset * ctx->shader->info.gs.vertices_out * 16 * 4,
2743 false);
2744
2745 offset++;
2746
2747 value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring[0], 1, ctx->ac.i32_0, vtx_offset,
2748 soffset, 0, ctx->ac.f32, ac_glc | ac_slc, true, false);
2749
2750 LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
2751 if (ac_get_type_size(type) == 2) {
2752 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
2753 value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
2754 }
2755
2756 LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, value),
2757 ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
2758 }
2759 }
2760
2761 if (ctx->args->shader_info->so.num_outputs)
2762 radv_emit_streamout(ctx, stream);
2763
2764 if (stream == 0) {
2765 handle_vs_outputs_post(ctx, false, ctx->args->shader_info->vs.outinfo.export_clip_dists,
2766 &ctx->args->shader_info->vs.outinfo);
2767 }
2768
2769 LLVMBuildBr(ctx->ac.builder, end_bb);
2770 }
2771
2772 LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb);
2773 }
2774
2775 static void
radv_compile_gs_copy_shader(struct ac_llvm_compiler * ac_llvm,struct nir_shader * geom_shader,struct radv_shader_binary ** rbinary,const struct radv_shader_args * args)2776 radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader,
2777 struct radv_shader_binary **rbinary,
2778 const struct radv_shader_args *args)
2779 {
2780 struct radv_shader_context ctx = {0};
2781 ctx.args = args;
2782
2783 assert(args->is_gs_copy_shader);
2784
2785 ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
2786 args->options->info, AC_FLOAT_MODE_DEFAULT, 64, 64);
2787 ctx.context = ctx.ac.context;
2788
2789 ctx.stage = MESA_SHADER_VERTEX;
2790 ctx.shader = geom_shader;
2791
2792 create_function(&ctx, MESA_SHADER_VERTEX, false);
2793
2794 ac_setup_rings(&ctx);
2795
2796 nir_foreach_shader_out_variable(variable, geom_shader)
2797 {
2798 scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
2799 ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, variable, MESA_SHADER_VERTEX);
2800 }
2801
2802 ac_gs_copy_shader_emit(&ctx);
2803
2804 LLVMBuildRetVoid(ctx.ac.builder);
2805
2806 ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
2807
2808 ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",
2809 args->options);
2810 (*rbinary)->is_gs_copy_shader = true;
2811 }
2812
2813 void
llvm_compile_shader(struct radv_device * device,unsigned shader_count,struct nir_shader * const * shaders,struct radv_shader_binary ** binary,struct radv_shader_args * args)2814 llvm_compile_shader(struct radv_device *device, unsigned shader_count,
2815 struct nir_shader *const *shaders, struct radv_shader_binary **binary,
2816 struct radv_shader_args *args)
2817 {
2818 enum ac_target_machine_options tm_options = 0;
2819 struct ac_llvm_compiler ac_llvm;
2820
2821 tm_options |= AC_TM_SUPPORTS_SPILL;
2822 if (args->options->check_ir)
2823 tm_options |= AC_TM_CHECK_IR;
2824
2825 radv_init_llvm_compiler(&ac_llvm, args->options->family, tm_options,
2826 args->shader_info->wave_size);
2827
2828 if (args->is_gs_copy_shader) {
2829 radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);
2830 } else {
2831 radv_compile_nir_shader(&ac_llvm, binary, args, shaders, shader_count);
2832 }
2833 }
2834