1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 #include "ac_nir.h"
30
unpack_sint16(struct si_shader_context * ctx,LLVMValueRef i32,unsigned index)31 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
32 {
33 assert(index <= 1);
34
35 if (index == 1)
36 return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
37
38 return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
39 ctx->ac.i32, "");
40 }
41
load_input_vs(struct si_shader_context * ctx,unsigned input_index,LLVMValueRef out[4])42 static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
43 {
44 const struct si_shader_info *info = &ctx->shader->selector->info;
45 unsigned vs_blit_property = info->base.vs.blit_sgprs_amd;
46
47 if (vs_blit_property) {
48 LLVMValueRef vertex_id = ctx->abi.vertex_id;
49 LLVMValueRef sel_x1 =
50 LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
51 /* Use LLVMIntNE, because we have 3 vertices and only
52 * the middle one should use y2.
53 */
54 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
55
56 unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
57 if (input_index == 0) {
58 /* Position: */
59 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
60 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
61
62 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
63 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
64 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
65 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
66
67 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
68 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
69
70 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
71 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
72 out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
73 out[3] = ctx->ac.f32_1;
74 return;
75 }
76
77 /* Color or texture coordinates: */
78 assert(input_index == 1);
79
80 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
81 for (int i = 0; i < 4; i++) {
82 out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
83 }
84 } else {
85 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
86 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
87 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
88 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
89 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
90
91 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
92 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
93 out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
94 out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
95 }
96 return;
97 }
98
99 /* Set can_speculate=false to help keep all loads grouped together
100 * for better latency hiding. If it was true, LLVM could move the loads forward
101 * and accidentally double memory latency by doing:
102 *
103 * buffer_load_dword_xyzw
104 * s_waitcnt vmcnt(0)
105 * buffer_load_dword_xyzw
106 * s_waitcnt vmcnt(0)
107 *
108 * ... which is what we must prevent at all cost.
109 */
110 const bool can_speculate = false;
111 unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32;
112 LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32;
113 LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32;
114 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
115 union si_vs_fix_fetch fix_fetch;
116 LLVMValueRef vb_desc;
117 LLVMValueRef vertex_index;
118 LLVMValueRef tmp;
119
120 if (input_index < num_vbos_in_user_sgprs) {
121 vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
122 } else {
123 unsigned index = input_index - num_vbos_in_user_sgprs;
124 vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.vertex_buffers),
125 LLVMConstInt(ctx->ac.i32, index, 0));
126 }
127
128 vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
129
130 /* Use the open-coded implementation for all loads of doubles and
131 * of dword-sized data that needs fixups. We need to insert conversion
132 * code anyway, and the amd/common code does it for us.
133 */
134 bool opencode = ctx->shader->key.ge.mono.vs_fetch_opencode & (1 << input_index);
135 fix_fetch.bits = ctx->shader->key.ge.mono.vs_fix_fetch[input_index].bits;
136 if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
137 (fix_fetch.u.log_size == 2)) {
138 tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
139 fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
140 fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
141 ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate);
142 for (unsigned i = 0; i < 4; ++i)
143 out[i] =
144 LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
145
146 if (bit_size == 16) {
147 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT ||
148 fix_fetch.u.format == AC_FETCH_FORMAT_SINT) {
149 for (unsigned i = 0; i < 4; i++)
150 out[i] = LLVMBuildTrunc(ctx->ac.builder, out[i], ctx->ac.i16, "");
151 } else {
152 for (unsigned i = 0; i < 4; i++) {
153 out[i] = ac_to_float(&ctx->ac, out[i]);
154 out[i] = LLVMBuildFPTrunc(ctx->ac.builder, out[i], ctx->ac.f16, "");
155 }
156 }
157 }
158 return;
159 }
160
161 unsigned required_channels = util_last_bit(info->input[input_index].usage_mask);
162 if (required_channels == 0) {
163 for (unsigned i = 0; i < 4; ++i)
164 out[i] = LLVMGetUndef(ctx->ac.f32);
165 return;
166 }
167
168 /* Do multiple loads for special formats. */
169 LLVMValueRef fetches[4];
170 unsigned num_fetches;
171 unsigned fetch_stride;
172 unsigned channels_per_fetch;
173
174 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
175 num_fetches = MIN2(required_channels, 3);
176 fetch_stride = 1 << fix_fetch.u.log_size;
177 channels_per_fetch = 1;
178 } else {
179 num_fetches = 1;
180 fetch_stride = 0;
181 channels_per_fetch = required_channels;
182 }
183
184 for (unsigned i = 0; i < num_fetches; ++i) {
185 LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
186 fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
187 channels_per_fetch, 0, can_speculate,
188 bit_size == 16, false);
189 }
190
191 if (num_fetches == 1 && channels_per_fetch > 1) {
192 LLVMValueRef fetch = fetches[0];
193 for (unsigned i = 0; i < channels_per_fetch; ++i) {
194 tmp = LLVMConstInt(ctx->ac.i32, i, false);
195 fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
196 }
197 num_fetches = channels_per_fetch;
198 channels_per_fetch = 1;
199 }
200
201 for (unsigned i = num_fetches; i < 4; ++i)
202 fetches[i] = LLVMGetUndef(float_type);
203
204 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
205 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
206 fetches[3] = LLVMConstInt(int_type, 1, 0);
207 else
208 fetches[3] = LLVMConstReal(float_type, 1);
209 } else if (fix_fetch.u.log_size == 3 &&
210 (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
211 fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
212 fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
213 required_channels == 4) {
214
215 /* For 2_10_10_10, the hardware returns an unsigned value;
216 * convert it to a signed one.
217 */
218 LLVMValueRef tmp = fetches[3];
219 LLVMValueRef c30 = LLVMConstInt(int_type, 30, 0);
220
221 /* First, recover the sign-extended signed integer value. */
222 if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
223 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, int_type, "");
224 else
225 tmp = ac_to_integer(&ctx->ac, tmp);
226
227 /* For the integer-like cases, do a natural sign extension.
228 *
229 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
230 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
231 * exponent.
232 */
233 tmp = LLVMBuildShl(
234 ctx->ac.builder, tmp,
235 fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(int_type, 7, 0) : c30, "");
236 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
237
238 /* Convert back to the right type. */
239 if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
240 LLVMValueRef clamp;
241 LLVMValueRef neg_one = LLVMConstReal(float_type, -1.0);
242 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
243 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
244 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
245 } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
246 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
247 }
248
249 fetches[3] = tmp;
250 }
251
252 for (unsigned i = 0; i < 4; ++i)
253 out[i] = ac_to_float(&ctx->ac, fetches[i]);
254 }
255
si_load_vs_input(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)256 static LLVMValueRef si_load_vs_input(struct ac_shader_abi *abi, unsigned driver_location,
257 unsigned component, unsigned num_components,
258 unsigned vertex_index, LLVMTypeRef type)
259 {
260 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
261 LLVMValueRef values[4];
262
263 load_input_vs(ctx, driver_location, values);
264
265 for (unsigned i = 0; i < 4; i++)
266 values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
267
268 return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
269 }
270
si_llvm_streamout_store_output(struct si_shader_context * ctx,LLVMValueRef const * so_buffers,LLVMValueRef const * so_write_offsets,struct pipe_stream_output * stream_out,struct si_shader_output_values * shader_out)271 void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
272 LLVMValueRef const *so_write_offsets,
273 struct pipe_stream_output *stream_out,
274 struct si_shader_output_values *shader_out)
275 {
276 unsigned buf_idx = stream_out->output_buffer;
277 unsigned start = stream_out->start_component;
278 unsigned num_comps = stream_out->num_components;
279 LLVMValueRef out[4];
280
281 assert(num_comps && num_comps <= 4);
282 if (!num_comps || num_comps > 4)
283 return;
284
285 /* Load the output as int. */
286 for (int j = 0; j < num_comps; j++) {
287 assert(stream_out->stream == ((shader_out->vertex_streams >> ((start + j) * 2)) & 0x3));
288
289 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
290 }
291
292 /* Pack the output. */
293 LLVMValueRef vdata = NULL;
294
295 switch (num_comps) {
296 case 1: /* as i32 */
297 vdata = out[0];
298 break;
299 case 2: /* as v2i32 */
300 case 3: /* as v3i32 */
301 case 4: /* as v4i32 */
302 vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
303 break;
304 }
305
306 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, NULL,
307 so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
308 ac_glc | ac_slc);
309 }
310
311 /**
312 * Write streamout data to buffers for vertex stream @p stream (different
313 * vertex streams can occur for GS copy shaders).
314 */
si_llvm_emit_streamout(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput,unsigned stream)315 void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
316 unsigned noutput, unsigned stream)
317 {
318 struct si_shader_selector *sel = ctx->shader->selector;
319 struct pipe_stream_output_info *so = &sel->so;
320 LLVMBuilderRef builder = ctx->ac.builder;
321 int i;
322
323 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
324 LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->args.streamout_config, 16, 7);
325
326 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
327
328 /* can_emit = tid < so_vtx_count; */
329 LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
330
331 /* Emit the streamout code conditionally. This actually avoids
332 * out-of-bounds buffer access. The hw tells us via the SGPR
333 * (so_vtx_count) which threads are allowed to emit streamout data. */
334 ac_build_ifcc(&ctx->ac, can_emit, 6501);
335 {
336 /* The buffer offset is computed as follows:
337 * ByteOffset = streamout_offset[buffer_id]*4 +
338 * (streamout_write_index + thread_id)*stride[buffer_id] +
339 * attrib_offset
340 */
341
342 LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args.streamout_write_index);
343
344 /* Compute (streamout_write_index + thread_id). */
345 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
346
347 /* Load the descriptor and compute the write offset for each
348 * enabled buffer. */
349 LLVMValueRef so_write_offset[4] = {};
350 LLVMValueRef so_buffers[4];
351 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
352
353 for (i = 0; i < 4; i++) {
354 if (!so->stride[i])
355 continue;
356
357 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
358
359 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
360
361 LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args.streamout_offset[i]);
362 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
363
364 so_write_offset[i] = ac_build_imad(
365 &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
366 }
367
368 /* Write streamout data. */
369 for (i = 0; i < so->num_outputs; i++) {
370 unsigned reg = so->output[i].register_index;
371
372 if (reg >= noutput)
373 continue;
374
375 if (stream != so->output[i].stream)
376 continue;
377
378 si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
379 &outputs[reg]);
380 }
381 }
382 ac_build_endif(&ctx->ac, 6501);
383 }
384
si_llvm_clipvertex_to_clipdist(struct si_shader_context * ctx,struct ac_export_args clipdist[2],LLVMValueRef clipvertex[4])385 void si_llvm_clipvertex_to_clipdist(struct si_shader_context *ctx,
386 struct ac_export_args clipdist[2], LLVMValueRef clipvertex[4])
387 {
388 unsigned reg_index;
389 unsigned chan;
390 unsigned const_chan;
391 LLVMValueRef base_elt;
392 LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
393 LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
394 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
395 unsigned clipdist_mask = ctx->shader->selector->clipdist_mask &
396 ~ctx->shader->key.ge.opt.kill_clip_distances;
397
398 for (reg_index = 0; reg_index < 2; reg_index++) {
399 struct ac_export_args *args = &clipdist[reg_index];
400
401 if (!(clipdist_mask & BITFIELD_RANGE(reg_index * 4, 4)))
402 continue;
403
404 args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMGetUndef(ctx->ac.f32);
405
406 /* Compute dot products of position and user clip plane vectors */
407 for (chan = 0; chan < 4; chan++) {
408 if (!(clipdist_mask & BITFIELD_BIT(reg_index * 4 + chan)))
409 continue;
410
411 for (const_chan = 0; const_chan < 4; const_chan++) {
412 LLVMValueRef addr =
413 LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
414 base_elt = si_buffer_load_const(ctx, const_resource, addr);
415 args->out[chan] =
416 ac_build_fmad(&ctx->ac, base_elt, clipvertex[const_chan],
417 const_chan == 0 ? ctx->ac.f32_0 : args->out[chan]);
418 }
419 }
420
421 args->enabled_channels = 0xf;
422 args->valid_mask = 0;
423 args->done = 0;
424 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
425 args->compr = 0;
426 }
427 }
428
429 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_vs_export_args(struct si_shader_context * ctx,const LLVMValueRef * values,unsigned target,struct ac_export_args * args)430 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, const LLVMValueRef *values,
431 unsigned target, struct ac_export_args *args)
432 {
433 args->enabled_channels = 0xf; /* writemask - default is 0xf */
434 args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
435 args->done = 0; /* Specify whether this is the last export */
436 args->target = target; /* Specify the target we are exporting */
437 args->compr = false;
438
439 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
440 }
441
si_prepare_param_exports(struct si_shader_context * ctx,const struct si_shader_output_values * outputs,unsigned noutput,struct ac_export_args exports[32])442 static void si_prepare_param_exports(struct si_shader_context *ctx,
443 const struct si_shader_output_values *outputs, unsigned noutput,
444 struct ac_export_args exports[32])
445 {
446 struct si_shader *shader = ctx->shader;
447 unsigned param_count = 0;
448
449 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
450 sizeof(shader->info.vs_output_param_offset));
451
452 for (unsigned i = 0; i < noutput; i++) {
453 unsigned semantic = outputs[i].semantic;
454
455 /* Skip if no channel writes to stream 0. */
456 if (outputs[i].vertex_streams & 0x03 &&
457 outputs[i].vertex_streams & 0x0c &&
458 outputs[i].vertex_streams & 0x30 &&
459 outputs[i].vertex_streams & 0xc0)
460 continue;
461
462 switch (semantic) {
463 case VARYING_SLOT_LAYER:
464 case VARYING_SLOT_VIEWPORT:
465 case VARYING_SLOT_CLIP_DIST0:
466 case VARYING_SLOT_CLIP_DIST1:
467 case VARYING_SLOT_COL0:
468 case VARYING_SLOT_COL1:
469 case VARYING_SLOT_BFC0:
470 case VARYING_SLOT_BFC1:
471 case VARYING_SLOT_PRIMITIVE_ID:
472 case VARYING_SLOT_FOGC:
473 break;
474 default:
475 if ((semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7) ||
476 semantic >= VARYING_SLOT_VAR0)
477 break;
478 else
479 continue;
480 }
481
482 if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
483 shader->key.ge.opt.kill_outputs &
484 (1ull << si_shader_io_get_unique_index(semantic, true)))
485 continue;
486
487 si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + param_count,
488 &exports[param_count]);
489
490 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
491 shader->info.vs_output_param_offset[i] = param_count++;
492 }
493
494 shader->info.nr_param_exports = param_count;
495 }
496
497 /**
498 * Vertex color clamping.
499 *
500 * This uses a state constant loaded in a user data SGPR and
501 * an IF statement is added that clamps all colors if the constant
502 * is true.
503 */
si_vertex_color_clamping(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput)504 static void si_vertex_color_clamping(struct si_shader_context *ctx,
505 struct si_shader_output_values *outputs, unsigned noutput)
506 {
507 LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
508 bool has_colors = false;
509
510 /* Store original colors to alloca variables. */
511 for (unsigned i = 0; i < noutput; i++) {
512 if (outputs[i].semantic != VARYING_SLOT_COL0 &&
513 outputs[i].semantic != VARYING_SLOT_COL1 &&
514 outputs[i].semantic != VARYING_SLOT_BFC0 &&
515 outputs[i].semantic != VARYING_SLOT_BFC1)
516 continue;
517
518 for (unsigned j = 0; j < 4; j++)
519 addr[i][j] = ac_build_alloca_init(&ctx->ac, outputs[i].values[j], "");
520
521 has_colors = true;
522 }
523
524 if (!has_colors)
525 return;
526
527 /* The state is in the first bit of the user SGPR. */
528 LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
529 cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
530
531 ac_build_ifcc(&ctx->ac, cond, 6502);
532
533 /* Store clamped colors to alloca variables within the conditional block. */
534 for (unsigned i = 0; i < noutput; i++) {
535 if (outputs[i].semantic != VARYING_SLOT_COL0 &&
536 outputs[i].semantic != VARYING_SLOT_COL1 &&
537 outputs[i].semantic != VARYING_SLOT_BFC0 &&
538 outputs[i].semantic != VARYING_SLOT_BFC1)
539 continue;
540
541 for (unsigned j = 0; j < 4; j++) {
542 LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
543 addr[i][j]);
544 }
545 }
546 ac_build_endif(&ctx->ac, 6502);
547
548 /* Load clamped colors */
549 for (unsigned i = 0; i < noutput; i++) {
550 if (outputs[i].semantic != VARYING_SLOT_COL0 &&
551 outputs[i].semantic != VARYING_SLOT_COL1 &&
552 outputs[i].semantic != VARYING_SLOT_BFC0 &&
553 outputs[i].semantic != VARYING_SLOT_BFC1)
554 continue;
555
556 for (unsigned j = 0; j < 4; j++) {
557 outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
558 }
559 }
560 }
561
562 /* Generate export instructions for hardware VS shader stage or NGG GS stage
563 * (position and parameter data only).
564 */
si_llvm_build_vs_exports(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput)565 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
566 struct si_shader_output_values *outputs, unsigned noutput)
567 {
568 struct si_shader *shader = ctx->shader;
569 struct ac_export_args pos_args[4] = {};
570 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
571 viewport_index_value = NULL;
572 unsigned pos_idx, index;
573 unsigned clipdist_mask = (shader->selector->clipdist_mask &
574 ~shader->key.ge.opt.kill_clip_distances) |
575 shader->selector->culldist_mask;
576 int i;
577
578 si_vertex_color_clamping(ctx, outputs, noutput);
579
580 struct ac_export_args param_exports[32];
581 si_prepare_param_exports(ctx, outputs, noutput, param_exports);
582
583 /* Build position exports. */
584 for (i = 0; i < noutput; i++) {
585 switch (outputs[i].semantic) {
586 case VARYING_SLOT_POS:
587 si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
588 break;
589 case VARYING_SLOT_PSIZ:
590 psize_value = outputs[i].values[0];
591 break;
592 case VARYING_SLOT_LAYER:
593 layer_value = outputs[i].values[0];
594 break;
595 case VARYING_SLOT_VIEWPORT:
596 viewport_index_value = outputs[i].values[0];
597 break;
598 case VARYING_SLOT_EDGE:
599 edgeflag_value = outputs[i].values[0];
600 break;
601 case VARYING_SLOT_CLIP_DIST0:
602 case VARYING_SLOT_CLIP_DIST1:
603 index = outputs[i].semantic - VARYING_SLOT_CLIP_DIST0;
604 if (clipdist_mask & BITFIELD_RANGE(index * 4, 4)) {
605 si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + 2 + index,
606 &pos_args[2 + index]);
607 }
608 break;
609 case VARYING_SLOT_CLIP_VERTEX:
610 si_llvm_clipvertex_to_clipdist(ctx, pos_args + 2, outputs[i].values);
611 break;
612 }
613 }
614
615 /* We need to add the position output manually if it's missing. */
616 if (!pos_args[0].out[0]) {
617 pos_args[0].enabled_channels = 0xf; /* writemask */
618 pos_args[0].valid_mask = 0; /* EXEC mask */
619 pos_args[0].done = 0; /* last export? */
620 pos_args[0].target = V_008DFC_SQ_EXP_POS;
621 pos_args[0].compr = 0; /* COMPR flag */
622 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
623 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
624 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
625 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
626 }
627
628 bool writes_psize = shader->selector->info.writes_psize && !shader->key.ge.opt.kill_pointsize;
629 bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.ge.as_ngg;
630 bool writes_vrs = ctx->screen->options.vrs2x2;
631
632 /* Write the misc vector (point size, edgeflag, layer, viewport). */
633 if (writes_psize || pos_writes_edgeflag || writes_vrs ||
634 shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
635 pos_args[1].enabled_channels = writes_psize |
636 ((pos_writes_edgeflag | writes_vrs) << 1) |
637 (shader->selector->info.writes_layer << 2);
638
639 pos_args[1].valid_mask = 0; /* EXEC mask */
640 pos_args[1].done = 0; /* last export? */
641 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
642 pos_args[1].compr = 0; /* COMPR flag */
643 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
644 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
645 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
646 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
647
648 if (writes_psize)
649 pos_args[1].out[0] = psize_value;
650
651 if (pos_writes_edgeflag) {
652 /* The output is a float, but the hw expects an integer
653 * with the first bit containing the edge flag. */
654 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
655 edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
656
657 /* The LLVM intrinsic expects a float. */
658 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
659 }
660
661 if (writes_vrs) {
662 /* Bits [2:3] = VRS rate X
663 * Bits [4:5] = VRS rate Y
664 *
665 * The range is [-2, 1]. Values:
666 * 1: 2x coarser shading rate in that direction.
667 * 0: normal shading rate
668 * -1: 2x finer shading rate (sample shading, not directional)
669 * -2: 4x finer shading rate (sample shading, not directional)
670 *
671 * Sample shading can't go above 8 samples, so both numbers can't be -2
672 * at the same time.
673 */
674 LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, (1 << 2) | (1 << 4), 0);
675
676 /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
677 rates = LLVMBuildSelect(ctx->ac.builder,
678 LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE,
679 pos_args[0].out[3], ctx->ac.f32_1, ""),
680 rates, ctx->ac.i32_0, "");
681
682 LLVMValueRef v = ac_to_integer(&ctx->ac, pos_args[1].out[1]);
683 v = LLVMBuildOr(ctx->ac.builder, v, rates, "");
684 pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
685 }
686
687 if (ctx->screen->info.chip_class >= GFX9) {
688 /* GFX9 has the layer in out.z[10:0] and the viewport
689 * index in out.z[19:16].
690 */
691 if (shader->selector->info.writes_layer)
692 pos_args[1].out[2] = layer_value;
693
694 if (shader->selector->info.writes_viewport_index) {
695 LLVMValueRef v = viewport_index_value;
696
697 v = ac_to_integer(&ctx->ac, v);
698 v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
699 v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
700 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
701 pos_args[1].enabled_channels |= 1 << 2;
702 }
703 } else {
704 if (shader->selector->info.writes_layer)
705 pos_args[1].out[2] = layer_value;
706
707 if (shader->selector->info.writes_viewport_index) {
708 pos_args[1].out[3] = viewport_index_value;
709 pos_args[1].enabled_channels |= 1 << 3;
710 }
711 }
712 }
713
714 for (i = 0; i < 4; i++)
715 if (pos_args[i].out[0])
716 shader->info.nr_pos_exports++;
717
718 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
719 * Setting valid_mask=1 prevents it and has no other effect.
720 */
721 if (ctx->screen->info.chip_class == GFX10)
722 pos_args[0].valid_mask = 1;
723
724 pos_idx = 0;
725 for (i = 0; i < 4; i++) {
726 if (!pos_args[i].out[0])
727 continue;
728
729 /* Specify the target we are exporting */
730 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
731
732 if (pos_idx == shader->info.nr_pos_exports) {
733 /* Specify that this is the last export */
734 pos_args[i].done = 1;
735
736 /* If a shader has no param exports, rasterization can start before
737 * the shader finishes and thus memory stores might not finish before
738 * the pixel shader starts.
739 *
740 * VLOAD is for atomics with return.
741 */
742 if (ctx->screen->info.chip_class >= GFX10 &&
743 !shader->info.nr_param_exports &&
744 shader->selector->info.base.writes_memory)
745 ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
746 }
747
748 ac_build_export(&ctx->ac, &pos_args[i]);
749 }
750
751 /* Build parameter exports. */
752 for (unsigned i = 0; i < shader->info.nr_param_exports; i++)
753 ac_build_export(&ctx->ac, ¶m_exports[i]);
754 }
755
si_llvm_emit_vs_epilogue(struct ac_shader_abi * abi)756 void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)
757 {
758 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
759 struct si_shader_info *info = &ctx->shader->selector->info;
760 struct si_shader_output_values *outputs = NULL;
761 LLVMValueRef *addrs = abi->outputs;
762 int i, j;
763
764 assert(!ctx->shader->is_gs_copy_shader);
765 assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
766
767 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
768
769 for (i = 0; i < info->num_outputs; i++) {
770 outputs[i].semantic = info->output_semantic[i];
771
772 for (j = 0; j < 4; j++) {
773 outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
774 outputs[i].vertex_streams = info->output_streams[i];
775 }
776 }
777
778 if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
779 si_llvm_emit_streamout(ctx, outputs, i, 0);
780
781 /* Export PrimitiveID. */
782 if (ctx->shader->key.ge.mono.u.vs_export_prim_id) {
783 outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
784 outputs[i].vertex_streams = 0;
785 outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
786 for (j = 1; j < 4; j++)
787 outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
788 i++;
789 }
790
791 si_llvm_build_vs_exports(ctx, outputs, i);
792 FREE(outputs);
793 }
794
795 /**
796 * Build the vertex shader prolog function.
797 *
798 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
799 * All inputs are returned unmodified. The vertex load indices are
800 * stored after them, which will be used by the API VS for fetching inputs.
801 *
802 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
803 * input_v0,
804 * input_v1,
805 * input_v2,
806 * input_v3,
807 * (VertexID + BaseVertex),
808 * (InstanceID + StartInstance),
809 * (InstanceID / 2 + StartInstance)
810 */
si_llvm_build_vs_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)811 void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
812 {
813 LLVMTypeRef *returns;
814 LLVMValueRef ret, func;
815 int num_returns, i;
816 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
817 unsigned num_input_vgprs =
818 key->vs_prolog.num_merged_next_stage_vgprs + 4;
819 struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
820 struct ac_arg input_vgpr_param[10];
821 LLVMValueRef input_vgprs[10];
822 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
823 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
824
825 memset(&ctx->args, 0, sizeof(ctx->args));
826
827 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
828 returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
829 num_returns = 0;
830
831 /* Declare input and output SGPRs. */
832 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
833 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
834 returns[num_returns++] = ctx->ac.i32;
835 }
836
837 /* Preloaded VGPRs (outputs must be floats) */
838 for (i = 0; i < num_input_vgprs; i++) {
839 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
840 returns[num_returns++] = ctx->ac.f32;
841 }
842
843 /* Vertex load indices. */
844 for (i = 0; i < key->vs_prolog.num_inputs; i++)
845 returns[num_returns++] = ctx->ac.f32;
846
847 /* Create the function. */
848 si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
849 func = ctx->main_fn;
850
851 for (i = 0; i < num_input_vgprs; i++) {
852 input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
853 }
854
855 if (key->vs_prolog.num_merged_next_stage_vgprs) {
856 if (!key->vs_prolog.is_monolithic)
857 ac_init_exec_full_mask(&ctx->ac);
858
859 if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
860 /* If there are no HS threads, SPI loads the LS VGPRs
861 * starting at VGPR 0. Shift them back to where they
862 * belong.
863 */
864 LLVMValueRef has_hs_threads =
865 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
866 si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
867
868 for (i = 4; i > 0; --i) {
869 input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
870 input_vgprs[i + 1], input_vgprs[i - 1], "");
871 }
872 }
873 }
874
875 /* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */
876 if (key->vs_prolog.load_vgprs_after_culling) {
877 for (i = 5; i <= 8; i++) {
878 bool is_tes_rel_patch_id = i == 7;
879 input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i],
880 LLVMPointerType(is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32,
881 AC_ADDR_SPACE_LDS), "");
882 input_vgprs[i] = LLVMBuildLoad(ctx->ac.builder, input_vgprs[i], "");
883 if (is_tes_rel_patch_id)
884 input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");
885 }
886 }
887
888 unsigned vertex_id_vgpr = first_vs_vgpr;
889 unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
890 ? first_vs_vgpr + 3
891 : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
892
893 ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
894 ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
895
896 /* Copy inputs to outputs. This should be no-op, as the registers match,
897 * but it will prevent the compiler from overwriting them unintentionally.
898 */
899 ret = ctx->return_value;
900 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
901 LLVMValueRef p = LLVMGetParam(func, i);
902 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
903 }
904 for (i = 0; i < num_input_vgprs; i++) {
905 LLVMValueRef p = input_vgprs[i];
906
907 if (i == vertex_id_vgpr)
908 p = ctx->abi.vertex_id;
909 else if (i == instance_id_vgpr)
910 p = ctx->abi.instance_id;
911
912 p = ac_to_float(&ctx->ac, p);
913 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
914 }
915
916 /* Compute vertex load indices from instance divisors. */
917 LLVMValueRef instance_divisor_constbuf = NULL;
918
919 if (key->vs_prolog.states.instance_divisor_is_fetched) {
920 LLVMValueRef list = si_prolog_get_internal_bindings(ctx);
921 LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
922 instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
923 }
924
925 for (i = 0; i < key->vs_prolog.num_inputs; i++) {
926 bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
927 bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
928 LLVMValueRef index = NULL;
929
930 if (divisor_is_one) {
931 index = ctx->abi.instance_id;
932 } else if (divisor_is_fetched) {
933 LLVMValueRef udiv_factors[4];
934
935 for (unsigned j = 0; j < 4; j++) {
936 udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
937 LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
938 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
939 }
940 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
941 * Such InstanceID might not be achievable in a reasonable time though.
942 */
943 index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
944 udiv_factors[1], udiv_factors[2], udiv_factors[3]);
945 }
946
947 if (divisor_is_one || divisor_is_fetched) {
948 /* Add StartInstance. */
949 index =
950 LLVMBuildAdd(ctx->ac.builder, index,
951 LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
952 } else {
953 /* VertexID + BaseVertex */
954 index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
955 LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
956 }
957
958 index = ac_to_float(&ctx->ac, index);
959 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
960 }
961
962 si_llvm_build_ret(ctx, ret);
963 }
964
get_base_vertex(struct ac_shader_abi * abi,bool non_indexed_is_zero)965 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
966 {
967 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
968
969 /* This doesn't happen with GL: */
970 if (!non_indexed_is_zero)
971 return ac_get_arg(&ctx->ac, ctx->args.base_vertex);
972
973 /* For non-indexed draws, the base vertex set by the driver
974 * (for direct draws) or the CP (for indirect draws) is the
975 * first vertex ID, but GLSL expects 0 to be returned.
976 */
977 LLVMValueRef indexed = si_unpack_param(ctx, ctx->vs_state_bits, 1, 1);
978 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
979
980 return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
981 ctx->ac.i32_0, "");
982 }
983
si_llvm_init_vs_callbacks(struct si_shader_context * ctx,bool ngg_cull_shader)984 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
985 {
986 struct si_shader *shader = ctx->shader;
987
988 if (shader->key.ge.as_ls)
989 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
990 else if (shader->key.ge.as_es)
991 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
992 else if (ngg_cull_shader)
993 ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
994 else if (shader->key.ge.as_ngg)
995 ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
996 else
997 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
998
999 ctx->abi.load_base_vertex = get_base_vertex;
1000 ctx->abi.load_inputs = si_load_vs_input;
1001 }
1002