1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 #include "ac_exp_param.h"
30 
unpack_sint16(struct si_shader_context * ctx,LLVMValueRef i32,unsigned index)31 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
32 {
33    assert(index <= 1);
34 
35    if (index == 1)
36       return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
37 
38    return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
39                         ctx->ac.i32, "");
40 }
41 
load_input_vs(struct si_shader_context * ctx,unsigned input_index,LLVMValueRef out[4])42 static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
43 {
44    const struct si_shader_info *info = &ctx->shader->selector->info;
45    unsigned vs_blit_property = info->base.vs.blit_sgprs_amd;
46 
47    if (vs_blit_property) {
48       LLVMValueRef vertex_id = ctx->abi.vertex_id;
49       LLVMValueRef sel_x1 =
50          LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
51       /* Use LLVMIntNE, because we have 3 vertices and only
52        * the middle one should use y2.
53        */
54       LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
55 
56       unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
57       if (input_index == 0) {
58          /* Position: */
59          LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
60          LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
61 
62          LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
63          LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
64          LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
65          LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
66 
67          LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
68          LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
69 
70          out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
71          out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
72          out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
73          out[3] = ctx->ac.f32_1;
74          return;
75       }
76 
77       /* Color or texture coordinates: */
78       assert(input_index == 1);
79 
80       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
81          for (int i = 0; i < 4; i++) {
82             out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
83          }
84       } else {
85          assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
86          LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
87          LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
88          LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
89          LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
90 
91          out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
92          out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
93          out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
94          out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
95       }
96       return;
97    }
98 
99    /* Set can_speculate=false to help keep all loads grouped together
100     * for better latency hiding. If it was true, LLVM could move the loads forward
101     * and accidentally double memory latency by doing:
102     *
103     *    buffer_load_dword_xyzw
104     *    s_waitcnt vmcnt(0)
105     *    buffer_load_dword_xyzw
106     *    s_waitcnt vmcnt(0)
107     *
108     * ... which is what we must prevent at all cost.
109     */
110    const bool can_speculate = false;
111    unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32;
112    LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32;
113    LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32;
114    unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
115    union si_vs_fix_fetch fix_fetch;
116    LLVMValueRef vb_desc;
117    LLVMValueRef vertex_index;
118    LLVMValueRef tmp;
119 
120    if (input_index < num_vbos_in_user_sgprs) {
121       vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
122    } else {
123       unsigned index = input_index - num_vbos_in_user_sgprs;
124       vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.vertex_buffers),
125                                       LLVMConstInt(ctx->ac.i32, index, 0));
126    }
127 
128    vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
129 
130    /* Use the open-coded implementation for all loads of doubles and
131     * of dword-sized data that needs fixups. We need to insert conversion
132     * code anyway, and the amd/common code does it for us.
133     */
134    bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
135    fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
136    if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
137        (fix_fetch.u.log_size == 2)) {
138       tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
139                                            fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
140                                            fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
141                                            ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate);
142       for (unsigned i = 0; i < 4; ++i)
143          out[i] =
144             LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
145 
146       if (bit_size == 16) {
147          if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT ||
148              fix_fetch.u.format == AC_FETCH_FORMAT_SINT) {
149             for (unsigned i = 0; i < 4; i++)
150                out[i] = LLVMBuildTrunc(ctx->ac.builder, out[i], ctx->ac.i16, "");
151          } else {
152             for (unsigned i = 0; i < 4; i++) {
153                out[i] = ac_to_float(&ctx->ac, out[i]);
154                out[i] = LLVMBuildFPTrunc(ctx->ac.builder, out[i], ctx->ac.f16, "");
155             }
156          }
157       }
158       return;
159    }
160 
161    unsigned required_channels = util_last_bit(info->input[input_index].usage_mask);
162    if (required_channels == 0) {
163       for (unsigned i = 0; i < 4; ++i)
164          out[i] = LLVMGetUndef(ctx->ac.f32);
165       return;
166    }
167 
168    /* Do multiple loads for special formats. */
169    LLVMValueRef fetches[4];
170    unsigned num_fetches;
171    unsigned fetch_stride;
172    unsigned channels_per_fetch;
173 
174    if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
175       num_fetches = MIN2(required_channels, 3);
176       fetch_stride = 1 << fix_fetch.u.log_size;
177       channels_per_fetch = 1;
178    } else {
179       num_fetches = 1;
180       fetch_stride = 0;
181       channels_per_fetch = required_channels;
182    }
183 
184    for (unsigned i = 0; i < num_fetches; ++i) {
185       LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
186       fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
187                                                channels_per_fetch, 0, can_speculate,
188                                                bit_size == 16, false);
189    }
190 
191    if (num_fetches == 1 && channels_per_fetch > 1) {
192       LLVMValueRef fetch = fetches[0];
193       for (unsigned i = 0; i < channels_per_fetch; ++i) {
194          tmp = LLVMConstInt(ctx->ac.i32, i, false);
195          fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
196       }
197       num_fetches = channels_per_fetch;
198       channels_per_fetch = 1;
199    }
200 
201    for (unsigned i = num_fetches; i < 4; ++i)
202       fetches[i] = LLVMGetUndef(float_type);
203 
204    if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
205       if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
206          fetches[3] = LLVMConstInt(int_type, 1, 0);
207       else
208          fetches[3] = LLVMConstReal(float_type, 1);
209    } else if (fix_fetch.u.log_size == 3 &&
210               (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
211                fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
212                fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
213               required_channels == 4) {
214 
215       /* For 2_10_10_10, the hardware returns an unsigned value;
216        * convert it to a signed one.
217        */
218       LLVMValueRef tmp = fetches[3];
219       LLVMValueRef c30 = LLVMConstInt(int_type, 30, 0);
220 
221       /* First, recover the sign-extended signed integer value. */
222       if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
223          tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, int_type, "");
224       else
225          tmp = ac_to_integer(&ctx->ac, tmp);
226 
227       /* For the integer-like cases, do a natural sign extension.
228        *
229        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
230        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
231        * exponent.
232        */
233       tmp = LLVMBuildShl(
234          ctx->ac.builder, tmp,
235          fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(int_type, 7, 0) : c30, "");
236       tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
237 
238       /* Convert back to the right type. */
239       if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
240          LLVMValueRef clamp;
241          LLVMValueRef neg_one = LLVMConstReal(float_type, -1.0);
242          tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
243          clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
244          tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
245       } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
246          tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
247       }
248 
249       fetches[3] = tmp;
250    }
251 
252    for (unsigned i = 0; i < 4; ++i)
253       out[i] = ac_to_float(&ctx->ac, fetches[i]);
254 }
255 
si_load_vs_input(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)256 static LLVMValueRef si_load_vs_input(struct ac_shader_abi *abi, unsigned driver_location,
257                                      unsigned component, unsigned num_components,
258                                      unsigned vertex_index, LLVMTypeRef type)
259 {
260    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
261    LLVMValueRef values[4];
262 
263    load_input_vs(ctx, driver_location, values);
264 
265    for (unsigned i = 0; i < 4; i++)
266       values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
267 
268    return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
269 }
270 
si_llvm_streamout_store_output(struct si_shader_context * ctx,LLVMValueRef const * so_buffers,LLVMValueRef const * so_write_offsets,struct pipe_stream_output * stream_out,struct si_shader_output_values * shader_out)271 void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
272                                     LLVMValueRef const *so_write_offsets,
273                                     struct pipe_stream_output *stream_out,
274                                     struct si_shader_output_values *shader_out)
275 {
276    unsigned buf_idx = stream_out->output_buffer;
277    unsigned start = stream_out->start_component;
278    unsigned num_comps = stream_out->num_components;
279    LLVMValueRef out[4];
280 
281    assert(num_comps && num_comps <= 4);
282    if (!num_comps || num_comps > 4)
283       return;
284 
285    /* Load the output as int. */
286    for (int j = 0; j < num_comps; j++) {
287       assert(stream_out->stream == shader_out->vertex_stream[start + j]);
288 
289       out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
290    }
291 
292    /* Pack the output. */
293    LLVMValueRef vdata = NULL;
294 
295    switch (num_comps) {
296    case 1: /* as i32 */
297       vdata = out[0];
298       break;
299    case 2: /* as v2i32 */
300    case 3: /* as v3i32 */
301       if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
302          vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
303          break;
304       }
305       /* as v4i32 (aligned to 4) */
306       out[3] = LLVMGetUndef(ctx->ac.i32);
307       FALLTHROUGH;
308    case 4: /* as v4i32 */
309       vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
310       break;
311    }
312 
313    ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,
314                                so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
315                                ac_glc | ac_slc);
316 }
317 
318 /**
319  * Write streamout data to buffers for vertex stream @p stream (different
320  * vertex streams can occur for GS copy shaders).
321  */
si_llvm_emit_streamout(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput,unsigned stream)322 void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
323                             unsigned noutput, unsigned stream)
324 {
325    struct si_shader_selector *sel = ctx->shader->selector;
326    struct pipe_stream_output_info *so = &sel->so;
327    LLVMBuilderRef builder = ctx->ac.builder;
328    int i;
329 
330    /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
331    LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->args.streamout_config, 16, 7);
332 
333    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
334 
335    /* can_emit = tid < so_vtx_count; */
336    LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
337 
338    /* Emit the streamout code conditionally. This actually avoids
339     * out-of-bounds buffer access. The hw tells us via the SGPR
340     * (so_vtx_count) which threads are allowed to emit streamout data. */
341    ac_build_ifcc(&ctx->ac, can_emit, 6501);
342    {
343       /* The buffer offset is computed as follows:
344        *   ByteOffset = streamout_offset[buffer_id]*4 +
345        *                (streamout_write_index + thread_id)*stride[buffer_id] +
346        *                attrib_offset
347        */
348 
349       LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args.streamout_write_index);
350 
351       /* Compute (streamout_write_index + thread_id). */
352       so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
353 
354       /* Load the descriptor and compute the write offset for each
355        * enabled buffer. */
356       LLVMValueRef so_write_offset[4] = {};
357       LLVMValueRef so_buffers[4];
358       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
359 
360       for (i = 0; i < 4; i++) {
361          if (!so->stride[i])
362             continue;
363 
364          LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
365 
366          so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
367 
368          LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args.streamout_offset[i]);
369          so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
370 
371          so_write_offset[i] = ac_build_imad(
372             &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
373       }
374 
375       /* Write streamout data. */
376       for (i = 0; i < so->num_outputs; i++) {
377          unsigned reg = so->output[i].register_index;
378 
379          if (reg >= noutput)
380             continue;
381 
382          if (stream != so->output[i].stream)
383             continue;
384 
385          si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
386                                         &outputs[reg]);
387       }
388    }
389    ac_build_endif(&ctx->ac, 6501);
390 }
391 
si_llvm_emit_clipvertex(struct si_shader_context * ctx,struct ac_export_args * pos,LLVMValueRef * out_elts)392 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,
393                                     LLVMValueRef *out_elts)
394 {
395    unsigned reg_index;
396    unsigned chan;
397    unsigned const_chan;
398    LLVMValueRef base_elt;
399    LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
400    LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
401    LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
402    unsigned clipdist_mask = ctx->shader->selector->clipdist_mask &
403                             ~ctx->shader->key.opt.kill_clip_distances;
404 
405    for (reg_index = 0; reg_index < 2; reg_index++) {
406       struct ac_export_args *args = &pos[2 + reg_index];
407 
408       if (!(clipdist_mask & BITFIELD_RANGE(reg_index * 4, 4)))
409          continue;
410 
411       args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMGetUndef(ctx->ac.f32);
412 
413       /* Compute dot products of position and user clip plane vectors */
414       for (chan = 0; chan < 4; chan++) {
415          if (!(clipdist_mask & BITFIELD_BIT(reg_index * 4 + chan)))
416             continue;
417 
418          for (const_chan = 0; const_chan < 4; const_chan++) {
419             LLVMValueRef addr =
420                LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
421             base_elt = si_buffer_load_const(ctx, const_resource, addr);
422             args->out[chan] =
423                ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan],
424                              const_chan == 0 ? ctx->ac.f32_0 : args->out[chan]);
425          }
426       }
427 
428       args->enabled_channels = 0xf;
429       args->valid_mask = 0;
430       args->done = 0;
431       args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
432       args->compr = 0;
433    }
434 }
435 
436 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_vs_export_args(struct si_shader_context * ctx,const LLVMValueRef * values,unsigned target,struct ac_export_args * args)437 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, const LLVMValueRef *values,
438                                         unsigned target, struct ac_export_args *args)
439 {
440    args->enabled_channels = 0xf; /* writemask - default is 0xf */
441    args->valid_mask = 0;         /* Specify whether the EXEC mask represents the valid mask */
442    args->done = 0;               /* Specify whether this is the last export */
443    args->target = target;        /* Specify the target we are exporting */
444    args->compr = false;
445 
446    memcpy(&args->out[0], values, sizeof(values[0]) * 4);
447 }
448 
si_prepare_param_exports(struct si_shader_context * ctx,const struct si_shader_output_values * outputs,unsigned noutput,struct ac_export_args exports[32])449 static void si_prepare_param_exports(struct si_shader_context *ctx,
450                                      const struct si_shader_output_values *outputs, unsigned noutput,
451                                      struct ac_export_args exports[32])
452 {
453    struct si_shader *shader = ctx->shader;
454    unsigned param_count = 0;
455 
456    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
457           sizeof(shader->info.vs_output_param_offset));
458 
459    for (unsigned i = 0; i < noutput; i++) {
460       unsigned semantic = outputs[i].semantic;
461 
462       if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&
463           outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)
464          continue;
465 
466       switch (semantic) {
467       case VARYING_SLOT_LAYER:
468       case VARYING_SLOT_VIEWPORT:
469       case VARYING_SLOT_CLIP_DIST0:
470       case VARYING_SLOT_CLIP_DIST1:
471       case VARYING_SLOT_COL0:
472       case VARYING_SLOT_COL1:
473       case VARYING_SLOT_BFC0:
474       case VARYING_SLOT_BFC1:
475       case VARYING_SLOT_PRIMITIVE_ID:
476       case VARYING_SLOT_FOGC:
477          break;
478       default:
479          if ((semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7) ||
480              semantic >= VARYING_SLOT_VAR0)
481             break;
482          else
483             continue;
484       }
485 
486       if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
487           shader->key.opt.kill_outputs &
488              (1ull << si_shader_io_get_unique_index(semantic, true)))
489          continue;
490 
491       si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + param_count,
492                                   &exports[param_count]);
493 
494       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
495       shader->info.vs_output_param_offset[i] = param_count++;
496    }
497 
498    shader->info.nr_param_exports = param_count;
499 }
500 
501 /**
502  * Vertex color clamping.
503  *
504  * This uses a state constant loaded in a user data SGPR and
505  * an IF statement is added that clamps all colors if the constant
506  * is true.
507  */
si_vertex_color_clamping(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput)508 static void si_vertex_color_clamping(struct si_shader_context *ctx,
509                                      struct si_shader_output_values *outputs, unsigned noutput)
510 {
511    LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
512    bool has_colors = false;
513 
514    /* Store original colors to alloca variables. */
515    for (unsigned i = 0; i < noutput; i++) {
516       if (outputs[i].semantic != VARYING_SLOT_COL0 &&
517           outputs[i].semantic != VARYING_SLOT_COL1 &&
518           outputs[i].semantic != VARYING_SLOT_BFC0 &&
519           outputs[i].semantic != VARYING_SLOT_BFC1)
520          continue;
521 
522       for (unsigned j = 0; j < 4; j++)
523          addr[i][j] = ac_build_alloca_init(&ctx->ac, outputs[i].values[j], "");
524 
525       has_colors = true;
526    }
527 
528    if (!has_colors)
529       return;
530 
531    /* The state is in the first bit of the user SGPR. */
532    LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
533    cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
534 
535    ac_build_ifcc(&ctx->ac, cond, 6502);
536 
537    /* Store clamped colors to alloca variables within the conditional block. */
538    for (unsigned i = 0; i < noutput; i++) {
539       if (outputs[i].semantic != VARYING_SLOT_COL0 &&
540           outputs[i].semantic != VARYING_SLOT_COL1 &&
541           outputs[i].semantic != VARYING_SLOT_BFC0 &&
542           outputs[i].semantic != VARYING_SLOT_BFC1)
543          continue;
544 
545       for (unsigned j = 0; j < 4; j++) {
546          LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
547                         addr[i][j]);
548       }
549    }
550    ac_build_endif(&ctx->ac, 6502);
551 
552    /* Load clamped colors */
553    for (unsigned i = 0; i < noutput; i++) {
554       if (outputs[i].semantic != VARYING_SLOT_COL0 &&
555           outputs[i].semantic != VARYING_SLOT_COL1 &&
556           outputs[i].semantic != VARYING_SLOT_BFC0 &&
557           outputs[i].semantic != VARYING_SLOT_BFC1)
558          continue;
559 
560       for (unsigned j = 0; j < 4; j++) {
561          outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
562       }
563    }
564 }
565 
566 /* Generate export instructions for hardware VS shader stage or NGG GS stage
567  * (position and parameter data only).
568  */
si_llvm_build_vs_exports(struct si_shader_context * ctx,struct si_shader_output_values * outputs,unsigned noutput)569 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
570                               struct si_shader_output_values *outputs, unsigned noutput)
571 {
572    struct si_shader *shader = ctx->shader;
573    struct ac_export_args pos_args[4] = {};
574    LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
575                 viewport_index_value = NULL;
576    unsigned pos_idx, index;
577    unsigned clipdist_mask = (shader->selector->clipdist_mask &
578                              ~shader->key.opt.kill_clip_distances) |
579                             shader->selector->culldist_mask;
580    int i;
581 
582    si_vertex_color_clamping(ctx, outputs, noutput);
583 
584    struct ac_export_args param_exports[32];
585    si_prepare_param_exports(ctx, outputs, noutput, param_exports);
586 
587    /* Build position exports. */
588    for (i = 0; i < noutput; i++) {
589       switch (outputs[i].semantic) {
590       case VARYING_SLOT_POS:
591          si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
592          break;
593       case VARYING_SLOT_PSIZ:
594          psize_value = outputs[i].values[0];
595          break;
596       case VARYING_SLOT_LAYER:
597          layer_value = outputs[i].values[0];
598          break;
599       case VARYING_SLOT_VIEWPORT:
600          viewport_index_value = outputs[i].values[0];
601          break;
602       case VARYING_SLOT_EDGE:
603          edgeflag_value = outputs[i].values[0];
604          break;
605       case VARYING_SLOT_CLIP_DIST0:
606       case VARYING_SLOT_CLIP_DIST1:
607          index = outputs[i].semantic - VARYING_SLOT_CLIP_DIST0;
608          if (clipdist_mask & BITFIELD_RANGE(index * 4, 4)) {
609             si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + 2 + index,
610                                         &pos_args[2 + index]);
611          }
612          break;
613       case VARYING_SLOT_CLIP_VERTEX:
614          si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
615          break;
616       }
617    }
618 
619    /* We need to add the position output manually if it's missing. */
620    if (!pos_args[0].out[0]) {
621       pos_args[0].enabled_channels = 0xf; /* writemask */
622       pos_args[0].valid_mask = 0;         /* EXEC mask */
623       pos_args[0].done = 0;               /* last export? */
624       pos_args[0].target = V_008DFC_SQ_EXP_POS;
625       pos_args[0].compr = 0;              /* COMPR flag */
626       pos_args[0].out[0] = ctx->ac.f32_0; /* X */
627       pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
628       pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
629       pos_args[0].out[3] = ctx->ac.f32_1; /* W */
630    }
631 
632    bool writes_psize = shader->selector->info.writes_psize && !shader->key.opt.kill_pointsize;
633    bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
634    bool writes_vrs = ctx->screen->options.vrs2x2;
635 
636    /* Write the misc vector (point size, edgeflag, layer, viewport). */
637    if (writes_psize || pos_writes_edgeflag || writes_vrs ||
638        shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
639       pos_args[1].enabled_channels = writes_psize |
640                                      ((pos_writes_edgeflag | writes_vrs) << 1) |
641                                      (shader->selector->info.writes_layer << 2);
642 
643       pos_args[1].valid_mask = 0; /* EXEC mask */
644       pos_args[1].done = 0;       /* last export? */
645       pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
646       pos_args[1].compr = 0;              /* COMPR flag */
647       pos_args[1].out[0] = ctx->ac.f32_0; /* X */
648       pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
649       pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
650       pos_args[1].out[3] = ctx->ac.f32_0; /* W */
651 
652       if (writes_psize)
653          pos_args[1].out[0] = psize_value;
654 
655       if (pos_writes_edgeflag) {
656          /* The output is a float, but the hw expects an integer
657           * with the first bit containing the edge flag. */
658          edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
659          edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
660 
661          /* The LLVM intrinsic expects a float. */
662          pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
663       }
664 
665       if (writes_vrs) {
666          /* Bits [2:3] = VRS rate X
667           * Bits [4:5] = VRS rate Y
668           *
669           * The range is [-2, 1]. Values:
670           *   1: 2x coarser shading rate in that direction.
671           *   0: normal shading rate
672           *  -1: 2x finer shading rate (sample shading, not directional)
673           *  -2: 4x finer shading rate (sample shading, not directional)
674           *
675           * Sample shading can't go above 8 samples, so both numbers can't be -2
676           * at the same time.
677           */
678          LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, (1 << 2) | (1 << 4), 0);
679 
680          /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
681          rates = LLVMBuildSelect(ctx->ac.builder,
682                                  LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE,
683                                                pos_args[0].out[3], ctx->ac.f32_1, ""),
684                                  rates, ctx->ac.i32_0, "");
685 
686          LLVMValueRef v = ac_to_integer(&ctx->ac, pos_args[1].out[1]);
687          v = LLVMBuildOr(ctx->ac.builder, v, rates, "");
688          pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
689       }
690 
691       if (ctx->screen->info.chip_class >= GFX9) {
692          /* GFX9 has the layer in out.z[10:0] and the viewport
693           * index in out.z[19:16].
694           */
695          if (shader->selector->info.writes_layer)
696             pos_args[1].out[2] = layer_value;
697 
698          if (shader->selector->info.writes_viewport_index) {
699             LLVMValueRef v = viewport_index_value;
700 
701             v = ac_to_integer(&ctx->ac, v);
702             v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
703             v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
704             pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
705             pos_args[1].enabled_channels |= 1 << 2;
706          }
707       } else {
708          if (shader->selector->info.writes_layer)
709             pos_args[1].out[2] = layer_value;
710 
711          if (shader->selector->info.writes_viewport_index) {
712             pos_args[1].out[3] = viewport_index_value;
713             pos_args[1].enabled_channels |= 1 << 3;
714          }
715       }
716    }
717 
718    for (i = 0; i < 4; i++)
719       if (pos_args[i].out[0])
720          shader->info.nr_pos_exports++;
721 
722    /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
723     * Setting valid_mask=1 prevents it and has no other effect.
724     */
725    if (ctx->screen->info.chip_class == GFX10)
726       pos_args[0].valid_mask = 1;
727 
728    pos_idx = 0;
729    for (i = 0; i < 4; i++) {
730       if (!pos_args[i].out[0])
731          continue;
732 
733       /* Specify the target we are exporting */
734       pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
735 
736       if (pos_idx == shader->info.nr_pos_exports) {
737          /* Specify that this is the last export */
738          pos_args[i].done = 1;
739 
740          /* If a shader has no param exports, rasterization can start before
741           * the shader finishes and thus memory stores might not finish before
742           * the pixel shader starts.
743           *
744           * VLOAD is for atomics with return.
745           */
746          if (ctx->screen->info.chip_class >= GFX10 &&
747              !shader->info.nr_param_exports &&
748              shader->selector->info.base.writes_memory)
749             ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
750       }
751 
752       ac_build_export(&ctx->ac, &pos_args[i]);
753    }
754 
755    /* Build parameter exports. */
756    for (unsigned i = 0; i < shader->info.nr_param_exports; i++)
757       ac_build_export(&ctx->ac, &param_exports[i]);
758 }
759 
si_llvm_emit_vs_epilogue(struct ac_shader_abi * abi)760 void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)
761 {
762    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
763    struct si_shader_info *info = &ctx->shader->selector->info;
764    struct si_shader_output_values *outputs = NULL;
765    LLVMValueRef *addrs = abi->outputs;
766    int i, j;
767 
768    assert(!ctx->shader->is_gs_copy_shader);
769    assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
770 
771    outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
772 
773    for (i = 0; i < info->num_outputs; i++) {
774       outputs[i].semantic = info->output_semantic[i];
775 
776       for (j = 0; j < 4; j++) {
777          outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
778          outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
779       }
780    }
781 
782    if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
783       si_llvm_emit_streamout(ctx, outputs, i, 0);
784 
785    /* Export PrimitiveID. */
786    if (ctx->shader->key.mono.u.vs_export_prim_id) {
787       outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
788       outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
789       for (j = 1; j < 4; j++)
790          outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
791 
792       memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
793       i++;
794    }
795 
796    si_llvm_build_vs_exports(ctx, outputs, i);
797    FREE(outputs);
798 }
799 
800 /**
801  * Build the vertex shader prolog function.
802  *
803  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
804  * All inputs are returned unmodified. The vertex load indices are
805  * stored after them, which will be used by the API VS for fetching inputs.
806  *
807  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
808  *   input_v0,
809  *   input_v1,
810  *   input_v2,
811  *   input_v3,
812  *   (VertexID + BaseVertex),
813  *   (InstanceID + StartInstance),
814  *   (InstanceID / 2 + StartInstance)
815  */
si_llvm_build_vs_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)816 void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
817 {
818    LLVMTypeRef *returns;
819    LLVMValueRef ret, func;
820    int num_returns, i;
821    unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
822    unsigned num_input_vgprs =
823       key->vs_prolog.num_merged_next_stage_vgprs + 4;
824    struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
825    struct ac_arg input_vgpr_param[10];
826    LLVMValueRef input_vgprs[10];
827    unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
828    unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
829 
830    memset(&ctx->args, 0, sizeof(ctx->args));
831 
832    /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
833    returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
834    num_returns = 0;
835 
836    /* Declare input and output SGPRs. */
837    for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
838       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
839       returns[num_returns++] = ctx->ac.i32;
840    }
841 
842    /* Preloaded VGPRs (outputs must be floats) */
843    for (i = 0; i < num_input_vgprs; i++) {
844       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
845       returns[num_returns++] = ctx->ac.f32;
846    }
847 
848    /* Vertex load indices. */
849    for (i = 0; i < key->vs_prolog.num_inputs; i++)
850       returns[num_returns++] = ctx->ac.f32;
851 
852    /* Create the function. */
853    si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
854    func = ctx->main_fn;
855 
856    for (i = 0; i < num_input_vgprs; i++) {
857       input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
858    }
859 
860    if (key->vs_prolog.num_merged_next_stage_vgprs) {
861       if (!key->vs_prolog.is_monolithic)
862          ac_init_exec_full_mask(&ctx->ac);
863 
864       if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
865          /* If there are no HS threads, SPI loads the LS VGPRs
866           * starting at VGPR 0. Shift them back to where they
867           * belong.
868           */
869          LLVMValueRef has_hs_threads =
870             LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
871                           si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
872 
873          for (i = 4; i > 0; --i) {
874             input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
875                                                  input_vgprs[i + 1], input_vgprs[i - 1], "");
876          }
877       }
878    }
879 
880    /* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */
881    if (key->vs_prolog.load_vgprs_after_culling) {
882       for (i = 5; i <= 8; i++) {
883          bool is_tes_rel_patch_id = i == 7;
884          input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i],
885                                             LLVMPointerType(is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32,
886                                                             AC_ADDR_SPACE_LDS), "");
887          input_vgprs[i] = LLVMBuildLoad(ctx->ac.builder, input_vgprs[i], "");
888          if (is_tes_rel_patch_id)
889             input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");
890       }
891    }
892 
893    unsigned vertex_id_vgpr = first_vs_vgpr;
894    unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
895                                   ? first_vs_vgpr + 3
896                                   : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
897 
898    ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
899    ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
900 
901    /* Copy inputs to outputs. This should be no-op, as the registers match,
902     * but it will prevent the compiler from overwriting them unintentionally.
903     */
904    ret = ctx->return_value;
905    for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
906       LLVMValueRef p = LLVMGetParam(func, i);
907       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
908    }
909    for (i = 0; i < num_input_vgprs; i++) {
910       LLVMValueRef p = input_vgprs[i];
911 
912       if (i == vertex_id_vgpr)
913          p = ctx->abi.vertex_id;
914       else if (i == instance_id_vgpr)
915          p = ctx->abi.instance_id;
916 
917       p = ac_to_float(&ctx->ac, p);
918       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
919    }
920 
921    /* Compute vertex load indices from instance divisors. */
922    LLVMValueRef instance_divisor_constbuf = NULL;
923 
924    if (key->vs_prolog.states.instance_divisor_is_fetched) {
925       LLVMValueRef list = si_prolog_get_internal_bindings(ctx);
926       LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
927       instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
928    }
929 
930    for (i = 0; i < key->vs_prolog.num_inputs; i++) {
931       bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
932       bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
933       LLVMValueRef index = NULL;
934 
935       if (divisor_is_one) {
936          index = ctx->abi.instance_id;
937       } else if (divisor_is_fetched) {
938          LLVMValueRef udiv_factors[4];
939 
940          for (unsigned j = 0; j < 4; j++) {
941             udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
942                                                    LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
943             udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
944          }
945          /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
946           * Such InstanceID might not be achievable in a reasonable time though.
947           */
948          index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
949                                         udiv_factors[1], udiv_factors[2], udiv_factors[3]);
950       }
951 
952       if (divisor_is_one || divisor_is_fetched) {
953          /* Add StartInstance. */
954          index =
955             LLVMBuildAdd(ctx->ac.builder, index,
956                          LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
957       } else {
958          /* VertexID + BaseVertex */
959          index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
960                               LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
961       }
962 
963       index = ac_to_float(&ctx->ac, index);
964       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
965    }
966 
967    si_llvm_build_ret(ctx, ret);
968 }
969 
get_base_vertex(struct ac_shader_abi * abi,bool non_indexed_is_zero)970 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi, bool non_indexed_is_zero)
971 {
972    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
973 
974    /* This doesn't happen with GL: */
975    if (!non_indexed_is_zero)
976       return ac_get_arg(&ctx->ac, ctx->args.base_vertex);
977 
978    /* For non-indexed draws, the base vertex set by the driver
979     * (for direct draws) or the CP (for indirect draws) is the
980     * first vertex ID, but GLSL expects 0 to be returned.
981     */
982    LLVMValueRef indexed = si_unpack_param(ctx, ctx->vs_state_bits, 1, 1);
983    indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
984 
985    return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
986                           ctx->ac.i32_0, "");
987 }
988 
si_llvm_init_vs_callbacks(struct si_shader_context * ctx,bool ngg_cull_shader)989 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
990 {
991    struct si_shader *shader = ctx->shader;
992 
993    if (shader->key.as_ls)
994       ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
995    else if (shader->key.as_es)
996       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
997    else if (ngg_cull_shader)
998       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
999    else if (shader->key.as_ngg)
1000       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1001    else
1002       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1003 
1004    ctx->abi.load_base_vertex = get_base_vertex;
1005    ctx->abi.load_inputs = si_load_vs_input;
1006 }
1007