1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 
get_rel_patch_id(struct si_shader_context * ctx)29 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
30 {
31    switch (ctx->stage) {
32    case MESA_SHADER_TESS_CTRL:
33       return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
34 
35    case MESA_SHADER_TESS_EVAL:
36       return ac_get_arg(&ctx->ac, ctx->args.tes_rel_patch_id);
37 
38    default:
39       assert(0);
40       return NULL;
41    }
42 }
43 
44 /* Tessellation shaders pass outputs to the next shader using LDS.
45  *
46  * LS outputs = TCS inputs
47  * TCS outputs = TES inputs
48  *
49  * The LDS layout is:
50  * - TCS inputs for patch 0
51  * - TCS inputs for patch 1
52  * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
53  * - ...
54  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
55  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
56  * - TCS outputs for patch 1
57  * - Per-patch TCS outputs for patch 1
58  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
59  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
60  * - ...
61  *
62  * All three shaders VS(LS), TCS, TES share the same LDS space.
63  */
64 
get_tcs_in_patch_stride(struct si_shader_context * ctx)65 static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx)
66 {
67    return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
68 }
69 
get_tcs_out_vertex_dw_stride_constant(struct si_shader_context * ctx)70 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
71 {
72    assert(ctx->stage == MESA_SHADER_TESS_CTRL);
73 
74    if (ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy)
75       return util_last_bit64(ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy) * 4;
76 
77    return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
78 }
79 
get_tcs_out_vertex_dw_stride(struct si_shader_context * ctx)80 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
81 {
82    unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
83 
84    return LLVMConstInt(ctx->ac.i32, stride, 0);
85 }
86 
get_tcs_out_patch_stride(struct si_shader_context * ctx)87 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
88 {
89    if (ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy)
90       return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
91 
92    const struct si_shader_info *info = &ctx->shader->selector->info;
93    unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
94    unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
95    unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
96    unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4;
97    return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
98 }
99 
get_tcs_out_patch0_offset(struct si_shader_context * ctx)100 static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx)
101 {
102    return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
103                        LLVMConstInt(ctx->ac.i32, 4, 0), "");
104 }
105 
get_tcs_out_patch0_patch_data_offset(struct si_shader_context * ctx)106 static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
107 {
108    return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
109                        LLVMConstInt(ctx->ac.i32, 4, 0), "");
110 }
111 
get_tcs_in_current_patch_offset(struct si_shader_context * ctx)112 static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
113 {
114    LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
115    LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
116 
117    return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
118 }
119 
get_tcs_out_current_patch_offset(struct si_shader_context * ctx)120 static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
121 {
122    LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
123    LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
124    LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
125 
126    return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
127 }
128 
get_tcs_out_current_patch_data_offset(struct si_shader_context * ctx)129 static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
130 {
131    LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx);
132    LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
133    LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
134 
135    return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
136 }
137 
get_num_tcs_out_vertices(struct si_shader_context * ctx)138 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
139 {
140    unsigned tcs_out_vertices =
141       ctx->shader->selector ? ctx->shader->selector->info.base.tess.tcs_vertices_out
142                             : 0;
143 
144    /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
145    if (ctx->stage == MESA_SHADER_TESS_CTRL && tcs_out_vertices)
146       return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
147 
148    return LLVMBuildAdd(ctx->ac.builder,
149                        si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 5), ctx->ac.i32_1, "");
150 }
151 
get_tcs_in_vertex_dw_stride(struct si_shader_context * ctx)152 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
153 {
154    unsigned stride;
155 
156    switch (ctx->stage) {
157    case MESA_SHADER_VERTEX:
158       stride = ctx->shader->selector->lshs_vertex_stride / 4;
159       return LLVMConstInt(ctx->ac.i32, stride, 0);
160 
161    case MESA_SHADER_TESS_CTRL:
162       if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) {
163          stride = ctx->shader->key.ge.part.tcs.ls->lshs_vertex_stride / 4;
164          return LLVMConstInt(ctx->ac.i32, stride, 0);
165       }
166       return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
167 
168    default:
169       assert(0);
170       return NULL;
171    }
172 }
173 
174 static LLVMValueRef
get_dw_address_from_generic_indices(struct si_shader_context * ctx,LLVMValueRef vertex_dw_stride,LLVMValueRef base_addr,LLVMValueRef vertex_index,LLVMValueRef param_index,ubyte name)175 get_dw_address_from_generic_indices(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride,
176                                     LLVMValueRef base_addr, LLVMValueRef vertex_index,
177                                     LLVMValueRef param_index, ubyte name)
178 {
179    if (vertex_dw_stride) {
180       base_addr = ac_build_imad(&ctx->ac, vertex_index, vertex_dw_stride, base_addr);
181    }
182 
183    if (param_index) {
184       base_addr = ac_build_imad(&ctx->ac, param_index, LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
185    }
186 
187    int param = name >= VARYING_SLOT_PATCH0 ||
188                name == VARYING_SLOT_TESS_LEVEL_INNER ||
189                name == VARYING_SLOT_TESS_LEVEL_OUTER
190                   ? si_shader_io_get_unique_index_patch(name)
191                   : si_shader_io_get_unique_index(name, false);
192 
193    /* Add the base address of the element. */
194    return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
195 }
196 
197 /* The offchip buffer layout for TCS->TES is
198  *
199  * - attribute 0 of patch 0 vertex 0
200  * - attribute 0 of patch 0 vertex 1
201  * - attribute 0 of patch 0 vertex 2
202  *   ...
203  * - attribute 0 of patch 1 vertex 0
204  * - attribute 0 of patch 1 vertex 1
205  *   ...
206  * - attribute 1 of patch 0 vertex 0
207  * - attribute 1 of patch 0 vertex 1
208  *   ...
209  * - per patch attribute 0 of patch 0
210  * - per patch attribute 0 of patch 1
211  *   ...
212  *
213  * Note that every attribute has 4 components.
214  */
get_tcs_tes_buffer_address(struct si_shader_context * ctx,LLVMValueRef rel_patch_id,LLVMValueRef vertex_index,LLVMValueRef param_index)215 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
216                                                LLVMValueRef rel_patch_id, LLVMValueRef vertex_index,
217                                                LLVMValueRef param_index)
218 {
219    LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
220    LLVMValueRef param_stride, constant16;
221 
222    vertices_per_patch = get_num_tcs_out_vertices(ctx);
223    num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
224    num_patches = LLVMBuildAdd(ctx->ac.builder, num_patches, ctx->ac.i32_1, "");
225    total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
226 
227    constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
228    if (vertex_index) {
229       base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index);
230       param_stride = total_vertices;
231    } else {
232       base_addr = rel_patch_id;
233       param_stride = num_patches;
234    }
235 
236    base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
237    base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
238 
239    if (!vertex_index) {
240       LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 11, 21);
241 
242       base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
243    }
244    return base_addr;
245 }
246 
get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context * ctx,LLVMValueRef vertex_index,LLVMValueRef param_index,ubyte name)247 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context *ctx,
248                                                                     LLVMValueRef vertex_index,
249                                                                     LLVMValueRef param_index,
250                                                                     ubyte name)
251 {
252    unsigned param_index_base;
253 
254    param_index_base = name >= VARYING_SLOT_PATCH0 ||
255                       name == VARYING_SLOT_TESS_LEVEL_INNER ||
256                       name == VARYING_SLOT_TESS_LEVEL_OUTER
257                          ? si_shader_io_get_unique_index_patch(name)
258                          : si_shader_io_get_unique_index(name, false);
259 
260    if (param_index) {
261       param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
262                                  LLVMConstInt(ctx->ac.i32, param_index_base, 0), "");
263    } else {
264       param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
265    }
266 
267    return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), vertex_index, param_index);
268 }
269 
buffer_load(struct si_shader_context * ctx,LLVMTypeRef type,unsigned swizzle,LLVMValueRef buffer,LLVMValueRef offset,LLVMValueRef base,bool can_speculate)270 static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
271                                 LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base,
272                                 bool can_speculate)
273 {
274    LLVMValueRef value;
275    LLVMTypeRef vec_type = LLVMVectorType(type, 4);
276 
277    if (swizzle == ~0) {
278       value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, type, ac_glc,
279                                    can_speculate, false);
280 
281       return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
282    }
283 
284    value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, type, ac_glc,
285                                 can_speculate, false);
286 
287    value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
288    return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0),
289                                   "");
290 }
291 
292 /**
293  * Load from LSHS LDS storage.
294  *
295  * \param type		output value type
296  * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
297  * \param dw_addr	address in dwords
298  */
lshs_lds_load(struct si_shader_context * ctx,LLVMTypeRef type,unsigned swizzle,LLVMValueRef dw_addr)299 static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
300                                   LLVMValueRef dw_addr)
301 {
302    LLVMValueRef value;
303 
304    if (swizzle == ~0) {
305       LLVMValueRef values[4];
306 
307       for (unsigned chan = 0; chan < 4; chan++)
308          values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
309 
310       return ac_build_gather_values(&ctx->ac, values, 4);
311    }
312 
313    dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
314    value = ac_lds_load(&ctx->ac, dw_addr);
315    return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
316 }
317 
318 /**
319  * Store to LSHS LDS storage.
320  *
321  * \param swizzle	offset (typically 0..3)
322  * \param dw_addr	address in dwords
323  * \param value		value to store
324  */
lshs_lds_store(struct si_shader_context * ctx,unsigned dw_offset_imm,LLVMValueRef dw_addr,LLVMValueRef value)325 static void lshs_lds_store(struct si_shader_context *ctx, unsigned dw_offset_imm,
326                            LLVMValueRef dw_addr, LLVMValueRef value)
327 {
328    dw_addr =
329       LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
330 
331    ac_lds_store(&ctx->ac, dw_addr, value);
332 }
333 
334 enum si_tess_ring
335 {
336    TCS_FACTOR_RING,
337    TESS_OFFCHIP_RING_TCS,
338    TESS_OFFCHIP_RING_TES,
339 };
340 
get_tess_ring_descriptor(struct si_shader_context * ctx,enum si_tess_ring ring)341 static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring)
342 {
343    LLVMBuilderRef builder = ctx->ac.builder;
344    LLVMValueRef addr = ac_get_arg(
345       &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout);
346 
347    /* TCS only receives high 13 bits of the address. */
348    if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
349       addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
350    }
351 
352    if (ring == TCS_FACTOR_RING) {
353       unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
354       addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
355    }
356 
357    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
358                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
359 
360    if (ctx->screen->info.chip_class >= GFX10)
361       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
362                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
363    else
364       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
365                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
366 
367    LLVMValueRef desc[4];
368    desc[0] = addr;
369    desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
370    desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
371    desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
372 
373    return ac_build_gather_values(&ctx->ac, desc, 4);
374 }
375 
si_llvm_preload_tes_rings(struct si_shader_context * ctx)376 void si_llvm_preload_tes_rings(struct si_shader_context *ctx)
377 {
378    ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
379 }
380 
si_nir_load_tcs_varyings(struct ac_shader_abi * abi,LLVMTypeRef type,LLVMValueRef vertex_index,LLVMValueRef param_index,unsigned driver_location,unsigned component,unsigned num_components,bool load_input,bool vertex_index_is_invoc_id)381 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type,
382                                              LLVMValueRef vertex_index, LLVMValueRef param_index,
383                                              unsigned driver_location, unsigned component,
384                                              unsigned num_components, bool load_input,
385                                              bool vertex_index_is_invoc_id)
386 {
387    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
388    struct si_shader_info *info = &ctx->shader->selector->info;
389    LLVMValueRef dw_addr, stride;
390    ubyte semantic;
391 
392    if (load_input) {
393       semantic = info->input[driver_location].semantic;
394    } else {
395       semantic = info->output_semantic[driver_location];
396    }
397 
398    /* Load the TCS input from a VGPR if possible. */
399    if (ctx->shader->key.ge.opt.same_patch_vertices &&
400        load_input && vertex_index_is_invoc_id && !param_index) {
401       unsigned func_param = ctx->args.tcs_rel_ids.arg_index + 1 +
402                             si_shader_io_get_unique_index(semantic, false) * 4;
403       LLVMValueRef value[4];
404 
405       for (unsigned i = component; i < component + num_components; i++) {
406          value[i] = LLVMGetParam(ctx->main_fn, func_param + i);
407          value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, "");
408       }
409 
410       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
411    }
412 
413    bool is_patch = vertex_index == NULL;
414    assert((semantic >= VARYING_SLOT_PATCH0 ||
415            semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
416            semantic == VARYING_SLOT_TESS_LEVEL_OUTER) == is_patch);
417 
418    if (load_input) {
419       stride = get_tcs_in_vertex_dw_stride(ctx);
420       dw_addr = get_tcs_in_current_patch_offset(ctx);
421    } else {
422       if (is_patch) {
423          stride = NULL;
424          dw_addr = get_tcs_out_current_patch_data_offset(ctx);
425       } else {
426          stride = get_tcs_out_vertex_dw_stride(ctx);
427          dw_addr = get_tcs_out_current_patch_offset(ctx);
428       }
429    }
430 
431    dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
432                                                  semantic);
433 
434    LLVMValueRef value[4];
435    for (unsigned i = component; i < component + num_components; i++)
436       value[i] = lshs_lds_load(ctx, type, i, dw_addr);
437 
438    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
439 }
440 
si_nir_load_input_tes(struct ac_shader_abi * abi,LLVMTypeRef type,LLVMValueRef vertex_index,LLVMValueRef param_index,unsigned driver_location,unsigned component,unsigned num_components,bool load_input,bool vertex_index_is_invoc_id)441 static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef type,
442                                           LLVMValueRef vertex_index, LLVMValueRef param_index,
443                                           unsigned driver_location, unsigned component,
444                                           unsigned num_components,
445                                           bool load_input, bool vertex_index_is_invoc_id)
446 {
447    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
448    struct si_shader_info *info = &ctx->shader->selector->info;
449    LLVMValueRef base, addr;
450 
451    ubyte semantic = info->input[driver_location].semantic;
452 
453    assert((semantic >= VARYING_SLOT_PATCH0 ||
454            semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
455            semantic == VARYING_SLOT_TESS_LEVEL_OUTER) == (vertex_index == NULL));
456 
457    base = ac_get_arg(&ctx->ac, ctx->args.tess_offchip_offset);
458 
459    addr =
460       get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, semantic);
461 
462    /* TODO: This will generate rather ordinary llvm code, although it
463     * should be easy for the optimizer to fix up. In future we might want
464     * to refactor buffer_load().
465     */
466    LLVMValueRef value[4];
467    for (unsigned i = component; i < component + num_components; i++)
468       value[i] = buffer_load(ctx, type, i, ctx->tess_offchip_ring, base, addr, true);
469 
470    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
471 }
472 
si_nir_store_output_tcs(struct ac_shader_abi * abi,LLVMValueRef vertex_index,LLVMValueRef param_index,LLVMValueRef src,unsigned writemask,unsigned component,unsigned location,unsigned driver_location)473 static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
474                                     LLVMValueRef vertex_index, LLVMValueRef param_index,
475                                     LLVMValueRef src, unsigned writemask,
476                                     unsigned component, unsigned location, unsigned driver_location)
477 {
478    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
479    struct si_shader_info *info = &ctx->shader->selector->info;
480    LLVMValueRef dw_addr, stride;
481    LLVMValueRef buffer, base, addr;
482    LLVMValueRef values[8];
483    bool is_tess_factor = false, is_tess_inner = false;
484 
485    ubyte semantic = info->output_semantic[driver_location];
486 
487    const bool is_const = !param_index;
488    const bool is_patch = vertex_index == NULL;
489 
490    /* Invalid SPIR-V can cause this. */
491    if ((semantic >= VARYING_SLOT_PATCH0 || semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
492         semantic == VARYING_SLOT_TESS_LEVEL_OUTER) != is_patch)
493       return;
494 
495    if (!is_patch) {
496       stride = get_tcs_out_vertex_dw_stride(ctx);
497       dw_addr = get_tcs_out_current_patch_offset(ctx);
498       dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
499                                                     semantic);
500    } else {
501       dw_addr = get_tcs_out_current_patch_data_offset(ctx);
502       dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
503                                                     semantic);
504 
505       if (is_const) {
506          int semantic = info->output_semantic[driver_location];
507 
508          /* Always write tess factors into LDS for the TCS epilog. */
509          if (semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
510              semantic == VARYING_SLOT_TESS_LEVEL_OUTER) {
511             is_tess_factor = true;
512             is_tess_inner = semantic == VARYING_SLOT_TESS_LEVEL_INNER;
513          }
514       }
515    }
516 
517    buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
518 
519    base = ac_get_arg(&ctx->ac, ctx->args.tess_offchip_offset);
520 
521    addr =
522       get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, semantic);
523 
524    for (unsigned chan = component; chan < 4; chan++) {
525       if (!(writemask & (1 << chan)))
526          continue;
527       LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
528 
529       /* Skip LDS stores if there is no LDS read of this output. */
530       if (info->output_readmask[driver_location] & (1 << chan) ||
531           /* The epilog reads LDS if invocation 0 doesn't define tess factors. */
532           (is_tess_factor &&
533            !ctx->shader->selector->info.tessfactors_are_def_in_all_invocs))
534          lshs_lds_store(ctx, chan, dw_addr, value);
535 
536       value = ac_to_integer(&ctx->ac, value);
537       values[chan] = value;
538 
539       if (writemask != 0xF && !is_tess_factor) {
540          ac_build_buffer_store_dword(&ctx->ac, buffer, value, NULL, addr, base,
541                                      4 * chan, ac_glc);
542       }
543 
544       /* Write tess factors into VGPRs for the epilog. */
545       if (is_tess_factor && ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
546          if (!is_tess_inner) {
547             LLVMBuildStore(ctx->ac.builder, value, /* outer */
548                            ctx->invoc0_tess_factors[chan]);
549          } else if (chan < 2) {
550             LLVMBuildStore(ctx->ac.builder, value, /* inner */
551                            ctx->invoc0_tess_factors[4 + chan]);
552          }
553       }
554    }
555 
556    if (writemask == 0xF && !is_tess_factor) {
557       LLVMValueRef value = ac_build_gather_values(&ctx->ac, values, 4);
558       ac_build_buffer_store_dword(&ctx->ac, buffer, value, NULL, addr, base, 0, ac_glc);
559    }
560 }
561 
load_tess_level(struct si_shader_context * ctx,unsigned semantic)562 static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic)
563 {
564    LLVMValueRef base, addr;
565 
566    int param = si_shader_io_get_unique_index_patch(semantic);
567 
568    base = ac_get_arg(&ctx->ac, ctx->args.tess_offchip_offset);
569    addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
570                                      LLVMConstInt(ctx->ac.i32, param, 0));
571 
572    return buffer_load(ctx, ctx->ac.f32, ~0, ctx->tess_offchip_ring, base, addr, true);
573 }
574 
load_tess_level_default(struct si_shader_context * ctx,unsigned sysval)575 static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, unsigned sysval)
576 {
577    LLVMValueRef buf, slot, val[4];
578    int i, offset;
579 
580    slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
581    buf = ac_get_arg(&ctx->ac, ctx->internal_bindings);
582    buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
583    offset = sysval == SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT ? 4 : 0;
584 
585    for (i = 0; i < 4; i++)
586       val[i] = si_buffer_load_const(ctx, buf, LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
587    return ac_build_gather_values(&ctx->ac, val, 4);
588 }
589 
si_load_tess_level(struct ac_shader_abi * abi,unsigned varying_id,bool load_default_state)590 static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, unsigned varying_id,
591                                        bool load_default_state)
592 {
593    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
594    unsigned semantic;
595 
596    if (load_default_state) {
597       switch (varying_id) {
598       case VARYING_SLOT_TESS_LEVEL_INNER:
599          semantic = SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT;
600          break;
601       case VARYING_SLOT_TESS_LEVEL_OUTER:
602          semantic = SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT;
603          break;
604       default:
605          unreachable("unknown tess level");
606       }
607       return load_tess_level_default(ctx, semantic);
608    }
609 
610    switch (varying_id) {
611    case VARYING_SLOT_TESS_LEVEL_INNER:
612       semantic = VARYING_SLOT_TESS_LEVEL_INNER;
613       break;
614    case VARYING_SLOT_TESS_LEVEL_OUTER:
615       semantic = VARYING_SLOT_TESS_LEVEL_OUTER;
616       break;
617    default:
618       unreachable("unknown tess level");
619    }
620 
621    return load_tess_level(ctx, semantic);
622 }
623 
si_load_patch_vertices_in(struct ac_shader_abi * abi)624 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
625 {
626    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
627    if (ctx->stage == MESA_SHADER_TESS_CTRL)
628       return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
629    else if (ctx->stage == MESA_SHADER_TESS_EVAL)
630       return get_num_tcs_out_vertices(ctx);
631    else
632       unreachable("invalid shader stage for VERTICESIN");
633 }
634 
635 /**
636  * Forward all outputs from the vertex shader to the TES. This is only used
637  * for the fixed function TCS.
638  */
si_copy_tcs_inputs(struct si_shader_context * ctx)639 static void si_copy_tcs_inputs(struct si_shader_context *ctx)
640 {
641    LLVMValueRef invocation_id, buffer, buffer_offset;
642    LLVMValueRef lds_vertex_stride, lds_base;
643    uint64_t inputs;
644 
645    invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
646    buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
647    buffer_offset = ac_get_arg(&ctx->ac, ctx->args.tess_offchip_offset);
648 
649    lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
650    lds_base = get_tcs_in_current_patch_offset(ctx);
651    lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base);
652 
653    inputs = ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy;
654    while (inputs) {
655       unsigned i = u_bit_scan64(&inputs);
656 
657       LLVMValueRef lds_ptr =
658          LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->ac.i32, 4 * i, 0), "");
659 
660       LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(
661          ctx, get_rel_patch_id(ctx), invocation_id, LLVMConstInt(ctx->ac.i32, i, 0));
662 
663       LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
664 
665       ac_build_buffer_store_dword(&ctx->ac, buffer, value, NULL, buffer_addr, buffer_offset, 0,
666                                   ac_glc);
667    }
668 }
669 
si_write_tess_factors(struct si_shader_context * ctx,LLVMValueRef rel_patch_id,LLVMValueRef invocation_id,LLVMValueRef tcs_out_current_patch_data_offset,LLVMValueRef invoc0_tf_outer[4],LLVMValueRef invoc0_tf_inner[2])670 static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
671                                   LLVMValueRef invocation_id,
672                                   LLVMValueRef tcs_out_current_patch_data_offset,
673                                   LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
674 {
675    struct si_shader *shader = ctx->shader;
676    unsigned tess_inner_index, tess_outer_index;
677    LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
678    LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
679    unsigned stride, outer_comps, inner_comps, i, offset;
680 
681    /* Add a barrier before loading tess factors from LDS. */
682    if (!shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def)
683       si_llvm_emit_barrier(ctx);
684 
685    /* Do this only for invocation 0, because the tess levels are per-patch,
686     * not per-vertex.
687     *
688     * This can't jump, because invocation 0 executes this. It should
689     * at least mask out the loads and stores for other invocations.
690     */
691    ac_build_ifcc(&ctx->ac,
692                  LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
693 
694    /* Determine the layout of one tess factor element in the buffer. */
695    switch (shader->key.ge.part.tcs.epilog.prim_mode) {
696    case TESS_PRIMITIVE_ISOLINES:
697       stride = 2; /* 2 dwords, 1 vec2 store */
698       outer_comps = 2;
699       inner_comps = 0;
700       break;
701    case TESS_PRIMITIVE_TRIANGLES:
702       stride = 4; /* 4 dwords, 1 vec4 store */
703       outer_comps = 3;
704       inner_comps = 1;
705       break;
706    case TESS_PRIMITIVE_QUADS:
707       stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
708       outer_comps = 4;
709       inner_comps = 2;
710       break;
711    default:
712       assert(0);
713       return;
714    }
715 
716    for (i = 0; i < 4; i++) {
717       inner[i] = LLVMGetUndef(ctx->ac.i32);
718       outer[i] = LLVMGetUndef(ctx->ac.i32);
719    }
720 
721    if (shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) {
722       /* Tess factors are in VGPRs. */
723       for (i = 0; i < outer_comps; i++)
724          outer[i] = out[i] = invoc0_tf_outer[i];
725       for (i = 0; i < inner_comps; i++)
726          inner[i] = out[outer_comps + i] = invoc0_tf_inner[i];
727    } else {
728       /* Load tess_inner and tess_outer from LDS.
729        * Any invocation can write them, so we can't get them from a temporary.
730        */
731       tess_inner_index = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
732       tess_outer_index = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER);
733 
734       lds_base = tcs_out_current_patch_data_offset;
735       lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
736                                LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), "");
737       lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
738                                LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), "");
739 
740       for (i = 0; i < outer_comps; i++) {
741          outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
742       }
743       for (i = 0; i < inner_comps; i++) {
744          inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
745       }
746    }
747 
748    if (shader->key.ge.part.tcs.epilog.prim_mode == TESS_PRIMITIVE_ISOLINES) {
749       /* For isolines, the hardware expects tess factors in the
750        * reverse order from what NIR specifies.
751        */
752       LLVMValueRef tmp = out[0];
753       out[0] = out[1];
754       out[1] = tmp;
755    }
756 
757    /* Convert the outputs to vectors for stores. */
758    vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
759    vec1 = NULL;
760 
761    if (stride > 4)
762       vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
763 
764    /* Get the buffer. */
765    buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
766 
767    /* Get the offset. */
768    tf_base = ac_get_arg(&ctx->ac, ctx->args.tcs_factor_offset);
769    byteoffset =
770       LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
771    offset = 0;
772 
773    /* Store the dynamic HS control word. */
774    if (ctx->screen->info.chip_class <= GFX8) {
775       ac_build_ifcc(&ctx->ac,
776                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
777       ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
778                                   NULL, ctx->ac.i32_0, tf_base, offset, ac_glc);
779       ac_build_endif(&ctx->ac, 6504);
780       offset += 4;
781    }
782 
783    /* Store the tessellation factors. */
784    ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, NULL, byteoffset,
785                                tf_base, offset, ac_glc);
786    offset += 16;
787    if (vec1)
788       ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, NULL, byteoffset,
789                                   tf_base, offset, ac_glc);
790 
791    /* Store the tess factors into the offchip buffer if TES reads them. */
792    if (shader->key.ge.part.tcs.epilog.tes_reads_tess_factors) {
793       LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
794       LLVMValueRef tf_inner_offset;
795       unsigned param_outer, param_inner;
796 
797       buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
798       base = ac_get_arg(&ctx->ac, ctx->args.tess_offchip_offset);
799 
800       param_outer = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER);
801       tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
802                                                    LLVMConstInt(ctx->ac.i32, param_outer, 0));
803 
804       outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_comps);
805 
806       ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, NULL, tf_outer_offset,
807                                   base, 0, ac_glc);
808       if (inner_comps) {
809          param_inner = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
810          tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
811                                                       LLVMConstInt(ctx->ac.i32, param_inner, 0));
812 
813          inner_vec = ac_build_gather_values(&ctx->ac, inner, inner_comps);
814          ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, NULL,
815                                      tf_inner_offset, base, 0, ac_glc);
816       }
817    }
818 
819    ac_build_endif(&ctx->ac, 6503);
820 }
821 
822 /* This only writes the tessellation factor levels. */
si_llvm_emit_tcs_epilogue(struct ac_shader_abi * abi)823 static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi)
824 {
825    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
826    LLVMBuilderRef builder = ctx->ac.builder;
827    LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
828 
829    si_copy_tcs_inputs(ctx);
830 
831    rel_patch_id = get_rel_patch_id(ctx);
832    invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
833    tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
834 
835    if (ctx->screen->info.chip_class >= GFX9 && !ctx->shader->is_monolithic) {
836       LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block};
837       LLVMValueRef values[2];
838 
839       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
840 
841       values[0] = rel_patch_id;
842       values[1] = LLVMGetUndef(ctx->ac.i32);
843       rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
844 
845       values[0] = tf_lds_offset;
846       values[1] = LLVMGetUndef(ctx->ac.i32);
847       tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
848 
849       values[0] = invocation_id;
850       values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
851       invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
852    }
853 
854    /* Return epilog parameters from this function. */
855    LLVMValueRef ret = ctx->return_value;
856    unsigned vgpr;
857 
858    if (ctx->screen->info.chip_class >= GFX9) {
859       ret =
860          si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
861       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
862       /* Tess offchip and tess factor offsets are at the beginning. */
863       ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 2);
864       ret = si_insert_input_ret(ctx, ret, ctx->args.tcs_factor_offset, 4);
865       vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
866    } else {
867       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
868       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT);
869       /* Tess offchip and tess factor offsets are after user SGPRs. */
870       ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, GFX6_TCS_NUM_USER_SGPR);
871       ret = si_insert_input_ret(ctx, ret, ctx->args.tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1);
872       vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
873    }
874 
875    /* VGPRs */
876    rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
877    invocation_id = ac_to_float(&ctx->ac, invocation_id);
878    tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
879 
880    /* Leave a hole corresponding to the two input VGPRs. This ensures that
881     * the invocation_id output does not alias the tcs_rel_ids input,
882     * which saves a V_MOV on gfx9.
883     */
884    vgpr += 2;
885 
886    ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
887    ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
888 
889    if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
890       vgpr++; /* skip the tess factor LDS offset */
891       for (unsigned i = 0; i < 6; i++) {
892          LLVMValueRef value = LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
893          value = ac_to_float(&ctx->ac, value);
894          ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
895       }
896    } else {
897       ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
898    }
899    ctx->return_value = ret;
900 }
901 
902 /* Pass TCS inputs from LS to TCS on GFX9. */
si_set_ls_return_value_for_tcs(struct si_shader_context * ctx)903 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
904 {
905    if (!ctx->shader->is_monolithic)
906       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
907 
908    LLVMValueRef ret = ctx->return_value;
909 
910    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
911    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
912    ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 2);
913    ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3);
914    ret = si_insert_input_ret(ctx, ret, ctx->args.tcs_factor_offset, 4);
915    ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5);
916 
917    ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
918    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
919                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
920 
921    ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
922 
923    ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
924    ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
925    ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
926 
927    unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
928    ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
929                               ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
930                               vgpr++, "");
931    ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
932                               ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
933                               vgpr++, "");
934    ctx->return_value = ret;
935 }
936 
si_llvm_emit_ls_epilogue(struct ac_shader_abi * abi)937 void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi)
938 {
939    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
940    struct si_shader *shader = ctx->shader;
941    struct si_shader_info *info = &shader->selector->info;
942    unsigned i, chan;
943    LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->args.vs_rel_patch_id);
944    LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
945    LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
946    LLVMValueRef *addrs = abi->outputs;
947    unsigned ret_offset = 8 + GFX9_TCS_NUM_USER_SGPR + 2;
948 
949    /* Write outputs to LDS. The next shader (TCS aka HS) will read
950     * its inputs from it. */
951    for (i = 0; i < info->num_outputs; i++) {
952       unsigned semantic = info->output_semantic[i];
953 
954       /* The ARB_shader_viewport_layer_array spec contains the
955        * following issue:
956        *
957        *    2) What happens if gl_ViewportIndex or gl_Layer is
958        *    written in the vertex shader and a geometry shader is
959        *    present?
960        *
961        *    RESOLVED: The value written by the last vertex processing
962        *    stage is used. If the last vertex processing stage
963        *    (vertex, tessellation evaluation or geometry) does not
964        *    statically assign to gl_ViewportIndex or gl_Layer, index
965        *    or layer zero is assumed.
966        *
967        * So writes to those outputs in VS-as-LS are simply ignored.
968        */
969       if (semantic == VARYING_SLOT_LAYER || semantic == VARYING_SLOT_VIEWPORT)
970          continue;
971 
972       int param = si_shader_io_get_unique_index(semantic, false);
973       LLVMValueRef dw_addr =
974          LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
975 
976       for (chan = 0; chan < 4; chan++) {
977          if (!(info->output_usagemask[i] & (1 << chan)))
978             continue;
979 
980          LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
981 
982          if (!shader->key.ge.opt.same_patch_vertices ||
983              !(ctx->next_shader_sel->tcs_vgpr_only_inputs & (1ull << semantic)))
984             lshs_lds_store(ctx, chan, dw_addr, value);
985 
986          if (shader->key.ge.opt.same_patch_vertices) {
987             ctx->return_value = LLVMBuildInsertValue(ctx->ac.builder, ctx->return_value,
988                                                      value, ret_offset + param * 4 + chan, "");
989          }
990       }
991    }
992 
993    if (ctx->screen->info.chip_class >= GFX9)
994       si_set_ls_return_value_for_tcs(ctx);
995 }
996 
997 /**
998  * Compile the TCS epilog function. This writes tesselation factors to memory
999  * based on the output primitive type of the tesselator (determined by TES).
1000  */
si_llvm_build_tcs_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)1001 void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
1002 {
1003    memset(&ctx->args, 0, sizeof(ctx->args));
1004 
1005    if (ctx->screen->info.chip_class >= GFX9) {
1006       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1007       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1008       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
1009       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
1010       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
1011       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1012       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1013       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1014       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1015       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1016       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1017       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1018       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1019       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1020       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1021       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1022       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1023       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1024       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1025    } else {
1026       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1027       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1028       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1029       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1030       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1031       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1032       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1033       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1034       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
1035       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
1036    }
1037 
1038    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
1039    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
1040    struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
1041    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
1042    struct ac_arg invocation_id; /* invocation ID within the patch */
1043    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
1044    struct ac_arg
1045       tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
1046    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset);
1047 
1048    struct ac_arg tess_factors[6];
1049    for (unsigned i = 0; i < 6; i++)
1050       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
1051 
1052    /* Create the function. */
1053    si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
1054    ac_declare_lds_as_pointer(&ctx->ac);
1055 
1056    LLVMValueRef invoc0_tess_factors[6];
1057    for (unsigned i = 0; i < 6; i++)
1058       invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
1059 
1060    si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
1061                          ac_get_arg(&ctx->ac, invocation_id),
1062                          ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
1063                          invoc0_tess_factors, invoc0_tess_factors + 4);
1064 
1065    LLVMBuildRetVoid(ctx->ac.builder);
1066 }
1067 
si_llvm_init_tcs_callbacks(struct si_shader_context * ctx)1068 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
1069 {
1070    ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
1071    ctx->abi.load_tess_level = si_load_tess_level;
1072    ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
1073    ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
1074    ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
1075 }
1076 
si_llvm_init_tes_callbacks(struct si_shader_context * ctx,bool ngg_cull_shader)1077 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
1078 {
1079    ctx->abi.load_tess_varyings = si_nir_load_input_tes;
1080    ctx->abi.load_tess_level = si_load_tess_level;
1081    ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
1082 
1083    if (ctx->shader->key.ge.as_es)
1084       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
1085    else if (ngg_cull_shader)
1086       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
1087    else if (ctx->shader->key.ge.as_ngg)
1088       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1089    else
1090       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1091 }
1092