1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 
si_is_es_thread(struct si_shader_context * ctx)30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
31 {
32    /* Return true if the current thread should execute an ES thread. */
33    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
34                         si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8), "");
35 }
36 
si_is_gs_thread(struct si_shader_context * ctx)37 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
38 {
39    /* Return true if the current thread should execute a GS thread. */
40    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
41                         si_unpack_param(ctx, ctx->args.merged_wave_info, 8, 8), "");
42 }
43 
si_llvm_load_input_gs(struct ac_shader_abi * abi,unsigned input_index,unsigned vtx_offset_param,LLVMTypeRef type,unsigned swizzle)44 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
45                                           unsigned vtx_offset_param, LLVMTypeRef type,
46                                           unsigned swizzle)
47 {
48    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
49    struct si_shader *shader = ctx->shader;
50    LLVMValueRef vtx_offset, soffset;
51    struct si_shader_info *info = &shader->selector->info;
52    unsigned param;
53    LLVMValueRef value;
54 
55    param = si_shader_io_get_unique_index(info->input[input_index].semantic, false);
56 
57    /* GFX9 has the ESGS ring in LDS. */
58    if (ctx->screen->info.chip_class >= GFX9) {
59       unsigned index = vtx_offset_param;
60       vtx_offset =
61          si_unpack_param(ctx, ctx->args.gs_vtx_offset[index / 2], (index & 1) * 16, 16);
62 
63       unsigned offset = param * 4 + swizzle;
64       vtx_offset =
65          LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
66 
67       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
68       LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
69       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
70    }
71 
72    /* GFX6: input load from the ESGS ring in memory. */
73    /* Get the vertex offset parameter on GFX6. */
74    LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[vtx_offset_param]);
75 
76    vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
77 
78    soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
79 
80    value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
81                                 ctx->ac.f32, ac_glc, true, false);
82    return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
83 }
84 
si_nir_load_input_gs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)85 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
86                                          unsigned driver_location, unsigned component,
87                                          unsigned num_components, unsigned vertex_index,
88                                          LLVMTypeRef type)
89 {
90    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
91 
92    LLVMValueRef value[4];
93    for (unsigned i = component; i < component + num_components; i++) {
94       value[i] = si_llvm_load_input_gs(&ctx->abi, driver_location,
95                                        vertex_index, type, i);
96    }
97 
98    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
99 }
100 
101 /* Pass GS inputs from ES to GS on GFX9. */
si_set_es_return_value_for_gs(struct si_shader_context * ctx)102 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
103 {
104    if (!ctx->shader->is_monolithic)
105       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
106 
107    LLVMValueRef ret = ctx->return_value;
108 
109    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
110    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
111    if (ctx->shader->key.as_ngg)
112       ret = si_insert_input_ptr(ctx, ret, ctx->args.gs_tg_info, 2);
113    else
114       ret = si_insert_input_ret(ctx, ret, ctx->args.gs2vs_offset, 2);
115    ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3);
116    ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5);
117 
118    ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
119    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
120                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
121    if (ctx->screen->use_ngg) {
122       ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
123    }
124 
125    unsigned vgpr = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS;
126 
127    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[0], vgpr++);
128    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[1], vgpr++);
129    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
130    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
131    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[2], vgpr++);
132    ctx->return_value = ret;
133 }
134 
si_llvm_emit_es_epilogue(struct ac_shader_abi * abi)135 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi)
136 {
137    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
138    struct si_shader *es = ctx->shader;
139    struct si_shader_info *info = &es->selector->info;
140    LLVMValueRef *addrs = abi->outputs;
141    LLVMValueRef lds_base = NULL;
142    unsigned chan;
143    int i;
144 
145    if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
146       unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
147       LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
148       LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4);
149       vertex_idx =
150          LLVMBuildOr(ctx->ac.builder, vertex_idx,
151                      LLVMBuildMul(ctx->ac.builder, wave_idx,
152                                   LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
153                      "");
154       lds_base =
155          LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
156    }
157 
158    for (i = 0; i < info->num_outputs; i++) {
159       int param;
160 
161       if (info->output_semantic[i] == VARYING_SLOT_VIEWPORT ||
162           info->output_semantic[i] == VARYING_SLOT_LAYER)
163          continue;
164 
165       param = si_shader_io_get_unique_index(info->output_semantic[i], false);
166 
167       for (chan = 0; chan < 4; chan++) {
168          if (!(info->output_usagemask[i] & (1 << chan)))
169             continue;
170 
171          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
172          out_val = ac_to_integer(&ctx->ac, out_val);
173 
174          /* GFX9 has the ESGS ring in LDS. */
175          if (ctx->screen->info.chip_class >= GFX9) {
176             LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
177             idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
178             ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
179             continue;
180          }
181 
182          ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
183                                      ac_get_arg(&ctx->ac, ctx->args.es2gs_offset),
184                                      (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
185       }
186    }
187 
188    if (ctx->screen->info.chip_class >= GFX9)
189       si_set_es_return_value_for_gs(ctx);
190 }
191 
si_get_gs_wave_id(struct si_shader_context * ctx)192 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
193 {
194    if (ctx->screen->info.chip_class >= GFX9)
195       return si_unpack_param(ctx, ctx->args.merged_wave_info, 16, 8);
196    else
197       return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id);
198 }
199 
emit_gs_epilogue(struct si_shader_context * ctx)200 static void emit_gs_epilogue(struct si_shader_context *ctx)
201 {
202    if (ctx->shader->key.as_ngg) {
203       gfx10_ngg_gs_emit_epilogue(ctx);
204       return;
205    }
206 
207    if (ctx->screen->info.chip_class >= GFX10)
208       LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
209 
210    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
211 
212    if (ctx->screen->info.chip_class >= GFX9)
213       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
214 }
215 
si_llvm_emit_gs_epilogue(struct ac_shader_abi * abi)216 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi)
217 {
218    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
219    struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
220 
221    assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
222 
223    emit_gs_epilogue(ctx);
224 }
225 
226 /* Emit one vertex from the geometry shader */
si_llvm_emit_vertex(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef * addrs)227 static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
228 {
229    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
230 
231    if (ctx->shader->key.as_ngg) {
232       gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
233       return;
234    }
235 
236    struct si_shader_info *info = &ctx->shader->selector->info;
237    struct si_shader *shader = ctx->shader;
238    LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->args.gs2vs_offset);
239    LLVMValueRef gs_next_vertex;
240    LLVMValueRef can_emit;
241    unsigned chan, offset;
242    int i;
243 
244    /* Write vertex attribute values to GSVS ring */
245    gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
246 
247    /* If this thread has already emitted the declared maximum number of
248     * vertices, skip the write: excessive vertex emissions are not
249     * supposed to have any effect.
250     *
251     * If the shader has no writes to memory, kill it instead. This skips
252     * further memory loads and may allow LLVM to skip to the end
253     * altogether.
254     */
255    can_emit =
256       LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
257                     LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), "");
258 
259    bool use_kill = !info->base.writes_memory;
260    if (use_kill) {
261       ac_build_kill_if_false(&ctx->ac, can_emit);
262    } else {
263       ac_build_ifcc(&ctx->ac, can_emit, 6505);
264    }
265 
266    offset = 0;
267    for (i = 0; i < info->num_outputs; i++) {
268       for (chan = 0; chan < 4; chan++) {
269          if (!(info->output_usagemask[i] & (1 << chan)) ||
270              ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
271             continue;
272 
273          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
274          LLVMValueRef voffset =
275             LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0);
276          offset++;
277 
278          voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
279          voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
280 
281          out_val = ac_to_integer(&ctx->ac, out_val);
282 
283          ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
284                                      0, ac_glc | ac_slc | ac_swizzled);
285       }
286    }
287 
288    gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
289    LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
290 
291    /* Signal vertex emission if vertex data was written. */
292    if (offset) {
293       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
294                        si_get_gs_wave_id(ctx));
295    }
296 
297    if (!use_kill)
298       ac_build_endif(&ctx->ac, 6505);
299 }
300 
301 /* Cut one primitive from the geometry shader */
si_llvm_emit_primitive(struct ac_shader_abi * abi,unsigned stream)302 static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
303 {
304    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
305 
306    if (ctx->shader->key.as_ngg) {
307       LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
308       return;
309    }
310 
311    /* Signal primitive cut */
312    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
313                     si_get_gs_wave_id(ctx));
314 }
315 
si_preload_esgs_ring(struct si_shader_context * ctx)316 void si_preload_esgs_ring(struct si_shader_context *ctx)
317 {
318    if (ctx->screen->info.chip_class <= GFX8) {
319       unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
320       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
321       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
322 
323       ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
324    } else {
325       if (USE_LDS_SYMBOLS) {
326          /* Declare the ESGS ring as an explicit LDS symbol. */
327          si_llvm_declare_esgs_ring(ctx);
328       } else {
329          ac_declare_lds_as_pointer(&ctx->ac);
330          ctx->esgs_ring = ctx->ac.lds;
331       }
332    }
333 }
334 
si_preload_gs_rings(struct si_shader_context * ctx)335 void si_preload_gs_rings(struct si_shader_context *ctx)
336 {
337    const struct si_shader_selector *sel = ctx->shader->selector;
338    LLVMBuilderRef builder = ctx->ac.builder;
339    LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
340    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
341    LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
342 
343    /* The conceptual layout of the GSVS ring is
344     *   v0c0 .. vLv0 v0c1 .. vLc1 ..
345     * but the real memory layout is swizzled across
346     * threads:
347     *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
348     *   t16v0c0 ..
349     * Override the buffer descriptor accordingly.
350     */
351    LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
352    uint64_t stream_offset = 0;
353 
354    for (unsigned stream = 0; stream < 4; ++stream) {
355       unsigned num_components;
356       unsigned stride;
357       unsigned num_records;
358       LLVMValueRef ring, tmp;
359 
360       num_components = sel->info.num_stream_output_components[stream];
361       if (!num_components)
362          continue;
363 
364       stride = 4 * num_components * sel->info.base.gs.vertices_out;
365 
366       /* Limit on the stride field for <= GFX7. */
367       assert(stride < (1 << 14));
368 
369       num_records = ctx->ac.wave_size;
370 
371       ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
372       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
373       tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
374       stream_offset += stride * ctx->ac.wave_size;
375 
376       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
377       ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
378       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
379       tmp = LLVMBuildOr(
380          builder, tmp,
381          LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
382       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
383       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
384                                     LLVMConstInt(ctx->ac.i32, 2, 0), "");
385 
386       uint32_t rsrc3 =
387          S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
388          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
389          S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
390          S_008F0C_ADD_TID_ENABLE(1);
391 
392       if (ctx->ac.chip_class >= GFX10) {
393          rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
394                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
395       } else {
396          rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
397                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
398                   S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
399       }
400 
401       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
402                                     LLVMConstInt(ctx->ac.i32, 3, 0), "");
403 
404       ctx->gsvs_ring[stream] = ring;
405    }
406 }
407 
408 /* Generate code for the hardware VS shader stage to go with a geometry shader */
si_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader_selector * gs_selector,struct pipe_debug_callback * debug)409 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
410                                              struct ac_llvm_compiler *compiler,
411                                              struct si_shader_selector *gs_selector,
412                                              struct pipe_debug_callback *debug)
413 {
414    struct si_shader_context ctx;
415    struct si_shader *shader;
416    LLVMBuilderRef builder;
417    struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
418    struct si_shader_info *gsinfo = &gs_selector->info;
419    int i;
420 
421    shader = CALLOC_STRUCT(si_shader);
422    if (!shader)
423       return NULL;
424 
425    /* We can leave the fence as permanently signaled because the GS copy
426     * shader only becomes visible globally after it has been compiled. */
427    util_queue_fence_init(&shader->ready);
428 
429    shader->selector = gs_selector;
430    shader->is_gs_copy_shader = true;
431 
432    si_llvm_context_init(&ctx, sscreen, compiler,
433                         si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
434                                          false, false));
435    ctx.shader = shader;
436    ctx.stage = MESA_SHADER_VERTEX;
437 
438    builder = ctx.ac.builder;
439 
440    si_llvm_create_main_func(&ctx, false);
441 
442    LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.internal_bindings);
443    ctx.gsvs_ring[0] =
444       ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
445 
446    LLVMValueRef voffset =
447       LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
448 
449    /* Fetch the vertex stream ID.*/
450    LLVMValueRef stream_id;
451 
452    if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
453       stream_id = si_unpack_param(&ctx, ctx.args.streamout_config, 24, 2);
454    else
455       stream_id = ctx.ac.i32_0;
456 
457    /* Fill in output information. */
458    for (i = 0; i < gsinfo->num_outputs; ++i) {
459       outputs[i].semantic = gsinfo->output_semantic[i];
460 
461       for (int chan = 0; chan < 4; chan++) {
462          outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
463       }
464    }
465 
466    LLVMBasicBlockRef end_bb;
467    LLVMValueRef switch_inst;
468 
469    end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
470    switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
471 
472    for (int stream = 0; stream < 4; stream++) {
473       LLVMBasicBlockRef bb;
474       unsigned offset;
475 
476       if (!gsinfo->num_stream_output_components[stream])
477          continue;
478 
479       if (stream > 0 && !gs_selector->so.num_outputs)
480          continue;
481 
482       bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
483       LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
484       LLVMPositionBuilderAtEnd(builder, bb);
485 
486       /* Fetch vertex data from GSVS ring */
487       offset = 0;
488       for (i = 0; i < gsinfo->num_outputs; ++i) {
489          for (unsigned chan = 0; chan < 4; chan++) {
490             if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
491                 outputs[i].vertex_stream[chan] != stream) {
492                outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
493                continue;
494             }
495 
496             LLVMValueRef soffset =
497                LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0);
498             offset++;
499 
500             outputs[i].values[chan] =
501                ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
502                                     ctx.ac.f32, ac_glc | ac_slc, true, false);
503          }
504       }
505 
506       /* Streamout and exports. */
507       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
508          si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
509       }
510 
511       if (stream == 0)
512          si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
513 
514       LLVMBuildBr(builder, end_bb);
515    }
516 
517    LLVMPositionBuilderAtEnd(builder, end_bb);
518 
519    LLVMBuildRetVoid(ctx.ac.builder);
520 
521    ctx.stage = MESA_SHADER_GEOMETRY; /* override for shader dumping */
522    si_llvm_optimize_module(&ctx);
523 
524    bool ok = false;
525    if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
526                        debug, MESA_SHADER_GEOMETRY, "GS Copy Shader", false)) {
527       if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY))
528          fprintf(stderr, "GS Copy Shader:\n");
529       si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
530 
531       if (!ctx.shader->config.scratch_bytes_per_wave)
532          ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
533       else
534          ok = true;
535    }
536 
537    si_llvm_dispose(&ctx);
538 
539    if (!ok) {
540       FREE(shader);
541       shader = NULL;
542    } else {
543       si_fix_resource_usage(sscreen, shader);
544    }
545    return shader;
546 }
547 
548 /**
549  * Build the GS prolog function. Rotate the input vertices for triangle strips
550  * with adjacency.
551  */
si_llvm_build_gs_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)552 void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
553 {
554    unsigned num_sgprs, num_vgprs;
555    LLVMBuilderRef builder = ctx->ac.builder;
556    LLVMTypeRef returns[AC_MAX_ARGS];
557    LLVMValueRef func, ret;
558 
559    memset(&ctx->args, 0, sizeof(ctx->args));
560 
561    if (ctx->screen->info.chip_class >= GFX9) {
562       /* Other user SGPRs are not needed by GS. */
563       num_sgprs = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS;
564       num_vgprs = 5; /* ES inputs are not needed by GS */
565    } else {
566       num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
567       num_vgprs = 8;
568    }
569 
570    for (unsigned i = 0; i < num_sgprs; ++i) {
571       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
572       returns[i] = ctx->ac.i32;
573    }
574 
575    for (unsigned i = 0; i < num_vgprs; ++i) {
576       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
577       returns[num_sgprs + i] = ctx->ac.f32;
578    }
579 
580    /* Create the function. */
581    si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
582    func = ctx->main_fn;
583 
584    /* Copy inputs to outputs. This should be no-op, as the registers match,
585     * but it will prevent the compiler from overwriting them unintentionally.
586     */
587    ret = ctx->return_value;
588    for (unsigned i = 0; i < num_sgprs; i++) {
589       LLVMValueRef p = LLVMGetParam(func, i);
590       ret = LLVMBuildInsertValue(builder, ret, p, i, "");
591    }
592    for (unsigned i = 0; i < num_vgprs; i++) {
593       LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
594       p = ac_to_float(&ctx->ac, p);
595       ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
596    }
597 
598    if (key->gs_prolog.states.tri_strip_adj_fix) {
599       /* Remap the input vertices for every other primitive. */
600       const struct ac_arg gfx6_vtx_params[6] = {
601          {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
602          {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
603          {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
604       };
605       const struct ac_arg gfx9_vtx_params[3] = {
606          {.used = true, .arg_index = num_sgprs},
607          {.used = true, .arg_index = num_sgprs + 1},
608          {.used = true, .arg_index = num_sgprs + 4},
609       };
610       LLVMValueRef vtx_in[6], vtx_out[6];
611       LLVMValueRef prim_id, rotate;
612 
613       if (ctx->screen->info.chip_class >= GFX9) {
614          for (unsigned i = 0; i < 3; i++) {
615             vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
616             vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
617          }
618       } else {
619          for (unsigned i = 0; i < 6; i++)
620             vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
621       }
622 
623       prim_id = LLVMGetParam(func, num_sgprs + 2);
624       rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
625 
626       for (unsigned i = 0; i < 6; ++i) {
627          LLVMValueRef base, rotated;
628          base = vtx_in[i];
629          rotated = vtx_in[(i + 4) % 6];
630          vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
631       }
632 
633       if (ctx->screen->info.chip_class >= GFX9) {
634          for (unsigned i = 0; i < 3; i++) {
635             LLVMValueRef hi, out;
636 
637             hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
638             out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
639             out = ac_to_float(&ctx->ac, out);
640             ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
641          }
642       } else {
643          for (unsigned i = 0; i < 6; i++) {
644             LLVMValueRef out;
645 
646             out = ac_to_float(&ctx->ac, vtx_out[i]);
647             ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
648          }
649       }
650    }
651 
652    LLVMBuildRet(builder, ret);
653 }
654 
si_llvm_init_gs_callbacks(struct si_shader_context * ctx)655 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
656 {
657    ctx->abi.load_inputs = si_nir_load_input_gs;
658    ctx->abi.emit_vertex = si_llvm_emit_vertex;
659    ctx->abi.emit_primitive = si_llvm_emit_primitive;
660    ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
661 }
662