1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 
si_is_es_thread(struct si_shader_context * ctx)30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
31 {
32    /* Return true if the current thread should execute an ES thread. */
33    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
34                         si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8), "");
35 }
36 
si_is_gs_thread(struct si_shader_context * ctx)37 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
38 {
39    /* Return true if the current thread should execute a GS thread. */
40    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
41                         si_unpack_param(ctx, ctx->args.merged_wave_info, 8, 8), "");
42 }
43 
si_llvm_load_input_gs(struct ac_shader_abi * abi,unsigned input_index,unsigned vtx_offset_param,LLVMTypeRef type,unsigned swizzle)44 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
45                                           unsigned vtx_offset_param, LLVMTypeRef type,
46                                           unsigned swizzle)
47 {
48    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
49    struct si_shader *shader = ctx->shader;
50    LLVMValueRef vtx_offset, soffset;
51    struct si_shader_info *info = &shader->selector->info;
52    unsigned param;
53    LLVMValueRef value;
54 
55    param = si_shader_io_get_unique_index(info->input[input_index].semantic, false);
56 
57    /* GFX9 has the ESGS ring in LDS. */
58    if (ctx->screen->info.chip_class >= GFX9) {
59       unsigned offset = param * 4 + swizzle;
60 
61       vtx_offset = LLVMBuildAdd(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
62                                 LLVMConstInt(ctx->ac.i32, offset, false), "");
63 
64       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
65       LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
66       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
67    }
68 
69    /* GFX6: input load from the ESGS ring in memory. */
70    /* Get the vertex offset parameter on GFX6. */
71    vtx_offset = LLVMBuildMul(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
72                              LLVMConstInt(ctx->ac.i32, 4, 0), "");
73 
74    soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
75 
76    value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
77                                 ctx->ac.f32, ac_glc, true, false);
78    return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
79 }
80 
si_nir_load_input_gs(struct ac_shader_abi * abi,unsigned driver_location,unsigned component,unsigned num_components,unsigned vertex_index,LLVMTypeRef type)81 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
82                                          unsigned driver_location, unsigned component,
83                                          unsigned num_components, unsigned vertex_index,
84                                          LLVMTypeRef type)
85 {
86    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
87 
88    LLVMValueRef value[4];
89    for (unsigned i = component; i < component + num_components; i++) {
90       value[i] = si_llvm_load_input_gs(&ctx->abi, driver_location,
91                                        vertex_index, type, i);
92    }
93 
94    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
95 }
96 
97 /* Pass GS inputs from ES to GS on GFX9. */
si_set_es_return_value_for_gs(struct si_shader_context * ctx)98 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
99 {
100    if (!ctx->shader->is_monolithic)
101       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
102 
103    LLVMValueRef ret = ctx->return_value;
104 
105    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
106    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
107    if (ctx->shader->key.ge.as_ngg)
108       ret = si_insert_input_ptr(ctx, ret, ctx->args.gs_tg_info, 2);
109    else
110       ret = si_insert_input_ret(ctx, ret, ctx->args.gs2vs_offset, 2);
111    ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3);
112    ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5);
113 
114    ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
115    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
116                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
117    if (ctx->screen->use_ngg) {
118       ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
119       ret = si_insert_input_ptr(ctx, ret, ctx->small_prim_cull_info, 8 + GFX9_SGPR_SMALL_PRIM_CULL_INFO);
120    }
121 
122    unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
123 
124    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[0], vgpr++);
125    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[1], vgpr++);
126    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
127    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
128    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[2], vgpr++);
129    ctx->return_value = ret;
130 }
131 
si_llvm_emit_es_epilogue(struct ac_shader_abi * abi)132 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi)
133 {
134    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
135    struct si_shader *es = ctx->shader;
136    struct si_shader_info *info = &es->selector->info;
137    LLVMValueRef *addrs = abi->outputs;
138    LLVMValueRef lds_base = NULL;
139    unsigned chan;
140    int i;
141 
142    if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
143       unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
144       LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
145       LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->args.merged_wave_info, 24, 4);
146       vertex_idx =
147          LLVMBuildOr(ctx->ac.builder, vertex_idx,
148                      LLVMBuildMul(ctx->ac.builder, wave_idx,
149                                   LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
150                      "");
151       lds_base =
152          LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
153    }
154 
155    for (i = 0; i < info->num_outputs; i++) {
156       int param;
157 
158       if (info->output_semantic[i] == VARYING_SLOT_VIEWPORT ||
159           info->output_semantic[i] == VARYING_SLOT_LAYER)
160          continue;
161 
162       param = si_shader_io_get_unique_index(info->output_semantic[i], false);
163 
164       for (chan = 0; chan < 4; chan++) {
165          if (!(info->output_usagemask[i] & (1 << chan)))
166             continue;
167 
168          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
169          out_val = ac_to_integer(&ctx->ac, out_val);
170 
171          /* GFX9 has the ESGS ring in LDS. */
172          if (ctx->screen->info.chip_class >= GFX9) {
173             LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
174             idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
175             ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
176             continue;
177          }
178 
179          ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, NULL, NULL,
180                                      ac_get_arg(&ctx->ac, ctx->args.es2gs_offset),
181                                      (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
182       }
183    }
184 
185    if (ctx->screen->info.chip_class >= GFX9)
186       si_set_es_return_value_for_gs(ctx);
187 }
188 
si_get_gs_wave_id(struct si_shader_context * ctx)189 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
190 {
191    if (ctx->screen->info.chip_class >= GFX9)
192       return si_unpack_param(ctx, ctx->args.merged_wave_info, 16, 8);
193    else
194       return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id);
195 }
196 
emit_gs_epilogue(struct si_shader_context * ctx)197 static void emit_gs_epilogue(struct si_shader_context *ctx)
198 {
199    if (ctx->shader->key.ge.as_ngg) {
200       gfx10_ngg_gs_emit_epilogue(ctx);
201       return;
202    }
203 
204    if (ctx->screen->info.chip_class >= GFX10)
205       LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
206 
207    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
208 
209    if (ctx->screen->info.chip_class >= GFX9)
210       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
211 }
212 
si_llvm_emit_gs_epilogue(struct ac_shader_abi * abi)213 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi)
214 {
215    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
216    struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
217 
218    assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
219 
220    emit_gs_epilogue(ctx);
221 }
222 
223 /* Emit one vertex from the geometry shader */
si_llvm_emit_vertex(struct ac_shader_abi * abi,unsigned stream,LLVMValueRef * addrs)224 static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
225 {
226    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
227 
228    if (ctx->shader->key.ge.as_ngg) {
229       gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
230       return;
231    }
232 
233    struct si_shader_info *info = &ctx->shader->selector->info;
234    struct si_shader *shader = ctx->shader;
235    LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->args.gs2vs_offset);
236    LLVMValueRef gs_next_vertex;
237    LLVMValueRef can_emit;
238    unsigned chan, offset;
239    int i;
240 
241    /* Write vertex attribute values to GSVS ring */
242    gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
243 
244    /* If this thread has already emitted the declared maximum number of
245     * vertices, skip the write: excessive vertex emissions are not
246     * supposed to have any effect.
247     *
248     * If the shader has no writes to memory, kill it instead. This skips
249     * further memory loads and may allow LLVM to skip to the end
250     * altogether.
251     */
252    can_emit =
253       LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
254                     LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), "");
255 
256    bool use_kill = !info->base.writes_memory;
257    if (use_kill) {
258       ac_build_kill_if_false(&ctx->ac, can_emit);
259    } else {
260       ac_build_ifcc(&ctx->ac, can_emit, 6505);
261    }
262 
263    offset = 0;
264    for (i = 0; i < info->num_outputs; i++) {
265       for (chan = 0; chan < 4; chan++) {
266          if (!(info->output_usagemask[i] & (1 << chan)) ||
267              ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
268             continue;
269 
270          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
271          LLVMValueRef voffset =
272             LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0);
273          offset++;
274 
275          voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
276          voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
277 
278          out_val = ac_to_integer(&ctx->ac, out_val);
279 
280          ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, NULL,
281                                      voffset, soffset, 0, ac_glc | ac_slc | ac_swizzled);
282       }
283    }
284 
285    gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
286    LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
287 
288    /* Signal vertex emission if vertex data was written. */
289    if (offset) {
290       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
291                        si_get_gs_wave_id(ctx));
292    }
293 
294    if (!use_kill)
295       ac_build_endif(&ctx->ac, 6505);
296 }
297 
298 /* Cut one primitive from the geometry shader */
si_llvm_emit_primitive(struct ac_shader_abi * abi,unsigned stream)299 static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
300 {
301    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
302 
303    if (ctx->shader->key.ge.as_ngg) {
304       LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
305       return;
306    }
307 
308    /* Signal primitive cut */
309    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
310                     si_get_gs_wave_id(ctx));
311 }
312 
si_preload_esgs_ring(struct si_shader_context * ctx)313 void si_preload_esgs_ring(struct si_shader_context *ctx)
314 {
315    LLVMBuilderRef builder = ctx->ac.builder;
316 
317    if (ctx->screen->info.chip_class <= GFX8) {
318       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_ESGS, 0);
319       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
320 
321       ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
322 
323       if (ctx->stage != MESA_SHADER_GEOMETRY) {
324          LLVMValueRef desc1 = LLVMBuildExtractElement(builder, ctx->esgs_ring, ctx->ac.i32_1, "");
325          LLVMValueRef desc3 = LLVMBuildExtractElement(builder, ctx->esgs_ring,
326                                                       LLVMConstInt(ctx->ac.i32, 3, 0), "");
327          desc1 = LLVMBuildOr(builder, desc1, LLVMConstInt(ctx->ac.i32,
328                                                           S_008F04_SWIZZLE_ENABLE(1), 0), "");
329          desc3 = LLVMBuildOr(builder, desc3, LLVMConstInt(ctx->ac.i32,
330                                                           S_008F0C_ELEMENT_SIZE(1) |
331                                                           S_008F0C_INDEX_STRIDE(3) |
332                                                           S_008F0C_ADD_TID_ENABLE(1), 0), "");
333          ctx->esgs_ring = LLVMBuildInsertElement(builder, ctx->esgs_ring, desc1, ctx->ac.i32_1, "");
334          ctx->esgs_ring = LLVMBuildInsertElement(builder, ctx->esgs_ring, desc3,
335                                                  LLVMConstInt(ctx->ac.i32, 3, 0), "");
336       }
337    } else {
338       if (USE_LDS_SYMBOLS) {
339          /* Declare the ESGS ring as an explicit LDS symbol. */
340          si_llvm_declare_esgs_ring(ctx);
341       } else {
342          ac_declare_lds_as_pointer(&ctx->ac);
343          ctx->esgs_ring = ctx->ac.lds;
344       }
345    }
346 }
347 
si_preload_gs_rings(struct si_shader_context * ctx)348 void si_preload_gs_rings(struct si_shader_context *ctx)
349 {
350    const struct si_shader_selector *sel = ctx->shader->selector;
351    LLVMBuilderRef builder = ctx->ac.builder;
352    LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
353    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
354    LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
355 
356    /* The conceptual layout of the GSVS ring is
357     *   v0c0 .. vLv0 v0c1 .. vLc1 ..
358     * but the real memory layout is swizzled across
359     * threads:
360     *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
361     *   t16v0c0 ..
362     * Override the buffer descriptor accordingly.
363     */
364    LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
365    uint64_t stream_offset = 0;
366 
367    for (unsigned stream = 0; stream < 4; ++stream) {
368       unsigned num_components;
369       unsigned stride;
370       unsigned num_records;
371       LLVMValueRef ring, tmp;
372 
373       num_components = sel->info.num_stream_output_components[stream];
374       if (!num_components)
375          continue;
376 
377       stride = 4 * num_components * sel->info.base.gs.vertices_out;
378 
379       /* Limit on the stride field for <= GFX7. */
380       assert(stride < (1 << 14));
381 
382       num_records = ctx->ac.wave_size;
383 
384       ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
385       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
386       tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
387       stream_offset += stride * ctx->ac.wave_size;
388 
389       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
390       ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
391       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
392       tmp = LLVMBuildOr(
393          builder, tmp,
394          LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
395       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
396       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
397                                     LLVMConstInt(ctx->ac.i32, 2, 0), "");
398 
399       uint32_t rsrc3 =
400          S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
401          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
402          S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
403          S_008F0C_ADD_TID_ENABLE(1);
404 
405       if (ctx->ac.chip_class >= GFX10) {
406          rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
407                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
408       } else {
409          rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
410                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
411                   S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
412       }
413 
414       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
415                                     LLVMConstInt(ctx->ac.i32, 3, 0), "");
416 
417       ctx->gsvs_ring[stream] = ring;
418    }
419 }
420 
421 /* Generate code for the hardware VS shader stage to go with a geometry shader */
si_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader_selector * gs_selector,struct pipe_debug_callback * debug)422 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
423                                              struct ac_llvm_compiler *compiler,
424                                              struct si_shader_selector *gs_selector,
425                                              struct pipe_debug_callback *debug)
426 {
427    struct si_shader_context ctx;
428    struct si_shader *shader;
429    LLVMBuilderRef builder;
430    struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
431    struct si_shader_info *gsinfo = &gs_selector->info;
432    int i;
433 
434    shader = CALLOC_STRUCT(si_shader);
435    if (!shader)
436       return NULL;
437 
438    /* We can leave the fence as permanently signaled because the GS copy
439     * shader only becomes visible globally after it has been compiled. */
440    util_queue_fence_init(&shader->ready);
441 
442    shader->selector = gs_selector;
443    shader->is_gs_copy_shader = true;
444    shader->wave_size = si_determine_wave_size(sscreen, shader);
445 
446    si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
447    ctx.shader = shader;
448    ctx.stage = MESA_SHADER_VERTEX;
449 
450    builder = ctx.ac.builder;
451 
452    si_llvm_create_main_func(&ctx, false);
453 
454    LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.internal_bindings);
455    ctx.gsvs_ring[0] =
456       ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
457 
458    LLVMValueRef voffset =
459       LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
460 
461    /* Fetch the vertex stream ID.*/
462    LLVMValueRef stream_id;
463 
464    if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
465       stream_id = si_unpack_param(&ctx, ctx.args.streamout_config, 24, 2);
466    else
467       stream_id = ctx.ac.i32_0;
468 
469    /* Fill in output information. */
470    for (i = 0; i < gsinfo->num_outputs; ++i) {
471       outputs[i].semantic = gsinfo->output_semantic[i];
472       outputs[i].vertex_streams = gsinfo->output_streams[i];
473    }
474 
475    LLVMBasicBlockRef end_bb;
476    LLVMValueRef switch_inst;
477 
478    end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
479    switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
480 
481    for (int stream = 0; stream < 4; stream++) {
482       LLVMBasicBlockRef bb;
483       unsigned offset;
484 
485       if (!gsinfo->num_stream_output_components[stream])
486          continue;
487 
488       if (stream > 0 && !gs_selector->so.num_outputs)
489          continue;
490 
491       bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
492       LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
493       LLVMPositionBuilderAtEnd(builder, bb);
494 
495       /* Fetch vertex data from GSVS ring */
496       offset = 0;
497       for (i = 0; i < gsinfo->num_outputs; ++i) {
498          for (unsigned chan = 0; chan < 4; chan++) {
499             if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
500                 ((outputs[i].vertex_streams >> (chan * 2)) & 0x3) != stream) {
501                outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
502                continue;
503             }
504 
505             LLVMValueRef soffset =
506                LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0);
507             offset++;
508 
509             outputs[i].values[chan] =
510                ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
511                                     ctx.ac.f32, ac_glc | ac_slc, true, false);
512          }
513       }
514 
515       /* Streamout and exports. */
516       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
517          si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
518       }
519 
520       if (stream == 0)
521          si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
522 
523       LLVMBuildBr(builder, end_bb);
524    }
525 
526    LLVMPositionBuilderAtEnd(builder, end_bb);
527 
528    LLVMBuildRetVoid(ctx.ac.builder);
529 
530    ctx.stage = MESA_SHADER_GEOMETRY; /* override for shader dumping */
531    si_llvm_optimize_module(&ctx);
532 
533    bool ok = false;
534    if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
535                        debug, MESA_SHADER_GEOMETRY, "GS Copy Shader", false)) {
536       if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY))
537          fprintf(stderr, "GS Copy Shader:\n");
538       si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
539 
540       if (!ctx.shader->config.scratch_bytes_per_wave)
541          ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
542       else
543          ok = true;
544    }
545 
546    si_llvm_dispose(&ctx);
547 
548    if (!ok) {
549       FREE(shader);
550       shader = NULL;
551    } else {
552       si_fix_resource_usage(sscreen, shader);
553    }
554    return shader;
555 }
556 
si_llvm_init_gs_callbacks(struct si_shader_context * ctx)557 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
558 {
559    ctx->abi.load_inputs = si_nir_load_input_gs;
560    ctx->abi.emit_vertex = si_llvm_emit_vertex;
561    ctx->abi.emit_primitive = si_llvm_emit_primitive;
562    ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
563 }
564