1 /*
2  * Copyright © 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir_builder.h"
25 #include "ir3_compiler.h"
26 #include "ir3_nir.h"
27 
28 struct state {
29    uint32_t topology;
30 
31    struct primitive_map {
32       unsigned loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */
33       unsigned stride;
34    } map;
35 
36    nir_ssa_def *header;
37 
38    nir_variable *vertex_count_var;
39    nir_variable *emitted_vertex_var;
40    nir_variable *vertex_flags_out;
41 
42    struct exec_list old_outputs;
43    struct exec_list new_outputs;
44    struct exec_list emit_outputs;
45 
46    /* tess ctrl shader on a650 gets the local primitive id at different bits: */
47    unsigned local_primitive_id_start;
48 };
49 
50 static nir_ssa_def *
bitfield_extract(nir_builder * b,nir_ssa_def * v,uint32_t start,uint32_t mask)51 bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
52 {
53    return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
54                    nir_imm_int(b, mask));
55 }
56 
57 static nir_ssa_def *
build_invocation_id(nir_builder * b,struct state * state)58 build_invocation_id(nir_builder *b, struct state *state)
59 {
60    return bitfield_extract(b, state->header, 11, 31);
61 }
62 
63 static nir_ssa_def *
build_vertex_id(nir_builder * b,struct state * state)64 build_vertex_id(nir_builder *b, struct state *state)
65 {
66    return bitfield_extract(b, state->header, 6, 31);
67 }
68 
69 static nir_ssa_def *
build_local_primitive_id(nir_builder * b,struct state * state)70 build_local_primitive_id(nir_builder *b, struct state *state)
71 {
72    return bitfield_extract(b, state->header, state->local_primitive_id_start,
73                            63);
74 }
75 
76 static bool
is_tess_levels(gl_varying_slot slot)77 is_tess_levels(gl_varying_slot slot)
78 {
79    return (slot == VARYING_SLOT_PRIMITIVE_ID ||
80            slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
81            slot == VARYING_SLOT_TESS_LEVEL_INNER);
82 }
83 
84 /* Return a deterministic index for varyings. We can't rely on driver_location
85  * to be correct without linking the different stages first, so we create
86  * "primitive maps" where the producer decides on the location of each varying
87  * slot and then exports a per-slot array to the consumer. This compacts the
88  * gl_varying_slot space down a bit so that the primitive maps aren't too
89  * large.
90  *
91  * Note: per-patch varyings are currently handled separately, without any
92  * compacting.
93  *
94  * TODO: We could probably use the driver_location's directly in the non-SSO
95  * (Vulkan) case.
96  */
97 
98 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)99 shader_io_get_unique_index(gl_varying_slot slot)
100 {
101    if (slot == VARYING_SLOT_POS)
102       return 0;
103    if (slot == VARYING_SLOT_PSIZ)
104       return 1;
105    if (slot == VARYING_SLOT_CLIP_DIST0)
106       return 2;
107    if (slot == VARYING_SLOT_CLIP_DIST1)
108       return 3;
109    if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
110       return 4 + (slot - VARYING_SLOT_VAR0);
111    unreachable("illegal slot in get unique index\n");
112 }
113 
114 static nir_ssa_def *
build_local_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)115 build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex,
116                    uint32_t location, uint32_t comp, nir_ssa_def *offset)
117 {
118    nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
119    nir_ssa_def *primitive_offset =
120       nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
121    nir_ssa_def *attr_offset;
122    nir_ssa_def *vertex_stride;
123    unsigned index = shader_io_get_unique_index(location);
124 
125    switch (b->shader->info.stage) {
126    case MESA_SHADER_VERTEX:
127    case MESA_SHADER_TESS_EVAL:
128       vertex_stride = nir_imm_int(b, state->map.stride * 4);
129       attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
130       break;
131    case MESA_SHADER_TESS_CTRL:
132    case MESA_SHADER_GEOMETRY:
133       vertex_stride = nir_load_vs_vertex_stride_ir3(b);
134       attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
135                              nir_imm_int(b, comp * 4));
136       break;
137    default:
138       unreachable("bad shader stage");
139    }
140 
141    nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
142 
143    return nir_iadd(
144       b, nir_iadd(b, primitive_offset, vertex_offset),
145       nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
146 }
147 
148 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_ssa_def * src0,nir_ssa_def * src1,nir_ssa_def * src2)149 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
150                   nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1,
151                   nir_ssa_def *src2)
152 {
153    nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
154 
155    new_intr->src[0] = nir_src_for_ssa(src0);
156    if (src1)
157       new_intr->src[1] = nir_src_for_ssa(src1);
158    if (src2)
159       new_intr->src[2] = nir_src_for_ssa(src2);
160 
161    new_intr->num_components = intr->num_components;
162 
163    if (nir_intrinsic_infos[op].has_dest)
164       nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components,
165                         intr->dest.ssa.bit_size, NULL);
166 
167    nir_builder_instr_insert(b, &new_intr->instr);
168 
169    if (nir_intrinsic_infos[op].has_dest)
170       nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa);
171 
172    nir_instr_remove(&intr->instr);
173 
174    return new_intr;
175 }
176 
177 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)178 build_primitive_map(nir_shader *shader, struct primitive_map *map)
179 {
180    /* All interfaces except the TCS <-> TES interface use ldlw, which takes
181     * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
182     * ldg, which takes an offset in dwords, but each per-vertex slot has
183     * space for every vertex, and there's space at the beginning for
184     * per-patch varyings.
185     */
186    unsigned slot_size = 16, start = 0;
187    if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
188       slot_size = shader->info.tess.tcs_vertices_out * 4;
189       start = util_last_bit(shader->info.patch_outputs_written) * 4;
190    }
191 
192    uint64_t mask = shader->info.outputs_written;
193    unsigned loc = start;
194    while (mask) {
195       int location = u_bit_scan64(&mask);
196       if (is_tess_levels(location))
197          continue;
198 
199       unsigned index = shader_io_get_unique_index(location);
200       map->loc[index] = loc;
201       loc += slot_size;
202    }
203 
204    map->stride = loc;
205    /* Use units of dwords for the stride. */
206    if (shader->info.stage != MESA_SHADER_TESS_CTRL)
207       map->stride /= 4;
208 }
209 
210 /* For shader stages that receive a primitive map, calculate how big it should
211  * be.
212  */
213 
214 static unsigned
calc_primitive_map_size(nir_shader * shader)215 calc_primitive_map_size(nir_shader *shader)
216 {
217    uint64_t mask = shader->info.inputs_read;
218    unsigned max_index = 0;
219    while (mask) {
220       int location = u_bit_scan64(&mask);
221 
222       if (is_tess_levels(location))
223          continue;
224 
225       unsigned index = shader_io_get_unique_index(location);
226       max_index = MAX2(max_index, index + 1);
227    }
228 
229    return max_index;
230 }
231 
232 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)233 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
234                                struct state *state)
235 {
236    nir_foreach_instr_safe (instr, block) {
237       if (instr->type != nir_instr_type_intrinsic)
238          continue;
239 
240       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
241 
242       switch (intr->intrinsic) {
243       case nir_intrinsic_store_output: {
244          // src[] = { value, offset }.
245 
246          /* nir_lower_io_to_temporaries replaces all access to output
247           * variables with temp variables and then emits a nir_copy_var at
248           * the end of the shader.  Thus, we should always get a full wrmask
249           * here.
250           */
251          assert(
252             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
253 
254          b->cursor = nir_instr_remove(&intr->instr);
255 
256          nir_ssa_def *vertex_id = build_vertex_id(b, state);
257          nir_ssa_def *offset = build_local_offset(
258             b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
259             nir_intrinsic_component(intr), intr->src[1].ssa);
260 
261          nir_store_shared_ir3(b, intr->src[0].ssa, offset);
262          break;
263       }
264 
265       default:
266          break;
267       }
268    }
269 }
270 
271 static nir_ssa_def *
local_thread_id(nir_builder * b)272 local_thread_id(nir_builder *b)
273 {
274    return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
275 }
276 
277 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)278 ir3_nir_lower_to_explicit_output(nir_shader *shader,
279                                  struct ir3_shader_variant *v,
280                                  unsigned topology)
281 {
282    struct state state = {};
283 
284    build_primitive_map(shader, &state.map);
285    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
286 
287    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
288    assert(impl);
289 
290    nir_builder b;
291    nir_builder_init(&b, impl);
292    b.cursor = nir_before_cf_list(&impl->body);
293 
294    if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
295       state.header = nir_load_tcs_header_ir3(&b);
296    else
297       state.header = nir_load_gs_header_ir3(&b);
298 
299    nir_foreach_block_safe (block, impl)
300       lower_block_to_explicit_output(block, &b, &state);
301 
302    nir_metadata_preserve(impl,
303                          nir_metadata_block_index | nir_metadata_dominance);
304 
305    v->output_size = state.map.stride;
306 }
307 
308 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)309 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
310                               struct state *state)
311 {
312    nir_foreach_instr_safe (instr, block) {
313       if (instr->type != nir_instr_type_intrinsic)
314          continue;
315 
316       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
317 
318       switch (intr->intrinsic) {
319       case nir_intrinsic_load_per_vertex_input: {
320          // src[] = { vertex, offset }.
321 
322          b->cursor = nir_before_instr(&intr->instr);
323 
324          nir_ssa_def *offset = build_local_offset(
325             b, state,
326             intr->src[0].ssa, // this is typically gl_InvocationID
327             nir_intrinsic_io_semantics(intr).location,
328             nir_intrinsic_component(intr), intr->src[1].ssa);
329 
330          replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
331                            NULL);
332          break;
333       }
334 
335       case nir_intrinsic_load_invocation_id: {
336          b->cursor = nir_before_instr(&intr->instr);
337 
338          nir_ssa_def *iid = build_invocation_id(b, state);
339          nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid);
340          nir_instr_remove(&intr->instr);
341          break;
342       }
343 
344       default:
345          break;
346       }
347    }
348 }
349 
350 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)351 ir3_nir_lower_to_explicit_input(nir_shader *shader,
352                                 struct ir3_shader_variant *v)
353 {
354    struct state state = {};
355 
356    /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
357     * HS uses a different primitive id, which starts at bit 16 in the header
358     */
359    if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
360        v->shader->compiler->tess_use_shared)
361       state.local_primitive_id_start = 16;
362 
363    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
364    assert(impl);
365 
366    nir_builder b;
367    nir_builder_init(&b, impl);
368    b.cursor = nir_before_cf_list(&impl->body);
369 
370    if (shader->info.stage == MESA_SHADER_GEOMETRY)
371       state.header = nir_load_gs_header_ir3(&b);
372    else
373       state.header = nir_load_tcs_header_ir3(&b);
374 
375    nir_foreach_block_safe (block, impl)
376       lower_block_to_explicit_input(block, &b, &state);
377 
378    v->input_size = calc_primitive_map_size(shader);
379 }
380 
381 static nir_ssa_def *
build_tcs_out_vertices(nir_builder * b)382 build_tcs_out_vertices(nir_builder *b)
383 {
384    if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
385       return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
386    else
387       return nir_load_patch_vertices_in(b);
388 }
389 
390 static nir_ssa_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)391 build_per_vertex_offset(nir_builder *b, struct state *state,
392                         nir_ssa_def *vertex, uint32_t location, uint32_t comp,
393                         nir_ssa_def *offset)
394 {
395    nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
396    nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
397    nir_ssa_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
398    nir_ssa_def *attr_offset;
399 
400    if (nir_src_is_const(nir_src_for_ssa(offset))) {
401       location += nir_src_as_uint(nir_src_for_ssa(offset));
402       offset = nir_imm_int(b, 0);
403    } else {
404       /* Offset is in vec4's, but we need it in unit of components for the
405        * load/store_global_ir3 offset.
406        */
407       offset = nir_ishl(b, offset, nir_imm_int(b, 2));
408    }
409 
410    nir_ssa_def *vertex_offset;
411    if (vertex) {
412       unsigned index = shader_io_get_unique_index(location);
413       switch (b->shader->info.stage) {
414       case MESA_SHADER_TESS_CTRL:
415          attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
416          break;
417       case MESA_SHADER_TESS_EVAL:
418          attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
419                                 nir_imm_int(b, comp));
420          break;
421       default:
422          unreachable("bad shader state");
423       }
424 
425       attr_offset = nir_iadd(b, attr_offset,
426                              nir_imul24(b, offset, build_tcs_out_vertices(b)));
427       vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
428    } else {
429       assert(location >= VARYING_SLOT_PATCH0 &&
430              location <= VARYING_SLOT_TESS_MAX);
431       unsigned index = location - VARYING_SLOT_PATCH0;
432       attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
433       vertex_offset = nir_imm_int(b, 0);
434    }
435 
436    return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
437 }
438 
439 static nir_ssa_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_ssa_def * offset)440 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
441                    uint32_t comp, nir_ssa_def *offset)
442 {
443    return build_per_vertex_offset(b, state, NULL, base, comp, offset);
444 }
445 
446 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)447 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
448 {
449    switch (state->topology) {
450    case IR3_TESS_TRIANGLES:
451       *inner = 1;
452       *outer = 3;
453       break;
454    case IR3_TESS_QUADS:
455       *inner = 2;
456       *outer = 4;
457       break;
458    case IR3_TESS_ISOLINES:
459       *inner = 0;
460       *outer = 2;
461       break;
462    default:
463       unreachable("bad");
464    }
465 }
466 
467 static nir_ssa_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)468 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
469                       struct state *state)
470 {
471    uint32_t inner_levels, outer_levels;
472    tess_level_components(state, &inner_levels, &outer_levels);
473 
474    const uint32_t patch_stride = 1 + inner_levels + outer_levels;
475 
476    nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
477 
478    nir_ssa_def *patch_offset =
479       nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
480 
481    uint32_t offset;
482    switch (slot) {
483    case VARYING_SLOT_PRIMITIVE_ID:
484       offset = 0;
485       break;
486    case VARYING_SLOT_TESS_LEVEL_OUTER:
487       offset = 1;
488       break;
489    case VARYING_SLOT_TESS_LEVEL_INNER:
490       offset = 1 + outer_levels;
491       break;
492    default:
493       unreachable("bad");
494    }
495 
496    return nir_iadd(b, patch_offset, nir_imm_int(b, offset + comp));
497 }
498 
499 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)500 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
501 {
502    nir_foreach_instr_safe (instr, block) {
503       if (instr->type != nir_instr_type_intrinsic)
504          continue;
505 
506       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
507 
508       switch (intr->intrinsic) {
509       case nir_intrinsic_load_per_vertex_output: {
510          // src[] = { vertex, offset }.
511 
512          b->cursor = nir_before_instr(&intr->instr);
513 
514          nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
515          nir_ssa_def *offset = build_per_vertex_offset(
516             b, state, intr->src[0].ssa,
517             nir_intrinsic_io_semantics(intr).location,
518             nir_intrinsic_component(intr), intr->src[1].ssa);
519 
520          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
521                            offset, NULL);
522          break;
523       }
524 
525       case nir_intrinsic_store_per_vertex_output: {
526          // src[] = { value, vertex, offset }.
527 
528          b->cursor = nir_before_instr(&intr->instr);
529 
530          /* sparse writemask not supported */
531          assert(
532             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
533 
534          nir_ssa_def *value = intr->src[0].ssa;
535          nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
536          nir_ssa_def *offset = build_per_vertex_offset(
537             b, state, intr->src[1].ssa,
538             nir_intrinsic_io_semantics(intr).location,
539             nir_intrinsic_component(intr), intr->src[2].ssa);
540 
541          replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
542                            address, offset);
543 
544          break;
545       }
546 
547       case nir_intrinsic_load_output: {
548          // src[] = { offset }.
549 
550          b->cursor = nir_before_instr(&intr->instr);
551 
552          nir_ssa_def *address, *offset;
553 
554          /* note if vectorization of the tess level loads ever happens:
555           * "ldg" across 16-byte boundaries can behave incorrectly if results
556           * are never used. most likely some issue with (sy) not properly
557           * syncing with values coming from a second memory transaction.
558           */
559          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
560          if (is_tess_levels(location)) {
561             assert(intr->dest.ssa.num_components == 1);
562             address = nir_load_tess_factor_base_ir3(b);
563             offset = build_tessfactor_base(
564                b, location, nir_intrinsic_component(intr), state);
565          } else {
566             address = nir_load_tess_param_base_ir3(b);
567             offset = build_patch_offset(b, state, location,
568                                         nir_intrinsic_component(intr),
569                                         intr->src[0].ssa);
570          }
571 
572          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
573                            offset, NULL);
574          break;
575       }
576 
577       case nir_intrinsic_store_output: {
578          // src[] = { value, offset }.
579 
580          /* write patch output to bo */
581 
582          b->cursor = nir_before_instr(&intr->instr);
583 
584          /* sparse writemask not supported */
585          assert(
586             util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
587 
588          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
589          if (is_tess_levels(location)) {
590             uint32_t inner_levels, outer_levels, levels;
591             tess_level_components(state, &inner_levels, &outer_levels);
592 
593             assert(intr->src[0].ssa->num_components == 1);
594 
595             nir_if *nif = NULL;
596             if (location != VARYING_SLOT_PRIMITIVE_ID) {
597                /* with tess levels are defined as float[4] and float[2],
598                 * but tess factor BO has smaller sizes for tris/isolines,
599                 * so we have to discard any writes beyond the number of
600                 * components for inner/outer levels
601                 */
602                if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
603                   levels = outer_levels;
604                else
605                   levels = inner_levels;
606 
607                nir_ssa_def *offset = nir_iadd_imm(
608                   b, intr->src[1].ssa, nir_intrinsic_component(intr));
609                nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
610             }
611 
612             nir_ssa_def *offset = build_tessfactor_base(
613                b, location, nir_intrinsic_component(intr), state);
614 
615             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
616                               intr->src[0].ssa,
617                               nir_load_tess_factor_base_ir3(b),
618                               nir_iadd(b, intr->src[1].ssa, offset));
619 
620             if (location != VARYING_SLOT_PRIMITIVE_ID) {
621                nir_pop_if(b, nif);
622             }
623          } else {
624             nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
625             nir_ssa_def *offset = build_patch_offset(
626                b, state, location, nir_intrinsic_component(intr),
627                intr->src[1].ssa);
628 
629             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
630                               intr->src[0].ssa, address, offset);
631          }
632          break;
633       }
634 
635       default:
636          break;
637       }
638    }
639 }
640 
641 static void
emit_tess_epilouge(nir_builder * b,struct state * state)642 emit_tess_epilouge(nir_builder *b, struct state *state)
643 {
644    /* Insert endpatch instruction:
645     *
646     * TODO we should re-work this to use normal flow control.
647     */
648 
649    nir_end_patch_ir3(b);
650 }
651 
652 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)653 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
654                         unsigned topology)
655 {
656    struct state state = {.topology = topology};
657 
658    if (shader_debug_enabled(shader->info.stage)) {
659       mesa_logi("NIR (before tess lowering) for %s shader:",
660                 _mesa_shader_stage_to_string(shader->info.stage));
661       nir_log_shaderi(shader);
662    }
663 
664    build_primitive_map(shader, &state.map);
665    memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
666    v->output_size = state.map.stride;
667 
668    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
669    assert(impl);
670 
671    nir_builder b;
672    nir_builder_init(&b, impl);
673    b.cursor = nir_before_cf_list(&impl->body);
674 
675    state.header = nir_load_tcs_header_ir3(&b);
676 
677    /* If required, store gl_PrimitiveID. */
678    if (v->key.tcs_store_primid) {
679       b.cursor = nir_after_cf_list(&impl->body);
680 
681       nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
682                        .io_semantics = {
683                            .location = VARYING_SLOT_PRIMITIVE_ID,
684                            .num_slots = 1
685                         });
686 
687       b.cursor = nir_before_cf_list(&impl->body);
688    }
689 
690    nir_foreach_block_safe (block, impl)
691       lower_tess_ctrl_block(block, &b, &state);
692 
693    /* Now move the body of the TCS into a conditional:
694     *
695     *   if (gl_InvocationID < num_vertices)
696     *     // body
697     *
698     */
699 
700    nir_cf_list body;
701    nir_cf_extract(&body, nir_before_cf_list(&impl->body),
702                   nir_after_cf_list(&impl->body));
703 
704    b.cursor = nir_after_cf_list(&impl->body);
705 
706    /* Re-emit the header, since the old one got moved into the if branch */
707    state.header = nir_load_tcs_header_ir3(&b);
708    nir_ssa_def *iid = build_invocation_id(&b, &state);
709 
710    const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
711    nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
712 
713    nir_if *nif = nir_push_if(&b, cond);
714 
715    nir_cf_reinsert(&body, b.cursor);
716 
717    b.cursor = nir_after_cf_list(&nif->then_list);
718 
719    /* Insert conditional exit for threads invocation id != 0 */
720    nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
721    nir_cond_end_ir3(&b, iid0_cond);
722 
723    emit_tess_epilouge(&b, &state);
724 
725    nir_pop_if(&b, nif);
726 
727    nir_metadata_preserve(impl, nir_metadata_none);
728 }
729 
730 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)731 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
732 {
733    nir_foreach_instr_safe (instr, block) {
734       if (instr->type != nir_instr_type_intrinsic)
735          continue;
736 
737       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
738 
739       switch (intr->intrinsic) {
740       case nir_intrinsic_load_tess_coord: {
741          b->cursor = nir_after_instr(&intr->instr);
742          nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
743          nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
744          nir_ssa_def *z;
745 
746          if (state->topology == IR3_TESS_TRIANGLES)
747             z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
748          else
749             z = nir_imm_float(b, 0.0f);
750 
751          nir_ssa_def *coord = nir_vec3(b, x, y, z);
752 
753          nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, coord,
754                                         b->cursor.instr);
755          break;
756       }
757 
758       case nir_intrinsic_load_per_vertex_input: {
759          // src[] = { vertex, offset }.
760 
761          b->cursor = nir_before_instr(&intr->instr);
762 
763          nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
764          nir_ssa_def *offset = build_per_vertex_offset(
765             b, state, intr->src[0].ssa,
766             nir_intrinsic_io_semantics(intr).location,
767             nir_intrinsic_component(intr), intr->src[1].ssa);
768 
769          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
770                            offset, NULL);
771          break;
772       }
773 
774       case nir_intrinsic_load_input: {
775          // src[] = { offset }.
776 
777          b->cursor = nir_before_instr(&intr->instr);
778 
779          nir_ssa_def *address, *offset;
780 
781          /* note if vectorization of the tess level loads ever happens:
782           * "ldg" across 16-byte boundaries can behave incorrectly if results
783           * are never used. most likely some issue with (sy) not properly
784           * syncing with values coming from a second memory transaction.
785           */
786          gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
787          if (is_tess_levels(location)) {
788             assert(intr->dest.ssa.num_components == 1);
789             address = nir_load_tess_factor_base_ir3(b);
790             offset = build_tessfactor_base(
791                b, location, nir_intrinsic_component(intr), state);
792          } else {
793             address = nir_load_tess_param_base_ir3(b);
794             offset = build_patch_offset(b, state, location,
795                                         nir_intrinsic_component(intr),
796                                         intr->src[0].ssa);
797          }
798 
799          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
800                            offset, NULL);
801          break;
802       }
803 
804       default:
805          break;
806       }
807    }
808 }
809 
810 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)811 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
812                         unsigned topology)
813 {
814    struct state state = {.topology = topology};
815 
816    if (shader_debug_enabled(shader->info.stage)) {
817       mesa_logi("NIR (before tess lowering) for %s shader:",
818                 _mesa_shader_stage_to_string(shader->info.stage));
819       nir_log_shaderi(shader);
820    }
821 
822    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
823    assert(impl);
824 
825    nir_builder b;
826    nir_builder_init(&b, impl);
827 
828    nir_foreach_block_safe (block, impl)
829       lower_tess_eval_block(block, &b, &state);
830 
831    v->input_size = calc_primitive_map_size(shader);
832 
833    nir_metadata_preserve(impl, nir_metadata_none);
834 }
835 
836 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)837 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
838 {
839    nir_foreach_instr_safe (instr, block) {
840       if (instr->type != nir_instr_type_intrinsic)
841          continue;
842 
843       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
844 
845       switch (intr->intrinsic) {
846       case nir_intrinsic_end_primitive: {
847          /* Note: This ignores the stream, which seems to match the blob
848           * behavior. I'm guessing the HW ignores any extraneous cut
849           * signals from an EndPrimitive() that doesn't correspond to the
850           * rasterized stream.
851           */
852          b->cursor = nir_before_instr(&intr->instr);
853          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
854          nir_instr_remove(&intr->instr);
855          break;
856       }
857 
858       case nir_intrinsic_emit_vertex: {
859          /* Load the vertex count */
860          b->cursor = nir_before_instr(&intr->instr);
861          nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
862 
863          nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
864 
865          unsigned stream = nir_intrinsic_stream_id(intr);
866          /* vertex_flags_out |= stream */
867          nir_store_var(b, state->vertex_flags_out,
868                        nir_ior(b, nir_load_var(b, state->vertex_flags_out),
869                                nir_imm_int(b, stream)),
870                        0x1 /* .x */);
871 
872          foreach_two_lists (dest_node, &state->emit_outputs, src_node,
873                             &state->old_outputs) {
874             nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
875             nir_variable *src = exec_node_data(nir_variable, src_node, node);
876             nir_copy_var(b, dest, src);
877          }
878 
879          nir_instr_remove(&intr->instr);
880 
881          nir_store_var(b, state->emitted_vertex_var,
882                        nir_iadd(b, nir_load_var(b, state->emitted_vertex_var),
883                                 nir_imm_int(b, 1)),
884                        0x1);
885 
886          nir_pop_if(b, NULL);
887 
888          /* Increment the vertex count by 1 */
889          nir_store_var(b, state->vertex_count_var,
890                        nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
891          nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
892 
893          break;
894       }
895 
896       default:
897          break;
898       }
899    }
900 }
901 
902 void
ir3_nir_lower_gs(nir_shader * shader)903 ir3_nir_lower_gs(nir_shader *shader)
904 {
905    struct state state = {};
906 
907    if (shader_debug_enabled(shader->info.stage)) {
908       mesa_logi("NIR (before gs lowering):");
909       nir_log_shaderi(shader);
910    }
911 
912    /* Create an output var for vertex_flags. This will be shadowed below,
913     * same way regular outputs get shadowed, and this variable will become a
914     * temporary.
915     */
916    state.vertex_flags_out = nir_variable_create(
917       shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
918    state.vertex_flags_out->data.driver_location = shader->num_outputs++;
919    state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
920    state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
921 
922    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
923    assert(impl);
924 
925    nir_builder b;
926    nir_builder_init(&b, impl);
927    b.cursor = nir_before_cf_list(&impl->body);
928 
929    state.header = nir_load_gs_header_ir3(&b);
930 
931    /* Generate two set of shadow vars for the output variables.  The first
932     * set replaces the real outputs and the second set (emit_outputs) we'll
933     * assign in the emit_vertex conditionals.  Then at the end of the shader
934     * we copy the emit_outputs to the real outputs, so that we get
935     * store_output in uniform control flow.
936     */
937    exec_list_make_empty(&state.old_outputs);
938    nir_foreach_shader_out_variable_safe (var, shader) {
939       exec_node_remove(&var->node);
940       exec_list_push_tail(&state.old_outputs, &var->node);
941    }
942    exec_list_make_empty(&state.new_outputs);
943    exec_list_make_empty(&state.emit_outputs);
944    nir_foreach_variable_in_list (var, &state.old_outputs) {
945       /* Create a new output var by cloning the original output var and
946        * stealing the name.
947        */
948       nir_variable *output = nir_variable_clone(var, shader);
949       exec_list_push_tail(&state.new_outputs, &output->node);
950 
951       /* Rewrite the original output to be a shadow variable. */
952       var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
953       var->data.mode = nir_var_shader_temp;
954 
955       /* Clone the shadow variable to create the emit shadow variable that
956        * we'll assign in the emit conditionals.
957        */
958       nir_variable *emit_output = nir_variable_clone(var, shader);
959       emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
960       exec_list_push_tail(&state.emit_outputs, &emit_output->node);
961    }
962 
963    /* During the shader we'll keep track of which vertex we're currently
964     * emitting for the EmitVertex test and how many vertices we emitted so we
965     * know to discard if didn't emit any.  In most simple shaders, this can
966     * all be statically determined and gets optimized away.
967     */
968    state.vertex_count_var =
969       nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
970    state.emitted_vertex_var =
971       nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
972 
973    /* Initialize to 0. */
974    b.cursor = nir_before_cf_list(&impl->body);
975    nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
976    nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
977    nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
978 
979    nir_foreach_block_safe (block, impl)
980       lower_gs_block(block, &b, &state);
981 
982    set_foreach (impl->end_block->predecessors, block_entry) {
983       struct nir_block *block = (void *)block_entry->key;
984       b.cursor = nir_after_block_before_jump(block);
985 
986       nir_ssa_def *cond =
987          nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
988 
989       nir_discard_if(&b, cond);
990 
991       foreach_two_lists (dest_node, &state.new_outputs, src_node,
992                          &state.emit_outputs) {
993          nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
994          nir_variable *src = exec_node_data(nir_variable, src_node, node);
995          nir_copy_var(&b, dest, src);
996       }
997    }
998 
999    exec_list_append(&shader->variables, &state.old_outputs);
1000    exec_list_append(&shader->variables, &state.emit_outputs);
1001    exec_list_append(&shader->variables, &state.new_outputs);
1002 
1003    nir_metadata_preserve(impl, nir_metadata_none);
1004 
1005    nir_lower_global_vars_to_local(shader);
1006    nir_split_var_copies(shader);
1007    nir_lower_var_copies(shader);
1008 
1009    nir_fixup_deref_modes(shader);
1010 
1011    if (shader_debug_enabled(shader->info.stage)) {
1012       mesa_logi("NIR (after gs lowering):");
1013       nir_log_shaderi(shader);
1014    }
1015 }
1016