1 /*
2 * Copyright © 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler/nir/nir_builder.h"
25 #include "ir3_compiler.h"
26 #include "ir3_nir.h"
27
28 struct state {
29 uint32_t topology;
30
31 struct primitive_map {
32 unsigned loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */
33 unsigned stride;
34 } map;
35
36 nir_ssa_def *header;
37
38 nir_variable *vertex_count_var;
39 nir_variable *emitted_vertex_var;
40 nir_variable *vertex_flags_out;
41
42 struct exec_list old_outputs;
43 struct exec_list new_outputs;
44 struct exec_list emit_outputs;
45
46 /* tess ctrl shader on a650 gets the local primitive id at different bits: */
47 unsigned local_primitive_id_start;
48 };
49
50 static nir_ssa_def *
bitfield_extract(nir_builder * b,nir_ssa_def * v,uint32_t start,uint32_t mask)51 bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
52 {
53 return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
54 nir_imm_int(b, mask));
55 }
56
57 static nir_ssa_def *
build_invocation_id(nir_builder * b,struct state * state)58 build_invocation_id(nir_builder *b, struct state *state)
59 {
60 return bitfield_extract(b, state->header, 11, 31);
61 }
62
63 static nir_ssa_def *
build_vertex_id(nir_builder * b,struct state * state)64 build_vertex_id(nir_builder *b, struct state *state)
65 {
66 return bitfield_extract(b, state->header, 6, 31);
67 }
68
69 static nir_ssa_def *
build_local_primitive_id(nir_builder * b,struct state * state)70 build_local_primitive_id(nir_builder *b, struct state *state)
71 {
72 return bitfield_extract(b, state->header, state->local_primitive_id_start,
73 63);
74 }
75
76 static bool
is_tess_levels(gl_varying_slot slot)77 is_tess_levels(gl_varying_slot slot)
78 {
79 return (slot == VARYING_SLOT_PRIMITIVE_ID ||
80 slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
81 slot == VARYING_SLOT_TESS_LEVEL_INNER);
82 }
83
84 /* Return a deterministic index for varyings. We can't rely on driver_location
85 * to be correct without linking the different stages first, so we create
86 * "primitive maps" where the producer decides on the location of each varying
87 * slot and then exports a per-slot array to the consumer. This compacts the
88 * gl_varying_slot space down a bit so that the primitive maps aren't too
89 * large.
90 *
91 * Note: per-patch varyings are currently handled separately, without any
92 * compacting.
93 *
94 * TODO: We could probably use the driver_location's directly in the non-SSO
95 * (Vulkan) case.
96 */
97
98 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)99 shader_io_get_unique_index(gl_varying_slot slot)
100 {
101 if (slot == VARYING_SLOT_POS)
102 return 0;
103 if (slot == VARYING_SLOT_PSIZ)
104 return 1;
105 if (slot == VARYING_SLOT_CLIP_DIST0)
106 return 2;
107 if (slot == VARYING_SLOT_CLIP_DIST1)
108 return 3;
109 if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
110 return 4 + (slot - VARYING_SLOT_VAR0);
111 unreachable("illegal slot in get unique index\n");
112 }
113
114 static nir_ssa_def *
build_local_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)115 build_local_offset(nir_builder *b, struct state *state, nir_ssa_def *vertex,
116 uint32_t location, uint32_t comp, nir_ssa_def *offset)
117 {
118 nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
119 nir_ssa_def *primitive_offset =
120 nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
121 nir_ssa_def *attr_offset;
122 nir_ssa_def *vertex_stride;
123 unsigned index = shader_io_get_unique_index(location);
124
125 switch (b->shader->info.stage) {
126 case MESA_SHADER_VERTEX:
127 case MESA_SHADER_TESS_EVAL:
128 vertex_stride = nir_imm_int(b, state->map.stride * 4);
129 attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
130 break;
131 case MESA_SHADER_TESS_CTRL:
132 case MESA_SHADER_GEOMETRY:
133 vertex_stride = nir_load_vs_vertex_stride_ir3(b);
134 attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
135 nir_imm_int(b, comp * 4));
136 break;
137 default:
138 unreachable("bad shader stage");
139 }
140
141 nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
142
143 return nir_iadd(
144 b, nir_iadd(b, primitive_offset, vertex_offset),
145 nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
146 }
147
148 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_ssa_def * src0,nir_ssa_def * src1,nir_ssa_def * src2)149 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
150 nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1,
151 nir_ssa_def *src2)
152 {
153 nir_intrinsic_instr *new_intr = nir_intrinsic_instr_create(b->shader, op);
154
155 new_intr->src[0] = nir_src_for_ssa(src0);
156 if (src1)
157 new_intr->src[1] = nir_src_for_ssa(src1);
158 if (src2)
159 new_intr->src[2] = nir_src_for_ssa(src2);
160
161 new_intr->num_components = intr->num_components;
162
163 if (nir_intrinsic_infos[op].has_dest)
164 nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components,
165 intr->dest.ssa.bit_size, NULL);
166
167 nir_builder_instr_insert(b, &new_intr->instr);
168
169 if (nir_intrinsic_infos[op].has_dest)
170 nir_ssa_def_rewrite_uses(&intr->dest.ssa, &new_intr->dest.ssa);
171
172 nir_instr_remove(&intr->instr);
173
174 return new_intr;
175 }
176
177 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)178 build_primitive_map(nir_shader *shader, struct primitive_map *map)
179 {
180 /* All interfaces except the TCS <-> TES interface use ldlw, which takes
181 * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
182 * ldg, which takes an offset in dwords, but each per-vertex slot has
183 * space for every vertex, and there's space at the beginning for
184 * per-patch varyings.
185 */
186 unsigned slot_size = 16, start = 0;
187 if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
188 slot_size = shader->info.tess.tcs_vertices_out * 4;
189 start = util_last_bit(shader->info.patch_outputs_written) * 4;
190 }
191
192 uint64_t mask = shader->info.outputs_written;
193 unsigned loc = start;
194 while (mask) {
195 int location = u_bit_scan64(&mask);
196 if (is_tess_levels(location))
197 continue;
198
199 unsigned index = shader_io_get_unique_index(location);
200 map->loc[index] = loc;
201 loc += slot_size;
202 }
203
204 map->stride = loc;
205 /* Use units of dwords for the stride. */
206 if (shader->info.stage != MESA_SHADER_TESS_CTRL)
207 map->stride /= 4;
208 }
209
210 /* For shader stages that receive a primitive map, calculate how big it should
211 * be.
212 */
213
214 static unsigned
calc_primitive_map_size(nir_shader * shader)215 calc_primitive_map_size(nir_shader *shader)
216 {
217 uint64_t mask = shader->info.inputs_read;
218 unsigned max_index = 0;
219 while (mask) {
220 int location = u_bit_scan64(&mask);
221
222 if (is_tess_levels(location))
223 continue;
224
225 unsigned index = shader_io_get_unique_index(location);
226 max_index = MAX2(max_index, index + 1);
227 }
228
229 return max_index;
230 }
231
232 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)233 lower_block_to_explicit_output(nir_block *block, nir_builder *b,
234 struct state *state)
235 {
236 nir_foreach_instr_safe (instr, block) {
237 if (instr->type != nir_instr_type_intrinsic)
238 continue;
239
240 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
241
242 switch (intr->intrinsic) {
243 case nir_intrinsic_store_output: {
244 // src[] = { value, offset }.
245
246 /* nir_lower_io_to_temporaries replaces all access to output
247 * variables with temp variables and then emits a nir_copy_var at
248 * the end of the shader. Thus, we should always get a full wrmask
249 * here.
250 */
251 assert(
252 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
253
254 b->cursor = nir_instr_remove(&intr->instr);
255
256 nir_ssa_def *vertex_id = build_vertex_id(b, state);
257 nir_ssa_def *offset = build_local_offset(
258 b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
259 nir_intrinsic_component(intr), intr->src[1].ssa);
260
261 nir_store_shared_ir3(b, intr->src[0].ssa, offset);
262 break;
263 }
264
265 default:
266 break;
267 }
268 }
269 }
270
271 static nir_ssa_def *
local_thread_id(nir_builder * b)272 local_thread_id(nir_builder *b)
273 {
274 return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
275 }
276
277 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)278 ir3_nir_lower_to_explicit_output(nir_shader *shader,
279 struct ir3_shader_variant *v,
280 unsigned topology)
281 {
282 struct state state = {};
283
284 build_primitive_map(shader, &state.map);
285 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
286
287 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
288 assert(impl);
289
290 nir_builder b;
291 nir_builder_init(&b, impl);
292 b.cursor = nir_before_cf_list(&impl->body);
293
294 if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
295 state.header = nir_load_tcs_header_ir3(&b);
296 else
297 state.header = nir_load_gs_header_ir3(&b);
298
299 nir_foreach_block_safe (block, impl)
300 lower_block_to_explicit_output(block, &b, &state);
301
302 nir_metadata_preserve(impl,
303 nir_metadata_block_index | nir_metadata_dominance);
304
305 v->output_size = state.map.stride;
306 }
307
308 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)309 lower_block_to_explicit_input(nir_block *block, nir_builder *b,
310 struct state *state)
311 {
312 nir_foreach_instr_safe (instr, block) {
313 if (instr->type != nir_instr_type_intrinsic)
314 continue;
315
316 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
317
318 switch (intr->intrinsic) {
319 case nir_intrinsic_load_per_vertex_input: {
320 // src[] = { vertex, offset }.
321
322 b->cursor = nir_before_instr(&intr->instr);
323
324 nir_ssa_def *offset = build_local_offset(
325 b, state,
326 intr->src[0].ssa, // this is typically gl_InvocationID
327 nir_intrinsic_io_semantics(intr).location,
328 nir_intrinsic_component(intr), intr->src[1].ssa);
329
330 replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL,
331 NULL);
332 break;
333 }
334
335 case nir_intrinsic_load_invocation_id: {
336 b->cursor = nir_before_instr(&intr->instr);
337
338 nir_ssa_def *iid = build_invocation_id(b, state);
339 nir_ssa_def_rewrite_uses(&intr->dest.ssa, iid);
340 nir_instr_remove(&intr->instr);
341 break;
342 }
343
344 default:
345 break;
346 }
347 }
348 }
349
350 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)351 ir3_nir_lower_to_explicit_input(nir_shader *shader,
352 struct ir3_shader_variant *v)
353 {
354 struct state state = {};
355
356 /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
357 * HS uses a different primitive id, which starts at bit 16 in the header
358 */
359 if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
360 v->shader->compiler->tess_use_shared)
361 state.local_primitive_id_start = 16;
362
363 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
364 assert(impl);
365
366 nir_builder b;
367 nir_builder_init(&b, impl);
368 b.cursor = nir_before_cf_list(&impl->body);
369
370 if (shader->info.stage == MESA_SHADER_GEOMETRY)
371 state.header = nir_load_gs_header_ir3(&b);
372 else
373 state.header = nir_load_tcs_header_ir3(&b);
374
375 nir_foreach_block_safe (block, impl)
376 lower_block_to_explicit_input(block, &b, &state);
377
378 v->input_size = calc_primitive_map_size(shader);
379 }
380
381 static nir_ssa_def *
build_tcs_out_vertices(nir_builder * b)382 build_tcs_out_vertices(nir_builder *b)
383 {
384 if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
385 return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
386 else
387 return nir_load_patch_vertices_in(b);
388 }
389
390 static nir_ssa_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)391 build_per_vertex_offset(nir_builder *b, struct state *state,
392 nir_ssa_def *vertex, uint32_t location, uint32_t comp,
393 nir_ssa_def *offset)
394 {
395 nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
396 nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
397 nir_ssa_def *patch_offset = nir_imul24(b, patch_id, patch_stride);
398 nir_ssa_def *attr_offset;
399
400 if (nir_src_is_const(nir_src_for_ssa(offset))) {
401 location += nir_src_as_uint(nir_src_for_ssa(offset));
402 offset = nir_imm_int(b, 0);
403 } else {
404 /* Offset is in vec4's, but we need it in unit of components for the
405 * load/store_global_ir3 offset.
406 */
407 offset = nir_ishl(b, offset, nir_imm_int(b, 2));
408 }
409
410 nir_ssa_def *vertex_offset;
411 if (vertex) {
412 unsigned index = shader_io_get_unique_index(location);
413 switch (b->shader->info.stage) {
414 case MESA_SHADER_TESS_CTRL:
415 attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
416 break;
417 case MESA_SHADER_TESS_EVAL:
418 attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
419 nir_imm_int(b, comp));
420 break;
421 default:
422 unreachable("bad shader state");
423 }
424
425 attr_offset = nir_iadd(b, attr_offset,
426 nir_imul24(b, offset, build_tcs_out_vertices(b)));
427 vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
428 } else {
429 assert(location >= VARYING_SLOT_PATCH0 &&
430 location <= VARYING_SLOT_TESS_MAX);
431 unsigned index = location - VARYING_SLOT_PATCH0;
432 attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
433 vertex_offset = nir_imm_int(b, 0);
434 }
435
436 return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
437 }
438
439 static nir_ssa_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_ssa_def * offset)440 build_patch_offset(nir_builder *b, struct state *state, uint32_t base,
441 uint32_t comp, nir_ssa_def *offset)
442 {
443 return build_per_vertex_offset(b, state, NULL, base, comp, offset);
444 }
445
446 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)447 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
448 {
449 switch (state->topology) {
450 case IR3_TESS_TRIANGLES:
451 *inner = 1;
452 *outer = 3;
453 break;
454 case IR3_TESS_QUADS:
455 *inner = 2;
456 *outer = 4;
457 break;
458 case IR3_TESS_ISOLINES:
459 *inner = 0;
460 *outer = 2;
461 break;
462 default:
463 unreachable("bad");
464 }
465 }
466
467 static nir_ssa_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,uint32_t comp,struct state * state)468 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
469 struct state *state)
470 {
471 uint32_t inner_levels, outer_levels;
472 tess_level_components(state, &inner_levels, &outer_levels);
473
474 const uint32_t patch_stride = 1 + inner_levels + outer_levels;
475
476 nir_ssa_def *patch_id = nir_load_rel_patch_id_ir3(b);
477
478 nir_ssa_def *patch_offset =
479 nir_imul24(b, patch_id, nir_imm_int(b, patch_stride));
480
481 uint32_t offset;
482 switch (slot) {
483 case VARYING_SLOT_PRIMITIVE_ID:
484 offset = 0;
485 break;
486 case VARYING_SLOT_TESS_LEVEL_OUTER:
487 offset = 1;
488 break;
489 case VARYING_SLOT_TESS_LEVEL_INNER:
490 offset = 1 + outer_levels;
491 break;
492 default:
493 unreachable("bad");
494 }
495
496 return nir_iadd(b, patch_offset, nir_imm_int(b, offset + comp));
497 }
498
499 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)500 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
501 {
502 nir_foreach_instr_safe (instr, block) {
503 if (instr->type != nir_instr_type_intrinsic)
504 continue;
505
506 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
507
508 switch (intr->intrinsic) {
509 case nir_intrinsic_load_per_vertex_output: {
510 // src[] = { vertex, offset }.
511
512 b->cursor = nir_before_instr(&intr->instr);
513
514 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
515 nir_ssa_def *offset = build_per_vertex_offset(
516 b, state, intr->src[0].ssa,
517 nir_intrinsic_io_semantics(intr).location,
518 nir_intrinsic_component(intr), intr->src[1].ssa);
519
520 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
521 offset, NULL);
522 break;
523 }
524
525 case nir_intrinsic_store_per_vertex_output: {
526 // src[] = { value, vertex, offset }.
527
528 b->cursor = nir_before_instr(&intr->instr);
529
530 /* sparse writemask not supported */
531 assert(
532 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
533
534 nir_ssa_def *value = intr->src[0].ssa;
535 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
536 nir_ssa_def *offset = build_per_vertex_offset(
537 b, state, intr->src[1].ssa,
538 nir_intrinsic_io_semantics(intr).location,
539 nir_intrinsic_component(intr), intr->src[2].ssa);
540
541 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
542 address, offset);
543
544 break;
545 }
546
547 case nir_intrinsic_load_output: {
548 // src[] = { offset }.
549
550 b->cursor = nir_before_instr(&intr->instr);
551
552 nir_ssa_def *address, *offset;
553
554 /* note if vectorization of the tess level loads ever happens:
555 * "ldg" across 16-byte boundaries can behave incorrectly if results
556 * are never used. most likely some issue with (sy) not properly
557 * syncing with values coming from a second memory transaction.
558 */
559 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
560 if (is_tess_levels(location)) {
561 assert(intr->dest.ssa.num_components == 1);
562 address = nir_load_tess_factor_base_ir3(b);
563 offset = build_tessfactor_base(
564 b, location, nir_intrinsic_component(intr), state);
565 } else {
566 address = nir_load_tess_param_base_ir3(b);
567 offset = build_patch_offset(b, state, location,
568 nir_intrinsic_component(intr),
569 intr->src[0].ssa);
570 }
571
572 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
573 offset, NULL);
574 break;
575 }
576
577 case nir_intrinsic_store_output: {
578 // src[] = { value, offset }.
579
580 /* write patch output to bo */
581
582 b->cursor = nir_before_instr(&intr->instr);
583
584 /* sparse writemask not supported */
585 assert(
586 util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
587
588 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
589 if (is_tess_levels(location)) {
590 uint32_t inner_levels, outer_levels, levels;
591 tess_level_components(state, &inner_levels, &outer_levels);
592
593 assert(intr->src[0].ssa->num_components == 1);
594
595 nir_if *nif = NULL;
596 if (location != VARYING_SLOT_PRIMITIVE_ID) {
597 /* with tess levels are defined as float[4] and float[2],
598 * but tess factor BO has smaller sizes for tris/isolines,
599 * so we have to discard any writes beyond the number of
600 * components for inner/outer levels
601 */
602 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
603 levels = outer_levels;
604 else
605 levels = inner_levels;
606
607 nir_ssa_def *offset = nir_iadd_imm(
608 b, intr->src[1].ssa, nir_intrinsic_component(intr));
609 nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
610 }
611
612 nir_ssa_def *offset = build_tessfactor_base(
613 b, location, nir_intrinsic_component(intr), state);
614
615 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
616 intr->src[0].ssa,
617 nir_load_tess_factor_base_ir3(b),
618 nir_iadd(b, intr->src[1].ssa, offset));
619
620 if (location != VARYING_SLOT_PRIMITIVE_ID) {
621 nir_pop_if(b, nif);
622 }
623 } else {
624 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
625 nir_ssa_def *offset = build_patch_offset(
626 b, state, location, nir_intrinsic_component(intr),
627 intr->src[1].ssa);
628
629 replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
630 intr->src[0].ssa, address, offset);
631 }
632 break;
633 }
634
635 default:
636 break;
637 }
638 }
639 }
640
641 static void
emit_tess_epilouge(nir_builder * b,struct state * state)642 emit_tess_epilouge(nir_builder *b, struct state *state)
643 {
644 /* Insert endpatch instruction:
645 *
646 * TODO we should re-work this to use normal flow control.
647 */
648
649 nir_end_patch_ir3(b);
650 }
651
652 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)653 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
654 unsigned topology)
655 {
656 struct state state = {.topology = topology};
657
658 if (shader_debug_enabled(shader->info.stage)) {
659 mesa_logi("NIR (before tess lowering) for %s shader:",
660 _mesa_shader_stage_to_string(shader->info.stage));
661 nir_log_shaderi(shader);
662 }
663
664 build_primitive_map(shader, &state.map);
665 memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
666 v->output_size = state.map.stride;
667
668 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
669 assert(impl);
670
671 nir_builder b;
672 nir_builder_init(&b, impl);
673 b.cursor = nir_before_cf_list(&impl->body);
674
675 state.header = nir_load_tcs_header_ir3(&b);
676
677 /* If required, store gl_PrimitiveID. */
678 if (v->key.tcs_store_primid) {
679 b.cursor = nir_after_cf_list(&impl->body);
680
681 nir_store_output(&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0),
682 .io_semantics = {
683 .location = VARYING_SLOT_PRIMITIVE_ID,
684 .num_slots = 1
685 });
686
687 b.cursor = nir_before_cf_list(&impl->body);
688 }
689
690 nir_foreach_block_safe (block, impl)
691 lower_tess_ctrl_block(block, &b, &state);
692
693 /* Now move the body of the TCS into a conditional:
694 *
695 * if (gl_InvocationID < num_vertices)
696 * // body
697 *
698 */
699
700 nir_cf_list body;
701 nir_cf_extract(&body, nir_before_cf_list(&impl->body),
702 nir_after_cf_list(&impl->body));
703
704 b.cursor = nir_after_cf_list(&impl->body);
705
706 /* Re-emit the header, since the old one got moved into the if branch */
707 state.header = nir_load_tcs_header_ir3(&b);
708 nir_ssa_def *iid = build_invocation_id(&b, &state);
709
710 const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
711 nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
712
713 nir_if *nif = nir_push_if(&b, cond);
714
715 nir_cf_reinsert(&body, b.cursor);
716
717 b.cursor = nir_after_cf_list(&nif->then_list);
718
719 /* Insert conditional exit for threads invocation id != 0 */
720 nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
721 nir_cond_end_ir3(&b, iid0_cond);
722
723 emit_tess_epilouge(&b, &state);
724
725 nir_pop_if(&b, nif);
726
727 nir_metadata_preserve(impl, nir_metadata_none);
728 }
729
730 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)731 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
732 {
733 nir_foreach_instr_safe (instr, block) {
734 if (instr->type != nir_instr_type_intrinsic)
735 continue;
736
737 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
738
739 switch (intr->intrinsic) {
740 case nir_intrinsic_load_tess_coord: {
741 b->cursor = nir_after_instr(&intr->instr);
742 nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
743 nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
744 nir_ssa_def *z;
745
746 if (state->topology == IR3_TESS_TRIANGLES)
747 z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
748 else
749 z = nir_imm_float(b, 0.0f);
750
751 nir_ssa_def *coord = nir_vec3(b, x, y, z);
752
753 nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, coord,
754 b->cursor.instr);
755 break;
756 }
757
758 case nir_intrinsic_load_per_vertex_input: {
759 // src[] = { vertex, offset }.
760
761 b->cursor = nir_before_instr(&intr->instr);
762
763 nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
764 nir_ssa_def *offset = build_per_vertex_offset(
765 b, state, intr->src[0].ssa,
766 nir_intrinsic_io_semantics(intr).location,
767 nir_intrinsic_component(intr), intr->src[1].ssa);
768
769 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
770 offset, NULL);
771 break;
772 }
773
774 case nir_intrinsic_load_input: {
775 // src[] = { offset }.
776
777 b->cursor = nir_before_instr(&intr->instr);
778
779 nir_ssa_def *address, *offset;
780
781 /* note if vectorization of the tess level loads ever happens:
782 * "ldg" across 16-byte boundaries can behave incorrectly if results
783 * are never used. most likely some issue with (sy) not properly
784 * syncing with values coming from a second memory transaction.
785 */
786 gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
787 if (is_tess_levels(location)) {
788 assert(intr->dest.ssa.num_components == 1);
789 address = nir_load_tess_factor_base_ir3(b);
790 offset = build_tessfactor_base(
791 b, location, nir_intrinsic_component(intr), state);
792 } else {
793 address = nir_load_tess_param_base_ir3(b);
794 offset = build_patch_offset(b, state, location,
795 nir_intrinsic_component(intr),
796 intr->src[0].ssa);
797 }
798
799 replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
800 offset, NULL);
801 break;
802 }
803
804 default:
805 break;
806 }
807 }
808 }
809
810 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)811 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
812 unsigned topology)
813 {
814 struct state state = {.topology = topology};
815
816 if (shader_debug_enabled(shader->info.stage)) {
817 mesa_logi("NIR (before tess lowering) for %s shader:",
818 _mesa_shader_stage_to_string(shader->info.stage));
819 nir_log_shaderi(shader);
820 }
821
822 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
823 assert(impl);
824
825 nir_builder b;
826 nir_builder_init(&b, impl);
827
828 nir_foreach_block_safe (block, impl)
829 lower_tess_eval_block(block, &b, &state);
830
831 v->input_size = calc_primitive_map_size(shader);
832
833 nir_metadata_preserve(impl, nir_metadata_none);
834 }
835
836 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)837 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
838 {
839 nir_foreach_instr_safe (instr, block) {
840 if (instr->type != nir_instr_type_intrinsic)
841 continue;
842
843 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
844
845 switch (intr->intrinsic) {
846 case nir_intrinsic_end_primitive: {
847 /* Note: This ignores the stream, which seems to match the blob
848 * behavior. I'm guessing the HW ignores any extraneous cut
849 * signals from an EndPrimitive() that doesn't correspond to the
850 * rasterized stream.
851 */
852 b->cursor = nir_before_instr(&intr->instr);
853 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
854 nir_instr_remove(&intr->instr);
855 break;
856 }
857
858 case nir_intrinsic_emit_vertex: {
859 /* Load the vertex count */
860 b->cursor = nir_before_instr(&intr->instr);
861 nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
862
863 nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
864
865 unsigned stream = nir_intrinsic_stream_id(intr);
866 /* vertex_flags_out |= stream */
867 nir_store_var(b, state->vertex_flags_out,
868 nir_ior(b, nir_load_var(b, state->vertex_flags_out),
869 nir_imm_int(b, stream)),
870 0x1 /* .x */);
871
872 foreach_two_lists (dest_node, &state->emit_outputs, src_node,
873 &state->old_outputs) {
874 nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
875 nir_variable *src = exec_node_data(nir_variable, src_node, node);
876 nir_copy_var(b, dest, src);
877 }
878
879 nir_instr_remove(&intr->instr);
880
881 nir_store_var(b, state->emitted_vertex_var,
882 nir_iadd(b, nir_load_var(b, state->emitted_vertex_var),
883 nir_imm_int(b, 1)),
884 0x1);
885
886 nir_pop_if(b, NULL);
887
888 /* Increment the vertex count by 1 */
889 nir_store_var(b, state->vertex_count_var,
890 nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
891 nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
892
893 break;
894 }
895
896 default:
897 break;
898 }
899 }
900 }
901
902 void
ir3_nir_lower_gs(nir_shader * shader)903 ir3_nir_lower_gs(nir_shader *shader)
904 {
905 struct state state = {};
906
907 if (shader_debug_enabled(shader->info.stage)) {
908 mesa_logi("NIR (before gs lowering):");
909 nir_log_shaderi(shader);
910 }
911
912 /* Create an output var for vertex_flags. This will be shadowed below,
913 * same way regular outputs get shadowed, and this variable will become a
914 * temporary.
915 */
916 state.vertex_flags_out = nir_variable_create(
917 shader, nir_var_shader_out, glsl_uint_type(), "vertex_flags");
918 state.vertex_flags_out->data.driver_location = shader->num_outputs++;
919 state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
920 state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
921
922 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
923 assert(impl);
924
925 nir_builder b;
926 nir_builder_init(&b, impl);
927 b.cursor = nir_before_cf_list(&impl->body);
928
929 state.header = nir_load_gs_header_ir3(&b);
930
931 /* Generate two set of shadow vars for the output variables. The first
932 * set replaces the real outputs and the second set (emit_outputs) we'll
933 * assign in the emit_vertex conditionals. Then at the end of the shader
934 * we copy the emit_outputs to the real outputs, so that we get
935 * store_output in uniform control flow.
936 */
937 exec_list_make_empty(&state.old_outputs);
938 nir_foreach_shader_out_variable_safe (var, shader) {
939 exec_node_remove(&var->node);
940 exec_list_push_tail(&state.old_outputs, &var->node);
941 }
942 exec_list_make_empty(&state.new_outputs);
943 exec_list_make_empty(&state.emit_outputs);
944 nir_foreach_variable_in_list (var, &state.old_outputs) {
945 /* Create a new output var by cloning the original output var and
946 * stealing the name.
947 */
948 nir_variable *output = nir_variable_clone(var, shader);
949 exec_list_push_tail(&state.new_outputs, &output->node);
950
951 /* Rewrite the original output to be a shadow variable. */
952 var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
953 var->data.mode = nir_var_shader_temp;
954
955 /* Clone the shadow variable to create the emit shadow variable that
956 * we'll assign in the emit conditionals.
957 */
958 nir_variable *emit_output = nir_variable_clone(var, shader);
959 emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
960 exec_list_push_tail(&state.emit_outputs, &emit_output->node);
961 }
962
963 /* During the shader we'll keep track of which vertex we're currently
964 * emitting for the EmitVertex test and how many vertices we emitted so we
965 * know to discard if didn't emit any. In most simple shaders, this can
966 * all be statically determined and gets optimized away.
967 */
968 state.vertex_count_var =
969 nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
970 state.emitted_vertex_var =
971 nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
972
973 /* Initialize to 0. */
974 b.cursor = nir_before_cf_list(&impl->body);
975 nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
976 nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
977 nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
978
979 nir_foreach_block_safe (block, impl)
980 lower_gs_block(block, &b, &state);
981
982 set_foreach (impl->end_block->predecessors, block_entry) {
983 struct nir_block *block = (void *)block_entry->key;
984 b.cursor = nir_after_block_before_jump(block);
985
986 nir_ssa_def *cond =
987 nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
988
989 nir_discard_if(&b, cond);
990
991 foreach_two_lists (dest_node, &state.new_outputs, src_node,
992 &state.emit_outputs) {
993 nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
994 nir_variable *src = exec_node_data(nir_variable, src_node, node);
995 nir_copy_var(&b, dest, src);
996 }
997 }
998
999 exec_list_append(&shader->variables, &state.old_outputs);
1000 exec_list_append(&shader->variables, &state.emit_outputs);
1001 exec_list_append(&shader->variables, &state.new_outputs);
1002
1003 nir_metadata_preserve(impl, nir_metadata_none);
1004
1005 nir_lower_global_vars_to_local(shader);
1006 nir_split_var_copies(shader);
1007 nir_lower_var_copies(shader);
1008
1009 nir_fixup_deref_modes(shader);
1010
1011 if (shader_debug_enabled(shader->info.stage)) {
1012 mesa_logi("NIR (after gs lowering):");
1013 nir_log_shaderi(shader);
1014 }
1015 }
1016