1 /*
2  * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2016 Broadcom
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "main/mtypes.h"
27 #include "compiler/nir_types.h"
28 #include "compiler/nir/nir_builder.h"
29 #include "util/u_debug.h"
30 #include "util/fast_idiv_by_const.h"
31 #include "agx_compile.h"
32 #include "agx_compiler.h"
33 #include "agx_builder.h"
34 
35 static const struct debug_named_value agx_debug_options[] = {
36    {"msgs",      AGX_DBG_MSGS,		"Print debug messages"},
37    {"shaders",   AGX_DBG_SHADERS,	"Dump shaders in NIR and AIR"},
38    {"shaderdb",  AGX_DBG_SHADERDB,	"Print statistics"},
39    {"verbose",   AGX_DBG_VERBOSE,	"Disassemble verbosely"},
40    {"internal",  AGX_DBG_INTERNAL,	"Dump even internal shaders"},
41    DEBUG_NAMED_VALUE_END
42 };
43 
44 DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)
45 
46 int agx_debug = 0;
47 
48 #define DBG(fmt, ...) \
49    do { if (agx_debug & AGX_DBG_MSGS) \
50       fprintf(stderr, "%s:%d: "fmt, \
51             __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
52 
53 static void
agx_block_add_successor(agx_block * block,agx_block * successor)54 agx_block_add_successor(agx_block *block, agx_block *successor)
55 {
56    assert(block != NULL && successor != NULL);
57 
58    /* Cull impossible edges */
59    if (block->unconditional_jumps)
60       return;
61 
62    for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
63       if (block->successors[i]) {
64          if (block->successors[i] == successor)
65             return;
66          else
67             continue;
68       }
69 
70       block->successors[i] = successor;
71       _mesa_set_add(successor->predecessors, block);
72       return;
73    }
74 
75    unreachable("Too many successors");
76 }
77 
78 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)79 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
80 {
81    /* Ensure we've been scalarized and bit size lowered */
82    unsigned bit_size = instr->def.bit_size;
83    assert(instr->def.num_components == 1);
84    assert(bit_size == 1 || bit_size == 16 || bit_size == 32);
85 
86    /* Emit move, later passes can inline/push if useful */
87    agx_mov_imm_to(b,
88                   agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),
89                   nir_const_value_as_uint(instr->value[0], bit_size));
90 }
91 
92 /* Emit code dividing P by Q */
93 static agx_index
agx_udiv_const(agx_builder * b,agx_index P,uint32_t Q)94 agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q)
95 {
96    /* P / 1 = P */
97    if (Q == 1) {
98       return P;
99    }
100 
101    /* P / UINT32_MAX = 0, unless P = UINT32_MAX when it's one */
102    if (Q == UINT32_MAX) {
103       agx_index max = agx_mov_imm(b, 32, UINT32_MAX);
104       agx_index one = agx_mov_imm(b, 32, 1);
105       return agx_icmpsel(b, P, max, one, agx_zero(), AGX_ICOND_UEQ);
106    }
107 
108    /* P / 2^N = P >> N */
109    if (util_is_power_of_two_or_zero(Q)) {
110       return agx_ushr(b, P, agx_mov_imm(b, 32, util_logbase2(Q)));
111    }
112 
113    /* Fall back on multiplication by a magic number */
114    struct util_fast_udiv_info info = util_compute_fast_udiv_info(Q, 32, 32);
115    agx_index preshift = agx_mov_imm(b, 32, info.pre_shift);
116    agx_index increment = agx_mov_imm(b, 32, info.increment);
117    agx_index postshift = agx_mov_imm(b, 32, info.post_shift);
118    agx_index multiplier = agx_mov_imm(b, 32, info.multiplier);
119    agx_index multiplied = agx_temp(b->shader, AGX_SIZE_64);
120    agx_index n = P;
121 
122    if (info.pre_shift != 0) n = agx_ushr(b, n, preshift);
123    if (info.increment != 0) n = agx_iadd(b, n, increment, 0);
124 
125    /* 64-bit multiplication, zero extending 32-bit x 32-bit, get the top word */
126    agx_imad_to(b, multiplied, agx_abs(n), agx_abs(multiplier), agx_zero(), 0);
127    n = agx_temp(b->shader, AGX_SIZE_32);
128    agx_p_extract_to(b, n, multiplied, 1);
129 
130    if (info.post_shift != 0) n = agx_ushr(b, n, postshift);
131 
132    return n;
133 }
134 
135 /* AGX appears to lack support for vertex attributes. Lower to global loads. */
136 static agx_instr *
agx_emit_load_attr(agx_builder * b,nir_intrinsic_instr * instr)137 agx_emit_load_attr(agx_builder *b, nir_intrinsic_instr *instr)
138 {
139    nir_src *offset_src = nir_get_io_offset_src(instr);
140    assert(nir_src_is_const(*offset_src) && "no attribute indirects");
141    unsigned index = nir_intrinsic_base(instr) +
142                     nir_src_as_uint(*offset_src);
143 
144    struct agx_shader_key *key = b->shader->key;
145    struct agx_attribute attrib = key->vs.attributes[index];
146 
147    /* address = base + (stride * vertex_id) + src_offset */
148    unsigned buf = attrib.buf;
149    unsigned stride = key->vs.vbuf_strides[buf];
150    unsigned shift = agx_format_shift(attrib.format);
151 
152    agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
153    agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
154 
155    agx_index vertex_id = agx_register(10, AGX_SIZE_32);
156    agx_index instance_id = agx_register(12, AGX_SIZE_32);
157 
158    /* A nonzero divisor requires dividing the instance ID. A zero divisor
159     * specifies per-instance data. */
160    agx_index element_id = (attrib.divisor == 0) ? vertex_id :
161                           agx_udiv_const(b, instance_id, attrib.divisor);
162 
163    agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);
164 
165    /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */
166    unsigned num_vbos = key->vs.num_vbufs;
167    unsigned base_length = (num_vbos * 4);
168    agx_index base = agx_indexed_sysval(b->shader,
169                                        AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length);
170 
171    /* Load the data */
172    assert(instr->num_components <= 4);
173 
174    bool pad = ((attrib.nr_comps_minus_1 + 1) < instr->num_components);
175    agx_index real_dest = agx_dest_index(&instr->dest);
176    agx_index dest = pad ? agx_temp(b->shader, AGX_SIZE_32) : real_dest;
177 
178    agx_device_load_to(b, dest, base, offset, attrib.format,
179                       BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
180 
181    agx_wait(b, 0);
182 
183    if (pad) {
184       agx_index one = agx_mov_imm(b, 32, fui(1.0));
185       agx_index zero = agx_mov_imm(b, 32, 0);
186       agx_index channels[4] = { zero, zero, zero, one };
187       for (unsigned i = 0; i < (attrib.nr_comps_minus_1 + 1); ++i)
188          channels[i] = agx_p_extract(b, dest, i);
189       for (unsigned i = instr->num_components; i < 4; ++i)
190          channels[i] = agx_null();
191       agx_p_combine_to(b, real_dest, channels[0], channels[1], channels[2], channels[3]);
192    }
193 
194    return NULL;
195 }
196 
197 static agx_instr *
agx_emit_load_vary_flat(agx_builder * b,nir_intrinsic_instr * instr)198 agx_emit_load_vary_flat(agx_builder *b, nir_intrinsic_instr *instr)
199 {
200    unsigned components = instr->num_components;
201    assert(components >= 1 && components <= 4);
202 
203    nir_src *offset = nir_get_io_offset_src(instr);
204    assert(nir_src_is_const(*offset) && "no indirects");
205    unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
206    imm_index += nir_src_as_uint(*offset);
207 
208    agx_index chan[4] = { agx_null() };
209 
210    for (unsigned i = 0; i < components; ++i) {
211       /* vec3 for each vertex, unknown what first 2 channels are for */
212       agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1);
213       chan[i] = agx_p_extract(b, values, 2);
214    }
215 
216    return agx_p_combine_to(b, agx_dest_index(&instr->dest),
217          chan[0], chan[1], chan[2], chan[3]);
218 }
219 
220 static agx_instr *
agx_emit_load_vary(agx_builder * b,nir_intrinsic_instr * instr)221 agx_emit_load_vary(agx_builder *b, nir_intrinsic_instr *instr)
222 {
223    ASSERTED unsigned components = instr->num_components;
224    ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
225 
226    assert(components >= 1 && components <= 4);
227    assert(parent);
228 
229    /* TODO: Interpolation modes */
230    assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel);
231 
232    nir_src *offset = nir_get_io_offset_src(instr);
233    assert(nir_src_is_const(*offset) && "no indirects");
234    unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
235    imm_index += nir_src_as_uint(*offset) * 4;
236 
237    return agx_ld_vary_to(b, agx_dest_index(&instr->dest),
238          agx_immediate(imm_index), components, true);
239 }
240 
241 static agx_instr *
agx_emit_store_vary(agx_builder * b,nir_intrinsic_instr * instr)242 agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
243 {
244    nir_src *offset = nir_get_io_offset_src(instr);
245    assert(nir_src_is_const(*offset) && "todo: indirects");
246    unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
247    imm_index += nir_intrinsic_component(instr);
248    imm_index += nir_src_as_uint(*offset);
249 
250    /* nir_lower_io_to_scalar */
251    assert(nir_intrinsic_write_mask(instr) == 0x1);
252 
253    return agx_st_vary(b,
254                agx_immediate(imm_index),
255                agx_src_index(&instr->src[0]));
256 }
257 
258 static agx_instr *
agx_emit_fragment_out(agx_builder * b,nir_intrinsic_instr * instr)259 agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
260 {
261    const nir_variable *var =
262       nir_find_variable_with_driver_location(b->shader->nir,
263             nir_var_shader_out, nir_intrinsic_base(instr));
264    assert(var);
265 
266    unsigned loc = var->data.location;
267    assert(var->data.index == 0 && "todo: dual-source blending");
268    assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
269    unsigned rt = (loc - FRAG_RESULT_DATA0);
270 
271    /* TODO: Reverse-engineer interactions with MRT */
272    if (b->shader->nir->info.internal) {
273       /* clear */
274    } else if (b->shader->did_writeout) {
275 	   agx_writeout(b, 0x0004);
276    } else {
277 	   agx_writeout(b, 0xC200);
278 	   agx_writeout(b, 0x000C);
279    }
280 
281    b->shader->did_writeout = true;
282    return agx_st_tile(b, agx_src_index(&instr->src[0]),
283              b->shader->key->fs.tib_formats[rt]);
284 }
285 
286 static agx_instr *
agx_emit_load_tile(agx_builder * b,nir_intrinsic_instr * instr)287 agx_emit_load_tile(agx_builder *b, nir_intrinsic_instr *instr)
288 {
289    const nir_variable *var =
290       nir_find_variable_with_driver_location(b->shader->nir,
291             nir_var_shader_out, nir_intrinsic_base(instr));
292    assert(var);
293 
294    unsigned loc = var->data.location;
295    assert(var->data.index == 0 && "todo: dual-source blending");
296    assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
297    unsigned rt = (loc - FRAG_RESULT_DATA0);
298 
299    /* TODO: Reverse-engineer interactions with MRT */
300    agx_writeout(b, 0xC200);
301    agx_writeout(b, 0x0008);
302    b->shader->did_writeout = true;
303    b->shader->out->reads_tib = true;
304 
305    return agx_ld_tile_to(b, agx_dest_index(&instr->dest),
306          b->shader->key->fs.tib_formats[rt]);
307 }
308 
309 static enum agx_format
agx_format_for_bits(unsigned bits)310 agx_format_for_bits(unsigned bits)
311 {
312    switch (bits) {
313    case 8: return AGX_FORMAT_I8;
314    case 16: return AGX_FORMAT_I16;
315    case 32: return AGX_FORMAT_I32;
316    default: unreachable("Invalid bit size for load/store");
317    }
318 }
319 
320 static agx_instr *
agx_emit_load_ubo(agx_builder * b,nir_intrinsic_instr * instr)321 agx_emit_load_ubo(agx_builder *b, nir_intrinsic_instr *instr)
322 {
323    bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
324    nir_src *offset = nir_get_io_offset_src(instr);
325 
326    if (!kernel_input && !nir_src_is_const(instr->src[0]))
327       unreachable("todo: indirect UBO access");
328 
329    /* Constant offsets for device_load are 16-bit */
330    bool offset_is_const = nir_src_is_const(*offset);
331    assert(offset_is_const && "todo: indirect UBO access");
332    int32_t const_offset = offset_is_const ? nir_src_as_int(*offset) : 0;
333 
334    /* Offsets are shifted by the type size, so divide that out */
335    unsigned bytes = nir_dest_bit_size(instr->dest) / 8;
336    assert((const_offset & (bytes - 1)) == 0);
337    const_offset = const_offset / bytes;
338    int16_t const_as_16 = const_offset;
339 
340    /* UBO blocks are specified (kernel inputs are always 0) */
341    uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]);
342 
343    /* Each UBO has a 64-bit = 4 x 16-bit address */
344    unsigned num_ubos = b->shader->nir->info.num_ubos;
345    unsigned base_length = (num_ubos * 4);
346    unsigned index = block * 4; /* 16 bit units */
347 
348    /* Lookup the base address (TODO: indirection) */
349    agx_index base = agx_indexed_sysval(b->shader,
350                                        AGX_PUSH_UBO_BASES, AGX_SIZE_64,
351                                        index, base_length);
352 
353    /* Load the data */
354    assert(instr->num_components <= 4);
355 
356    agx_device_load_to(b, agx_dest_index(&instr->dest),
357                       base,
358                       (offset_is_const && (const_offset == const_as_16)) ?
359                       agx_immediate(const_as_16) : agx_mov_imm(b, 32, const_offset),
360                       agx_format_for_bits(nir_dest_bit_size(instr->dest)),
361                       BITFIELD_MASK(instr->num_components), 0);
362 
363    return agx_wait(b, 0);
364 }
365 
366 static agx_instr *
agx_emit_load_frag_coord(agx_builder * b,nir_intrinsic_instr * instr)367 agx_emit_load_frag_coord(agx_builder *b, nir_intrinsic_instr *instr)
368 {
369    agx_index xy[2];
370 
371    for (unsigned i = 0; i < 2; ++i) {
372       xy[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),
373                agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),
374                AGX_ROUND_RTE), agx_immediate_f(0.5f));
375    }
376 
377    /* Ordering by the ABI */
378    agx_index z = agx_ld_vary(b, agx_immediate(1), 1, false);
379    agx_index w = agx_ld_vary(b, agx_immediate(0), 1, false);
380 
381    return agx_p_combine_to(b, agx_dest_index(&instr->dest),
382          xy[0], xy[1], z, w);
383 }
384 
385 static agx_instr *
agx_blend_const(agx_builder * b,agx_index dst,unsigned comp)386 agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)
387 {
388      agx_index val = agx_indexed_sysval(b->shader,
389            AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);
390 
391      return agx_mov_to(b, dst, val);
392 }
393 
394 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)395 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
396 {
397   agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
398      agx_dest_index(&instr->dest) : agx_null();
399   gl_shader_stage stage = b->shader->stage;
400 
401   switch (instr->intrinsic) {
402   case nir_intrinsic_load_barycentric_pixel:
403   case nir_intrinsic_load_barycentric_centroid:
404   case nir_intrinsic_load_barycentric_sample:
405   case nir_intrinsic_load_barycentric_at_sample:
406   case nir_intrinsic_load_barycentric_at_offset:
407      /* handled later via load_vary */
408      return NULL;
409   case nir_intrinsic_load_interpolated_input:
410      assert(stage == MESA_SHADER_FRAGMENT);
411      return agx_emit_load_vary(b, instr);
412 
413   case nir_intrinsic_load_input:
414      if (stage == MESA_SHADER_FRAGMENT)
415         return agx_emit_load_vary_flat(b, instr);
416      else if (stage == MESA_SHADER_VERTEX)
417         return agx_emit_load_attr(b, instr);
418      else
419         unreachable("Unsupported shader stage");
420 
421   case nir_intrinsic_store_output:
422      if (stage == MESA_SHADER_FRAGMENT)
423         return agx_emit_fragment_out(b, instr);
424      else if (stage == MESA_SHADER_VERTEX)
425         return agx_emit_store_vary(b, instr);
426      else
427         unreachable("Unsupported shader stage");
428 
429   case nir_intrinsic_load_output:
430      assert(stage == MESA_SHADER_FRAGMENT);
431      return agx_emit_load_tile(b, instr);
432 
433   case nir_intrinsic_load_ubo:
434   case nir_intrinsic_load_kernel_input:
435      return agx_emit_load_ubo(b, instr);
436 
437   case nir_intrinsic_load_frag_coord:
438      return agx_emit_load_frag_coord(b, instr);
439 
440   case nir_intrinsic_load_back_face_agx:
441      return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
442 
443   case nir_intrinsic_load_vertex_id:
444      return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32)));
445 
446   case nir_intrinsic_load_instance_id:
447      return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32)));
448 
449   case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
450   case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
451   case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);
452   case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);
453 
454   default:
455        fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
456        unreachable("Unhandled intrinsic");
457   }
458 }
459 
460 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)461 agx_alu_src_index(agx_builder *b, nir_alu_src src)
462 {
463    /* Check well-formedness of the input NIR */
464    ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
465    unsigned comps = nir_src_num_components(src.src);
466    unsigned channel = src.swizzle[0];
467 
468    assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);
469    assert(!(src.negate || src.abs));
470    assert(channel < comps);
471 
472    agx_index idx = agx_src_index(&src.src);
473 
474    /* We only deal with scalars, emit p_extract if needed */
475    if (comps > 1)
476       return agx_p_extract(b, idx, channel);
477    else
478       return idx;
479 }
480 
481 static agx_instr *
agx_emit_alu_bool(agx_builder * b,nir_op op,agx_index dst,agx_index s0,agx_index s1,agx_index s2)482 agx_emit_alu_bool(agx_builder *b, nir_op op,
483       agx_index dst, agx_index s0, agx_index s1, agx_index s2)
484 {
485    /* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.
486     * This will give the optimizer flexibility. */
487    agx_index f = agx_immediate(0);
488    agx_index t = agx_immediate(0x1);
489 
490    switch (op) {
491    case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);
492    case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);
493    case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);
494    case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);
495 
496    case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);
497    case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);
498    case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);
499    case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);
500    case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);
501    case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);
502 
503    case nir_op_mov: return agx_mov_to(b, dst, s0);
504    case nir_op_iand: return agx_and_to(b, dst, s0, s1);
505    case nir_op_ior: return agx_or_to(b, dst, s0, s1);
506    case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);
507    case nir_op_inot: return agx_xor_to(b, dst, s0, t);
508 
509    case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);
510    case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
511    case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
512 
513    case nir_op_bcsel:
514       return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);
515 
516    default:
517       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
518       unreachable("Unhandled boolean ALU instruction");
519    }
520 }
521 
522 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)523 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
524 {
525    unsigned srcs = nir_op_infos[instr->op].num_inputs;
526    unsigned sz = nir_dest_bit_size(instr->dest.dest);
527    unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
528    ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);
529 
530    assert(comps == 1 || nir_op_is_vec(instr->op));
531    assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);
532 
533    agx_index dst = agx_dest_index(&instr->dest.dest);
534    agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
535    agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
536    agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
537    agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
538 
539    /* 1-bit bools are a bit special, only handle with select ops */
540    if (sz == 1)
541       return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);
542 
543 #define UNOP(nop, aop) \
544    case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);
545 #define BINOP(nop, aop) \
546    case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);
547 #define TRIOP(nop, aop) \
548    case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);
549 
550    switch (instr->op) {
551    BINOP(fadd, fadd);
552    BINOP(fmul, fmul);
553    TRIOP(ffma, fma);
554 
555    UNOP(f2f16, fmov);
556    UNOP(f2f32, fmov);
557    UNOP(fround_even, roundeven);
558    UNOP(ftrunc, trunc);
559    UNOP(ffloor, floor);
560    UNOP(fceil, ceil);
561    UNOP(frcp, rcp);
562    UNOP(frsq, rsqrt);
563    UNOP(flog2, log2);
564    UNOP(fexp2, exp2);
565 
566    UNOP(fddx, dfdx);
567    UNOP(fddx_coarse, dfdx);
568    UNOP(fddx_fine, dfdx);
569 
570    UNOP(fddy, dfdy);
571    UNOP(fddy_coarse, dfdy);
572    UNOP(fddy_fine, dfdy);
573 
574    UNOP(mov, mov);
575    UNOP(u2u16, mov);
576    UNOP(u2u32, mov);
577    UNOP(inot, not);
578    BINOP(iand, and);
579    BINOP(ior, or);
580    BINOP(ixor, xor);
581 
582    case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
583    case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));
584    case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));
585    case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));
586 
587    case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);
588    case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);
589    case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
590    case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
591    case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
592    case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
593 
594    case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);
595    case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
596    case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);
597    case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);
598 
599    case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);
600    case nir_op_ushr: return agx_ushr_to(b, dst, s0, s1);
601    case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);
602 
603    case nir_op_bcsel:
604       return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);
605 
606    case nir_op_b2i32:
607    case nir_op_b2i16:
608       return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);
609 
610    case nir_op_b2f16:
611    case nir_op_b2f32:
612    {
613       /* At this point, boolean is just zero/nonzero, so compare with zero */
614       agx_index one = (sz == 16) ?
615          agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :
616          agx_mov_imm(b, 32, fui(1.0));
617 
618       agx_index zero = agx_zero();
619 
620       return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);
621    }
622 
623    case nir_op_i2i32:
624    {
625       if (s0.size != AGX_SIZE_16)
626          unreachable("todo: more conversions");
627 
628       return agx_iadd_to(b, dst, s0, agx_zero(), 0);
629    }
630 
631    case nir_op_i2i16:
632    {
633       if (s0.size != AGX_SIZE_32)
634          unreachable("todo: more conversions");
635 
636       return agx_iadd_to(b, dst, s0, agx_zero(), 0);
637    }
638 
639    case nir_op_iadd_sat:
640    {
641       agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
642       I->saturate = true;
643       return I;
644    }
645 
646    case nir_op_isub_sat:
647    {
648       agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
649       I->saturate = true;
650       return I;
651    }
652 
653    case nir_op_uadd_sat:
654    {
655       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
656       I->saturate = true;
657       return I;
658    }
659 
660    case nir_op_usub_sat:
661    {
662       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
663       I->saturate = true;
664       return I;
665    }
666 
667    case nir_op_fsat:
668    {
669       agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
670       I->saturate = true;
671       return I;
672    }
673 
674    case nir_op_fsin_agx:
675    {
676       agx_index fixup = agx_sin_pt_1(b, s0);
677       agx_index sinc = agx_sin_pt_2(b, fixup);
678       return agx_fmul_to(b, dst, sinc, fixup);
679    }
680 
681    case nir_op_f2i16:
682       return agx_convert_to(b, dst,
683             agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);
684 
685    case nir_op_f2i32:
686       return agx_convert_to(b, dst,
687             agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);
688 
689    case nir_op_f2u16:
690       return agx_convert_to(b, dst,
691             agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);
692 
693    case nir_op_f2u32:
694       return agx_convert_to(b, dst,
695             agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);
696 
697    case nir_op_u2f16:
698    case nir_op_u2f32:
699    {
700       if (src_sz == 64)
701          unreachable("64-bit conversions unimplemented");
702 
703       enum agx_convert mode =
704          (src_sz == 32) ? AGX_CONVERT_U32_TO_F :
705          (src_sz == 16) ? AGX_CONVERT_U16_TO_F :
706                           AGX_CONVERT_U8_TO_F;
707 
708       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
709    }
710 
711    case nir_op_i2f16:
712    case nir_op_i2f32:
713    {
714       if (src_sz == 64)
715          unreachable("64-bit conversions unimplemented");
716 
717       enum agx_convert mode =
718          (src_sz == 32) ? AGX_CONVERT_S32_TO_F :
719          (src_sz == 16) ? AGX_CONVERT_S16_TO_F :
720                           AGX_CONVERT_S8_TO_F;
721 
722       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
723    }
724 
725    case nir_op_vec2:
726    case nir_op_vec3:
727    case nir_op_vec4:
728       return agx_p_combine_to(b, dst, s0, s1, s2, s3);
729 
730    case nir_op_vec8:
731    case nir_op_vec16:
732       unreachable("should've been lowered");
733 
734    default:
735       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
736       unreachable("Unhandled ALU instruction");
737    }
738 }
739 
740 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)741 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
742 {
743    switch (dim) {
744    case GLSL_SAMPLER_DIM_1D:
745    case GLSL_SAMPLER_DIM_BUF:
746       return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D;
747 
748    case GLSL_SAMPLER_DIM_2D:
749    case GLSL_SAMPLER_DIM_RECT:
750    case GLSL_SAMPLER_DIM_EXTERNAL:
751       return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D;
752 
753    case GLSL_SAMPLER_DIM_MS:
754       assert(!array && "multisampled arrays unsupported");
755       return AGX_DIM_TEX_2D_MS;
756 
757    case GLSL_SAMPLER_DIM_3D:
758       assert(!array && "3D arrays unsupported");
759       return AGX_DIM_TEX_3D;
760 
761    case GLSL_SAMPLER_DIM_CUBE:
762       return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE;
763 
764    default:
765       unreachable("Invalid sampler dim\n");
766    }
767 }
768 
769 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)770 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
771 {
772    switch (instr->op) {
773    case nir_texop_tex:
774    case nir_texop_txl:
775       break;
776    default:
777       unreachable("Unhandled texture op");
778    }
779 
780    enum agx_lod_mode lod_mode = (instr->op == nir_texop_tex) ?
781       AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
782 
783    agx_index coords = agx_null(),
784              texture = agx_immediate(instr->texture_index),
785              sampler = agx_immediate(instr->sampler_index),
786              lod = agx_immediate(0),
787              offset = agx_null();
788 
789    for (unsigned i = 0; i < instr->num_srcs; ++i) {
790       agx_index index = agx_src_index(&instr->src[i].src);
791 
792       switch (instr->src[i].src_type) {
793       case nir_tex_src_coord:
794          coords = index;
795          break;
796 
797       case nir_tex_src_lod:
798          lod = index;
799          break;
800 
801       case nir_tex_src_bias:
802       case nir_tex_src_ms_index:
803       case nir_tex_src_offset:
804       case nir_tex_src_comparator:
805       case nir_tex_src_texture_offset:
806       case nir_tex_src_sampler_offset:
807       default:
808          unreachable("todo");
809       }
810    }
811 
812    agx_texture_sample_to(b, agx_dest_index(&instr->dest),
813          coords, lod, texture, sampler, offset,
814          agx_tex_dim(instr->sampler_dim, instr->is_array),
815          lod_mode,
816          0xF, /* TODO: wrmask */
817          0);
818 
819    agx_wait(b, 0);
820 }
821 
822 /* NIR loops are treated as a pair of AGX loops:
823  *
824  *    do {
825  *       do {
826  *          ...
827  *       } while (0);
828  *    } while (cond);
829  *
830  * By manipulating the nesting counter (r0l), we may break out of nested loops,
831  * so under the model, both break and continue may be implemented as breaks,
832  * where break breaks out of the outer loop (2 layers) and continue breaks out
833  * of the inner loop (1 layer).
834  *
835  * After manipulating the nesting counter directly, pop_exec #0 must be used to
836  * flush the update to the execution mask.
837  */
838 
839 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)840 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
841 {
842    agx_context *ctx = b->shader;
843    assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);
844 
845    /* Break out of either one or two loops */
846    unsigned nestings = b->shader->loop_nesting;
847 
848    if (instr->type == nir_jump_continue) {
849       nestings += 1;
850       agx_block_add_successor(ctx->current_block, ctx->continue_block);
851    } else if (instr->type == nir_jump_break) {
852       nestings += 2;
853       agx_block_add_successor(ctx->current_block, ctx->break_block);
854    }
855 
856    /* Update the counter and flush */
857    agx_index r0l = agx_register(0, false);
858    agx_mov_to(b, r0l, agx_immediate(nestings));
859    agx_pop_exec(b, 0);
860 
861    ctx->current_block->unconditional_jumps = true;
862 }
863 
864 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)865 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
866 {
867    switch (instr->type) {
868    case nir_instr_type_load_const:
869       agx_emit_load_const(b, nir_instr_as_load_const(instr));
870       break;
871 
872    case nir_instr_type_intrinsic:
873       agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
874       break;
875 
876    case nir_instr_type_alu:
877       agx_emit_alu(b, nir_instr_as_alu(instr));
878       break;
879 
880    case nir_instr_type_tex:
881       agx_emit_tex(b, nir_instr_as_tex(instr));
882       break;
883 
884    case nir_instr_type_jump:
885       agx_emit_jump(b, nir_instr_as_jump(instr));
886       break;
887 
888    default:
889       unreachable("should've been lowered");
890    }
891 }
892 
893 static agx_block *
agx_create_block(agx_context * ctx)894 agx_create_block(agx_context *ctx)
895 {
896    agx_block *blk = rzalloc(ctx, agx_block);
897 
898    blk->predecessors = _mesa_set_create(blk,
899          _mesa_hash_pointer, _mesa_key_pointer_equal);
900 
901    return blk;
902 }
903 
904 static agx_block *
emit_block(agx_context * ctx,nir_block * block)905 emit_block(agx_context *ctx, nir_block *block)
906 {
907    if (ctx->after_block) {
908       ctx->current_block = ctx->after_block;
909       ctx->after_block = NULL;
910    } else {
911       ctx->current_block = agx_create_block(ctx);
912    }
913 
914    agx_block *blk = ctx->current_block;
915    list_addtail(&blk->link, &ctx->blocks);
916    list_inithead(&blk->instructions);
917 
918    agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
919 
920    nir_foreach_instr(instr, block) {
921       agx_emit_instr(&_b, instr);
922    }
923 
924    return blk;
925 }
926 
927 static agx_block *
928 emit_cf_list(agx_context *ctx, struct exec_list *list);
929 
930 /* Emit if-else as
931  *
932  *    if_icmp cond != 0
933  *       ...
934  *    else_icmp cond == 0
935  *       ...
936  *    pop_exec
937  *
938  * If the else is empty, we can omit the else_icmp. This is not usually
939  * optimal, but it's a start.
940  */
941 
942 static void
emit_if(agx_context * ctx,nir_if * nif)943 emit_if(agx_context *ctx, nir_if *nif)
944 {
945    nir_block *nir_else_block = nir_if_first_else_block(nif);
946    bool empty_else_block =
947       (nir_else_block == nir_if_last_else_block(nif) &&
948        exec_list_is_empty(&nir_else_block->instr_list));
949 
950    agx_block *first_block = ctx->current_block;
951    agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
952    agx_index cond = agx_src_index(&nif->condition);
953 
954    agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);
955    ctx->loop_nesting++;
956 
957    /* Emit the two subblocks. */
958    agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
959    agx_block *end_then = ctx->current_block;
960 
961    if (!empty_else_block) {
962       _b.cursor = agx_after_block(ctx->current_block);
963       agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);
964    }
965 
966    agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
967    agx_block *end_else = ctx->current_block;
968 
969    ctx->after_block = agx_create_block(ctx);
970 
971    agx_block_add_successor(first_block, if_block);
972    agx_block_add_successor(first_block, else_block);
973    agx_block_add_successor(end_then, ctx->after_block);
974    agx_block_add_successor(end_else, ctx->after_block);
975 
976    _b.cursor = agx_after_block(ctx->current_block);
977    agx_pop_exec(&_b, 1);
978    ctx->loop_nesting--;
979 }
980 
981 static void
emit_loop(agx_context * ctx,nir_loop * nloop)982 emit_loop(agx_context *ctx, nir_loop *nloop)
983 {
984    /* We only track nesting within the innermost loop, so reset */
985    ctx->loop_nesting = 0;
986 
987    agx_block *popped_break = ctx->break_block;
988    agx_block *popped_continue = ctx->continue_block;
989 
990    ctx->break_block = agx_create_block(ctx);
991    ctx->continue_block = agx_create_block(ctx);
992 
993    /* Make room for break/continue nesting (TODO: skip if no divergent CF) */
994    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
995    agx_push_exec(&_b, 2);
996 
997    /* Fallthrough to body */
998    agx_block_add_successor(ctx->current_block, ctx->continue_block);
999 
1000    /* Emit the body */
1001    ctx->after_block = ctx->continue_block;
1002    agx_block *start_block = emit_cf_list(ctx, &nloop->body);
1003 
1004    /* Fix up the nesting counter via an always true while_icmp, and branch back
1005     * to start of loop if any lanes are active */
1006    _b.cursor = agx_after_block(ctx->current_block);
1007    agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);
1008    agx_jmp_exec_any(&_b, start_block);
1009    agx_pop_exec(&_b, 2);
1010    agx_block_add_successor(ctx->current_block, ctx->continue_block);
1011 
1012    /* Pop off */
1013    ctx->after_block = ctx->break_block;
1014    ctx->break_block = popped_break;
1015    ctx->continue_block = popped_continue;
1016 
1017    /* Update shader-db stats */
1018    ++ctx->loop_count;
1019 
1020    /* All nested control flow must have finished */
1021    assert(ctx->loop_nesting == 0);
1022 }
1023 
1024 /* Before the first control flow structure, the nesting counter (r0l) needs to
1025  * be zeroed for correct operation. This only happens at most once, since by
1026  * definition this occurs at the end of the first block, which dominates the
1027  * rest of the program. */
1028 
1029 static void
emit_first_cf(agx_context * ctx)1030 emit_first_cf(agx_context *ctx)
1031 {
1032    if (ctx->any_cf)
1033       return;
1034 
1035    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
1036    agx_index r0l = agx_register(0, false);
1037 
1038    agx_mov_to(&_b, r0l, agx_immediate(0));
1039    ctx->any_cf = true;
1040 }
1041 
1042 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)1043 emit_cf_list(agx_context *ctx, struct exec_list *list)
1044 {
1045    agx_block *start_block = NULL;
1046 
1047    foreach_list_typed(nir_cf_node, node, node, list) {
1048       switch (node->type) {
1049       case nir_cf_node_block: {
1050          agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
1051 
1052          if (!start_block)
1053             start_block = block;
1054 
1055          break;
1056       }
1057 
1058       case nir_cf_node_if:
1059          emit_first_cf(ctx);
1060          emit_if(ctx, nir_cf_node_as_if(node));
1061          break;
1062 
1063       case nir_cf_node_loop:
1064          emit_first_cf(ctx);
1065          emit_loop(ctx, nir_cf_node_as_loop(node));
1066          break;
1067 
1068       default:
1069          unreachable("Unknown control flow");
1070       }
1071    }
1072 
1073    return start_block;
1074 }
1075 
1076 static void
agx_set_st_vary_final(agx_context * ctx)1077 agx_set_st_vary_final(agx_context *ctx)
1078 {
1079    agx_foreach_instr_global_rev(ctx, I) {
1080       if (I->op == AGX_OPCODE_ST_VARY) {
1081          I->last = true;
1082          return;
1083       }
1084    }
1085 }
1086 
1087 static void
agx_print_stats(agx_context * ctx,unsigned size,FILE * fp)1088 agx_print_stats(agx_context *ctx, unsigned size, FILE *fp)
1089 {
1090    unsigned nr_ins = 0, nr_bytes = 0, nr_threads = 1;
1091 
1092    /* TODO */
1093    fprintf(stderr, "%s shader: %u inst, %u bytes, %u threads, %u loops,"
1094            "%u:%u spills:fills\n",
1095            ctx->nir->info.label ?: "",
1096            nr_ins, nr_bytes, nr_threads, ctx->loop_count,
1097            ctx->spills, ctx->fills);
1098 }
1099 
1100 static int
glsl_type_size(const struct glsl_type * type,bool bindless)1101 glsl_type_size(const struct glsl_type *type, bool bindless)
1102 {
1103    return glsl_count_attribute_slots(type, false);
1104 }
1105 
1106 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)1107 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
1108 {
1109    if (instr->type != nir_instr_type_alu)
1110       return false;
1111 
1112    nir_alu_instr *alu = nir_instr_as_alu(instr);
1113    return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
1114 }
1115 
1116 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
1117  * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
1118  * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
1119  * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
1120  * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
1121  * need to change units from radians to quadrants modulo turns. Cosine is
1122  * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
1123  */
1124 
1125 static nir_ssa_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)1126 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
1127 {
1128    nir_alu_instr *alu = nir_instr_as_alu(instr);
1129    nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);
1130    nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
1131 
1132    if (alu->op == nir_op_fcos)
1133       turns = nir_fadd_imm(b, turns, 0.25f);
1134 
1135    nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
1136    return nir_fsin_agx(b, quadrants);
1137 }
1138 
1139 static bool
agx_lower_sincos(nir_shader * shader)1140 agx_lower_sincos(nir_shader *shader)
1141 {
1142    return nir_shader_lower_instructions(shader,
1143          agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);
1144 }
1145 
1146 static bool
agx_lower_front_face(struct nir_builder * b,nir_instr * instr,UNUSED void * data)1147 agx_lower_front_face(struct nir_builder *b,
1148                      nir_instr *instr, UNUSED void *data)
1149 {
1150    if (instr->type != nir_instr_type_intrinsic)
1151       return false;
1152 
1153    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1154    if (intr->intrinsic != nir_intrinsic_load_front_face)
1155       return false;
1156 
1157    assert(intr->dest.is_ssa);
1158    nir_ssa_def *def = &intr->dest.ssa;
1159    assert(def->bit_size == 1);
1160 
1161    b->cursor = nir_before_instr(&intr->instr);
1162    nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
1163    return true;
1164 }
1165 
1166 static bool
agx_lower_point_coord(struct nir_builder * b,nir_instr * instr,UNUSED void * data)1167 agx_lower_point_coord(struct nir_builder *b,
1168                       nir_instr *instr, UNUSED void *data)
1169 {
1170    if (instr->type != nir_instr_type_intrinsic)
1171       return false;
1172 
1173    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1174 
1175    if (intr->intrinsic != nir_intrinsic_load_deref)
1176       return false;
1177 
1178    nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
1179    nir_variable *var = nir_deref_instr_get_variable(deref);
1180 
1181    if (var->data.mode != nir_var_shader_in)
1182       return false;
1183 
1184    if (var->data.location != VARYING_SLOT_PNTC)
1185       return false;
1186 
1187    assert(intr->dest.is_ssa);
1188    assert(intr->dest.ssa.num_components == 2);
1189 
1190    b->cursor = nir_after_instr(&intr->instr);
1191    nir_ssa_def *def = nir_load_deref(b, deref);
1192    nir_ssa_def *y = nir_channel(b, def, 1);
1193    nir_ssa_def *flipped_y = nir_fadd_imm(b, nir_fneg(b, y), 1.0);
1194    nir_ssa_def *flipped = nir_vec2(b, nir_channel(b, def, 0), flipped_y);
1195    nir_ssa_def_rewrite_uses(&intr->dest.ssa, flipped);
1196    return true;
1197 }
1198 
1199 static void
agx_optimize_nir(nir_shader * nir)1200 agx_optimize_nir(nir_shader *nir)
1201 {
1202    bool progress;
1203 
1204    nir_lower_idiv_options idiv_options = {
1205       .imprecise_32bit_lowering = true,
1206       .allow_fp16 = true,
1207    };
1208 
1209    NIR_PASS_V(nir, nir_lower_regs_to_ssa);
1210    NIR_PASS_V(nir, nir_lower_int64);
1211    NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
1212    NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1213    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1214    NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
1215    NIR_PASS_V(nir, agx_lower_sincos);
1216    NIR_PASS_V(nir, nir_shader_instructions_pass,
1217          agx_lower_front_face,
1218          nir_metadata_block_index | nir_metadata_dominance, NULL);
1219 
1220    do {
1221       progress = false;
1222 
1223       NIR_PASS(progress, nir, nir_lower_var_copies);
1224       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
1225 
1226       NIR_PASS(progress, nir, nir_copy_prop);
1227       NIR_PASS(progress, nir, nir_opt_remove_phis);
1228       NIR_PASS(progress, nir, nir_opt_dce);
1229       NIR_PASS(progress, nir, nir_opt_dead_cf);
1230       NIR_PASS(progress, nir, nir_opt_cse);
1231       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
1232       NIR_PASS(progress, nir, nir_opt_algebraic);
1233       NIR_PASS(progress, nir, nir_opt_constant_folding);
1234 
1235       NIR_PASS(progress, nir, nir_opt_undef);
1236       NIR_PASS(progress, nir, nir_lower_undef_to_zero);
1237 
1238       NIR_PASS(progress, nir, nir_opt_loop_unroll);
1239    } while (progress);
1240 
1241    NIR_PASS_V(nir, nir_opt_algebraic_late);
1242    NIR_PASS_V(nir, nir_opt_constant_folding);
1243    NIR_PASS_V(nir, nir_copy_prop);
1244    NIR_PASS_V(nir, nir_opt_dce);
1245    NIR_PASS_V(nir, nir_opt_cse);
1246    NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1247    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1248 
1249    /* Cleanup optimizations */
1250    nir_move_options move_all =
1251       nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
1252       nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
1253 
1254    NIR_PASS_V(nir, nir_opt_sink, move_all);
1255    NIR_PASS_V(nir, nir_opt_move, move_all);
1256    NIR_PASS_V(nir, nir_convert_from_ssa, true);
1257 }
1258 
1259 /* ABI: position first, then user, then psiz */
1260 static void
agx_remap_varyings_vs(nir_shader * nir,struct agx_varyings * varyings,unsigned * remap)1261 agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings,
1262                       unsigned *remap)
1263 {
1264    unsigned base = 0;
1265 
1266    nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS);
1267    if (pos) {
1268       assert(pos->data.driver_location < AGX_MAX_VARYINGS);
1269       remap[pos->data.driver_location] = base;
1270       base += 4;
1271    }
1272 
1273    nir_foreach_shader_out_variable(var, nir) {
1274       unsigned loc = var->data.location;
1275 
1276       if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) {
1277          continue;
1278       }
1279 
1280       assert(var->data.driver_location < AGX_MAX_VARYINGS);
1281       remap[var->data.driver_location] = base;
1282       base += 4;
1283    }
1284 
1285    nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
1286    if (psiz) {
1287       assert(psiz->data.driver_location < AGX_MAX_VARYINGS);
1288       remap[psiz->data.driver_location] = base;
1289       base += 1;
1290    }
1291 
1292    varyings->nr_slots = base;
1293 }
1294 
1295 static void
agx_remap_varyings_fs(nir_shader * nir,struct agx_varyings * varyings,unsigned * remap)1296 agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings,
1297                       unsigned *remap)
1298 {
1299    struct agx_varying_packed *packed = varyings->packed;
1300    unsigned base = 0;
1301 
1302    agx_pack(packed, VARYING, cfg) {
1303       cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W;
1304       cfg.components = 1;
1305       cfg.triangle_slot = cfg.point_slot = base;
1306    }
1307 
1308    base++;
1309    packed++;
1310 
1311    agx_pack(packed, VARYING, cfg) {
1312       cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z;
1313       cfg.components = 1;
1314       cfg.triangle_slot = cfg.point_slot = base;
1315    }
1316 
1317    base++;
1318    packed++;
1319 
1320    unsigned comps[MAX_VARYING] = { 0 };
1321 
1322    nir_foreach_shader_in_variable(var, nir) {
1323      unsigned loc = var->data.driver_location;
1324      const struct glsl_type *column =
1325         glsl_without_array_or_matrix(var->type);
1326      unsigned chan = glsl_get_components(column);
1327 
1328      /* If we have a fractional location added, we need to increase the size
1329       * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
1330       * We could do better but this is an edge case as it is, normally
1331       * packed varyings will be aligned.
1332       */
1333      chan += var->data.location_frac;
1334      comps[loc] = MAX2(comps[loc], chan);
1335    }
1336 
1337    nir_foreach_shader_in_variable(var, nir) {
1338      unsigned loc = var->data.driver_location;
1339      unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
1340      unsigned channels = comps[loc];
1341 
1342      assert(var->data.driver_location <= AGX_MAX_VARYINGS);
1343      remap[var->data.driver_location] = base;
1344 
1345      for (int c = 0; c < sz; ++c) {
1346         agx_pack(packed, VARYING, cfg) {
1347            cfg.type = (var->data.location == VARYING_SLOT_PNTC) ?
1348               AGX_VARYING_TYPE_POINT_COORDINATES :
1349               (var->data.interpolation == INTERP_MODE_FLAT) ?
1350                  AGX_VARYING_TYPE_FLAT_LAST :
1351                  AGX_VARYING_TYPE_SMOOTH;
1352 
1353            cfg.components = channels;
1354            cfg.triangle_slot = cfg.point_slot = base;
1355         }
1356 
1357         base += channels;
1358         packed++;
1359      }
1360    }
1361 
1362    varyings->nr_descs = (packed - varyings->packed);
1363    varyings->nr_slots = base;
1364 }
1365 
1366 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_dynarray * binary,struct agx_shader_info * out)1367 agx_compile_shader_nir(nir_shader *nir,
1368       struct agx_shader_key *key,
1369       struct util_dynarray *binary,
1370       struct agx_shader_info *out)
1371 {
1372    agx_debug = debug_get_option_agx_debug();
1373 
1374    agx_context *ctx = rzalloc(NULL, agx_context);
1375    ctx->nir = nir;
1376    ctx->out = out;
1377    ctx->key = key;
1378    ctx->stage = nir->info.stage;
1379    list_inithead(&ctx->blocks);
1380 
1381    if (ctx->stage == MESA_SHADER_VERTEX) {
1382       out->writes_psiz = nir->info.outputs_written &
1383          BITFIELD_BIT(VARYING_SLOT_PSIZ);
1384    }
1385 
1386    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1387 
1388    /* Lower large arrays to scratch and small arrays to csel */
1389    NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
1390          glsl_get_natural_size_align_bytes);
1391    NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
1392 
1393    if (ctx->stage == MESA_SHADER_VERTEX) {
1394       /* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */
1395       if (!key->vs.clip_halfz)
1396          NIR_PASS_V(nir, nir_lower_clip_halfz);
1397    } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1398       /* Flip point coordinate since OpenGL and Metal disagree */
1399       NIR_PASS_V(nir, nir_shader_instructions_pass,
1400             agx_lower_point_coord,
1401             nir_metadata_block_index | nir_metadata_dominance, NULL);
1402    }
1403 
1404    NIR_PASS_V(nir, nir_split_var_copies);
1405    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
1406    NIR_PASS_V(nir, nir_lower_var_copies);
1407    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1408    NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1409          glsl_type_size, 0);
1410    if (ctx->stage == MESA_SHADER_FRAGMENT) {
1411       NIR_PASS_V(nir, nir_lower_mediump_io,
1412             nir_var_shader_in | nir_var_shader_out, ~0, false);
1413    }
1414    NIR_PASS_V(nir, nir_lower_ssbo);
1415 
1416    /* Varying output is scalar, other I/O is vector */
1417    if (ctx->stage == MESA_SHADER_VERTEX) {
1418       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
1419    }
1420 
1421    nir_lower_tex_options lower_tex_options = {
1422       .lower_txs_lod = true,
1423       .lower_txp = ~0,
1424    };
1425 
1426    nir_tex_src_type_constraints tex_constraints = {
1427       [nir_tex_src_lod] = { true, 16 }
1428    };
1429 
1430    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
1431    NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
1432 
1433    agx_optimize_nir(nir);
1434 
1435    /* Must be last since NIR passes can remap driver_location freely */
1436    if (ctx->stage == MESA_SHADER_VERTEX) {
1437       agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings);
1438    } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1439       agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings);
1440    }
1441 
1442    bool skip_internal = nir->info.internal;
1443    skip_internal &= !(agx_debug & AGX_DBG_INTERNAL);
1444 
1445    if (agx_debug & AGX_DBG_SHADERS && !skip_internal) {
1446       nir_print_shader(nir, stdout);
1447    }
1448 
1449    nir_foreach_function(func, nir) {
1450       if (!func->impl)
1451          continue;
1452 
1453       /* TODO: Handle phi nodes instead of just convert_from_ssa and yolo'ing
1454        * the mapping of nir_register to hardware registers and guaranteeing bad
1455        * performance and breaking spilling... */
1456       ctx->nir_regalloc = rzalloc_array(ctx, unsigned, func->impl->reg_alloc);
1457 
1458       /* Leave the last 4 registers for hacky p-copy lowering */
1459       unsigned nir_regalloc = AGX_NUM_REGS - (4 * 2);
1460 
1461       /* Assign backwards so we don't need to guess a size */
1462       nir_foreach_register(reg, &func->impl->registers) {
1463          /* Ensure alignment */
1464          if (reg->bit_size >= 32 && (nir_regalloc & 1))
1465             nir_regalloc--;
1466 
1467          unsigned size = DIV_ROUND_UP(reg->bit_size * reg->num_components, 16);
1468          nir_regalloc -= size;
1469          ctx->nir_regalloc[reg->index] = nir_regalloc;
1470       }
1471 
1472       ctx->max_register = nir_regalloc;
1473       ctx->alloc += func->impl->ssa_alloc;
1474       emit_cf_list(ctx, &func->impl->body);
1475       break; /* TODO: Multi-function shaders */
1476    }
1477 
1478    /* TODO: Actual RA... this way passes don't need to deal nir_register */
1479    agx_foreach_instr_global(ctx, I) {
1480       agx_foreach_dest(I, d) {
1481          if (I->dest[d].type == AGX_INDEX_NIR_REGISTER) {
1482             I->dest[d].type = AGX_INDEX_REGISTER;
1483             I->dest[d].value = ctx->nir_regalloc[I->dest[d].value];
1484          }
1485       }
1486 
1487       agx_foreach_src(I, s) {
1488          if (I->src[s].type == AGX_INDEX_NIR_REGISTER) {
1489             I->src[s].type = AGX_INDEX_REGISTER;
1490             I->src[s].value = ctx->nir_regalloc[I->src[s].value];
1491          }
1492       }
1493    }
1494 
1495    /* Terminate the shader after the exit block */
1496    agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
1497    agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
1498    agx_stop(&_b);
1499 
1500    /* Also add traps to match the blob, unsure what the function is */
1501    for (unsigned i = 0; i < 8; ++i)
1502       agx_trap(&_b);
1503 
1504    unsigned block_source_count = 0;
1505 
1506    /* Name blocks now that we're done emitting so the order is consistent */
1507    agx_foreach_block(ctx, block)
1508       block->name = block_source_count++;
1509 
1510    if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1511       agx_print_shader(ctx, stdout);
1512 
1513    agx_optimizer(ctx);
1514    agx_dce(ctx);
1515 
1516    if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1517       agx_print_shader(ctx, stdout);
1518 
1519    agx_ra(ctx);
1520 
1521    if (ctx->stage == MESA_SHADER_VERTEX)
1522       agx_set_st_vary_final(ctx);
1523 
1524    if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1525       agx_print_shader(ctx, stdout);
1526 
1527    agx_pack_binary(ctx, binary);
1528 
1529    if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal)
1530       agx_print_stats(ctx, binary->size, stderr);
1531 
1532    ralloc_free(ctx);
1533 }
1534