1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "main/mtypes.h"
28 #include "compiler/glsl/glsl_to_nir.h"
29 #include "compiler/nir_types.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32 
33 #include "disassemble.h"
34 #include "bifrost_compile.h"
35 #include "compiler.h"
36 #include "bi_quirks.h"
37 #include "bi_builder.h"
38 #include "bifrost_nir.h"
39 
40 static const struct debug_named_value bifrost_debug_options[] = {
41         {"msgs",      BIFROST_DBG_MSGS,		"Print debug messages"},
42         {"shaders",   BIFROST_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
43         {"shaderdb",  BIFROST_DBG_SHADERDB,	"Print statistics"},
44         {"verbose",   BIFROST_DBG_VERBOSE,	"Disassemble verbosely"},
45         {"internal",  BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
46         {"nosched",   BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
47         {"inorder",   BIFROST_DBG_INORDER, 	"Force in-order bundling"},
48         {"novalidate",BIFROST_DBG_NOVALIDATE,   "Skip IR validation"},
49         {"noopt",     BIFROST_DBG_NOOPT,        "Skip optimization passes"},
50         DEBUG_NAMED_VALUE_END
51 };
52 
53 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0)
54 
55 /* How many bytes are prefetched by the Bifrost shader core. From the final
56  * clause of the shader, this range must be valid instructions or zero. */
57 #define BIFROST_SHADER_PREFETCH 128
58 
59 int bifrost_debug = 0;
60 
61 #define DBG(fmt, ...) \
62 		do { if (bifrost_debug & BIFROST_DBG_MSGS) \
63 			fprintf(stderr, "%s:%d: "fmt, \
64 				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
65 
66 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
67 
68 static void
bi_block_add_successor(bi_block * block,bi_block * successor)69 bi_block_add_successor(bi_block *block, bi_block *successor)
70 {
71         assert(block != NULL && successor != NULL);
72 
73         /* Cull impossible edges */
74         if (block->unconditional_jumps)
75                 return;
76 
77         for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
78                 if (block->successors[i]) {
79                        if (block->successors[i] == successor)
80                                return;
81                        else
82                                continue;
83                 }
84 
85                 block->successors[i] = successor;
86                 _mesa_set_add(successor->predecessors, block);
87                 return;
88         }
89 
90         unreachable("Too many successors");
91 }
92 
93 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)94 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
95 {
96         bi_instr *branch = bi_jump(b, bi_zero());
97 
98         switch (instr->type) {
99         case nir_jump_break:
100                 branch->branch_target = b->shader->break_block;
101                 break;
102         case nir_jump_continue:
103                 branch->branch_target = b->shader->continue_block;
104                 break;
105         default:
106                 unreachable("Unhandled jump type");
107         }
108 
109         bi_block_add_successor(b->shader->current_block, branch->branch_target);
110         b->shader->current_block->unconditional_jumps = true;
111 }
112 
113 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)114 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
115 {
116         switch (intr->intrinsic) {
117         case nir_intrinsic_load_barycentric_centroid:
118         case nir_intrinsic_load_barycentric_sample:
119                 return bi_register(61);
120 
121         /* Need to put the sample ID in the top 16-bits */
122         case nir_intrinsic_load_barycentric_at_sample:
123                 return bi_mkvec_v2i16(b, bi_half(bi_dontcare(), false),
124                                 bi_half(bi_src_index(&intr->src[0]), false));
125 
126         /* Interpret as 8:8 signed fixed point positions in pixels along X and
127          * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
128          * is the center of the pixel so we first fixup and then convert. For
129          * fp16 input:
130          *
131          * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
132          * f2i16((256 * (x, y)) + (128, 128)) =
133          * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
134          *
135          * For fp32 input, that lacks enough precision for MSAA 16x, but the
136          * idea is the same. FIXME: still doesn't pass
137          */
138         case nir_intrinsic_load_barycentric_at_offset: {
139                 bi_index offset = bi_src_index(&intr->src[0]);
140                 bi_index f16 = bi_null();
141                 unsigned sz = nir_src_bit_size(intr->src[0]);
142 
143                 if (sz == 16) {
144                         f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
145                                         bi_imm_f16(128.0), BI_ROUND_NONE);
146                 } else {
147                         assert(sz == 32);
148                         bi_index f[2];
149                         for (unsigned i = 0; i < 2; ++i) {
150                                 f[i] = bi_fadd_rscale_f32(b,
151                                                 bi_word(offset, i),
152                                                 bi_imm_f32(0.5), bi_imm_u32(8),
153                                                 BI_ROUND_NONE, BI_SPECIAL_NONE);
154                         }
155 
156                         f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE);
157                 }
158 
159                 return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ);
160         }
161 
162         case nir_intrinsic_load_barycentric_pixel:
163         default:
164                 return bi_dontcare();
165         }
166 }
167 
168 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)169 bi_interp_for_intrinsic(nir_intrinsic_op op)
170 {
171         switch (op) {
172         case nir_intrinsic_load_barycentric_centroid:
173                 return BI_SAMPLE_CENTROID;
174         case nir_intrinsic_load_barycentric_sample:
175         case nir_intrinsic_load_barycentric_at_sample:
176                 return BI_SAMPLE_SAMPLE;
177         case nir_intrinsic_load_barycentric_at_offset:
178                 return BI_SAMPLE_EXPLICIT;
179         case nir_intrinsic_load_barycentric_pixel:
180         default:
181                 return BI_SAMPLE_CENTER;
182         }
183 }
184 
185 /* auto, 64-bit omitted */
186 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)187 bi_reg_fmt_for_nir(nir_alu_type T)
188 {
189         switch (T) {
190         case nir_type_float16: return BI_REGISTER_FORMAT_F16;
191         case nir_type_float32: return BI_REGISTER_FORMAT_F32;
192         case nir_type_int16:   return BI_REGISTER_FORMAT_S16;
193         case nir_type_uint16:  return BI_REGISTER_FORMAT_U16;
194         case nir_type_int32:   return BI_REGISTER_FORMAT_S32;
195         case nir_type_uint32:  return BI_REGISTER_FORMAT_U32;
196         default: unreachable("Invalid type for register format");
197         }
198 }
199 
200 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
201  * immediate to be used (which applies even if _IMM can't be used) */
202 
203 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)204 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max)
205 {
206         nir_src *offset = nir_get_io_offset_src(instr);
207 
208         if (!nir_src_is_const(*offset))
209                 return false;
210 
211         *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
212         return (*immediate) < max;
213 }
214 
215 static void
216 bi_make_vec_to(bi_builder *b, bi_index final_dst,
217                 bi_index *src,
218                 unsigned *channel,
219                 unsigned count,
220                 unsigned bitsize);
221 
222 /* Bifrost's load instructions lack a component offset despite operating in
223  * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
224  * but they may be unavoidable with separate shaders in use. To solve this, we
225  * lower to a larger load and an explicit copy of the desired components. */
226 
227 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)228 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
229 {
230         unsigned component = nir_intrinsic_component(instr);
231 
232         if (component == 0)
233                 return;
234 
235         bi_index srcs[] = { tmp, tmp, tmp, tmp };
236         unsigned channels[] = { component, component + 1, component + 2 };
237 
238         bi_make_vec_to(b,
239                         bi_dest_index(&instr->dest),
240                         srcs, channels, instr->num_components,
241                         nir_dest_bit_size(instr->dest));
242 }
243 
244 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)245 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
246 {
247         nir_alu_type T = nir_intrinsic_dest_type(instr);
248         enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
249         nir_src *offset = nir_get_io_offset_src(instr);
250         unsigned component = nir_intrinsic_component(instr);
251         enum bi_vecsize vecsize = (instr->num_components + component - 1);
252         unsigned imm_index = 0;
253         unsigned base = nir_intrinsic_base(instr);
254         bool constant = nir_src_is_const(*offset);
255         bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
256         bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
257 
258         if (immediate) {
259                 bi_ld_attr_imm_to(b, dest, bi_register(61), bi_register(62),
260                                 regfmt, vecsize, imm_index);
261         } else {
262                 bi_index idx = bi_src_index(&instr->src[0]);
263 
264                 if (constant)
265                         idx = bi_imm_u32(imm_index);
266                 else if (base != 0)
267                         idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
268 
269                 bi_ld_attr_to(b, dest, bi_register(61), bi_register(62),
270                                 idx, regfmt, vecsize);
271         }
272 
273         bi_copy_component(b, instr, dest);
274 }
275 
276 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)277 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
278 {
279         enum bi_sample sample = BI_SAMPLE_CENTER;
280         enum bi_update update = BI_UPDATE_STORE;
281         enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
282         bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
283         bi_index src0 = bi_null();
284 
285         unsigned component = nir_intrinsic_component(instr);
286         enum bi_vecsize vecsize = (instr->num_components + component - 1);
287         bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader);
288 
289         unsigned sz = nir_dest_bit_size(instr->dest);
290 
291         if (smooth) {
292                 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
293                 assert(parent);
294 
295                 sample = bi_interp_for_intrinsic(parent->intrinsic);
296                 src0 = bi_varying_src0_for_barycentric(b, parent);
297 
298                 assert(sz == 16 || sz == 32);
299                 regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16
300                         : BI_REGISTER_FORMAT_F32;
301         } else {
302                 assert(sz == 32);
303                 regfmt = BI_REGISTER_FORMAT_U32;
304         }
305 
306         nir_src *offset = nir_get_io_offset_src(instr);
307         unsigned imm_index = 0;
308         bool immediate = bi_is_intr_immediate(instr, &imm_index, 20);
309 
310         if (immediate && smooth) {
311                 bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
312                                 vecsize, imm_index);
313         } else if (immediate && !smooth) {
314                 bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt,
315                                 vecsize, imm_index);
316         } else {
317                 bi_index idx = bi_src_index(offset);
318                 unsigned base = nir_intrinsic_base(instr);
319 
320                 if (base != 0)
321                         idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
322 
323                 if (smooth) {
324                         bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
325                                         update, vecsize);
326                 } else {
327                         bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
328                                         regfmt, vecsize);
329                 }
330         }
331 
332         bi_copy_component(b, instr, dest);
333 }
334 
335 static void
bi_make_vec16_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count)336 bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src,
337                 unsigned *channel, unsigned count)
338 {
339         for (unsigned i = 0; i < count; i += 2) {
340                 bool next = (i + 1) < count;
341 
342                 unsigned chan = channel ? channel[i] : 0;
343                 unsigned nextc = next && channel ? channel[i + 1] : 0;
344 
345                 bi_index w0 = bi_word(src[i], chan >> 1);
346                 bi_index w1 = next ? bi_word(src[i + 1], nextc >> 1) : bi_zero();
347 
348                 bi_index h0 = bi_half(w0, chan & 1);
349                 bi_index h1 = bi_half(w1, nextc & 1);
350 
351                 bi_index to = bi_word(dst, i >> 1);
352 
353                 if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1))
354                         bi_mov_i32_to(b, to, w0);
355                 else if (bi_is_word_equiv(w0, w1))
356                         bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1));
357                 else
358                         bi_mkvec_v2i16_to(b, to, h0, h1);
359         }
360 }
361 
362 static void
bi_make_vec_to(bi_builder * b,bi_index final_dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)363 bi_make_vec_to(bi_builder *b, bi_index final_dst,
364                 bi_index *src,
365                 unsigned *channel,
366                 unsigned count,
367                 unsigned bitsize)
368 {
369         /* If we reads our own output, we need a temporary move to allow for
370          * swapping. TODO: Could do a bit better for pairwise swaps of 16-bit
371          * vectors */
372         bool reads_self = false;
373 
374         for (unsigned i = 0; i < count; ++i)
375                 reads_self |= bi_is_equiv(final_dst, src[i]);
376 
377         /* SSA can't read itself */
378         assert(!reads_self || final_dst.reg);
379 
380         bi_index dst = reads_self ? bi_temp(b->shader) : final_dst;
381 
382         if (bitsize == 32) {
383                 for (unsigned i = 0; i < count; ++i) {
384                         bi_mov_i32_to(b, bi_word(dst, i),
385                                         bi_word(src[i], channel ? channel[i] : 0));
386                 }
387         } else if (bitsize == 16) {
388                 bi_make_vec16_to(b, dst, src, channel, count);
389         } else if (bitsize == 8 && count == 1) {
390                 bi_swz_v4i8_to(b, dst, bi_byte(
391                                         bi_word(src[0], channel[0] >> 2),
392                                         channel[0] & 3));
393         } else {
394                 unreachable("8-bit mkvec not yet supported");
395         }
396 
397         /* Emit an explicit copy if needed */
398         if (!bi_is_equiv(dst, final_dst)) {
399                 unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0;
400                 unsigned vec = (1 << shift);
401 
402                 for (unsigned i = 0; i < count; i += vec) {
403                         bi_mov_i32_to(b, bi_word(final_dst, i >> shift),
404                                         bi_word(dst, i >> shift));
405                 }
406         }
407 }
408 
409 static bi_instr *
bi_load_sysval_to(bi_builder * b,bi_index dest,int sysval,unsigned nr_components,unsigned offset)410 bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval,
411                 unsigned nr_components, unsigned offset)
412 {
413         unsigned sysval_ubo =
414                 MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos);
415         unsigned uniform =
416                 pan_lookup_sysval(b->shader->sysval_to_id,
417                                   &b->shader->info->sysvals,
418                                   sysval);
419         unsigned idx = (uniform * 16) + offset;
420 
421         return bi_load_to(b, nr_components * 32, dest,
422                         bi_imm_u32(idx),
423                         bi_imm_u32(sysval_ubo), BI_SEG_UBO);
424 }
425 
426 static void
bi_load_sysval_nir(bi_builder * b,nir_intrinsic_instr * intr,unsigned nr_components,unsigned offset)427 bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr,
428                 unsigned nr_components, unsigned offset)
429 {
430         bi_load_sysval_to(b, bi_dest_index(&intr->dest),
431                         panfrost_sysval_for_instr(&intr->instr, NULL),
432                         nr_components, offset);
433 }
434 
435 static bi_index
bi_load_sysval(bi_builder * b,int sysval,unsigned nr_components,unsigned offset)436 bi_load_sysval(bi_builder *b, int sysval,
437                 unsigned nr_components, unsigned offset)
438 {
439         bi_index tmp = bi_temp(b->shader);
440         bi_load_sysval_to(b, tmp, sysval, nr_components, offset);
441         return tmp;
442 }
443 
444 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)445 bi_load_sample_id_to(bi_builder *b, bi_index dst)
446 {
447         /* r61[16:23] contains the sampleID, mask it out. Upper bits
448          * seem to read garbage (despite being architecturally defined
449          * as zero), so use a 5-bit mask instead of 8-bits */
450 
451         bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f),
452                                 bi_imm_u8(16));
453 }
454 
455 static bi_index
bi_load_sample_id(bi_builder * b)456 bi_load_sample_id(bi_builder *b)
457 {
458         bi_index sample_id = bi_temp(b->shader);
459         bi_load_sample_id_to(b, sample_id);
460         return sample_id;
461 }
462 
463 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)464 bi_pixel_indices(bi_builder *b, unsigned rt)
465 {
466         /* We want to load the current pixel. */
467         struct bifrost_pixel_indices pix = {
468                 .y = BIFROST_CURRENT_PIXEL,
469                 .rt = rt
470         };
471 
472         uint32_t indices_u32 = 0;
473         memcpy(&indices_u32, &pix, sizeof(indices_u32));
474         bi_index indices = bi_imm_u32(indices_u32);
475 
476         /* Sample index above is left as zero. For multisampling, we need to
477          * fill in the actual sample ID in the lower byte */
478 
479         if (b->shader->inputs->blend.nr_samples > 1)
480                 indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
481 
482         return indices;
483 }
484 
485 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)486 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
487 {
488         ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
489 
490         /* Source color is passed through r0-r3, or r4-r7 for the second
491          * source when dual-source blending.  TODO: Precolour instead */
492         bi_index srcs[] = {
493                 bi_register(0), bi_register(1), bi_register(2), bi_register(3)
494         };
495         bi_index srcs2[] = {
496                 bi_register(4), bi_register(5), bi_register(6), bi_register(7)
497         };
498 
499         bool second_source = (sem.location == VARYING_SLOT_VAR0);
500 
501         bi_make_vec_to(b, bi_dest_index(&instr->dest),
502                        second_source ? srcs2 : srcs,
503                        NULL, 4, 32);
504 }
505 
506 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,unsigned rt)507 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt)
508 {
509         /* Reads 2 or 4 staging registers to cover the input */
510         unsigned size = nir_alu_type_get_type_size(T);
511         unsigned sr_count = (size <= 16) ? 2 : 4;
512         const struct panfrost_compile_inputs *inputs = b->shader->inputs;
513         uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
514 
515         if (inputs->is_blend && inputs->blend.nr_samples > 1) {
516                 /* Conversion descriptor comes from the compile inputs, pixel
517                  * indices derived at run time based on sample ID */
518                 bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_register(60),
519                                 bi_imm_u32(blend_desc >> 32), BI_VECSIZE_V4);
520         } else if (b->shader->inputs->is_blend) {
521                 /* Blend descriptor comes from the compile inputs */
522                 /* Put the result in r0 */
523                 bi_blend_to(b, bi_register(0), rgba,
524                                 bi_register(60),
525                                 bi_imm_u32(blend_desc & 0xffffffff),
526                                 bi_imm_u32(blend_desc >> 32), sr_count);
527         } else {
528                 /* Blend descriptor comes from the FAU RAM. By convention, the
529                  * return address is stored in r48 and will be used by the
530                  * blend shader to jump back to the fragment shader after */
531                 bi_blend_to(b, bi_register(48), rgba,
532                                 bi_register(60),
533                                 bi_fau(BIR_FAU_BLEND_0 + rt, false),
534                                 bi_fau(BIR_FAU_BLEND_0 + rt, true), sr_count);
535         }
536 
537         assert(rt < 8);
538         b->shader->info->bifrost.blend[rt].type = T;
539 }
540 
541 /* Blend shaders do not need to run ATEST since they are dependent on a
542  * fragment shader that runs it. Blit shaders may not need to run ATEST, since
543  * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
544  * there are no writes to the coverage mask. The latter two are satisfied for
545  * all blit shaders, so we just care about early-z, which blit shaders force
546  * iff they do not write depth or stencil */
547 
548 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)549 bi_skip_atest(bi_context *ctx, bool emit_zs)
550 {
551         return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
552 }
553 
554 static void
bi_emit_atest(bi_builder * b,bi_index alpha)555 bi_emit_atest(bi_builder *b, bi_index alpha)
556 {
557         bi_index coverage = bi_register(60);
558         bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha);
559         b->shader->emitted_atest = true;
560 
561         /* Pseudo-source to encode in the tuple */
562         atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
563 }
564 
565 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)566 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
567 {
568         bool combined = instr->intrinsic ==
569                 nir_intrinsic_store_combined_output_pan;
570 
571         unsigned writeout = combined ? nir_intrinsic_component(instr) :
572                 PAN_WRITEOUT_C;
573 
574         bool emit_blend = writeout & (PAN_WRITEOUT_C);
575         bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
576 
577         const nir_variable *var =
578                 nir_find_variable_with_driver_location(b->shader->nir,
579                                 nir_var_shader_out, nir_intrinsic_base(instr));
580         assert(var);
581 
582         unsigned loc = var->data.location;
583         bi_index src0 = bi_src_index(&instr->src[0]);
584 
585         /* By ISA convention, the coverage mask is stored in R60. The store
586          * itself will be handled by a subsequent ATEST instruction */
587         if (loc == FRAG_RESULT_SAMPLE_MASK) {
588                 bi_index orig = bi_register(60);
589                 bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
590                 bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0));
591                 bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO);
592                 return;
593         }
594 
595 
596         /* Dual-source blending is implemented by putting the color in
597          * registers r4-r7. */
598         if (var->data.index) {
599                 unsigned count = nir_src_num_components(instr->src[0]);
600 
601                 for (unsigned i = 0; i < count; ++i)
602                         bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i));
603 
604                 b->shader->info->bifrost.blend_src1_type =
605                         nir_intrinsic_src_type(instr);
606 
607                 return;
608         }
609 
610         /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
611          * value, but render target #0 might not be floating point. However the
612          * alpha value is only used for alpha-to-coverage, a stage which is
613          * skipped for pure integer framebuffers, so the issue is moot. */
614 
615         if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
616                 nir_alu_type T = nir_intrinsic_src_type(instr);
617 
618                 bi_index rgba = bi_src_index(&instr->src[0]);
619                 bi_index alpha =
620                         (T == nir_type_float16) ? bi_half(bi_word(rgba, 1), true) :
621                         (T == nir_type_float32) ? bi_word(rgba, 3) :
622                         bi_dontcare();
623 
624                 /* Don't read out-of-bounds */
625                 if (nir_src_num_components(instr->src[0]) < 4)
626                         alpha = bi_imm_f32(1.0);
627 
628                 bi_emit_atest(b, alpha);
629         }
630 
631         if (emit_zs) {
632                 bi_index z = { 0 }, s = { 0 };
633 
634                 if (writeout & PAN_WRITEOUT_Z)
635                         z = bi_src_index(&instr->src[2]);
636 
637                 if (writeout & PAN_WRITEOUT_S)
638                         s = bi_src_index(&instr->src[3]);
639 
640                 bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60),
641                                 writeout & PAN_WRITEOUT_S,
642                                 writeout & PAN_WRITEOUT_Z);
643         }
644 
645         if (emit_blend) {
646                 assert(loc >= FRAG_RESULT_DATA0);
647 
648                 unsigned rt = (loc - FRAG_RESULT_DATA0);
649                 bi_index color = bi_src_index(&instr->src[0]);
650 
651                 /* Explicit copy since BLEND inputs are precoloured to R0-R3,
652                  * TODO: maybe schedule around this or implement in RA as a
653                  * spill */
654                 bool has_mrt = false;
655 
656                 nir_foreach_shader_out_variable(var, b->shader->nir)
657                         has_mrt |= (var->data.location > FRAG_RESULT_DATA0);
658 
659                 if (has_mrt) {
660                         bi_index srcs[4] = { color, color, color, color };
661                         unsigned channels[4] = { 0, 1, 2, 3 };
662                         color = bi_temp(b->shader);
663                         bi_make_vec_to(b, color, srcs, channels,
664                                        nir_src_num_components(instr->src[0]),
665                                        nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
666                 }
667 
668                 bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), rt);
669         }
670 
671         if (b->shader->inputs->is_blend) {
672                 /* Jump back to the fragment shader, return address is stored
673                  * in r48 (see above).
674                  */
675                 bi_jump(b, bi_register(48));
676         }
677 }
678 
679 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)680 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
681 {
682         /* In principle we can do better for 16-bit. At the moment we require
683          * 32-bit to permit the use of .auto, in order to force .u32 for flat
684          * varyings, to handle internal TGSI shaders that set flat in the VS
685          * but smooth in the FS */
686 
687         ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
688         assert(nir_alu_type_get_type_size(T) == 32);
689         enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
690 
691         unsigned imm_index = 0;
692         bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
693 
694         bi_index address;
695         if (immediate) {
696                 address = bi_lea_attr_imm(b,
697                                           bi_register(61), bi_register(62),
698                                           regfmt, imm_index);
699         } else {
700                 bi_index idx =
701                         bi_iadd_u32(b,
702                                     bi_src_index(nir_get_io_offset_src(instr)),
703                                     bi_imm_u32(nir_intrinsic_base(instr)),
704                                     false);
705                 address = bi_lea_attr(b,
706                                       bi_register(61), bi_register(62),
707                                       idx, regfmt);
708         }
709 
710         /* Only look at the total components needed. In effect, we fill in all
711          * the intermediate "holes" in the write mask, since we can't mask off
712          * stores. Since nir_lower_io_to_temporaries ensures each varying is
713          * written at most once, anything that's masked out is undefined, so it
714          * doesn't matter what we write there. So we may as well do the
715          * simplest thing possible. */
716         unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
717         assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
718 
719         bi_st_cvt(b, bi_src_index(&instr->src[0]), address,
720                         bi_word(address, 1), bi_word(address, 2),
721                         regfmt, nr - 1);
722 }
723 
724 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)725 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
726 {
727         nir_src *offset = nir_get_io_offset_src(instr);
728 
729         bool offset_is_const = nir_src_is_const(*offset);
730         bi_index dyn_offset = bi_src_index(offset);
731         uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
732         bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
733 
734         bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
735                         bi_dest_index(&instr->dest), offset_is_const ?
736                         bi_imm_u32(const_offset) : dyn_offset,
737                         kernel_input ? bi_zero() : bi_src_index(&instr->src[0]),
738                         BI_SEG_UBO);
739 }
740 
741 static bi_index
bi_addr_high(nir_src * src)742 bi_addr_high(nir_src *src)
743 {
744 	return (nir_src_bit_size(*src) == 64) ?
745 		bi_word(bi_src_index(src), 1) : bi_zero();
746 }
747 
748 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)749 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
750 {
751         bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest),
752                    bi_dest_index(&instr->dest),
753                    bi_src_index(&instr->src[0]), bi_addr_high(&instr->src[0]),
754                    seg);
755 }
756 
757 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)758 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
759 {
760         /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
761         assert(nir_intrinsic_write_mask(instr) ==
762                         BITFIELD_MASK(instr->num_components));
763 
764         bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
765                     bi_src_index(&instr->src[0]),
766                     bi_src_index(&instr->src[1]), bi_addr_high(&instr->src[1]),
767                     seg);
768 }
769 
770 /* Exchanges the staging register with memory */
771 
772 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)773 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg)
774 {
775         assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
776 
777         unsigned sz = nir_src_bit_size(*arg);
778         assert(sz == 32 || sz == 64);
779 
780         bi_index data = bi_src_index(arg);
781 
782         bi_index data_words[] = {
783                 bi_word(data, 0),
784                 bi_word(data, 1),
785         };
786 
787         bi_index inout = bi_temp_reg(b->shader);
788         bi_make_vec_to(b, inout, data_words, NULL, sz / 32, 32);
789 
790         bi_axchg_to(b, sz, inout, inout,
791                         bi_word(addr, 0),
792                         (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
793                         seg);
794 
795         bi_index inout_words[] = {
796                 bi_word(inout, 0),
797                 bi_word(inout, 1),
798         };
799 
800         bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
801 }
802 
803 /* Exchanges the second staging register with memory if comparison with first
804  * staging register passes */
805 
806 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)807 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg)
808 {
809         assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
810 
811         /* hardware is swapped from NIR */
812         bi_index src0 = bi_src_index(arg_2);
813         bi_index src1 = bi_src_index(arg_1);
814 
815         unsigned sz = nir_src_bit_size(*arg_1);
816         assert(sz == 32 || sz == 64);
817 
818         bi_index data_words[] = {
819                 bi_word(src0, 0),
820                 sz == 32 ? bi_word(src1, 0) : bi_word(src0, 1),
821 
822                 /* 64-bit */
823                 bi_word(src1, 0),
824                 bi_word(src1, 1),
825         };
826 
827         bi_index inout = bi_temp_reg(b->shader);
828         bi_make_vec_to(b, inout, data_words, NULL, 2 * (sz / 32), 32);
829 
830         bi_acmpxchg_to(b, sz, inout, inout,
831                         bi_word(addr, 0),
832                         (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(),
833                         seg);
834 
835         bi_index inout_words[] = {
836                 bi_word(inout, 0),
837                 bi_word(inout, 1),
838         };
839 
840         bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
841 }
842 
843 /* Extracts an atomic opcode */
844 
845 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_intrinsic_op op)846 bi_atom_opc_for_nir(nir_intrinsic_op op)
847 {
848         switch (op) {
849         case nir_intrinsic_global_atomic_add:
850         case nir_intrinsic_shared_atomic_add:
851         case nir_intrinsic_image_atomic_add:
852                 return BI_ATOM_OPC_AADD;
853 
854         case nir_intrinsic_global_atomic_imin:
855         case nir_intrinsic_shared_atomic_imin:
856         case nir_intrinsic_image_atomic_imin:
857                 return BI_ATOM_OPC_ASMIN;
858 
859         case nir_intrinsic_global_atomic_umin:
860         case nir_intrinsic_shared_atomic_umin:
861         case nir_intrinsic_image_atomic_umin:
862                 return BI_ATOM_OPC_AUMIN;
863 
864         case nir_intrinsic_global_atomic_imax:
865         case nir_intrinsic_shared_atomic_imax:
866         case nir_intrinsic_image_atomic_imax:
867                 return BI_ATOM_OPC_ASMAX;
868 
869         case nir_intrinsic_global_atomic_umax:
870         case nir_intrinsic_shared_atomic_umax:
871         case nir_intrinsic_image_atomic_umax:
872                 return BI_ATOM_OPC_AUMAX;
873 
874         case nir_intrinsic_global_atomic_and:
875         case nir_intrinsic_shared_atomic_and:
876         case nir_intrinsic_image_atomic_and:
877                 return BI_ATOM_OPC_AAND;
878 
879         case nir_intrinsic_global_atomic_or:
880         case nir_intrinsic_shared_atomic_or:
881         case nir_intrinsic_image_atomic_or:
882                 return BI_ATOM_OPC_AOR;
883 
884         case nir_intrinsic_global_atomic_xor:
885         case nir_intrinsic_shared_atomic_xor:
886         case nir_intrinsic_image_atomic_xor:
887                 return BI_ATOM_OPC_AXOR;
888 
889         default:
890                 unreachable("Unexpected computational atomic");
891         }
892 }
893 
894 /* Optimized unary atomics are available with an implied #1 argument */
895 
896 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)897 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
898 {
899         /* Check we have a compatible constant */
900         if (arg.type != BI_INDEX_CONSTANT)
901                 return false;
902 
903         if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
904                 return false;
905 
906         /* Check for a compatible operation */
907         switch (op) {
908         case BI_ATOM_OPC_AADD:
909                 *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
910                 return true;
911         case BI_ATOM_OPC_ASMAX:
912                 *out = BI_ATOM_OPC_ASMAX1;
913                 return true;
914         case BI_ATOM_OPC_AUMAX:
915                 *out = BI_ATOM_OPC_AUMAX1;
916                 return true;
917         case BI_ATOM_OPC_AOR:
918                 *out = BI_ATOM_OPC_AOR1;
919                 return true;
920         default:
921                 return false;
922         }
923 }
924 
925 /* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR */
926 
927 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array)928 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
929                     unsigned coord_comps, bool is_array)
930 {
931         assert(coord_comps > 0 && coord_comps <= 3);
932 
933         if (src_idx == 0) {
934                 if (coord_comps == 1 || (coord_comps == 2 && is_array))
935                         return bi_word(coord, 0);
936                 else
937                         return bi_mkvec_v2i16(b,
938                                               bi_half(bi_word(coord, 0), false),
939                                               bi_half(bi_word(coord, 1), false));
940         } else {
941                 if (coord_comps == 3)
942                         return bi_word(coord, 2);
943                 else if (coord_comps == 2 && is_array)
944                         return bi_word(coord, 1);
945                 else
946                         return bi_zero();
947         }
948 }
949 
950 static bi_index
bi_emit_image_index(bi_builder * b,nir_intrinsic_instr * instr)951 bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr)
952 {
953         nir_src src = instr->src[0];
954         bi_index index = bi_src_index(&src);
955         bi_context *ctx = b->shader;
956 
957         /* Images come after vertex attributes, so handle an explicit offset */
958         unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ?
959                 util_bitcount64(ctx->nir->info.inputs_read) : 0;
960 
961         if (offset == 0)
962                 return index;
963         else if (nir_src_is_const(src))
964                 return bi_imm_u32(nir_src_as_uint(src) + offset);
965         else
966                 return bi_iadd_u32(b, index, bi_imm_u32(offset), false);
967 }
968 
969 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)970 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
971 {
972         enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
973         unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
974         bool array = nir_intrinsic_image_array(instr);
975         ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
976 
977         bi_index coords = bi_src_index(&instr->src[1]);
978         /* TODO: MSAA */
979         assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
980 
981         bi_ld_attr_tex_to(b, bi_dest_index(&instr->dest),
982                           bi_emit_image_coord(b, coords, 0, coord_comps, array),
983                           bi_emit_image_coord(b, coords, 1, coord_comps, array),
984                           bi_emit_image_index(b, instr),
985                           bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)),
986                           instr->num_components - 1);
987 }
988 
989 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)990 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
991 {
992         enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
993         bool array = nir_intrinsic_image_array(instr);
994         ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim);
995         unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
996 
997         /* TODO: MSAA */
998         assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported");
999 
1000         enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ?
1001                 bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) :
1002                 BI_REGISTER_FORMAT_AUTO;
1003 
1004         bi_index coords = bi_src_index(&instr->src[1]);
1005         bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array);
1006         bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array);
1007 
1008         bi_instr *I = bi_lea_attr_tex_to(b, bi_temp(b->shader), xy, zw,
1009                         bi_emit_image_index(b, instr), type);
1010 
1011         /* LEA_ATTR_TEX defaults to the secondary attribute table, but our ABI
1012          * has all images in the primary attribute table */
1013         I->table = BI_TABLE_ATTRIBUTE_1;
1014 
1015         return I->dest[0];
1016 }
1017 
1018 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1019 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1020 {
1021         bi_index addr = bi_emit_lea_image(b, instr);
1022 
1023         bi_st_cvt(b, bi_src_index(&instr->src[3]),
1024                      addr, bi_word(addr, 1), bi_word(addr, 2),
1025                      bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)),
1026                      instr->num_components - 1);
1027 }
1028 
1029 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_intrinsic_op intrinsic)1030 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst,
1031                 bi_index addr, bi_index arg, nir_intrinsic_op intrinsic)
1032 {
1033         /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1034          * take any vector but can still output in RETURN mode */
1035         bi_index sr = bi_temp_reg(b->shader);
1036 
1037         enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic);
1038         enum bi_atom_opc post_opc = opc;
1039 
1040         /* Generate either ATOM_C or ATOM_C1 as required */
1041         if (bi_promote_atom_c1(opc, arg, &opc)) {
1042                 bi_patom_c1_i32_to(b, sr, bi_word(addr, 0),
1043                                 bi_word(addr, 1), opc, 2);
1044         } else {
1045                 bi_mov_i32_to(b, sr, arg);
1046                 bi_patom_c_i32_to(b, sr, sr, bi_word(addr, 0),
1047                                 bi_word(addr, 1), opc, 2);
1048         }
1049 
1050         /* Post-process it */
1051         bi_atom_post_i32_to(b, dst, bi_word(sr, 0), bi_word(sr, 1), post_opc);
1052 }
1053 
1054 /* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
1055  * gl_FragCoord.z = ld_vary(fragz)
1056  * gl_FragCoord.w = ld_vary(fragw)
1057  */
1058 
1059 static void
bi_emit_load_frag_coord(bi_builder * b,nir_intrinsic_instr * instr)1060 bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
1061 {
1062         bi_index src[4] = {};
1063 
1064         for (unsigned i = 0; i < 2; ++i) {
1065                 src[i] = bi_fadd_f32(b,
1066                                 bi_u16_to_f32(b, bi_half(bi_register(59), i)),
1067                                 bi_imm_f32(0.5f), BI_ROUND_NONE);
1068         }
1069 
1070         for (unsigned i = 0; i < 2; ++i) {
1071                 src[2 + i] = bi_ld_var_special(b, bi_zero(),
1072                                 BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1073                                 BI_UPDATE_CLOBBER,
1074                                 (i == 0) ? BI_VARYING_NAME_FRAG_Z :
1075                                         BI_VARYING_NAME_FRAG_W,
1076                                 BI_VECSIZE_NONE);
1077         }
1078 
1079         bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32);
1080 }
1081 
1082 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1083 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1084 {
1085         unsigned rt = b->shader->inputs->blend.rt;
1086         unsigned size = nir_dest_bit_size(instr->dest);
1087 
1088         /* Get the render target */
1089         if (!b->shader->inputs->is_blend) {
1090                 const nir_variable *var =
1091                         nir_find_variable_with_driver_location(b->shader->nir,
1092                                         nir_var_shader_out, nir_intrinsic_base(instr));
1093                 unsigned loc = var->data.location;
1094                 assert(loc >= FRAG_RESULT_DATA0);
1095                 rt = (loc - FRAG_RESULT_DATA0);
1096         }
1097 
1098         bi_index desc = b->shader->inputs->is_blend ?
1099                 bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) :
1100                 b->shader->inputs->bifrost.static_rt_conv ?
1101                 bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) :
1102                 bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0);
1103 
1104         bi_ld_tile_to(b, bi_dest_index(&instr->dest), bi_pixel_indices(b, rt),
1105                         bi_register(60), desc, (instr->num_components - 1));
1106 }
1107 
1108 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1109 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1110 {
1111         bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
1112                 bi_dest_index(&instr->dest) : bi_null();
1113         gl_shader_stage stage = b->shader->stage;
1114 
1115         switch (instr->intrinsic) {
1116         case nir_intrinsic_load_barycentric_pixel:
1117         case nir_intrinsic_load_barycentric_centroid:
1118         case nir_intrinsic_load_barycentric_sample:
1119         case nir_intrinsic_load_barycentric_at_sample:
1120         case nir_intrinsic_load_barycentric_at_offset:
1121                 /* handled later via load_vary */
1122                 break;
1123         case nir_intrinsic_load_interpolated_input:
1124         case nir_intrinsic_load_input:
1125                 if (b->shader->inputs->is_blend)
1126                         bi_emit_load_blend_input(b, instr);
1127                 else if (stage == MESA_SHADER_FRAGMENT)
1128                         bi_emit_load_vary(b, instr);
1129                 else if (stage == MESA_SHADER_VERTEX)
1130                         bi_emit_load_attr(b, instr);
1131                 else
1132                         unreachable("Unsupported shader stage");
1133                 break;
1134 
1135         case nir_intrinsic_store_output:
1136                 if (stage == MESA_SHADER_FRAGMENT)
1137                         bi_emit_fragment_out(b, instr);
1138                 else if (stage == MESA_SHADER_VERTEX)
1139                         bi_emit_store_vary(b, instr);
1140                 else
1141                         unreachable("Unsupported shader stage");
1142                 break;
1143 
1144         case nir_intrinsic_store_combined_output_pan:
1145                 assert(stage == MESA_SHADER_FRAGMENT);
1146                 bi_emit_fragment_out(b, instr);
1147                 break;
1148 
1149         case nir_intrinsic_load_ubo:
1150         case nir_intrinsic_load_kernel_input:
1151                 bi_emit_load_ubo(b, instr);
1152                 break;
1153 
1154         case nir_intrinsic_load_global:
1155         case nir_intrinsic_load_global_constant:
1156                 bi_emit_load(b, instr, BI_SEG_NONE);
1157                 break;
1158 
1159         case nir_intrinsic_store_global:
1160                 bi_emit_store(b, instr, BI_SEG_NONE);
1161                 break;
1162 
1163         case nir_intrinsic_load_scratch:
1164                 bi_emit_load(b, instr, BI_SEG_TL);
1165                 break;
1166 
1167         case nir_intrinsic_store_scratch:
1168                 bi_emit_store(b, instr, BI_SEG_TL);
1169                 break;
1170 
1171         case nir_intrinsic_load_shared:
1172                 bi_emit_load(b, instr, BI_SEG_WLS);
1173                 break;
1174 
1175         case nir_intrinsic_store_shared:
1176                 bi_emit_store(b, instr, BI_SEG_WLS);
1177                 break;
1178 
1179         /* Blob doesn't seem to do anything for memory barriers, note +BARRIER
1180          * is illegal in fragment shaders */
1181         case nir_intrinsic_memory_barrier:
1182         case nir_intrinsic_memory_barrier_buffer:
1183         case nir_intrinsic_memory_barrier_image:
1184         case nir_intrinsic_memory_barrier_shared:
1185         case nir_intrinsic_group_memory_barrier:
1186                 break;
1187 
1188         case nir_intrinsic_control_barrier:
1189                 assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1190                 bi_barrier(b);
1191                 break;
1192 
1193         case nir_intrinsic_shared_atomic_add:
1194         case nir_intrinsic_shared_atomic_imin:
1195         case nir_intrinsic_shared_atomic_umin:
1196         case nir_intrinsic_shared_atomic_imax:
1197         case nir_intrinsic_shared_atomic_umax:
1198         case nir_intrinsic_shared_atomic_and:
1199         case nir_intrinsic_shared_atomic_or:
1200         case nir_intrinsic_shared_atomic_xor: {
1201                 assert(nir_src_bit_size(instr->src[1]) == 32);
1202 
1203                 bi_index addr = bi_seg_add_i64(b, bi_src_index(&instr->src[0]),
1204                                 bi_zero(), false, BI_SEG_WLS);
1205 
1206                 bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]),
1207                                 instr->intrinsic);
1208                 break;
1209         }
1210 
1211         case nir_intrinsic_image_atomic_add:
1212         case nir_intrinsic_image_atomic_imin:
1213         case nir_intrinsic_image_atomic_umin:
1214         case nir_intrinsic_image_atomic_imax:
1215         case nir_intrinsic_image_atomic_umax:
1216         case nir_intrinsic_image_atomic_and:
1217         case nir_intrinsic_image_atomic_or:
1218         case nir_intrinsic_image_atomic_xor:
1219                 assert(nir_src_bit_size(instr->src[3]) == 32);
1220 
1221                 bi_emit_atomic_i32_to(b, dst,
1222                                 bi_emit_lea_image(b, instr),
1223                                 bi_src_index(&instr->src[3]),
1224                                 instr->intrinsic);
1225                 break;
1226 
1227         case nir_intrinsic_global_atomic_add:
1228         case nir_intrinsic_global_atomic_imin:
1229         case nir_intrinsic_global_atomic_umin:
1230         case nir_intrinsic_global_atomic_imax:
1231         case nir_intrinsic_global_atomic_umax:
1232         case nir_intrinsic_global_atomic_and:
1233         case nir_intrinsic_global_atomic_or:
1234         case nir_intrinsic_global_atomic_xor:
1235                 assert(nir_src_bit_size(instr->src[1]) == 32);
1236 
1237                 bi_emit_atomic_i32_to(b, dst,
1238                                 bi_src_index(&instr->src[0]),
1239                                 bi_src_index(&instr->src[1]),
1240                                 instr->intrinsic);
1241                 break;
1242 
1243         case nir_intrinsic_image_load:
1244                 bi_emit_image_load(b, instr);
1245                 break;
1246 
1247         case nir_intrinsic_image_store:
1248                 bi_emit_image_store(b, instr);
1249                 break;
1250 
1251         case nir_intrinsic_global_atomic_exchange:
1252                 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1253                                 &instr->src[1], BI_SEG_NONE);
1254                 break;
1255 
1256         case nir_intrinsic_image_atomic_exchange:
1257                 bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr),
1258                                 &instr->src[3], BI_SEG_NONE);
1259                 break;
1260 
1261         case nir_intrinsic_shared_atomic_exchange:
1262                 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]),
1263                                 &instr->src[1], BI_SEG_WLS);
1264                 break;
1265 
1266         case nir_intrinsic_global_atomic_comp_swap:
1267                 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1268                                 &instr->src[1], &instr->src[2], BI_SEG_NONE);
1269                 break;
1270 
1271         case nir_intrinsic_image_atomic_comp_swap:
1272                 bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr),
1273                                 &instr->src[3], &instr->src[4], BI_SEG_NONE);
1274                 break;
1275 
1276         case nir_intrinsic_shared_atomic_comp_swap:
1277                 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]),
1278                                 &instr->src[1], &instr->src[2], BI_SEG_WLS);
1279                 break;
1280 
1281         case nir_intrinsic_load_frag_coord:
1282                 bi_emit_load_frag_coord(b, instr);
1283                 break;
1284 
1285         case nir_intrinsic_load_output:
1286                 bi_emit_ld_tile(b, instr);
1287                 break;
1288 
1289         case nir_intrinsic_discard_if: {
1290                 bi_index src = bi_src_index(&instr->src[0]);
1291                 assert(nir_src_bit_size(instr->src[0]) == 1);
1292                 bi_discard_b32(b, bi_half(src, false));
1293                 break;
1294         }
1295 
1296         case nir_intrinsic_discard:
1297                 bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1298                 break;
1299 
1300         case nir_intrinsic_load_ssbo_address:
1301                 bi_load_sysval_nir(b, instr, 2, 0);
1302                 break;
1303 
1304         case nir_intrinsic_load_work_dim:
1305                 bi_load_sysval_nir(b, instr, 1, 0);
1306                 break;
1307 
1308         case nir_intrinsic_load_first_vertex:
1309                 bi_load_sysval_nir(b, instr, 1, 0);
1310                 break;
1311 
1312         case nir_intrinsic_load_base_vertex:
1313                 bi_load_sysval_nir(b, instr, 1, 4);
1314                 break;
1315 
1316         case nir_intrinsic_load_base_instance:
1317                 bi_load_sysval_nir(b, instr, 1, 8);
1318                 break;
1319 
1320         case nir_intrinsic_load_draw_id:
1321                 bi_load_sysval_nir(b, instr, 1, 0);
1322                 break;
1323 
1324         case nir_intrinsic_get_ssbo_size:
1325                 bi_load_sysval_nir(b, instr, 1, 8);
1326                 break;
1327 
1328         case nir_intrinsic_load_viewport_scale:
1329         case nir_intrinsic_load_viewport_offset:
1330         case nir_intrinsic_load_num_workgroups:
1331         case nir_intrinsic_load_workgroup_size:
1332                 bi_load_sysval_nir(b, instr, 3, 0);
1333                 break;
1334 
1335         case nir_intrinsic_image_size:
1336                 bi_load_sysval_nir(b, instr,
1337                                 nir_dest_num_components(instr->dest), 0);
1338                 break;
1339 
1340         case nir_intrinsic_load_blend_const_color_rgba:
1341                 bi_load_sysval_nir(b, instr,
1342                                    nir_dest_num_components(instr->dest), 0);
1343                 break;
1344 
1345 	case nir_intrinsic_load_sample_positions_pan:
1346                 bi_mov_i32_to(b, bi_word(dst, 0),
1347                                 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false));
1348                 bi_mov_i32_to(b, bi_word(dst, 1),
1349                                 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1350                 break;
1351 
1352 	case nir_intrinsic_load_sample_mask_in:
1353                 /* r61[0:15] contains the coverage bitmap */
1354                 bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false));
1355                 break;
1356 
1357         case nir_intrinsic_load_sample_id:
1358                 bi_load_sample_id_to(b, dst);
1359                 break;
1360 
1361 	case nir_intrinsic_load_front_face:
1362                 /* r58 == 0 means primitive is front facing */
1363                 bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ,
1364                                 BI_RESULT_TYPE_M1);
1365                 break;
1366 
1367         case nir_intrinsic_load_point_coord:
1368                 bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1369                                 BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1370                                 BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1371                 break;
1372 
1373         case nir_intrinsic_load_vertex_id_zero_base:
1374                 bi_mov_i32_to(b, dst, bi_register(61));
1375                 break;
1376 
1377         case nir_intrinsic_load_instance_id:
1378                 bi_mov_i32_to(b, dst, bi_register(62));
1379                 break;
1380 
1381         case nir_intrinsic_load_subgroup_invocation:
1382                 bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1383                 break;
1384 
1385         case nir_intrinsic_load_local_invocation_id:
1386                 for (unsigned i = 0; i < 3; ++i)
1387                         bi_u16_to_u32_to(b, bi_word(dst, i),
1388                                          bi_half(bi_register(55 + i / 2), i % 2));
1389                 break;
1390 
1391         case nir_intrinsic_load_workgroup_id:
1392                 for (unsigned i = 0; i < 3; ++i)
1393                         bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i));
1394                 break;
1395 
1396         case nir_intrinsic_load_global_invocation_id:
1397         case nir_intrinsic_load_global_invocation_id_zero_base:
1398                 for (unsigned i = 0; i < 3; ++i)
1399                         bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i));
1400                 break;
1401 
1402         case nir_intrinsic_shader_clock:
1403                 bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1404                 break;
1405 
1406         default:
1407                 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
1408                 assert(0);
1409         }
1410 }
1411 
1412 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)1413 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
1414 {
1415         /* Make sure we've been lowered */
1416         assert(instr->def.num_components <= (32 / instr->def.bit_size));
1417 
1418         /* Accumulate all the channels of the constant, as if we did an
1419          * implicit SEL over them */
1420         uint32_t acc = 0;
1421 
1422         for (unsigned i = 0; i < instr->def.num_components; ++i) {
1423                 uint32_t v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
1424 
1425                 v = bi_extend_constant(v, instr->def.bit_size);
1426                 acc |= (v << (i * instr->def.bit_size));
1427         }
1428 
1429         bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc));
1430 }
1431 
1432 static bi_index
bi_alu_src_index(nir_alu_src src,unsigned comps)1433 bi_alu_src_index(nir_alu_src src, unsigned comps)
1434 {
1435         /* we don't lower modifiers until the backend */
1436         assert(!(src.negate || src.abs));
1437 
1438         unsigned bitsize = nir_src_bit_size(src.src);
1439 
1440         /* TODO: Do we need to do something more clever with 1-bit bools? */
1441         if (bitsize == 1)
1442                 bitsize = 16;
1443 
1444         /* the bi_index carries the 32-bit (word) offset separate from the
1445          * subword swizzle, first handle the offset */
1446 
1447         unsigned offset = 0;
1448 
1449         assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
1450         unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
1451 
1452         for (unsigned i = 0; i < comps; ++i) {
1453                 unsigned new_offset = (src.swizzle[i] >> subword_shift);
1454 
1455                 if (i > 0)
1456                         assert(offset == new_offset && "wrong vectorization");
1457 
1458                 offset = new_offset;
1459         }
1460 
1461         bi_index idx = bi_word(bi_src_index(&src.src), offset);
1462 
1463         /* Compose the subword swizzle with existing (identity) swizzle */
1464         assert(idx.swizzle == BI_SWIZZLE_H01);
1465 
1466         /* Bigger vectors should have been lowered */
1467         assert(comps <= (1 << subword_shift));
1468 
1469         if (bitsize == 16) {
1470                 unsigned c0 = src.swizzle[0] & 1;
1471                 unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
1472                 idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
1473         } else if (bitsize == 8) {
1474                 /* 8-bit vectors not yet supported */
1475                 assert(comps == 1 && "8-bit vectors not supported");
1476                 assert(src.swizzle[0] < 4 && "8-bit vectors not supported");
1477                 idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0];
1478         }
1479 
1480         return idx;
1481 }
1482 
1483 static enum bi_round
bi_nir_round(nir_op op)1484 bi_nir_round(nir_op op)
1485 {
1486         switch (op) {
1487         case nir_op_fround_even: return BI_ROUND_NONE;
1488         case nir_op_ftrunc: return BI_ROUND_RTZ;
1489         case nir_op_fceil: return BI_ROUND_RTP;
1490         case nir_op_ffloor: return BI_ROUND_RTN;
1491         default: unreachable("invalid nir round op");
1492         }
1493 }
1494 
1495 /* Convenience for lowered transcendentals */
1496 
1497 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)1498 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
1499 {
1500         return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE);
1501 }
1502 
1503 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
1504  * Newton-Raphson to improve precision */
1505 
1506 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)1507 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
1508 {
1509         bi_index x1 = bi_frcp_approx_f32(b, s0);
1510         bi_index m  = bi_frexpm_f32(b, s0, false, false);
1511         bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
1512         bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
1513                         bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N);
1514         bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e,
1515                         BI_ROUND_NONE, BI_SPECIAL_NONE);
1516 }
1517 
1518 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)1519 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
1520 {
1521         bi_index x1 = bi_frsq_approx_f32(b, s0);
1522         bi_index m  = bi_frexpm_f32(b, s0, false, true);
1523         bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
1524         bi_index t1 = bi_fmul_f32(b, x1, x1);
1525         bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
1526                         bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N);
1527         bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e,
1528                         BI_ROUND_NONE, BI_SPECIAL_N);
1529 }
1530 
1531 /* More complex transcendentals, see
1532  * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
1533  * for documentation */
1534 
1535 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)1536 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
1537 {
1538         bi_index t1 = bi_temp(b->shader);
1539         bi_instr *t1_instr = bi_fadd_f32_to(b, t1,
1540                         s0, bi_imm_u32(0x49400000), BI_ROUND_NONE);
1541         t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
1542 
1543         bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE);
1544 
1545         bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader),
1546                         s0, bi_neg(t2), BI_ROUND_NONE);
1547         a2->clamp = BI_CLAMP_CLAMP_M1_1;
1548 
1549         bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
1550         bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
1551         bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
1552         bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
1553                         bi_imm_u32(0x3e75fffa), BI_ROUND_NONE);
1554         bi_index p2 = bi_fma_f32(b, p1, a2->dest[0],
1555                         bi_imm_u32(0x3f317218), BI_ROUND_NONE);
1556         bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
1557         bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
1558                         p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE);
1559         x->clamp = BI_CLAMP_CLAMP_0_INF;
1560 
1561         bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
1562         max->sem = BI_SEM_NAN_PROPAGATE;
1563 }
1564 
1565 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)1566 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
1567 {
1568         /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
1569          * fixed-point input */
1570         bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
1571                         bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE);
1572         bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE);
1573 
1574         /* Compute the result for the fixed-point input, but pass along
1575          * the floating-point scale for correct NaN propagation */
1576         bi_fexp_f32_to(b, dst, fixed_pt, scale);
1577 }
1578 
1579 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)1580 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1581 {
1582         /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
1583         bi_index a1 = bi_frexpm_f32(b, s0, true, false);
1584         bi_index ei = bi_frexpe_f32(b, s0, true, false);
1585         bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ);
1586 
1587         /* xt estimates -log(r1), a coarse approximation of log(a1) */
1588         bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
1589         bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
1590 
1591         /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
1592          * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
1593          * and then log(s0) = x1 + x2 */
1594         bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE);
1595 
1596         /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
1597          * polynomial approximation around 1. The series is expressed around
1598          * 1, so set y = (a1 * r1) - 1.0 */
1599         bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE);
1600 
1601         /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
1602          * log_e(1 + y) by the Taylor series (lower precision than the blob):
1603          * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
1604         bi_index loge = bi_fmul_f32(b, y,
1605                 bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE));
1606 
1607         bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
1608 
1609         /* log(s0) = x1 + x2 */
1610         bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);
1611 }
1612 
1613 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)1614 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
1615 {
1616         bi_index frexp = bi_frexpe_f32(b, s0, true, false);
1617         bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);
1618         bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
1619         bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,
1620                         BI_ROUND_NONE);
1621 }
1622 
1623 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)1624 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1625 {
1626         bi_index log2_base = bi_null();
1627 
1628         if (base.type == BI_INDEX_CONSTANT) {
1629                 log2_base = bi_imm_f32(log2f(uif(base.value)));
1630         } else {
1631                 log2_base = bi_temp(b->shader);
1632                 bi_lower_flog2_32(b, log2_base, base);
1633         }
1634 
1635         return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
1636 }
1637 
1638 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)1639 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
1640 {
1641         bi_index log2_base = bi_null();
1642 
1643         if (base.type == BI_INDEX_CONSTANT) {
1644                 log2_base = bi_imm_f32(log2f(uif(base.value)));
1645         } else {
1646                 log2_base = bi_temp(b->shader);
1647                 bi_flog2_32(b, log2_base, base);
1648         }
1649 
1650         return bi_fexp_32(b, dst, exp, log2_base);
1651 }
1652 
1653 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
1654  * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
1655  * calculates the results. We use them to calculate sin/cos via a Taylor
1656  * approximation:
1657  *
1658  * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
1659  * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
1660  * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
1661  */
1662 
1663 #define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
1664 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
1665 #define SINCOS_BIAS  bi_imm_u32(0x49400000)
1666 
1667 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)1668 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
1669 {
1670         /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
1671         bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);
1672 
1673         /* Approximate domain error (small) */
1674         bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),
1675                                 BI_ROUND_NONE),
1676                         MPI_OVER_TWO, s0, BI_ROUND_NONE);
1677 
1678         /* Lookup sin(x), cos(x) */
1679         bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
1680         bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
1681 
1682         /* e^2 / 2 */
1683         bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
1684                         bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);
1685 
1686         /* (-e^2)/2 f''(x) */
1687         bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
1688                         cos ? cosx : sinx,
1689                         bi_negzero(),  BI_ROUND_NONE);
1690 
1691         /* e f'(x) - (e^2/2) f''(x) */
1692         bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
1693                         cos ? bi_neg(sinx) : cosx,
1694                         quadratic, BI_ROUND_NONE);
1695         I->clamp = BI_CLAMP_CLAMP_M1_1;
1696 
1697         /* f(x) + e f'(x) - (e^2/2) f''(x) */
1698         bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
1699 }
1700 
1701 /* The XOR lane op is useful for derivative calculation, but was added in v7.
1702  * Add a safe helper that will do the appropriate lowering on v6 */
1703 
1704 static bi_index
bi_clper_xor(bi_builder * b,bi_index s0,bi_index s1)1705 bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
1706 {
1707         if (b->shader->arch >= 7) {
1708                 return bi_clper_i32(b, s0, s1,
1709                                 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
1710                                 BI_SUBGROUP_SUBGROUP4);
1711         }
1712 
1713         bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
1714         bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
1715         return bi_clper_v6_i32(b, s0, lane);
1716 }
1717 
1718 static bi_instr *
bi_emit_alu_bool(bi_builder * b,unsigned sz,nir_op op,bi_index dst,bi_index s0,bi_index s1,bi_index s2)1719 bi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op,
1720       bi_index dst, bi_index s0, bi_index s1, bi_index s2)
1721 {
1722         /* Handle 1-bit bools as 0/~0 by default and let the optimizer deal
1723          * with the bit patterns later. 0/~0 has the nice property of being
1724          * independent of replicated vectorization. */
1725         if (sz == 1) sz = 16;
1726         bi_index f = bi_zero();
1727         bi_index t = bi_imm_u16(0xFFFF);
1728 
1729         switch (op) {
1730         case nir_op_feq:
1731                 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
1732         case nir_op_flt:
1733                 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1734         case nir_op_fge:
1735                 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1736         case nir_op_fneu:
1737                 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
1738 
1739         case nir_op_ieq:
1740                 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1);
1741         case nir_op_ine:
1742                 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1);
1743         case nir_op_ilt:
1744                 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1745         case nir_op_ige:
1746                 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1747         case nir_op_ult:
1748                 return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1);
1749         case nir_op_uge:
1750                 return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1);
1751 
1752         case nir_op_iand:
1753                 return bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1754         case nir_op_ior:
1755                 return bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1756         case nir_op_ixor:
1757                 return bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
1758         case nir_op_inot:
1759                 return bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
1760 
1761         case nir_op_f2b1:
1762                 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1763         case nir_op_i2b1:
1764                 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1765         case nir_op_b2b1:
1766                 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ);
1767 
1768         case nir_op_bcsel:
1769                 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, s1, s2, BI_CMPF_NE);
1770 
1771         default:
1772                 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
1773                 unreachable("Unhandled boolean ALU instruction");
1774         }
1775 }
1776 
1777 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)1778 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
1779 {
1780         bi_index dst = bi_dest_index(&instr->dest.dest);
1781         unsigned srcs = nir_op_infos[instr->op].num_inputs;
1782         unsigned sz = nir_dest_bit_size(instr->dest.dest);
1783         unsigned comps = nir_dest_num_components(instr->dest.dest);
1784         unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
1785         unsigned src1_sz = srcs > 1 ? nir_src_bit_size(instr->src[1].src) : 0;
1786         bool is_bool = (sz == 1);
1787 
1788         /* TODO: Anything else? */
1789         if (sz == 1)
1790                 sz = 16;
1791 
1792         /* Indicate scalarness */
1793         if (sz == 16 && comps == 1)
1794                 dst.swizzle = BI_SWIZZLE_H00;
1795 
1796         if (!instr->dest.dest.is_ssa) {
1797                 for (unsigned i = 0; i < comps; ++i)
1798                         assert(instr->dest.write_mask);
1799         }
1800 
1801         /* First, match against the various moves in NIR. These are
1802          * special-cased because they can operate on vectors even after
1803          * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
1804          * instruction is no "bigger" than SIMD-within-a-register. These moves
1805          * are the exceptions that need to handle swizzles specially. */
1806 
1807         switch (instr->op) {
1808         case nir_op_pack_32_2x16:
1809         case nir_op_vec2:
1810         case nir_op_vec3:
1811         case nir_op_vec4: {
1812                 bi_index unoffset_srcs[4] = {
1813                         srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),
1814                         srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),
1815                         srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),
1816                         srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),
1817                 };
1818 
1819                 unsigned channels[4] = {
1820                         instr->src[0].swizzle[0],
1821                         instr->src[1].swizzle[0],
1822                         srcs > 2 ? instr->src[2].swizzle[0] : 0,
1823                         srcs > 3 ? instr->src[3].swizzle[0] : 0,
1824                 };
1825 
1826                 bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
1827                 return;
1828         }
1829 
1830         case nir_op_vec8:
1831         case nir_op_vec16:
1832                 unreachable("should've been lowered");
1833 
1834         case nir_op_unpack_32_2x16:
1835         case nir_op_unpack_64_2x32_split_x:
1836                 bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0].src));
1837                 return;
1838 
1839         case nir_op_unpack_64_2x32_split_y:
1840                 bi_mov_i32_to(b, dst, bi_word(bi_src_index(&instr->src[0].src), 1));
1841                 return;
1842 
1843         case nir_op_pack_64_2x32_split:
1844                 bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src));
1845                 bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src));
1846                 return;
1847 
1848         case nir_op_pack_64_2x32:
1849                 bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0));
1850                 bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1));
1851                 return;
1852 
1853         case nir_op_mov: {
1854                 bi_index idx = bi_src_index(&instr->src[0].src);
1855                 bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
1856 
1857                 unsigned channels[4] = {
1858                         comps > 0 ? instr->src[0].swizzle[0] : 0,
1859                         comps > 1 ? instr->src[0].swizzle[1] : 0,
1860                         comps > 2 ? instr->src[0].swizzle[2] : 0,
1861                         comps > 3 ? instr->src[0].swizzle[3] : 0,
1862                 };
1863 
1864                 if (sz == 1) sz = 16;
1865                 bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz);
1866                 return;
1867         }
1868 
1869         case nir_op_f2f16:
1870                 assert(src_sz == 32);
1871                 bi_index idx = bi_src_index(&instr->src[0].src);
1872                 bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
1873                 bi_index s1 = comps > 1 ?
1874                         bi_word(idx, instr->src[0].swizzle[1]) : s0;
1875 
1876                 bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE);
1877                 return;
1878 
1879         /* Vectorized downcasts */
1880         case nir_op_u2u16:
1881         case nir_op_i2i16: {
1882                 if (!(src_sz == 32 && comps == 2))
1883                         break;
1884 
1885                 bi_index idx = bi_src_index(&instr->src[0].src);
1886                 bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]);
1887                 bi_index s1 = bi_word(idx, instr->src[0].swizzle[1]);
1888 
1889                 bi_mkvec_v2i16_to(b, dst,
1890                                 bi_half(s0, false), bi_half(s1, false));
1891                 return;
1892         }
1893 
1894         case nir_op_i2i8:
1895         case nir_op_u2u8:
1896         {
1897                 /* Acts like an 8-bit swizzle */
1898                 bi_index idx = bi_src_index(&instr->src[0].src);
1899                 unsigned factor = src_sz / 8;
1900                 unsigned chan[4] = { 0 };
1901 
1902                 for (unsigned i = 0; i < comps; ++i)
1903                         chan[i] = instr->src[0].swizzle[i] * factor;
1904 
1905                 bi_make_vec_to(b, dst, &idx, chan, comps, 8);
1906                 return;
1907         }
1908 
1909         default:
1910                 break;
1911         }
1912 
1913         bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null();
1914         bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null();
1915         bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null();
1916 
1917         if (is_bool) {
1918                 bi_emit_alu_bool(b, src_sz, instr->op, dst, s0, s1, s2);
1919                 return;
1920         }
1921 
1922         switch (instr->op) {
1923         case nir_op_ffma:
1924                 bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);
1925                 break;
1926 
1927         case nir_op_fmul:
1928                 bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE);
1929                 break;
1930 
1931         case nir_op_fsub:
1932                 s1 = bi_neg(s1);
1933                 FALLTHROUGH;
1934         case nir_op_fadd:
1935                 bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
1936                 break;
1937 
1938         case nir_op_fsat: {
1939                 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
1940                 I->clamp = BI_CLAMP_CLAMP_0_1;
1941                 break;
1942         }
1943 
1944         case nir_op_fsat_signed_mali: {
1945                 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
1946                 I->clamp = BI_CLAMP_CLAMP_M1_1;
1947                 break;
1948         }
1949 
1950         case nir_op_fclamp_pos_mali: {
1951                 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
1952                 I->clamp = BI_CLAMP_CLAMP_0_INF;
1953                 break;
1954         }
1955 
1956         case nir_op_fneg:
1957                 bi_fabsneg_to(b, sz, dst, bi_neg(s0));
1958                 break;
1959 
1960         case nir_op_fabs:
1961                 bi_fabsneg_to(b, sz, dst, bi_abs(s0));
1962                 break;
1963 
1964         case nir_op_fsin:
1965                 bi_lower_fsincos_32(b, dst, s0, false);
1966                 break;
1967 
1968         case nir_op_fcos:
1969                 bi_lower_fsincos_32(b, dst, s0, true);
1970                 break;
1971 
1972         case nir_op_fexp2:
1973                 assert(sz == 32); /* should've been lowered */
1974 
1975                 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1976                         bi_lower_fexp2_32(b, dst, s0);
1977                 else
1978                         bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
1979 
1980                 break;
1981 
1982         case nir_op_flog2:
1983                 assert(sz == 32); /* should've been lowered */
1984 
1985                 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1986                         bi_lower_flog2_32(b, dst, s0);
1987                 else
1988                         bi_flog2_32(b, dst, s0);
1989 
1990                 break;
1991 
1992         case nir_op_fpow:
1993                 assert(sz == 32); /* should've been lowered */
1994 
1995                 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
1996                         bi_lower_fpow_32(b, dst, s0, s1);
1997                 else
1998                         bi_fpow_32(b, dst, s0, s1);
1999 
2000                 break;
2001 
2002         case nir_op_bcsel:
2003                 if (src1_sz == 8)
2004                         bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2005                 else
2006                         bi_csel_to(b, nir_type_int, src1_sz,
2007                                         dst, s0, bi_zero(), s1, s2, BI_CMPF_NE);
2008                 break;
2009 
2010         case nir_op_ishl:
2011                 bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2012                 break;
2013         case nir_op_ushr:
2014                 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2015                 break;
2016 
2017         case nir_op_ishr:
2018                 bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2019                 break;
2020 
2021         case nir_op_imin:
2022         case nir_op_umin:
2023                 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2024                                 s0, s1, s0, s1, BI_CMPF_LT);
2025                 break;
2026 
2027         case nir_op_imax:
2028         case nir_op_umax:
2029                 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst,
2030                                 s0, s1, s0, s1, BI_CMPF_GT);
2031                 break;
2032 
2033         case nir_op_fddx_must_abs_mali:
2034         case nir_op_fddy_must_abs_mali: {
2035                 bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
2036                 bi_index adjacent = bi_clper_xor(b, s0, bit);
2037                 bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE);
2038                 break;
2039         }
2040 
2041         case nir_op_fddx:
2042         case nir_op_fddy: {
2043                 bi_index lane1 = bi_lshift_and_i32(b,
2044                                 bi_fau(BIR_FAU_LANE_ID, false),
2045                                 bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1),
2046                                 bi_imm_u8(0));
2047 
2048                 bi_index lane2 = bi_iadd_u32(b, lane1,
2049                                 bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2),
2050                                 false);
2051 
2052                 bi_index left, right;
2053 
2054                 if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
2055                         left = bi_clper_v6_i32(b, s0, lane1);
2056                         right = bi_clper_v6_i32(b, s0, lane2);
2057                 } else {
2058                         left = bi_clper_i32(b, s0, lane1,
2059                                         BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2060                                         BI_SUBGROUP_SUBGROUP4);
2061 
2062                         right = bi_clper_i32(b, s0, lane2,
2063                                         BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
2064                                         BI_SUBGROUP_SUBGROUP4);
2065                 }
2066 
2067                 bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);
2068                 break;
2069         }
2070 
2071         case nir_op_f2f32:
2072                 bi_f16_to_f32_to(b, dst, s0);
2073                 break;
2074 
2075         case nir_op_f2i32:
2076                 if (src_sz == 32)
2077                         bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
2078                 else
2079                         bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
2080                 break;
2081 
2082         /* Note 32-bit sources => no vectorization, so 32-bit works */
2083         case nir_op_f2u16:
2084                 if (src_sz == 32)
2085                         bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
2086                 else
2087                         bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);
2088                 break;
2089 
2090         case nir_op_f2i16:
2091                 if (src_sz == 32)
2092                         bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
2093                 else
2094                         bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);
2095                 break;
2096 
2097         case nir_op_f2u32:
2098                 if (src_sz == 32)
2099                         bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
2100                 else
2101                         bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
2102                 break;
2103 
2104         case nir_op_u2f16:
2105                 if (src_sz == 32)
2106                         bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
2107                 else if (src_sz == 16)
2108                         bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
2109                 else if (src_sz == 8)
2110                         bi_v2u8_to_v2f16_to(b, dst, s0);
2111                 break;
2112 
2113         case nir_op_u2f32:
2114                 if (src_sz == 32)
2115                         bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
2116                 else if (src_sz == 16)
2117                         bi_u16_to_f32_to(b, dst, s0);
2118                 else
2119                         bi_u8_to_f32_to(b, dst, s0);
2120                 break;
2121 
2122         case nir_op_i2f16:
2123                 if (src_sz == 32)
2124                         bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
2125                 else if (src_sz == 16)
2126                         bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
2127                 else if (src_sz == 8)
2128                         bi_v2s8_to_v2f16_to(b, dst, s0);
2129                 break;
2130 
2131         case nir_op_i2f32:
2132                 if (src_sz == 32)
2133                         bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
2134                 else if (src_sz == 16)
2135                         bi_s16_to_f32_to(b, dst, s0);
2136                 else if (src_sz == 8)
2137                         bi_s8_to_f32_to(b, dst, s0);
2138                 break;
2139 
2140         case nir_op_i2i32:
2141                 if (src_sz == 16)
2142                         bi_s16_to_s32_to(b, dst, s0);
2143                 else
2144                         bi_s8_to_s32_to(b, dst, s0);
2145                 break;
2146 
2147         case nir_op_u2u32:
2148                 if (src_sz == 16)
2149                         bi_u16_to_u32_to(b, dst, s0);
2150                 else
2151                         bi_u8_to_u32_to(b, dst, s0);
2152                 break;
2153 
2154         case nir_op_i2i16:
2155                 assert(src_sz == 8 || src_sz == 32);
2156 
2157                 if (src_sz == 8)
2158                         bi_v2s8_to_v2s16_to(b, dst, s0);
2159                 else
2160                         bi_mov_i32_to(b, dst, s0);
2161                 break;
2162 
2163         case nir_op_u2u16:
2164                 assert(src_sz == 8 || src_sz == 32);
2165 
2166                 if (src_sz == 8)
2167                         bi_v2u8_to_v2u16_to(b, dst, s0);
2168                 else
2169                         bi_mov_i32_to(b, dst, s0);
2170                 break;
2171 
2172         case nir_op_b2f16:
2173         case nir_op_b2f32:
2174                 bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
2175                                 (sz == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0),
2176                                 (sz == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0),
2177                                 BI_CMPF_NE);
2178                 break;
2179 
2180         case nir_op_b2b32:
2181                 bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(),
2182                                 bi_imm_u32(~0), bi_zero(), BI_CMPF_NE);
2183                 break;
2184 
2185         case nir_op_b2i8:
2186         case nir_op_b2i16:
2187         case nir_op_b2i32:
2188                 bi_lshift_and_to(b, sz, dst, s0, bi_imm_uintN(1, sz), bi_imm_u8(0));
2189                 break;
2190 
2191         case nir_op_fround_even:
2192         case nir_op_fceil:
2193         case nir_op_ffloor:
2194         case nir_op_ftrunc:
2195                 bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2196                 break;
2197 
2198         case nir_op_fmin:
2199                 bi_fmin_to(b, sz, dst, s0, s1);
2200                 break;
2201 
2202         case nir_op_fmax:
2203                 bi_fmax_to(b, sz, dst, s0, s1);
2204                 break;
2205 
2206         case nir_op_iadd:
2207                 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2208                 break;
2209 
2210         case nir_op_iadd_sat:
2211                 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2212                 break;
2213 
2214         case nir_op_uadd_sat:
2215                 bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2216                 break;
2217 
2218         case nir_op_ihadd:
2219                 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2220                 break;
2221 
2222         case nir_op_irhadd:
2223                 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2224                 break;
2225 
2226         case nir_op_ineg:
2227                 bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
2228                 break;
2229 
2230         case nir_op_isub:
2231                 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
2232                 break;
2233 
2234         case nir_op_isub_sat:
2235                 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
2236                 break;
2237 
2238         case nir_op_usub_sat:
2239                 bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
2240                 break;
2241 
2242         case nir_op_imul:
2243                 bi_imul_to(b, sz, dst, s0, s1);
2244                 break;
2245 
2246         case nir_op_iabs:
2247                 bi_iabs_to(b, sz, dst, s0);
2248                 break;
2249 
2250         case nir_op_iand:
2251                 bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2252                 break;
2253 
2254         case nir_op_ior:
2255                 bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2256                 break;
2257 
2258         case nir_op_ixor:
2259                 bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
2260                 break;
2261 
2262         case nir_op_inot:
2263                 bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
2264                 break;
2265 
2266         case nir_op_frsq:
2267                 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2268                         bi_lower_frsq_32(b, dst, s0);
2269                 else
2270                         bi_frsq_to(b, sz, dst, s0);
2271                 break;
2272 
2273         case nir_op_frcp:
2274                 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2275                         bi_lower_frcp_32(b, dst, s0);
2276                 else
2277                         bi_frcp_to(b, sz, dst, s0);
2278                 break;
2279 
2280         case nir_op_uclz:
2281                 bi_clz_to(b, sz, dst, s0, false);
2282                 break;
2283 
2284         case nir_op_bit_count:
2285                 bi_popcount_i32_to(b, dst, s0);
2286                 break;
2287 
2288         case nir_op_bitfield_reverse:
2289                 bi_bitrev_i32_to(b, dst, s0);
2290                 break;
2291 
2292         case nir_op_ufind_msb: {
2293                 bi_index clz = bi_clz(b, src_sz, s0, false);
2294 
2295                 if (sz == 8)
2296                         clz = bi_byte(clz, 0);
2297                 else if (sz == 16)
2298                         clz = bi_half(clz, false);
2299 
2300                 bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
2301                 break;
2302         }
2303 
2304         default:
2305                 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2306                 unreachable("Unknown ALU op");
2307         }
2308 }
2309 
2310 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */
2311 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)2312 bifrost_tex_format(enum glsl_sampler_dim dim)
2313 {
2314         switch (dim) {
2315         case GLSL_SAMPLER_DIM_1D:
2316         case GLSL_SAMPLER_DIM_BUF:
2317                 return 1;
2318 
2319         case GLSL_SAMPLER_DIM_2D:
2320         case GLSL_SAMPLER_DIM_MS:
2321         case GLSL_SAMPLER_DIM_EXTERNAL:
2322         case GLSL_SAMPLER_DIM_RECT:
2323                 return 2;
2324 
2325         case GLSL_SAMPLER_DIM_3D:
2326                 return 3;
2327 
2328         case GLSL_SAMPLER_DIM_CUBE:
2329                 return 0;
2330 
2331         default:
2332                 DBG("Unknown sampler dim type\n");
2333                 assert(0);
2334                 return 0;
2335         }
2336 }
2337 
2338 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)2339 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
2340 {
2341         switch (T) {
2342         case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp;
2343         case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp;
2344         case nir_type_uint16:  return BIFROST_TEXTURE_FORMAT_U16;
2345         case nir_type_int16:   return BIFROST_TEXTURE_FORMAT_S16;
2346         case nir_type_uint32:  return BIFROST_TEXTURE_FORMAT_U32;
2347         case nir_type_int32:   return BIFROST_TEXTURE_FORMAT_S32;
2348         default:              unreachable("Invalid type for texturing");
2349         }
2350 }
2351 
2352 /* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
2353 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)2354 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
2355 {
2356         /* For (u)int we can just passthrough */
2357         nir_alu_type base = nir_alu_type_get_base_type(T);
2358         if (base == nir_type_int || base == nir_type_uint)
2359                 return idx;
2360 
2361         /* Otherwise we convert */
2362         assert(T == nir_type_float32);
2363 
2364         /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
2365          * Texel Selection") defines the layer to be taken from clamp(RNE(r),
2366          * 0, dt - 1). So we use round RTE, clamping is handled at the data
2367          * structure level */
2368 
2369         return bi_f32_to_u32(b, idx, BI_ROUND_NONE);
2370 }
2371 
2372 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
2373  * 16-bit 8:8 fixed-point format. We lower as:
2374  *
2375  * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
2376  * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
2377  */
2378 
2379 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)2380 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
2381 {
2382         /* Precompute for constant LODs to avoid general constant folding */
2383         if (lod.type == BI_INDEX_CONSTANT) {
2384                 uint32_t raw = lod.value;
2385                 float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
2386                 int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
2387                 return bi_imm_u32(s32 & 0xFFFF);
2388         }
2389 
2390         /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
2391          * the max LOD (16 since we cap at 2^16 texture dimensions), and
2392          * preferably small to minimize precision loss */
2393         const float max_lod = 16.0;
2394 
2395         bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
2396                         fp16 ? bi_half(lod, false) : lod,
2397                         bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE);
2398 
2399         fsat->clamp = BI_CLAMP_CLAMP_M1_1;
2400 
2401         bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
2402                         bi_negzero(), BI_ROUND_NONE);
2403 
2404         return bi_mkvec_v2i16(b,
2405                         bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),
2406                         bi_imm_u16(0));
2407 }
2408 
2409 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
2410  * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
2411  * TODO: Cube face.
2412  */
2413 
2414 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)2415 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
2416 {
2417         return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
2418 }
2419 
2420 /* The hardware specifies texel offsets and multisample indices together as a
2421  * u8vec4 <offset, ms index>. By default all are zero, so if have either a
2422  * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
2423  * the bits we need and return that to be passed as a staging register. Else we
2424  * return 0 to avoid allocating a data register when everything is zero. */
2425 
2426 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)2427 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
2428 {
2429         bi_index dest = bi_zero();
2430 
2431         int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2432         if (offs_idx >= 0 &&
2433             (!nir_src_is_const(instr->src[offs_idx].src) ||
2434              nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
2435                 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
2436                 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
2437                 dest = bi_mkvec_v4i8(b,
2438                                 (nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0),
2439                                 (nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0),
2440                                 (nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0),
2441                                 bi_imm_u8(0));
2442         }
2443 
2444         int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
2445         if (ms_idx >= 0 &&
2446             (!nir_src_is_const(instr->src[ms_idx].src) ||
2447              nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
2448                 dest = bi_lshift_or_i32(b,
2449                                 bi_src_index(&instr->src[ms_idx].src), dest,
2450                                 bi_imm_u8(24));
2451         }
2452 
2453         return dest;
2454 }
2455 
2456 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)2457 bi_emit_cube_coord(bi_builder *b, bi_index coord,
2458                     bi_index *face, bi_index *s, bi_index *t)
2459 {
2460         /* Compute max { |x|, |y|, |z| } */
2461         bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader),
2462                         bi_temp(b->shader), coord,
2463                         bi_word(coord, 1), bi_word(coord, 2));
2464 
2465         /* Select coordinates */
2466 
2467         bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,
2468                         cubeface->dest[1]);
2469 
2470         bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),
2471                         cubeface->dest[1]);
2472 
2473         /* The OpenGL ES specification requires us to transform an input vector
2474          * (x, y, z) to the coordinate, given the selected S/T:
2475          *
2476          * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
2477          *
2478          * We implement (s shown, t similar) in a form friendlier to FMA
2479          * instructions, and clamp coordinates at the end for correct
2480          * NaN/infinity handling:
2481          *
2482          * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
2483          *
2484          * Take the reciprocal of max{x, y, z}
2485          */
2486 
2487         bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]);
2488 
2489         /* Calculate 0.5 * (1.0 / max{x, y, z}) */
2490         bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(),
2491                         BI_ROUND_NONE);
2492 
2493         /* Transform the coordinates */
2494         *s = bi_temp(b->shader);
2495         *t = bi_temp(b->shader);
2496 
2497         bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),
2498                         BI_ROUND_NONE);
2499         bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),
2500                         BI_ROUND_NONE);
2501 
2502         S->clamp = BI_CLAMP_CLAMP_0_1;
2503         T->clamp = BI_CLAMP_CLAMP_0_1;
2504 
2505         /* Face index at bit[29:31], matching the cube map descriptor */
2506         *face = cubeface->dest[1];
2507 }
2508 
2509 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
2510  * 32-bits in passed pointer t. The packing of the face with the S coordinate
2511  * exploits the redundancy of floating points with the range restriction of
2512  * CUBEFACE output.
2513  *
2514  *     struct cube_map_descriptor {
2515  *         float s : 29;
2516  *         unsigned face : 3;
2517  *         float t : 32;
2518  *     }
2519  *
2520  * Since the cube face index is preshifted, this is easy to pack with a bitwise
2521  * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
2522  * bits from face.
2523  */
2524 
2525 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)2526 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
2527 {
2528         bi_index face, s;
2529         bi_emit_cube_coord(b, coord, &face, &s, t);
2530         bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
2531         return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
2532 }
2533 
2534 /* Map to the main texture op used. Some of these (txd in particular) will
2535  * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
2536  * sequence). We assume that lowering is handled elsewhere.
2537  */
2538 
2539 static enum bifrost_tex_op
bi_tex_op(nir_texop op)2540 bi_tex_op(nir_texop op)
2541 {
2542         switch (op) {
2543         case nir_texop_tex:
2544         case nir_texop_txb:
2545         case nir_texop_txl:
2546         case nir_texop_txd:
2547         case nir_texop_tex_prefetch:
2548                 return BIFROST_TEX_OP_TEX;
2549         case nir_texop_txf:
2550         case nir_texop_txf_ms:
2551         case nir_texop_txf_ms_fb:
2552         case nir_texop_tg4:
2553                 return BIFROST_TEX_OP_FETCH;
2554         case nir_texop_txs:
2555         case nir_texop_lod:
2556         case nir_texop_query_levels:
2557         case nir_texop_texture_samples:
2558         case nir_texop_samples_identical:
2559                 unreachable("should've been lowered");
2560         default:
2561                 unreachable("unsupported tex op");
2562         }
2563 }
2564 
2565 /* Data registers required by texturing in the order they appear. All are
2566  * optional, the texture operation descriptor determines which are present.
2567  * Note since 3D arrays are not permitted at an API level, Z_COORD and
2568  * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
2569 
2570 enum bifrost_tex_dreg {
2571         BIFROST_TEX_DREG_Z_COORD = 0,
2572         BIFROST_TEX_DREG_Y_DELTAS = 1,
2573         BIFROST_TEX_DREG_LOD = 2,
2574         BIFROST_TEX_DREG_GRDESC_HI = 3,
2575         BIFROST_TEX_DREG_SHADOW = 4,
2576         BIFROST_TEX_DREG_ARRAY = 5,
2577         BIFROST_TEX_DREG_OFFSETMS = 6,
2578         BIFROST_TEX_DREG_SAMPLER = 7,
2579         BIFROST_TEX_DREG_TEXTURE = 8,
2580         BIFROST_TEX_DREG_COUNT,
2581 };
2582 
2583 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)2584 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
2585 {
2586         bool computed_lod = false;
2587 
2588         struct bifrost_texture_operation desc = {
2589                 .op = bi_tex_op(instr->op),
2590                 .offset_or_bias_disable = false, /* TODO */
2591                 .shadow_or_clamp_disable = instr->is_shadow,
2592                 .array = instr->is_array,
2593                 .dimension = bifrost_tex_format(instr->sampler_dim),
2594                 .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */
2595                 .mask = 0xF,
2596         };
2597 
2598         switch (desc.op) {
2599         case BIFROST_TEX_OP_TEX:
2600                 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
2601                 computed_lod = true;
2602                 break;
2603         case BIFROST_TEX_OP_FETCH:
2604                 desc.lod_or_fetch = (enum bifrost_lod_mode)
2605                    (instr->op == nir_texop_tg4 ?
2606                         BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component :
2607                         BIFROST_TEXTURE_FETCH_TEXEL);
2608                 break;
2609         default:
2610                 unreachable("texture op unsupported");
2611         }
2612 
2613         /* 32-bit indices to be allocated as consecutive staging registers */
2614         bi_index dregs[BIFROST_TEX_DREG_COUNT] = { };
2615         bi_index cx = bi_null(), cy = bi_null();
2616 
2617         for (unsigned i = 0; i < instr->num_srcs; ++i) {
2618                 bi_index index = bi_src_index(&instr->src[i].src);
2619                 unsigned sz = nir_src_bit_size(instr->src[i].src);
2620                 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
2621                 nir_alu_type T = base | sz;
2622 
2623                 switch (instr->src[i].src_type) {
2624                 case nir_tex_src_coord:
2625                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2626                                 cx = bi_emit_texc_cube_coord(b, index, &cy);
2627 			} else {
2628                                 unsigned components = nir_src_num_components(instr->src[i].src);
2629 
2630                                 /* Copy XY (for 2D+) or XX (for 1D) */
2631                                 cx = index;
2632                                 cy = bi_word(index, MIN2(1, components - 1));
2633 
2634                                 assert(components >= 1 && components <= 3);
2635 
2636                                 if (components < 3) {
2637                                         /* nothing to do */
2638                                 } else if (desc.array) {
2639                                         /* 2D array */
2640                                         dregs[BIFROST_TEX_DREG_ARRAY] =
2641                                                 bi_emit_texc_array_index(b,
2642                                                                 bi_word(index, 2), T);
2643                                 } else {
2644                                         /* 3D */
2645                                         dregs[BIFROST_TEX_DREG_Z_COORD] =
2646                                                 bi_word(index, 2);
2647                                 }
2648                         }
2649                         break;
2650 
2651                 case nir_tex_src_lod:
2652                         if (desc.op == BIFROST_TEX_OP_TEX &&
2653                             nir_src_is_const(instr->src[i].src) &&
2654                             nir_src_as_uint(instr->src[i].src) == 0) {
2655                                 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
2656                         } else if (desc.op == BIFROST_TEX_OP_TEX) {
2657                                 assert(base == nir_type_float);
2658 
2659                                 assert(sz == 16 || sz == 32);
2660                                 dregs[BIFROST_TEX_DREG_LOD] =
2661                                         bi_emit_texc_lod_88(b, index, sz == 16);
2662                                 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
2663                         } else {
2664                                 assert(desc.op == BIFROST_TEX_OP_FETCH);
2665                                 assert(base == nir_type_uint || base == nir_type_int);
2666                                 assert(sz == 16 || sz == 32);
2667 
2668                                 dregs[BIFROST_TEX_DREG_LOD] =
2669                                         bi_emit_texc_lod_cube(b, index);
2670                         }
2671 
2672                         break;
2673 
2674                 case nir_tex_src_bias:
2675                         /* Upper 16-bits interpreted as a clamp, leave zero */
2676                         assert(desc.op == BIFROST_TEX_OP_TEX);
2677                         assert(base == nir_type_float);
2678                         assert(sz == 16 || sz == 32);
2679                         dregs[BIFROST_TEX_DREG_LOD] =
2680                                 bi_emit_texc_lod_88(b, index, sz == 16);
2681                         desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
2682                         computed_lod = true;
2683                         break;
2684 
2685                 case nir_tex_src_ms_index:
2686                 case nir_tex_src_offset:
2687                         if (desc.offset_or_bias_disable)
2688                                 break;
2689 
2690                         dregs[BIFROST_TEX_DREG_OFFSETMS] =
2691 	                        bi_emit_texc_offset_ms_index(b, instr);
2692                         if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
2693                                 desc.offset_or_bias_disable = true;
2694                         break;
2695 
2696                 case nir_tex_src_comparator:
2697                         dregs[BIFROST_TEX_DREG_SHADOW] = index;
2698                         break;
2699 
2700                 case nir_tex_src_texture_offset:
2701                         assert(instr->texture_index == 0);
2702                         dregs[BIFROST_TEX_DREG_TEXTURE] = index;
2703                         break;
2704 
2705                 case nir_tex_src_sampler_offset:
2706                         assert(instr->sampler_index == 0);
2707                         dregs[BIFROST_TEX_DREG_SAMPLER] = index;
2708                         break;
2709 
2710                 default:
2711                         unreachable("Unhandled src type in texc emit");
2712                 }
2713         }
2714 
2715         if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
2716                 dregs[BIFROST_TEX_DREG_LOD] =
2717                         bi_emit_texc_lod_cube(b, bi_zero());
2718         }
2719 
2720         /* Choose an index mode */
2721 
2722         bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
2723         bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
2724         bool direct = direct_tex && direct_samp;
2725 
2726         desc.immediate_indices = direct && (instr->sampler_index < 16);
2727 
2728         if (desc.immediate_indices) {
2729                 desc.sampler_index_or_mode = instr->sampler_index;
2730                 desc.index = instr->texture_index;
2731         } else {
2732                 enum bifrost_index mode = 0;
2733 
2734                 if (direct && instr->sampler_index == instr->texture_index) {
2735                         mode = BIFROST_INDEX_IMMEDIATE_SHARED;
2736                         desc.index = instr->texture_index;
2737                 } else if (direct) {
2738                         mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
2739                         desc.index = instr->sampler_index;
2740                         dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b,
2741                                         bi_imm_u32(instr->texture_index));
2742                 } else if (direct_tex) {
2743                         assert(!direct_samp);
2744                         mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
2745                         desc.index = instr->texture_index;
2746                 } else if (direct_samp) {
2747                         assert(!direct_tex);
2748                         mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
2749                         desc.index = instr->sampler_index;
2750                 } else {
2751                         mode = BIFROST_INDEX_REGISTER;
2752                 }
2753 
2754                 desc.sampler_index_or_mode = mode | (0x3 << 2);
2755         }
2756 
2757         /* Allocate staging registers contiguously by compacting the array.
2758          * Index is not SSA (tied operands) */
2759 
2760         unsigned sr_count = 0;
2761 
2762         for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
2763                 if (!bi_is_null(dregs[i]))
2764                         dregs[sr_count++] = dregs[i];
2765         }
2766 
2767         bi_index idx = sr_count ? bi_temp_reg(b->shader) : bi_null();
2768 
2769         if (sr_count)
2770                 bi_make_vec_to(b, idx, dregs, NULL, sr_count, 32);
2771 
2772         uint32_t desc_u = 0;
2773         memcpy(&desc_u, &desc, sizeof(desc_u));
2774         bi_texc_to(b, sr_count ? idx : bi_dest_index(&instr->dest),
2775                         idx, cx, cy, bi_imm_u32(desc_u), !computed_lod,
2776                         sr_count);
2777 
2778         /* Explicit copy to facilitate tied operands */
2779         if (sr_count) {
2780                 bi_index srcs[4] = { idx, idx, idx, idx };
2781                 unsigned channels[4] = { 0, 1, 2, 3 };
2782                 bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, 4, 32);
2783         }
2784 }
2785 
2786 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
2787  * textures with sufficiently small immediate indices. Anything else
2788  * needs a complete texture op. */
2789 
2790 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)2791 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
2792 {
2793         int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
2794         assert(coord_idx >= 0);
2795         bi_index coords = bi_src_index(&instr->src[coord_idx].src);
2796 
2797         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2798                 bi_index face, s, t;
2799                 bi_emit_cube_coord(b, coords, &face, &s, &t);
2800 
2801                 bi_texs_cube_to(b, nir_dest_bit_size(instr->dest),
2802                                 bi_dest_index(&instr->dest),
2803                                 s, t, face,
2804                                 instr->sampler_index, instr->texture_index);
2805         } else {
2806                 bi_texs_2d_to(b, nir_dest_bit_size(instr->dest),
2807                                 bi_dest_index(&instr->dest),
2808                                 coords, bi_word(coords, 1),
2809                                 instr->op != nir_texop_tex, /* zero LOD */
2810                                 instr->sampler_index, instr->texture_index);
2811         }
2812 }
2813 
2814 static bool
bi_is_simple_tex(nir_tex_instr * instr)2815 bi_is_simple_tex(nir_tex_instr *instr)
2816 {
2817         if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
2818                 return false;
2819 
2820         if (instr->dest_type != nir_type_float32 &&
2821             instr->dest_type != nir_type_float16)
2822                 return false;
2823 
2824         if (instr->is_shadow || instr->is_array)
2825                 return false;
2826 
2827         switch (instr->sampler_dim) {
2828         case GLSL_SAMPLER_DIM_2D:
2829         case GLSL_SAMPLER_DIM_EXTERNAL:
2830         case GLSL_SAMPLER_DIM_RECT:
2831                 break;
2832 
2833         case GLSL_SAMPLER_DIM_CUBE:
2834                 /* LOD can't be specified with TEXS_CUBE */
2835                 if (instr->op == nir_texop_txl)
2836                         return false;
2837                 break;
2838 
2839         default:
2840                 return false;
2841         }
2842 
2843         for (unsigned i = 0; i < instr->num_srcs; ++i) {
2844                 if (instr->src[i].src_type != nir_tex_src_lod &&
2845                     instr->src[i].src_type != nir_tex_src_coord)
2846                         return false;
2847         }
2848 
2849         /* Indices need to fit in provided bits */
2850         unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
2851         if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
2852                 return false;
2853 
2854         int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2855         if (lod_idx < 0)
2856                 return true;
2857 
2858         nir_src lod = instr->src[lod_idx].src;
2859         return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
2860 }
2861 
2862 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)2863 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
2864 {
2865         switch (instr->op) {
2866         case nir_texop_txs:
2867                 bi_load_sysval_to(b, bi_dest_index(&instr->dest),
2868                                 panfrost_sysval_for_instr(&instr->instr, NULL),
2869                                 4, 0);
2870                 return;
2871         case nir_texop_tex:
2872         case nir_texop_txl:
2873         case nir_texop_txb:
2874         case nir_texop_txf:
2875         case nir_texop_txf_ms:
2876         case nir_texop_tg4:
2877                 break;
2878         default:
2879                 unreachable("Invalid texture operation");
2880         }
2881 
2882         if (bi_is_simple_tex(instr))
2883                 bi_emit_texs(b, instr);
2884         else
2885                 bi_emit_texc(b, instr);
2886 }
2887 
2888 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)2889 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
2890 {
2891         switch (instr->type) {
2892         case nir_instr_type_load_const:
2893                 bi_emit_load_const(b, nir_instr_as_load_const(instr));
2894                 break;
2895 
2896         case nir_instr_type_intrinsic:
2897                 bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2898                 break;
2899 
2900         case nir_instr_type_alu:
2901                 bi_emit_alu(b, nir_instr_as_alu(instr));
2902                 break;
2903 
2904         case nir_instr_type_tex:
2905                 bi_emit_tex(b, nir_instr_as_tex(instr));
2906                 break;
2907 
2908         case nir_instr_type_jump:
2909                 bi_emit_jump(b, nir_instr_as_jump(instr));
2910                 break;
2911 
2912         default:
2913                 unreachable("should've been lowered");
2914         }
2915 }
2916 
2917 static bi_block *
create_empty_block(bi_context * ctx)2918 create_empty_block(bi_context *ctx)
2919 {
2920         bi_block *blk = rzalloc(ctx, bi_block);
2921 
2922         blk->predecessors = _mesa_set_create(blk,
2923                         _mesa_hash_pointer,
2924                         _mesa_key_pointer_equal);
2925 
2926         return blk;
2927 }
2928 
2929 static bi_block *
emit_block(bi_context * ctx,nir_block * block)2930 emit_block(bi_context *ctx, nir_block *block)
2931 {
2932         if (ctx->after_block) {
2933                 ctx->current_block = ctx->after_block;
2934                 ctx->after_block = NULL;
2935         } else {
2936                 ctx->current_block = create_empty_block(ctx);
2937         }
2938 
2939         list_addtail(&ctx->current_block->link, &ctx->blocks);
2940         list_inithead(&ctx->current_block->instructions);
2941 
2942         bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
2943 
2944         nir_foreach_instr(instr, block) {
2945                 bi_emit_instr(&_b, instr);
2946                 ++ctx->instruction_count;
2947         }
2948 
2949         return ctx->current_block;
2950 }
2951 
2952 static void
emit_if(bi_context * ctx,nir_if * nif)2953 emit_if(bi_context *ctx, nir_if *nif)
2954 {
2955         bi_block *before_block = ctx->current_block;
2956 
2957         /* Speculatively emit the branch, but we can't fill it in until later */
2958         bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
2959         bi_instr *then_branch = bi_branchz_i16(&_b,
2960                         bi_half(bi_src_index(&nif->condition), false),
2961                         bi_zero(), BI_CMPF_EQ);
2962 
2963         /* Emit the two subblocks. */
2964         bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
2965         bi_block *end_then_block = ctx->current_block;
2966 
2967         /* Emit second block, and check if it's empty */
2968 
2969         int count_in = ctx->instruction_count;
2970         bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
2971         bi_block *end_else_block = ctx->current_block;
2972         ctx->after_block = create_empty_block(ctx);
2973 
2974         /* Now that we have the subblocks emitted, fix up the branches */
2975 
2976         assert(then_block);
2977         assert(else_block);
2978 
2979         if (ctx->instruction_count == count_in) {
2980                 then_branch->branch_target = ctx->after_block;
2981                 bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */
2982         } else {
2983                 then_branch->branch_target = else_block;
2984 
2985                 /* Emit a jump from the end of the then block to the end of the else */
2986                 _b.cursor = bi_after_block(end_then_block);
2987                 bi_instr *then_exit = bi_jump(&_b, bi_zero());
2988                 then_exit->branch_target = ctx->after_block;
2989 
2990                 bi_block_add_successor(end_then_block, then_exit->branch_target);
2991                 bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
2992         }
2993 
2994         bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */
2995         bi_block_add_successor(before_block, then_block); /* fallthrough */
2996 }
2997 
2998 static void
emit_loop(bi_context * ctx,nir_loop * nloop)2999 emit_loop(bi_context *ctx, nir_loop *nloop)
3000 {
3001         /* Remember where we are */
3002         bi_block *start_block = ctx->current_block;
3003 
3004         bi_block *saved_break = ctx->break_block;
3005         bi_block *saved_continue = ctx->continue_block;
3006 
3007         ctx->continue_block = create_empty_block(ctx);
3008         ctx->break_block = create_empty_block(ctx);
3009         ctx->after_block = ctx->continue_block;
3010 
3011         /* Emit the body itself */
3012         emit_cf_list(ctx, &nloop->body);
3013 
3014         /* Branch back to loop back */
3015         bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
3016         bi_instr *I = bi_jump(&_b, bi_zero());
3017         I->branch_target = ctx->continue_block;
3018         bi_block_add_successor(start_block, ctx->continue_block);
3019         bi_block_add_successor(ctx->current_block, ctx->continue_block);
3020 
3021         ctx->after_block = ctx->break_block;
3022 
3023         /* Pop off */
3024         ctx->break_block = saved_break;
3025         ctx->continue_block = saved_continue;
3026         ++ctx->loop_count;
3027 }
3028 
3029 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)3030 emit_cf_list(bi_context *ctx, struct exec_list *list)
3031 {
3032         bi_block *start_block = NULL;
3033 
3034         foreach_list_typed(nir_cf_node, node, node, list) {
3035                 switch (node->type) {
3036                 case nir_cf_node_block: {
3037                         bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
3038 
3039                         if (!start_block)
3040                                 start_block = block;
3041 
3042                         break;
3043                 }
3044 
3045                 case nir_cf_node_if:
3046                         emit_if(ctx, nir_cf_node_as_if(node));
3047                         break;
3048 
3049                 case nir_cf_node_loop:
3050                         emit_loop(ctx, nir_cf_node_as_loop(node));
3051                         break;
3052 
3053                 default:
3054                         unreachable("Unknown control flow");
3055                 }
3056         }
3057 
3058         return start_block;
3059 }
3060 
3061 /* shader-db stuff */
3062 
3063 struct bi_stats {
3064         unsigned nr_clauses, nr_tuples, nr_ins;
3065         unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
3066 };
3067 
3068 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)3069 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
3070 {
3071         /* Count instructions */
3072         stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
3073 
3074         /* Non-message passing tuples are always arithmetic */
3075         if (tuple->add != clause->message) {
3076                 stats->nr_arith++;
3077                 return;
3078         }
3079 
3080         /* Message + FMA we'll count as arithmetic _and_ message */
3081         if (tuple->fma)
3082                 stats->nr_arith++;
3083 
3084         switch (clause->message_type) {
3085         case BIFROST_MESSAGE_VARYING:
3086                 /* Check components interpolated */
3087                 stats->nr_varying += (clause->message->vecsize + 1) *
3088                         (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
3089                 break;
3090 
3091         case BIFROST_MESSAGE_VARTEX:
3092                 /* 2 coordinates, fp32 each */
3093                 stats->nr_varying += (2 * 2);
3094                 FALLTHROUGH;
3095         case BIFROST_MESSAGE_TEX:
3096                 stats->nr_texture++;
3097                 break;
3098 
3099         case BIFROST_MESSAGE_ATTRIBUTE:
3100         case BIFROST_MESSAGE_LOAD:
3101         case BIFROST_MESSAGE_STORE:
3102         case BIFROST_MESSAGE_ATOMIC:
3103                 stats->nr_ldst++;
3104                 break;
3105 
3106         case BIFROST_MESSAGE_NONE:
3107         case BIFROST_MESSAGE_BARRIER:
3108         case BIFROST_MESSAGE_BLEND:
3109         case BIFROST_MESSAGE_TILE:
3110         case BIFROST_MESSAGE_Z_STENCIL:
3111         case BIFROST_MESSAGE_ATEST:
3112         case BIFROST_MESSAGE_JOB:
3113         case BIFROST_MESSAGE_64BIT:
3114                 /* Nothing to do */
3115                 break;
3116         };
3117 
3118 }
3119 
3120 static void
bi_print_stats(bi_context * ctx,unsigned size,FILE * fp)3121 bi_print_stats(bi_context *ctx, unsigned size, FILE *fp)
3122 {
3123         struct bi_stats stats = { 0 };
3124 
3125         /* Count instructions, clauses, and tuples. Also attempt to construct
3126          * normalized execution engine cycle counts, using the following ratio:
3127          *
3128          * 24 arith tuples/cycle
3129          * 2 texture messages/cycle
3130          * 16 x 16-bit varying channels interpolated/cycle
3131          * 1 load store message/cycle
3132          *
3133          * These numbers seem to match Arm Mobile Studio's heuristic. The real
3134          * cycle counts are surely more complicated.
3135          */
3136 
3137         bi_foreach_block(ctx, block) {
3138                 bi_foreach_clause_in_block(block, clause) {
3139                         stats.nr_clauses++;
3140                         stats.nr_tuples += clause->tuple_count;
3141 
3142                         for (unsigned i = 0; i < clause->tuple_count; ++i)
3143                                 bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
3144                 }
3145         }
3146 
3147         float cycles_arith = ((float) stats.nr_arith) / 24.0;
3148         float cycles_texture = ((float) stats.nr_texture) / 2.0;
3149         float cycles_varying = ((float) stats.nr_varying) / 16.0;
3150         float cycles_ldst = ((float) stats.nr_ldst) / 1.0;
3151 
3152         float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
3153         float cycles_bound = MAX2(cycles_arith, cycles_message);
3154 
3155         /* Thread count and register pressure are traded off only on v7 */
3156         bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32);
3157         unsigned nr_threads = full_threads ? 2 : 1;
3158 
3159         /* Dump stats */
3160 
3161         fprintf(stderr, "%s - %s shader: "
3162                         "%u inst, %u tuples, %u clauses, "
3163                         "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
3164                         "%u quadwords, %u threads, %u loops, "
3165                         "%u:%u spills:fills\n",
3166                         ctx->nir->info.label ?: "",
3167                         ctx->inputs->is_blend ? "PAN_SHADER_BLEND" :
3168                         gl_shader_stage_name(ctx->stage),
3169                         stats.nr_ins, stats.nr_tuples, stats.nr_clauses,
3170                         cycles_bound, cycles_arith, cycles_texture,
3171                         cycles_varying, cycles_ldst,
3172                         size / 16, nr_threads,
3173                         ctx->loop_count,
3174                         ctx->spills, ctx->fills);
3175 }
3176 
3177 static int
glsl_type_size(const struct glsl_type * type,bool bindless)3178 glsl_type_size(const struct glsl_type *type, bool bindless)
3179 {
3180         return glsl_count_attribute_slots(type, false);
3181 }
3182 
3183 /* Split stores to memory. We don't split stores to vertex outputs, since
3184  * nir_lower_io_to_temporaries will ensure there's only a single write.
3185  */
3186 
3187 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)3188 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
3189 {
3190         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3191 
3192         switch (intr->intrinsic) {
3193         case nir_intrinsic_store_ssbo:
3194         case nir_intrinsic_store_shared:
3195         case nir_intrinsic_store_global:
3196         case nir_intrinsic_store_scratch:
3197                 return true;
3198         default:
3199                 return false;
3200         }
3201 }
3202 
3203 /* Bifrost wants transcendentals as FP32 */
3204 
3205 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)3206 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
3207 {
3208         if (instr->type != nir_instr_type_alu)
3209                 return 0;
3210 
3211         nir_alu_instr *alu = nir_instr_as_alu(instr);
3212 
3213         switch (alu->op) {
3214         case nir_op_fexp2:
3215         case nir_op_flog2:
3216         case nir_op_fpow:
3217         case nir_op_fsin:
3218         case nir_op_fcos:
3219                 return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32;
3220         default:
3221                 return 0;
3222         }
3223 }
3224 
3225 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
3226  * transcendentals are an exception. Also shifts because of lane size mismatch
3227  * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
3228  * to be scalarized due to type size. */
3229 
3230 static bool
bi_vectorize_filter(const nir_instr * instr,void * data)3231 bi_vectorize_filter(const nir_instr *instr, void *data)
3232 {
3233         /* Defaults work for everything else */
3234         if (instr->type != nir_instr_type_alu)
3235                 return true;
3236 
3237         const nir_alu_instr *alu = nir_instr_as_alu(instr);
3238 
3239         switch (alu->op) {
3240         case nir_op_frcp:
3241         case nir_op_frsq:
3242         case nir_op_ishl:
3243         case nir_op_ishr:
3244         case nir_op_ushr:
3245         case nir_op_f2i16:
3246         case nir_op_f2u16:
3247         case nir_op_i2f16:
3248         case nir_op_u2f16:
3249                 return false;
3250         default:
3251                 return true;
3252         }
3253 }
3254 
3255 /* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we
3256  * keep divergence info around after we consume it for indirect lowering,
3257  * nir_convert_from_ssa will regress code quality since it will avoid
3258  * coalescing divergent with non-divergent nodes. */
3259 
3260 static bool
nir_invalidate_divergence_ssa(nir_ssa_def * ssa,UNUSED void * data)3261 nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data)
3262 {
3263         ssa->divergent = false;
3264         return true;
3265 }
3266 
3267 static bool
nir_invalidate_divergence(struct nir_builder * b,nir_instr * instr,UNUSED void * data)3268 nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr,
3269                 UNUSED void *data)
3270 {
3271         return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL);
3272 }
3273 
3274 /* Ensure we write exactly 4 components */
3275 static nir_ssa_def *
bifrost_nir_valid_channel(nir_builder * b,nir_ssa_def * in,unsigned channel,unsigned first,unsigned mask)3276 bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in,
3277                           unsigned channel, unsigned first, unsigned mask)
3278 {
3279         if (!(mask & BITFIELD_BIT(channel)))
3280                 channel = first;
3281 
3282         return nir_channel(b, in, channel);
3283 }
3284 
3285 /* Lower fragment store_output instructions to always write 4 components,
3286  * matching the hardware semantic. This may require additional moves. Skipping
3287  * these moves is possible in theory, but invokes undefined behaviour in the
3288  * compiler. The DDK inserts these moves, so we will as well. */
3289 
3290 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_instr * instr,void * data)3291 bifrost_nir_lower_blend_components(struct nir_builder *b,
3292                                    nir_instr *instr, void *data)
3293 {
3294         if (instr->type != nir_instr_type_intrinsic)
3295                 return false;
3296 
3297         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3298 
3299         if (intr->intrinsic != nir_intrinsic_store_output)
3300                 return false;
3301 
3302         nir_ssa_def *in = intr->src[0].ssa;
3303         unsigned first = nir_intrinsic_component(intr);
3304         unsigned mask = nir_intrinsic_write_mask(intr);
3305 
3306         assert(first == 0 && "shouldn't get nonzero components");
3307 
3308         /* Nothing to do */
3309         if (mask == BITFIELD_MASK(4))
3310                 return false;
3311 
3312         b->cursor = nir_before_instr(&intr->instr);
3313 
3314         /* Replicate the first valid component instead */
3315         nir_ssa_def *replicated =
3316                 nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
3317                             bifrost_nir_valid_channel(b, in, 1, first, mask),
3318                             bifrost_nir_valid_channel(b, in, 2, first, mask),
3319                             bifrost_nir_valid_channel(b, in, 3, first, mask));
3320 
3321         /* Rewrite to use our replicated version */
3322         nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated);
3323         nir_intrinsic_set_component(intr, 0);
3324         nir_intrinsic_set_write_mask(intr, 0xF);
3325         intr->num_components = 4;
3326 
3327         return true;
3328 }
3329 
3330 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)3331 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
3332 {
3333         bool progress;
3334         unsigned lower_flrp = 16 | 32 | 64;
3335 
3336         NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
3337 
3338         nir_lower_tex_options lower_tex_options = {
3339                 .lower_txs_lod = true,
3340                 .lower_txp = ~0,
3341                 .lower_tg4_broadcom_swizzle = true,
3342                 .lower_txd = true,
3343         };
3344 
3345         NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin);
3346         NIR_PASS(progress, nir, pan_lower_helper_invocation);
3347 
3348         NIR_PASS(progress, nir, nir_lower_int64);
3349 
3350         nir_lower_idiv_options idiv_options = {
3351                 .imprecise_32bit_lowering = true,
3352                 .allow_fp16 = true,
3353         };
3354         NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options);
3355 
3356         NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
3357         NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
3358         NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
3359 
3360         do {
3361                 progress = false;
3362 
3363                 NIR_PASS(progress, nir, nir_lower_var_copies);
3364                 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
3365                 NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
3366 
3367                 NIR_PASS(progress, nir, nir_copy_prop);
3368                 NIR_PASS(progress, nir, nir_opt_remove_phis);
3369                 NIR_PASS(progress, nir, nir_opt_dce);
3370                 NIR_PASS(progress, nir, nir_opt_dead_cf);
3371                 NIR_PASS(progress, nir, nir_opt_cse);
3372                 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
3373                 NIR_PASS(progress, nir, nir_opt_algebraic);
3374                 NIR_PASS(progress, nir, nir_opt_constant_folding);
3375 
3376                 NIR_PASS(progress, nir, nir_lower_alu);
3377 
3378                 if (lower_flrp != 0) {
3379                         bool lower_flrp_progress = false;
3380                         NIR_PASS(lower_flrp_progress,
3381                                  nir,
3382                                  nir_lower_flrp,
3383                                  lower_flrp,
3384                                  false /* always_precise */);
3385                         if (lower_flrp_progress) {
3386                                 NIR_PASS(progress, nir,
3387                                          nir_opt_constant_folding);
3388                                 progress = true;
3389                         }
3390 
3391                         /* Nothing should rematerialize any flrps, so we only
3392                          * need to do this lowering once.
3393                          */
3394                         lower_flrp = 0;
3395                 }
3396 
3397                 NIR_PASS(progress, nir, nir_opt_undef);
3398                 NIR_PASS(progress, nir, nir_lower_undef_to_zero);
3399 
3400                 NIR_PASS(progress, nir, nir_opt_loop_unroll);
3401         } while (progress);
3402 
3403         /* TODO: Why is 64-bit getting rematerialized?
3404          * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
3405         NIR_PASS(progress, nir, nir_lower_int64);
3406 
3407         /* We need to cleanup after each iteration of late algebraic
3408          * optimizations, since otherwise NIR can produce weird edge cases
3409          * (like fneg of a constant) which we don't handle */
3410         bool late_algebraic = true;
3411         while (late_algebraic) {
3412                 late_algebraic = false;
3413                 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
3414                 NIR_PASS(progress, nir, nir_opt_constant_folding);
3415                 NIR_PASS(progress, nir, nir_copy_prop);
3416                 NIR_PASS(progress, nir, nir_opt_dce);
3417                 NIR_PASS(progress, nir, nir_opt_cse);
3418         }
3419 
3420         NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
3421         NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
3422         NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
3423         NIR_PASS(progress, nir, nir_opt_dce);
3424 
3425         /* Prepass to simplify instruction selection */
3426         NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late);
3427         NIR_PASS(progress, nir, nir_opt_dce);
3428 
3429         if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3430                 NIR_PASS_V(nir, nir_shader_instructions_pass,
3431                            bifrost_nir_lower_blend_components,
3432                            nir_metadata_block_index | nir_metadata_dominance,
3433                            NULL);
3434         }
3435 
3436         /* Backend scheduler is purely local, so do some global optimizations
3437          * to reduce register pressure. */
3438         nir_move_options move_all =
3439                 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3440                 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3441 
3442         NIR_PASS_V(nir, nir_opt_sink, move_all);
3443         NIR_PASS_V(nir, nir_opt_move, move_all);
3444 
3445         /* We might lower attribute, varying, and image indirects. Use the
3446          * gathered info to skip the extra analysis in the happy path. */
3447         bool any_indirects =
3448                 nir->info.inputs_read_indirectly ||
3449                 nir->info.outputs_accessed_indirectly ||
3450                 nir->info.patch_inputs_read_indirectly ||
3451                 nir->info.patch_outputs_accessed_indirectly ||
3452                 nir->info.images_used;
3453 
3454         if (any_indirects) {
3455                 nir_convert_to_lcssa(nir, true, true);
3456                 NIR_PASS_V(nir, nir_divergence_analysis);
3457                 NIR_PASS_V(nir, bi_lower_divergent_indirects,
3458                                 bifrost_lanes_per_warp(gpu_id));
3459                 NIR_PASS_V(nir, nir_shader_instructions_pass,
3460                         nir_invalidate_divergence, nir_metadata_all, NULL);
3461         }
3462 
3463         /* Take us out of SSA */
3464         NIR_PASS(progress, nir, nir_lower_locals_to_regs);
3465         NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
3466         NIR_PASS(progress, nir, nir_convert_from_ssa, true);
3467 }
3468 
3469 /* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the
3470  * same lowering here to zero-extend correctly */
3471 
3472 static bool
bifrost_nir_lower_i8_fragout_impl(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)3473 bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b,
3474                 nir_intrinsic_instr *intr, UNUSED void *data)
3475 {
3476         if (nir_src_bit_size(intr->src[0]) != 8)
3477                 return false;
3478 
3479         nir_alu_type type =
3480                 nir_alu_type_get_base_type(nir_intrinsic_src_type(intr));
3481 
3482         assert(type == nir_type_int || type == nir_type_uint);
3483 
3484         b->cursor = nir_before_instr(&intr->instr);
3485         nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16);
3486 
3487         nir_intrinsic_set_src_type(intr, type | 16);
3488         nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast);
3489         return true;
3490 }
3491 
3492 static bool
bifrost_nir_lower_i8_fragin_impl(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)3493 bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b,
3494                 nir_intrinsic_instr *intr, UNUSED void *data)
3495 {
3496         if (nir_dest_bit_size(intr->dest) != 8)
3497                 return false;
3498 
3499         nir_alu_type type =
3500                 nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr));
3501 
3502         assert(type == nir_type_int || type == nir_type_uint);
3503 
3504         b->cursor = nir_before_instr(&intr->instr);
3505         nir_ssa_def *out =
3506                 nir_load_output(b, intr->num_components, 16, intr->src[0].ssa,
3507                         .base = nir_intrinsic_base(intr),
3508                         .component = nir_intrinsic_component(intr),
3509                         .dest_type = type | 16,
3510                         .io_semantics = nir_intrinsic_io_semantics(intr));
3511 
3512         nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8);
3513         nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast);
3514         return true;
3515 }
3516 
3517 static bool
bifrost_nir_lower_i8_frag(struct nir_builder * b,nir_instr * instr,UNUSED void * data)3518 bifrost_nir_lower_i8_frag(struct nir_builder *b,
3519                 nir_instr *instr, UNUSED void *data)
3520 {
3521         if (instr->type != nir_instr_type_intrinsic)
3522                 return false;
3523 
3524         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3525         if (intr->intrinsic == nir_intrinsic_load_output)
3526                 return bifrost_nir_lower_i8_fragin_impl(b, intr, data);
3527         else if (intr->intrinsic == nir_intrinsic_store_output)
3528                 return bifrost_nir_lower_i8_fragout_impl(b, intr, data);
3529         else
3530                 return false;
3531 }
3532 
3533 static void
bi_opt_post_ra(bi_context * ctx)3534 bi_opt_post_ra(bi_context *ctx)
3535 {
3536         bi_foreach_instr_global_safe(ctx, ins) {
3537                 if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0]))
3538                         bi_remove_instruction(ins);
3539         }
3540 }
3541 
3542 static bool
bifrost_nir_lower_store_component(struct nir_builder * b,nir_instr * instr,void * data)3543 bifrost_nir_lower_store_component(struct nir_builder *b,
3544                 nir_instr *instr, void *data)
3545 {
3546         if (instr->type != nir_instr_type_intrinsic)
3547                 return false;
3548 
3549         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3550 
3551         if (intr->intrinsic != nir_intrinsic_store_output)
3552                 return false;
3553 
3554         struct hash_table_u64 *slots = data;
3555         unsigned component = nir_intrinsic_component(intr);
3556         nir_src *slot_src = nir_get_io_offset_src(intr);
3557         uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
3558 
3559         nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
3560         unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
3561 
3562         nir_ssa_def *value = intr->src[0].ssa;
3563         b->cursor = nir_before_instr(&intr->instr);
3564 
3565         nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);
3566         nir_ssa_def *channels[4] = { undef, undef, undef, undef };
3567 
3568         /* Copy old */
3569         u_foreach_bit(i, mask) {
3570                 assert(prev != NULL);
3571                 nir_ssa_def *prev_ssa = prev->src[0].ssa;
3572                 channels[i] = nir_channel(b, prev_ssa, i);
3573         }
3574 
3575         /* Copy new */
3576         unsigned new_mask = nir_intrinsic_write_mask(intr);
3577         mask |= (new_mask << component);
3578 
3579         u_foreach_bit(i, new_mask) {
3580                 assert(component + i < 4);
3581                 channels[component + i] = nir_channel(b, value, i);
3582         }
3583 
3584         intr->num_components = util_last_bit(mask);
3585         nir_instr_rewrite_src_ssa(instr, &intr->src[0],
3586                         nir_vec(b, channels, intr->num_components));
3587 
3588         nir_intrinsic_set_component(intr, 0);
3589         nir_intrinsic_set_write_mask(intr, mask);
3590 
3591         if (prev) {
3592                 _mesa_hash_table_u64_remove(slots, slot);
3593                 nir_instr_remove(&prev->instr);
3594         }
3595 
3596         _mesa_hash_table_u64_insert(slots, slot, intr);
3597         return false;
3598 }
3599 
3600 /* Dead code elimination for branches at the end of a block - only one branch
3601  * per block is legal semantically, but unreachable jumps can be generated.
3602  * Likewise we can generate jumps to the terminal block which need to be
3603  * lowered away to a jump to #0x0, which induces successful termination. */
3604 
3605 static void
bi_lower_branch(bi_block * block)3606 bi_lower_branch(bi_block *block)
3607 {
3608         bool branched = false;
3609         ASSERTED bool was_jump = false;
3610 
3611         bi_foreach_instr_in_block_safe(block, ins) {
3612                 if (!ins->branch_target) continue;
3613 
3614                 if (branched) {
3615                         assert(was_jump && (ins->op == BI_OPCODE_JUMP));
3616                         bi_remove_instruction(ins);
3617                         continue;
3618                 }
3619 
3620                 branched = true;
3621                 was_jump = ins->op == BI_OPCODE_JUMP;
3622 
3623                 if (bi_is_terminal_block(ins->branch_target))
3624                         ins->branch_target = NULL;
3625         }
3626 }
3627 
3628 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary)3629 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary)
3630 {
3631         unsigned final_clause = bi_pack(ctx, binary);
3632 
3633         /* If we need to wait for ATEST or BLEND in the first clause, pass the
3634          * corresponding bits through to the renderer state descriptor */
3635         bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
3636         bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
3637 
3638         unsigned first_deps = first_clause ? first_clause->dependencies : 0;
3639         ctx->info->bifrost.wait_6 = (first_deps & (1 << 6));
3640         ctx->info->bifrost.wait_7 = (first_deps & (1 << 7));
3641 
3642         /* Pad the shader with enough zero bytes to trick the prefetcher,
3643          * unless we're compiling an empty shader (in which case we don't pad
3644          * so the size remains 0) */
3645         unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
3646 
3647         if (binary->size) {
3648                 memset(util_dynarray_grow(binary, uint8_t, prefetch_size),
3649                        0, prefetch_size);
3650         }
3651 }
3652 
3653 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)3654 bifrost_compile_shader_nir(nir_shader *nir,
3655                            const struct panfrost_compile_inputs *inputs,
3656                            struct util_dynarray *binary,
3657                            struct pan_shader_info *info)
3658 {
3659         bifrost_debug = debug_get_option_bifrost_debug();
3660 
3661         bi_context *ctx = rzalloc(NULL, bi_context);
3662         ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx);
3663 
3664         ctx->inputs = inputs;
3665         ctx->nir = nir;
3666         ctx->info = info;
3667         ctx->stage = nir->info.stage;
3668         ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
3669         ctx->arch = inputs->gpu_id >> 12;
3670 
3671         /* If nothing is pushed, all UBOs need to be uploaded */
3672         ctx->ubo_mask = ~0;
3673 
3674         list_inithead(&ctx->blocks);
3675 
3676         /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
3677          * (so we don't accidentally duplicate the epilogue since mesa/st has
3678          * messed with our I/O quite a bit already) */
3679 
3680         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3681 
3682         if (ctx->stage == MESA_SHADER_VERTEX) {
3683                 NIR_PASS_V(nir, nir_lower_viewport_transform);
3684                 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0);
3685         }
3686 
3687         /* Lower large arrays to scratch and small arrays to bcsel (TODO: tune
3688          * threshold, but not until addresses / csel is optimized better) */
3689         NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
3690                         glsl_get_natural_size_align_bytes);
3691         NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3692 
3693         NIR_PASS_V(nir, nir_split_var_copies);
3694         NIR_PASS_V(nir, nir_lower_global_vars_to_local);
3695         NIR_PASS_V(nir, nir_lower_var_copies);
3696         NIR_PASS_V(nir, nir_lower_vars_to_ssa);
3697         NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3698                         glsl_type_size, 0);
3699 
3700         if (ctx->stage == MESA_SHADER_FRAGMENT) {
3701                 NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
3702                                 ~0, false);
3703         } else {
3704                 struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx);
3705                 NIR_PASS_V(nir, nir_shader_instructions_pass,
3706                                 bifrost_nir_lower_store_component,
3707                                 nir_metadata_block_index |
3708                                 nir_metadata_dominance, stores);
3709                 _mesa_hash_table_u64_destroy(stores);
3710         }
3711 
3712         NIR_PASS_V(nir, nir_lower_ssbo);
3713         NIR_PASS_V(nir, pan_nir_lower_zs_store);
3714         NIR_PASS_V(nir, pan_lower_sample_pos);
3715         NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
3716 
3717         if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3718                 NIR_PASS_V(nir, nir_shader_instructions_pass,
3719                         bifrost_nir_lower_i8_frag,
3720                         nir_metadata_block_index | nir_metadata_dominance,
3721                         NULL);
3722         }
3723 
3724         bi_optimize_nir(nir, ctx->inputs->gpu_id, ctx->inputs->is_blend);
3725 
3726         NIR_PASS_V(nir, pan_nir_reorder_writeout);
3727 
3728         bool skip_internal = nir->info.internal;
3729         skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
3730 
3731         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
3732                 nir_print_shader(nir, stdout);
3733         }
3734 
3735         info->tls_size = nir->scratch_size;
3736 
3737         nir_foreach_function(func, nir) {
3738                 if (!func->impl)
3739                         continue;
3740 
3741                 ctx->ssa_alloc += func->impl->ssa_alloc;
3742                 ctx->reg_alloc += func->impl->reg_alloc;
3743 
3744                 emit_cf_list(ctx, &func->impl->body);
3745                 break; /* TODO: Multi-function shaders */
3746         }
3747 
3748         unsigned block_source_count = 0;
3749 
3750         bi_foreach_block(ctx, block) {
3751                 /* Name blocks now that we're done emitting so the order is
3752                  * consistent */
3753                 block->name = block_source_count++;
3754         }
3755 
3756         bi_validate(ctx, "NIR -> BIR");
3757 
3758         /* If the shader doesn't write any colour or depth outputs, it may
3759          * still need an ATEST at the very end! */
3760         bool need_dummy_atest =
3761                 (ctx->stage == MESA_SHADER_FRAGMENT) &&
3762                 !ctx->emitted_atest &&
3763                 !bi_skip_atest(ctx, false);
3764 
3765         if (need_dummy_atest) {
3766                 bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
3767                 bi_builder b = bi_init_builder(ctx, bi_after_block(end));
3768                 bi_emit_atest(&b, bi_zero());
3769         }
3770 
3771         bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
3772 
3773         /* Runs before constant folding */
3774         bi_lower_swizzle(ctx);
3775         bi_validate(ctx, "Early lowering");
3776 
3777         /* Runs before copy prop */
3778         if (optimize && !ctx->inputs->no_ubo_to_push) {
3779                 bi_opt_push_ubo(ctx);
3780         }
3781 
3782         if (likely(optimize)) {
3783                 bi_opt_copy_prop(ctx);
3784                 bi_opt_constant_fold(ctx);
3785                 bi_opt_copy_prop(ctx);
3786                 bi_opt_mod_prop_forward(ctx);
3787                 bi_opt_mod_prop_backward(ctx);
3788                 bi_opt_dead_code_eliminate(ctx);
3789                 bi_opt_cse(ctx);
3790                 bi_opt_dead_code_eliminate(ctx);
3791                 bi_validate(ctx, "Optimization passes");
3792         }
3793 
3794         bi_foreach_instr_global(ctx, I) {
3795                 bi_lower_opt_instruction(I);
3796         }
3797 
3798         bi_foreach_block(ctx, block) {
3799                 bi_lower_branch(block);
3800         }
3801 
3802         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3803                 bi_print_shader(ctx, stdout);
3804         bi_lower_fau(ctx);
3805 
3806         /* Analyze before register allocation to avoid false dependencies. The
3807          * skip bit is a function of only the data flow graph and is invariant
3808          * under valid scheduling. */
3809         bi_analyze_helper_requirements(ctx);
3810         bi_validate(ctx, "Late lowering");
3811 
3812         bi_register_allocate(ctx);
3813 
3814         if (likely(optimize))
3815                 bi_opt_post_ra(ctx);
3816 
3817         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3818                 bi_print_shader(ctx, stdout);
3819 
3820         if (ctx->arch <= 8) {
3821                 bi_schedule(ctx);
3822                 bi_assign_scoreboard(ctx);
3823         }
3824 
3825         /* Analyze after scheduling since we depend on instruction order. */
3826         bi_analyze_helper_terminate(ctx);
3827 
3828         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
3829                 bi_print_shader(ctx, stdout);
3830 
3831         if (ctx->arch <= 8) {
3832                 bi_pack_clauses(ctx, binary);
3833         } else {
3834                 /* TODO: pack flat */
3835         }
3836 
3837         info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos);
3838 
3839         if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
3840                 disassemble_bifrost(stdout, binary->data, binary->size,
3841                                     bifrost_debug & BIFROST_DBG_VERBOSE);
3842                 fflush(stdout);
3843         }
3844 
3845         if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) &&
3846             !skip_internal) {
3847                 bi_print_stats(ctx, binary->size, stderr);
3848         }
3849 
3850         _mesa_hash_table_u64_destroy(ctx->sysval_to_id);
3851         ralloc_free(ctx);
3852 }
3853