1 /*
2  * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include <stdarg.h>
28 
29 #include "util/u_math.h"
30 #include "util/u_memory.h"
31 #include "util/u_string.h"
32 
33 #include "ir3_compiler.h"
34 #include "ir3_image.h"
35 #include "ir3_nir.h"
36 #include "ir3_shader.h"
37 
38 #include "instr-a3xx.h"
39 #include "ir3.h"
40 #include "ir3_context.h"
41 
42 void
ir3_handle_nonuniform(struct ir3_instruction * instr,nir_intrinsic_instr * intrin)43 ir3_handle_nonuniform(struct ir3_instruction *instr,
44                       nir_intrinsic_instr *intrin)
45 {
46    if (nir_intrinsic_has_access(intrin) &&
47        (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
48       instr->flags |= IR3_INSTR_NONUNIF;
49    }
50 }
51 
52 void
ir3_handle_bindless_cat6(struct ir3_instruction * instr,nir_src rsrc)53 ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
54 {
55    nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
56    if (!intrin)
57       return;
58 
59    instr->flags |= IR3_INSTR_B;
60    instr->cat6.base = nir_intrinsic_desc_set(intrin);
61 }
62 
63 static struct ir3_instruction *
create_input(struct ir3_context * ctx,unsigned compmask)64 create_input(struct ir3_context *ctx, unsigned compmask)
65 {
66    struct ir3_instruction *in;
67 
68    in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
69    in->input.sysval = ~0;
70    __ssa_dst(in)->wrmask = compmask;
71 
72    array_insert(ctx->ir, ctx->ir->inputs, in);
73 
74    return in;
75 }
76 
77 static struct ir3_instruction *
create_frag_input(struct ir3_context * ctx,struct ir3_instruction * coord,unsigned n)78 create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
79                   unsigned n)
80 {
81    struct ir3_block *block = ctx->block;
82    struct ir3_instruction *instr;
83    /* packed inloc is fixed up later: */
84    struct ir3_instruction *inloc = create_immed(block, n);
85 
86    if (coord) {
87       instr = ir3_BARY_F(block, inloc, 0, coord, 0);
88    } else if (ctx->compiler->flat_bypass) {
89       instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
90       instr->cat6.type = TYPE_U32;
91       instr->cat6.iim_val = 1;
92    } else {
93       instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);
94       instr->srcs[1]->wrmask = 0x3;
95    }
96 
97    return instr;
98 }
99 
100 static struct ir3_instruction *
create_driver_param(struct ir3_context * ctx,enum ir3_driver_param dp)101 create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
102 {
103    /* first four vec4 sysval's reserved for UBOs: */
104    /* NOTE: dp is in scalar, but there can be >4 dp components: */
105    struct ir3_const_state *const_state = ir3_const_state(ctx->so);
106    unsigned n = const_state->offsets.driver_param;
107    unsigned r = regid(n + dp / 4, dp % 4);
108    return create_uniform(ctx->block, r);
109 }
110 
111 /*
112  * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
113  * 32-bit registers.  We use NIR's 1-bit integers to represent bools, and
114  * trust that we will only see and/or/xor on those 1-bit values, so we can
115  * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
116  * 0.
117  */
118 
119 /*
120  * alu/sfu instructions:
121  */
122 
123 static struct ir3_instruction *
create_cov(struct ir3_context * ctx,struct ir3_instruction * src,unsigned src_bitsize,nir_op op)124 create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
125            unsigned src_bitsize, nir_op op)
126 {
127    type_t src_type, dst_type;
128 
129    switch (op) {
130    case nir_op_f2f32:
131    case nir_op_f2f16_rtne:
132    case nir_op_f2f16_rtz:
133    case nir_op_f2f16:
134    case nir_op_f2i32:
135    case nir_op_f2i16:
136    case nir_op_f2i8:
137    case nir_op_f2u32:
138    case nir_op_f2u16:
139    case nir_op_f2u8:
140       switch (src_bitsize) {
141       case 32:
142          src_type = TYPE_F32;
143          break;
144       case 16:
145          src_type = TYPE_F16;
146          break;
147       default:
148          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
149       }
150       break;
151 
152    case nir_op_i2f32:
153    case nir_op_i2f16:
154    case nir_op_i2i32:
155    case nir_op_i2i16:
156    case nir_op_i2i8:
157       switch (src_bitsize) {
158       case 32:
159          src_type = TYPE_S32;
160          break;
161       case 16:
162          src_type = TYPE_S16;
163          break;
164       case 8:
165          src_type = TYPE_S8;
166          break;
167       default:
168          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
169       }
170       break;
171 
172    case nir_op_u2f32:
173    case nir_op_u2f16:
174    case nir_op_u2u32:
175    case nir_op_u2u16:
176    case nir_op_u2u8:
177       switch (src_bitsize) {
178       case 32:
179          src_type = TYPE_U32;
180          break;
181       case 16:
182          src_type = TYPE_U16;
183          break;
184       case 8:
185          src_type = TYPE_U8;
186          break;
187       default:
188          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
189       }
190       break;
191 
192    case nir_op_b2f16:
193    case nir_op_b2f32:
194    case nir_op_b2i8:
195    case nir_op_b2i16:
196    case nir_op_b2i32:
197       src_type = TYPE_U32;
198       break;
199 
200    default:
201       ir3_context_error(ctx, "invalid conversion op: %u", op);
202    }
203 
204    switch (op) {
205    case nir_op_f2f32:
206    case nir_op_i2f32:
207    case nir_op_u2f32:
208    case nir_op_b2f32:
209       dst_type = TYPE_F32;
210       break;
211 
212    case nir_op_f2f16_rtne:
213    case nir_op_f2f16_rtz:
214    case nir_op_f2f16:
215    case nir_op_i2f16:
216    case nir_op_u2f16:
217    case nir_op_b2f16:
218       dst_type = TYPE_F16;
219       break;
220 
221    case nir_op_f2i32:
222    case nir_op_i2i32:
223    case nir_op_b2i32:
224       dst_type = TYPE_S32;
225       break;
226 
227    case nir_op_f2i16:
228    case nir_op_i2i16:
229    case nir_op_b2i16:
230       dst_type = TYPE_S16;
231       break;
232 
233    case nir_op_f2i8:
234    case nir_op_i2i8:
235    case nir_op_b2i8:
236       dst_type = TYPE_S8;
237       break;
238 
239    case nir_op_f2u32:
240    case nir_op_u2u32:
241       dst_type = TYPE_U32;
242       break;
243 
244    case nir_op_f2u16:
245    case nir_op_u2u16:
246       dst_type = TYPE_U16;
247       break;
248 
249    case nir_op_f2u8:
250    case nir_op_u2u8:
251       dst_type = TYPE_U8;
252       break;
253 
254    default:
255       ir3_context_error(ctx, "invalid conversion op: %u", op);
256    }
257 
258    if (src_type == dst_type)
259       return src;
260 
261    struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
262 
263    if (op == nir_op_f2f16_rtne) {
264       cov->cat1.round = ROUND_EVEN;
265    } else if (op == nir_op_f2f16) {
266       unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
267       nir_rounding_mode rounding_mode =
268          nir_get_rounding_mode_from_float_controls(execution_mode,
269                                                    nir_type_float16);
270       if (rounding_mode == nir_rounding_mode_rtne)
271          cov->cat1.round = ROUND_EVEN;
272    }
273 
274    return cov;
275 }
276 
277 /* For shift instructions NIR always has shift amount as 32 bit integer */
278 static struct ir3_instruction *
resize_shift_amount(struct ir3_context * ctx,struct ir3_instruction * src,unsigned bs)279 resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
280                     unsigned bs)
281 {
282    if (bs != 16)
283       return src;
284 
285    return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
286 }
287 
288 static void
emit_alu(struct ir3_context * ctx,nir_alu_instr * alu)289 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
290 {
291    const nir_op_info *info = &nir_op_infos[alu->op];
292    struct ir3_instruction **dst, *src[info->num_inputs];
293    unsigned bs[info->num_inputs]; /* bit size */
294    struct ir3_block *b = ctx->block;
295    unsigned dst_sz, wrmask;
296    type_t dst_type =
297       nir_dest_bit_size(alu->dest.dest) == 16 ? TYPE_U16 : TYPE_U32;
298 
299    if (alu->dest.dest.is_ssa) {
300       dst_sz = alu->dest.dest.ssa.num_components;
301       wrmask = (1 << dst_sz) - 1;
302    } else {
303       dst_sz = alu->dest.dest.reg.reg->num_components;
304       wrmask = alu->dest.write_mask;
305    }
306 
307    dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz);
308 
309    /* Vectors are special in that they have non-scalarized writemasks,
310     * and just take the first swizzle channel for each argument in
311     * order into each writemask channel.
312     */
313    if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
314        (alu->op == nir_op_vec4)) {
315 
316       for (int i = 0; i < info->num_inputs; i++) {
317          nir_alu_src *asrc = &alu->src[i];
318 
319          compile_assert(ctx, !asrc->abs);
320          compile_assert(ctx, !asrc->negate);
321 
322          src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
323          if (!src[i])
324             src[i] = create_immed_typed(ctx->block, 0, dst_type);
325          dst[i] = ir3_MOV(b, src[i], dst_type);
326       }
327 
328       ir3_put_dst(ctx, &alu->dest.dest);
329       return;
330    }
331 
332    /* We also get mov's with more than one component for mov's so
333     * handle those specially:
334     */
335    if (alu->op == nir_op_mov) {
336       nir_alu_src *asrc = &alu->src[0];
337       struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
338 
339       for (unsigned i = 0; i < dst_sz; i++) {
340          if (wrmask & (1 << i)) {
341             dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
342          } else {
343             dst[i] = NULL;
344          }
345       }
346 
347       ir3_put_dst(ctx, &alu->dest.dest);
348       return;
349    }
350 
351    /* General case: We can just grab the one used channel per src. */
352    for (int i = 0; i < info->num_inputs; i++) {
353       unsigned chan = ffs(alu->dest.write_mask) - 1;
354       nir_alu_src *asrc = &alu->src[i];
355 
356       compile_assert(ctx, !asrc->abs);
357       compile_assert(ctx, !asrc->negate);
358 
359       src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
360       bs[i] = nir_src_bit_size(asrc->src);
361 
362       compile_assert(ctx, src[i]);
363    }
364 
365    switch (alu->op) {
366    case nir_op_f2f32:
367    case nir_op_f2f16_rtne:
368    case nir_op_f2f16_rtz:
369    case nir_op_f2f16:
370    case nir_op_f2i32:
371    case nir_op_f2i16:
372    case nir_op_f2i8:
373    case nir_op_f2u32:
374    case nir_op_f2u16:
375    case nir_op_f2u8:
376    case nir_op_i2f32:
377    case nir_op_i2f16:
378    case nir_op_i2i32:
379    case nir_op_i2i16:
380    case nir_op_i2i8:
381    case nir_op_u2f32:
382    case nir_op_u2f16:
383    case nir_op_u2u32:
384    case nir_op_u2u16:
385    case nir_op_u2u8:
386    case nir_op_b2f16:
387    case nir_op_b2f32:
388    case nir_op_b2i8:
389    case nir_op_b2i16:
390    case nir_op_b2i32:
391       dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
392       break;
393 
394    case nir_op_fquantize2f16:
395       dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
396                           16, nir_op_f2f32);
397       break;
398    case nir_op_f2b1:
399       dst[0] = ir3_CMPS_F(
400          b, src[0], 0,
401          create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0);
402       dst[0]->cat2.condition = IR3_COND_NE;
403       break;
404 
405    case nir_op_i2b1:
406       /* i2b1 will appear when translating from nir_load_ubo or
407        * nir_intrinsic_load_ssbo, where any non-zero value is true.
408        */
409       dst[0] = ir3_CMPS_S(
410          b, src[0], 0,
411          create_immed_typed(b, 0, bs[0] == 16 ? TYPE_U16 : TYPE_U32), 0);
412       dst[0]->cat2.condition = IR3_COND_NE;
413       break;
414 
415    case nir_op_b2b1:
416       /* b2b1 will appear when translating from
417        *
418        * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
419        * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
420        *
421        * A negate can turn those into a 1 or 0 for us.
422        */
423       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
424       break;
425 
426    case nir_op_b2b32:
427       /* b2b32 will appear when converting our 1-bit bools to a store_shared
428        * argument.
429        *
430        * A negate can turn those into a ~0 for us.
431        */
432       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
433       break;
434 
435    case nir_op_fneg:
436       dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
437       break;
438    case nir_op_fabs:
439       dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
440       break;
441    case nir_op_fmax:
442       dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
443       break;
444    case nir_op_fmin:
445       dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
446       break;
447    case nir_op_fsat:
448       /* if there is just a single use of the src, and it supports
449        * (sat) bit, we can just fold the (sat) flag back to the
450        * src instruction and create a mov.  This is easier for cp
451        * to eliminate.
452        */
453       if (alu->src[0].src.is_ssa && is_sat_compatible(src[0]->opc) &&
454           (list_length(&alu->src[0].src.ssa->uses) == 1)) {
455          src[0]->flags |= IR3_INSTR_SAT;
456          dst[0] = ir3_MOV(b, src[0], dst_type);
457       } else {
458          /* otherwise generate a max.f that saturates.. blob does
459           * similar (generating a cat2 mov using max.f)
460           */
461          dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
462          dst[0]->flags |= IR3_INSTR_SAT;
463       }
464       break;
465    case nir_op_fmul:
466       dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
467       break;
468    case nir_op_fadd:
469       dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
470       break;
471    case nir_op_fsub:
472       dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
473       break;
474    case nir_op_ffma:
475       dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
476       break;
477    case nir_op_fddx:
478    case nir_op_fddx_coarse:
479       dst[0] = ir3_DSX(b, src[0], 0);
480       dst[0]->cat5.type = TYPE_F32;
481       break;
482    case nir_op_fddx_fine:
483       dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);
484       dst[0]->cat5.type = TYPE_F32;
485       break;
486    case nir_op_fddy:
487    case nir_op_fddy_coarse:
488       dst[0] = ir3_DSY(b, src[0], 0);
489       dst[0]->cat5.type = TYPE_F32;
490       break;
491       break;
492    case nir_op_fddy_fine:
493       dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);
494       dst[0]->cat5.type = TYPE_F32;
495       break;
496    case nir_op_flt:
497       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
498       dst[0]->cat2.condition = IR3_COND_LT;
499       break;
500    case nir_op_fge:
501       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
502       dst[0]->cat2.condition = IR3_COND_GE;
503       break;
504    case nir_op_feq:
505       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
506       dst[0]->cat2.condition = IR3_COND_EQ;
507       break;
508    case nir_op_fneu:
509       dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
510       dst[0]->cat2.condition = IR3_COND_NE;
511       break;
512    case nir_op_fceil:
513       dst[0] = ir3_CEIL_F(b, src[0], 0);
514       break;
515    case nir_op_ffloor:
516       dst[0] = ir3_FLOOR_F(b, src[0], 0);
517       break;
518    case nir_op_ftrunc:
519       dst[0] = ir3_TRUNC_F(b, src[0], 0);
520       break;
521    case nir_op_fround_even:
522       dst[0] = ir3_RNDNE_F(b, src[0], 0);
523       break;
524    case nir_op_fsign:
525       dst[0] = ir3_SIGN_F(b, src[0], 0);
526       break;
527 
528    case nir_op_fsin:
529       dst[0] = ir3_SIN(b, src[0], 0);
530       break;
531    case nir_op_fcos:
532       dst[0] = ir3_COS(b, src[0], 0);
533       break;
534    case nir_op_frsq:
535       dst[0] = ir3_RSQ(b, src[0], 0);
536       break;
537    case nir_op_frcp:
538       dst[0] = ir3_RCP(b, src[0], 0);
539       break;
540    case nir_op_flog2:
541       dst[0] = ir3_LOG2(b, src[0], 0);
542       break;
543    case nir_op_fexp2:
544       dst[0] = ir3_EXP2(b, src[0], 0);
545       break;
546    case nir_op_fsqrt:
547       dst[0] = ir3_SQRT(b, src[0], 0);
548       break;
549 
550    case nir_op_iabs:
551       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
552       break;
553    case nir_op_iadd:
554       dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
555       break;
556    case nir_op_iand:
557       dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
558       break;
559    case nir_op_imax:
560       dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
561       break;
562    case nir_op_umax:
563       dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
564       break;
565    case nir_op_imin:
566       dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
567       break;
568    case nir_op_umin:
569       dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
570       break;
571    case nir_op_umul_low:
572       dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
573       break;
574    case nir_op_imadsh_mix16:
575       dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
576       break;
577    case nir_op_imad24_ir3:
578       dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
579       break;
580    case nir_op_imul:
581       compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16);
582       dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
583       break;
584    case nir_op_imul24:
585       dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
586       break;
587    case nir_op_ineg:
588       dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
589       break;
590    case nir_op_inot:
591       if (bs[0] == 1) {
592          dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0);
593       } else {
594          dst[0] = ir3_NOT_B(b, src[0], 0);
595       }
596       break;
597    case nir_op_ior:
598       dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
599       break;
600    case nir_op_ishl:
601       dst[0] =
602          ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
603       break;
604    case nir_op_ishr:
605       dst[0] =
606          ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
607       break;
608    case nir_op_isub:
609       dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
610       break;
611    case nir_op_ixor:
612       dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
613       break;
614    case nir_op_ushr:
615       dst[0] =
616          ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
617       break;
618    case nir_op_ilt:
619       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
620       dst[0]->cat2.condition = IR3_COND_LT;
621       break;
622    case nir_op_ige:
623       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
624       dst[0]->cat2.condition = IR3_COND_GE;
625       break;
626    case nir_op_ieq:
627       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
628       dst[0]->cat2.condition = IR3_COND_EQ;
629       break;
630    case nir_op_ine:
631       dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
632       dst[0]->cat2.condition = IR3_COND_NE;
633       break;
634    case nir_op_ult:
635       dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
636       dst[0]->cat2.condition = IR3_COND_LT;
637       break;
638    case nir_op_uge:
639       dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
640       dst[0]->cat2.condition = IR3_COND_GE;
641       break;
642 
643    case nir_op_bcsel: {
644       struct ir3_instruction *cond = src[0];
645 
646       /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),
647        * we can ignore that and use original cond, since the nonzero-ness of
648        * cond stays the same.
649        */
650       if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 &&
651           (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
652              IR3_REG_SNEG) {
653          cond = cond->srcs[0]->def->instr;
654       }
655 
656       compile_assert(ctx, bs[1] == bs[2]);
657       /* The condition's size has to match the other two arguments' size, so
658        * convert down if necessary.
659        */
660       if (bs[1] == 16) {
661          struct hash_entry *prev_entry =
662             _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
663          if (prev_entry) {
664             cond = prev_entry->data;
665          } else {
666             cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
667             _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
668          }
669       }
670 
671       if (bs[1] != 16)
672          dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
673       else
674          dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
675       break;
676    }
677    case nir_op_bit_count: {
678       // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to
679       // double check on earlier gen's.  Once half-precision support is
680       // in place, this should probably move to a NIR lowering pass:
681       struct ir3_instruction *hi, *lo;
682 
683       hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,
684                    TYPE_U16);
685       lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
686 
687       hi = ir3_CBITS_B(b, hi, 0);
688       lo = ir3_CBITS_B(b, lo, 0);
689 
690       // TODO maybe the builders should default to making dst half-precision
691       // if the src's were half precision, to make this less awkward.. otoh
692       // we should probably just do this lowering in NIR.
693       hi->dsts[0]->flags |= IR3_REG_HALF;
694       lo->dsts[0]->flags |= IR3_REG_HALF;
695 
696       dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
697       dst[0]->dsts[0]->flags |= IR3_REG_HALF;
698       dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
699       break;
700    }
701    case nir_op_ifind_msb: {
702       struct ir3_instruction *cmp;
703       dst[0] = ir3_CLZ_S(b, src[0], 0);
704       cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
705       cmp->cat2.condition = IR3_COND_GE;
706       dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
707                            0, cmp, 0, dst[0], 0);
708       break;
709    }
710    case nir_op_ufind_msb:
711       dst[0] = ir3_CLZ_B(b, src[0], 0);
712       dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
713                            0, src[0], 0, dst[0], 0);
714       break;
715    case nir_op_find_lsb:
716       dst[0] = ir3_BFREV_B(b, src[0], 0);
717       dst[0] = ir3_CLZ_B(b, dst[0], 0);
718       break;
719    case nir_op_bitfield_reverse:
720       dst[0] = ir3_BFREV_B(b, src[0], 0);
721       break;
722 
723    default:
724       ir3_context_error(ctx, "Unhandled ALU op: %s\n",
725                         nir_op_infos[alu->op].name);
726       break;
727    }
728 
729    if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
730       assert(nir_dest_bit_size(alu->dest.dest) == 1 || alu->op == nir_op_b2b32);
731       assert(dst_sz == 1);
732    } else {
733       /* 1-bit values stored in 32-bit registers are only valid for certain
734        * ALU ops.
735        */
736       switch (alu->op) {
737       case nir_op_iand:
738       case nir_op_ior:
739       case nir_op_ixor:
740       case nir_op_inot:
741       case nir_op_bcsel:
742          break;
743       default:
744          compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1);
745       }
746    }
747 
748    ir3_put_dst(ctx, &alu->dest.dest);
749 }
750 
751 static void
emit_intrinsic_load_ubo_ldc(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)752 emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
753                             struct ir3_instruction **dst)
754 {
755    struct ir3_block *b = ctx->block;
756 
757    unsigned ncomp = intr->num_components;
758    struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
759    struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
760    struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
761    ldc->dsts[0]->wrmask = MASK(ncomp);
762    ldc->cat6.iim_val = ncomp;
763    ldc->cat6.d = nir_intrinsic_component(intr);
764    ldc->cat6.type = TYPE_U32;
765 
766    ir3_handle_bindless_cat6(ldc, intr->src[0]);
767    if (ldc->flags & IR3_INSTR_B)
768       ctx->so->bindless_ubo = true;
769    ir3_handle_nonuniform(ldc, intr);
770 
771    ir3_split_dest(b, dst, ldc, 0, ncomp);
772 }
773 
774 /* handles direct/indirect UBO reads: */
775 static void
emit_intrinsic_load_ubo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)776 emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
777                         struct ir3_instruction **dst)
778 {
779    struct ir3_block *b = ctx->block;
780    struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
781    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
782    unsigned ubo = regid(const_state->offsets.ubo, 0);
783    const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
784 
785    int off = 0;
786 
787    /* First src is ubo index, which could either be an immed or not: */
788    src0 = ir3_get_src(ctx, &intr->src[0])[0];
789    if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
790       base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
791       base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
792    } else {
793       base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
794                                         ir3_get_addr0(ctx, src0, ptrsz));
795       base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
796                                         ir3_get_addr0(ctx, src0, ptrsz));
797 
798       /* NOTE: since relative addressing is used, make sure constlen is
799        * at least big enough to cover all the UBO addresses, since the
800        * assembler won't know what the max address reg is.
801        */
802       ctx->so->constlen =
803          MAX2(ctx->so->constlen,
804               const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
805    }
806 
807    /* note: on 32bit gpu's base_hi is ignored and DCE'd */
808    addr = base_lo;
809 
810    if (nir_src_is_const(intr->src[1])) {
811       off += nir_src_as_uint(intr->src[1]);
812    } else {
813       /* For load_ubo_indirect, second src is indirect offset: */
814       src1 = ir3_get_src(ctx, &intr->src[1])[0];
815 
816       /* and add offset to addr: */
817       addr = ir3_ADD_S(b, addr, 0, src1, 0);
818    }
819 
820    /* if offset is to large to encode in the ldg, split it out: */
821    if ((off + (intr->num_components * 4)) > 1024) {
822       /* split out the minimal amount to improve the odds that
823        * cp can fit the immediate in the add.s instruction:
824        */
825       unsigned off2 = off + (intr->num_components * 4) - 1024;
826       addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
827       off -= off2;
828    }
829 
830    if (ptrsz == 2) {
831       struct ir3_instruction *carry;
832 
833       /* handle 32b rollover, ie:
834        *   if (addr < base_lo)
835        *      base_hi++
836        */
837       carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
838       carry->cat2.condition = IR3_COND_LT;
839       base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
840 
841       addr = ir3_collect(b, addr, base_hi);
842    }
843 
844    for (int i = 0; i < intr->num_components; i++) {
845       struct ir3_instruction *load =
846          ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
847                  create_immed(b, 1), 0); /* num components */
848       load->cat6.type = TYPE_U32;
849       dst[i] = load;
850    }
851 }
852 
853 /* src[] = { block_index } */
854 static void
emit_intrinsic_ssbo_size(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)855 emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
856                          struct ir3_instruction **dst)
857 {
858    struct ir3_block *b = ctx->block;
859    struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
860    struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
861    resinfo->cat6.iim_val = 1;
862    resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2;
863    resinfo->cat6.type = TYPE_U32;
864    resinfo->cat6.typed = false;
865    /* resinfo has no writemask and always writes out 3 components */
866    resinfo->dsts[0]->wrmask = MASK(3);
867    ir3_handle_bindless_cat6(resinfo, intr->src[0]);
868    ir3_handle_nonuniform(resinfo, intr);
869 
870    if (ctx->compiler->gen >= 6) {
871       ir3_split_dest(b, dst, resinfo, 0, 1);
872    } else {
873       /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */
874       struct ir3_instruction *resinfo_dst[2];
875       ir3_split_dest(b, resinfo_dst, resinfo, 0, 2);
876       *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0);
877    }
878 }
879 
880 /* src[] = { offset }. const_index[] = { base } */
881 static void
emit_intrinsic_load_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)882 emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
883                            struct ir3_instruction **dst)
884 {
885    struct ir3_block *b = ctx->block;
886    struct ir3_instruction *ldl, *offset;
887    unsigned base;
888 
889    offset = ir3_get_src(ctx, &intr->src[0])[0];
890    base = nir_intrinsic_base(intr);
891 
892    ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
893                  create_immed(b, intr->num_components), 0);
894 
895    ldl->cat6.type = utype_dst(intr->dest);
896    ldl->dsts[0]->wrmask = MASK(intr->num_components);
897 
898    ldl->barrier_class = IR3_BARRIER_SHARED_R;
899    ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
900 
901    ir3_split_dest(b, dst, ldl, 0, intr->num_components);
902 }
903 
904 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
905 static void
emit_intrinsic_store_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)906 emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
907 {
908    struct ir3_block *b = ctx->block;
909    struct ir3_instruction *stl, *offset;
910    struct ir3_instruction *const *value;
911    unsigned base, wrmask, ncomp;
912 
913    value = ir3_get_src(ctx, &intr->src[0]);
914    offset = ir3_get_src(ctx, &intr->src[1])[0];
915 
916    base = nir_intrinsic_base(intr);
917    wrmask = nir_intrinsic_write_mask(intr);
918    ncomp = ffs(~wrmask) - 1;
919 
920    assert(wrmask == BITFIELD_MASK(intr->num_components));
921 
922    stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
923                  create_immed(b, ncomp), 0);
924    stl->cat6.dst_offset = base;
925    stl->cat6.type = utype_src(intr->src[0]);
926    stl->barrier_class = IR3_BARRIER_SHARED_W;
927    stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
928 
929    array_insert(b, b->keeps, stl);
930 }
931 
932 /* src[] = { offset }. const_index[] = { base } */
933 static void
emit_intrinsic_load_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)934 emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
935                                nir_intrinsic_instr *intr,
936                                struct ir3_instruction **dst)
937 {
938    struct ir3_block *b = ctx->block;
939    struct ir3_instruction *load, *offset;
940    unsigned base;
941 
942    offset = ir3_get_src(ctx, &intr->src[0])[0];
943    base = nir_intrinsic_base(intr);
944 
945    load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
946                    create_immed(b, intr->num_components), 0);
947 
948    /* for a650, use LDL for tess ctrl inputs: */
949    if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
950       load->opc = OPC_LDL;
951 
952    load->cat6.type = utype_dst(intr->dest);
953    load->dsts[0]->wrmask = MASK(intr->num_components);
954 
955    load->barrier_class = IR3_BARRIER_SHARED_R;
956    load->barrier_conflict = IR3_BARRIER_SHARED_W;
957 
958    ir3_split_dest(b, dst, load, 0, intr->num_components);
959 }
960 
961 /* src[] = { value, offset }. const_index[] = { base } */
962 static void
emit_intrinsic_store_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr)963 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
964                                 nir_intrinsic_instr *intr)
965 {
966    struct ir3_block *b = ctx->block;
967    struct ir3_instruction *store, *offset;
968    struct ir3_instruction *const *value;
969 
970    value = ir3_get_src(ctx, &intr->src[0]);
971    offset = ir3_get_src(ctx, &intr->src[1])[0];
972 
973    store = ir3_STLW(b, offset, 0,
974                     ir3_create_collect(b, value, intr->num_components), 0,
975                     create_immed(b, intr->num_components), 0);
976 
977    /* for a650, use STL for vertex outputs used by tess ctrl shader: */
978    if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
979        ctx->compiler->tess_use_shared)
980       store->opc = OPC_STL;
981 
982    store->cat6.dst_offset = nir_intrinsic_base(intr);
983    store->cat6.type = utype_src(intr->src[0]);
984    store->barrier_class = IR3_BARRIER_SHARED_W;
985    store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
986 
987    array_insert(b, b->keeps, store);
988 }
989 
990 /*
991  * CS shared variable atomic intrinsics
992  *
993  * All of the shared variable atomic memory operations read a value from
994  * memory, compute a new value using one of the operations below, write the
995  * new value to memory, and return the original value read.
996  *
997  * All operations take 2 sources except CompSwap that takes 3. These
998  * sources represent:
999  *
1000  * 0: The offset into the shared variable storage region that the atomic
1001  *    operation will operate on.
1002  * 1: The data parameter to the atomic function (i.e. the value to add
1003  *    in shared_atomic_add, etc).
1004  * 2: For CompSwap only: the second data parameter.
1005  */
1006 static struct ir3_instruction *
emit_intrinsic_atomic_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1007 emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1008 {
1009    struct ir3_block *b = ctx->block;
1010    struct ir3_instruction *atomic, *src0, *src1;
1011    type_t type = TYPE_U32;
1012 
1013    src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1014    src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1015 
1016    switch (intr->intrinsic) {
1017    case nir_intrinsic_shared_atomic_add:
1018       atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1019       break;
1020    case nir_intrinsic_shared_atomic_imin:
1021       atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1022       type = TYPE_S32;
1023       break;
1024    case nir_intrinsic_shared_atomic_umin:
1025       atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1026       break;
1027    case nir_intrinsic_shared_atomic_imax:
1028       atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1029       type = TYPE_S32;
1030       break;
1031    case nir_intrinsic_shared_atomic_umax:
1032       atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1033       break;
1034    case nir_intrinsic_shared_atomic_and:
1035       atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1036       break;
1037    case nir_intrinsic_shared_atomic_or:
1038       atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1039       break;
1040    case nir_intrinsic_shared_atomic_xor:
1041       atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1042       break;
1043    case nir_intrinsic_shared_atomic_exchange:
1044       atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1045       break;
1046    case nir_intrinsic_shared_atomic_comp_swap:
1047       /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1048       src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
1049       atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1050       break;
1051    default:
1052       unreachable("boo");
1053    }
1054 
1055    atomic->cat6.iim_val = 1;
1056    atomic->cat6.d = 1;
1057    atomic->cat6.type = type;
1058    atomic->barrier_class = IR3_BARRIER_SHARED_W;
1059    atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1060 
1061    /* even if nothing consume the result, we can't DCE the instruction: */
1062    array_insert(b, b->keeps, atomic);
1063 
1064    return atomic;
1065 }
1066 
1067 /* src[] = { offset }. */
1068 static void
emit_intrinsic_load_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1069 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1070                             struct ir3_instruction **dst)
1071 {
1072    struct ir3_block *b = ctx->block;
1073    struct ir3_instruction *ldp, *offset;
1074 
1075    offset = ir3_get_src(ctx, &intr->src[0])[0];
1076 
1077    ldp = ir3_LDP(b, offset, 0, create_immed(b, 0), 0,
1078                  create_immed(b, intr->num_components), 0);
1079 
1080    ldp->cat6.type = utype_dst(intr->dest);
1081    ldp->dsts[0]->wrmask = MASK(intr->num_components);
1082 
1083    ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1084    ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1085 
1086    ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1087 }
1088 
1089 /* src[] = { value, offset }. const_index[] = { write_mask } */
1090 static void
emit_intrinsic_store_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr)1091 emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1092 {
1093    struct ir3_block *b = ctx->block;
1094    struct ir3_instruction *stp, *offset;
1095    struct ir3_instruction *const *value;
1096    unsigned wrmask, ncomp;
1097 
1098    value = ir3_get_src(ctx, &intr->src[0]);
1099    offset = ir3_get_src(ctx, &intr->src[1])[0];
1100 
1101    wrmask = nir_intrinsic_write_mask(intr);
1102    ncomp = ffs(~wrmask) - 1;
1103 
1104    assert(wrmask == BITFIELD_MASK(intr->num_components));
1105 
1106    stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1107                  create_immed(b, ncomp), 0);
1108    stp->cat6.dst_offset = 0;
1109    stp->cat6.type = utype_src(intr->src[0]);
1110    stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1111    stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1112 
1113    array_insert(b, b->keeps, stp);
1114 }
1115 
1116 struct tex_src_info {
1117    /* For prefetch */
1118    unsigned tex_base, samp_base, tex_idx, samp_idx;
1119    /* For normal tex instructions */
1120    unsigned base, combined_idx, a1_val, flags;
1121    struct ir3_instruction *samp_tex;
1122 };
1123 
1124 /* TODO handle actual indirect/dynamic case.. which is going to be weird
1125  * to handle with the image_mapping table..
1126  */
1127 static struct tex_src_info
get_image_samp_tex_src(struct ir3_context * ctx,nir_intrinsic_instr * intr)1128 get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1129 {
1130    struct ir3_block *b = ctx->block;
1131    struct tex_src_info info = {0};
1132    nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(intr->src[0]);
1133    ctx->so->bindless_tex = true;
1134 
1135    if (bindless_tex) {
1136       /* Bindless case */
1137       info.flags |= IR3_INSTR_B;
1138 
1139       /* Gather information required to determine which encoding to
1140        * choose as well as for prefetch.
1141        */
1142       info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1143       bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1144       if (tex_const)
1145          info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1146       info.samp_idx = 0;
1147 
1148       /* Choose encoding. */
1149       if (tex_const && info.tex_idx < 256) {
1150          if (info.tex_idx < 16) {
1151             /* Everything fits within the instruction */
1152             info.base = info.tex_base;
1153             info.combined_idx = info.samp_idx | (info.tex_idx << 4);
1154          } else {
1155             info.base = info.tex_base;
1156             info.a1_val = info.tex_idx << 3;
1157             info.combined_idx = 0;
1158             info.flags |= IR3_INSTR_A1EN;
1159          }
1160          info.samp_tex = NULL;
1161       } else {
1162          info.flags |= IR3_INSTR_S2EN;
1163          info.base = info.tex_base;
1164 
1165          /* Note: the indirect source is now a vec2 instead of hvec2 */
1166          struct ir3_instruction *texture, *sampler;
1167 
1168          texture = ir3_get_src(ctx, &intr->src[0])[0];
1169          sampler = create_immed(b, 0);
1170          info.samp_tex = ir3_collect(b, texture, sampler);
1171       }
1172    } else {
1173       info.flags |= IR3_INSTR_S2EN;
1174       unsigned slot = nir_src_as_uint(intr->src[0]);
1175       unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot);
1176       struct ir3_instruction *texture, *sampler;
1177 
1178       texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1179       sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1180 
1181       info.samp_tex = ir3_collect(b, sampler, texture);
1182    }
1183 
1184    return info;
1185 }
1186 
1187 static struct ir3_instruction *
emit_sam(struct ir3_context * ctx,opc_t opc,struct tex_src_info info,type_t type,unsigned wrmask,struct ir3_instruction * src0,struct ir3_instruction * src1)1188 emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1189          type_t type, unsigned wrmask, struct ir3_instruction *src0,
1190          struct ir3_instruction *src1)
1191 {
1192    struct ir3_instruction *sam, *addr;
1193    if (info.flags & IR3_INSTR_A1EN) {
1194       addr = ir3_get_addr1(ctx, info.a1_val);
1195    }
1196    sam = ir3_SAM(ctx->block, opc, type, 0b1111, info.flags, info.samp_tex, src0,
1197                  src1);
1198    if (info.flags & IR3_INSTR_A1EN) {
1199       ir3_instr_set_address(sam, addr);
1200    }
1201    if (info.flags & IR3_INSTR_B) {
1202       sam->cat5.tex_base = info.base;
1203       sam->cat5.samp = info.combined_idx;
1204    }
1205    return sam;
1206 }
1207 
1208 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
1209 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1210 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1211                           struct ir3_instruction **dst)
1212 {
1213    /* Coherent accesses have to go directly to memory, rather than through
1214     * ISAM's texture cache (which isn't coherent with image stores).
1215     */
1216    if (nir_intrinsic_access(intr) & ACCESS_COHERENT && ctx->compiler->gen >= 5) {
1217       ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1218       return;
1219    }
1220 
1221    struct ir3_block *b = ctx->block;
1222    struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
1223    struct ir3_instruction *sam;
1224    struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1225    struct ir3_instruction *coords[4];
1226    unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1227    type_t type = ir3_get_type_for_image_intrinsic(intr);
1228 
1229    /* hmm, this seems a bit odd, but it is what blob does and (at least
1230     * a5xx) just faults on bogus addresses otherwise:
1231     */
1232    if (flags & IR3_INSTR_3D) {
1233       flags &= ~IR3_INSTR_3D;
1234       flags |= IR3_INSTR_A;
1235    }
1236    info.flags |= flags;
1237 
1238    for (unsigned i = 0; i < ncoords; i++)
1239       coords[i] = src0[i];
1240 
1241    if (ncoords == 1)
1242       coords[ncoords++] = create_immed(b, 0);
1243 
1244    sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1245                   ir3_create_collect(b, coords, ncoords), NULL);
1246 
1247    ir3_handle_nonuniform(sam, intr);
1248 
1249    sam->barrier_class = IR3_BARRIER_IMAGE_R;
1250    sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1251 
1252    ir3_split_dest(b, dst, sam, 0, 4);
1253 }
1254 
1255 /* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1256 void
emit_intrinsic_image_size_tex(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1257 emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1258                               nir_intrinsic_instr *intr,
1259                               struct ir3_instruction **dst)
1260 {
1261    struct ir3_block *b = ctx->block;
1262    struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
1263    struct ir3_instruction *sam, *lod;
1264    unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1265    type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32;
1266 
1267    info.flags |= flags;
1268    assert(nir_src_as_uint(intr->src[1]) == 0);
1269    lod = create_immed(b, 0);
1270    sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1271 
1272    /* Array size actually ends up in .w rather than .z. This doesn't
1273     * matter for miplevel 0, but for higher mips the value in z is
1274     * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1275     * returned, which means that we have to add 1 to it for arrays for
1276     * a3xx.
1277     *
1278     * Note use a temporary dst and then copy, since the size of the dst
1279     * array that is passed in is based on nir's understanding of the
1280     * result size, not the hardware's
1281     */
1282    struct ir3_instruction *tmp[4];
1283 
1284    ir3_split_dest(b, tmp, sam, 0, 4);
1285 
1286    for (unsigned i = 0; i < ncoords; i++)
1287       dst[i] = tmp[i];
1288 
1289    if (flags & IR3_INSTR_A) {
1290       if (ctx->compiler->levels_add_one) {
1291          dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1292       } else {
1293          dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1294       }
1295    }
1296 }
1297 
1298 static void
emit_control_barrier(struct ir3_context * ctx)1299 emit_control_barrier(struct ir3_context *ctx)
1300 {
1301    /* Hull shaders dispatch 32 wide so an entire patch will always
1302     * fit in a single warp and execute in lock-step. Consequently,
1303     * we don't need to do anything for TCS barriers. Emitting
1304     * barrier instruction will deadlock.
1305     */
1306    if (ctx->so->type == MESA_SHADER_TESS_CTRL)
1307       return;
1308 
1309    struct ir3_block *b = ctx->block;
1310    struct ir3_instruction *barrier = ir3_BAR(b);
1311    barrier->cat7.g = true;
1312    if (ctx->compiler->gen < 6)
1313       barrier->cat7.l = true;
1314    barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
1315    barrier->barrier_class = IR3_BARRIER_EVERYTHING;
1316    array_insert(b, b->keeps, barrier);
1317 }
1318 
1319 static void
emit_intrinsic_barrier(struct ir3_context * ctx,nir_intrinsic_instr * intr)1320 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1321 {
1322    struct ir3_block *b = ctx->block;
1323    struct ir3_instruction *barrier;
1324 
1325    /* TODO: find out why there is a major difference of .l usage
1326     * between a5xx and a6xx,
1327     */
1328 
1329    switch (intr->intrinsic) {
1330    case nir_intrinsic_control_barrier:
1331       emit_control_barrier(ctx);
1332       return;
1333    case nir_intrinsic_scoped_barrier: {
1334       nir_scope exec_scope = nir_intrinsic_execution_scope(intr);
1335       nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
1336 
1337       if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
1338          /* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch,
1339           * because hull shaders dispatch 32 wide so an entire patch will
1340           * always fit in a single warp and execute in lock-step.
1341           *
1342           * TODO: memory barrier also tells us not to reorder stores, this
1343           * information is lost here (backend doesn't reorder stores so we
1344           * are safe for now).
1345           */
1346          modes &= ~nir_var_shader_out;
1347       }
1348 
1349       assert(!(modes & nir_var_shader_out));
1350 
1351       if ((modes &
1352            (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global))) {
1353          barrier = ir3_FENCE(b);
1354          barrier->cat7.r = true;
1355          barrier->cat7.w = true;
1356 
1357          if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1358             barrier->cat7.g = true;
1359          }
1360 
1361          if (ctx->compiler->gen >= 6) {
1362             if (modes & nir_var_mem_ssbo) {
1363                barrier->cat7.l = true;
1364             }
1365          } else {
1366             if (modes & (nir_var_mem_shared | nir_var_mem_ssbo)) {
1367                barrier->cat7.l = true;
1368             }
1369          }
1370 
1371          barrier->barrier_class = 0;
1372          barrier->barrier_conflict = 0;
1373 
1374          if (modes & nir_var_mem_shared) {
1375             barrier->barrier_class |= IR3_BARRIER_SHARED_W;
1376             barrier->barrier_conflict |=
1377                IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1378          }
1379 
1380          if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1381             barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
1382             barrier->barrier_conflict |=
1383                IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1384          }
1385 
1386          /* TODO: check for image mode when it has a separate one */
1387          if (modes & nir_var_mem_ssbo) {
1388             barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
1389             barrier->barrier_conflict |=
1390                IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
1391          }
1392          array_insert(b, b->keeps, barrier);
1393       }
1394 
1395       if (exec_scope >= NIR_SCOPE_WORKGROUP) {
1396          emit_control_barrier(ctx);
1397       }
1398 
1399       return;
1400    }
1401    case nir_intrinsic_memory_barrier_tcs_patch:
1402       /* Not applicable, see explanation for scoped_barrier + shader_out */
1403       return;
1404    case nir_intrinsic_memory_barrier_buffer:
1405       barrier = ir3_FENCE(b);
1406       barrier->cat7.g = true;
1407       if (ctx->compiler->gen >= 6)
1408          barrier->cat7.l = true;
1409       barrier->cat7.r = true;
1410       barrier->cat7.w = true;
1411       barrier->barrier_class = IR3_BARRIER_BUFFER_W;
1412       barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1413       break;
1414    case nir_intrinsic_memory_barrier_image:
1415       barrier = ir3_FENCE(b);
1416       barrier->cat7.g = true;
1417       barrier->cat7.l = true;
1418       barrier->cat7.r = true;
1419       barrier->cat7.w = true;
1420       barrier->barrier_class = IR3_BARRIER_IMAGE_W;
1421       barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
1422       break;
1423    case nir_intrinsic_memory_barrier_shared:
1424       barrier = ir3_FENCE(b);
1425       if (ctx->compiler->gen < 6)
1426          barrier->cat7.l = true;
1427       barrier->cat7.r = true;
1428       barrier->cat7.w = true;
1429       barrier->barrier_class = IR3_BARRIER_SHARED_W;
1430       barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1431       break;
1432    case nir_intrinsic_memory_barrier:
1433    case nir_intrinsic_group_memory_barrier:
1434       barrier = ir3_FENCE(b);
1435       barrier->cat7.g = true;
1436       barrier->cat7.l = true;
1437       barrier->cat7.r = true;
1438       barrier->cat7.w = true;
1439       barrier->barrier_class =
1440          IR3_BARRIER_SHARED_W | IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
1441       barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
1442                                   IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
1443                                   IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1444       break;
1445    default:
1446       unreachable("boo");
1447    }
1448 
1449    /* make sure barrier doesn't get DCE'd */
1450    array_insert(b, b->keeps, barrier);
1451 }
1452 
1453 static void
add_sysval_input_compmask(struct ir3_context * ctx,gl_system_value slot,unsigned compmask,struct ir3_instruction * instr)1454 add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
1455                           unsigned compmask, struct ir3_instruction *instr)
1456 {
1457    struct ir3_shader_variant *so = ctx->so;
1458    unsigned n = so->inputs_count++;
1459 
1460    assert(instr->opc == OPC_META_INPUT);
1461    instr->input.inidx = n;
1462    instr->input.sysval = slot;
1463 
1464    so->inputs[n].sysval = true;
1465    so->inputs[n].slot = slot;
1466    so->inputs[n].compmask = compmask;
1467    so->total_in++;
1468 
1469    so->sysval_in += util_last_bit(compmask);
1470 }
1471 
1472 static struct ir3_instruction *
create_sysval_input(struct ir3_context * ctx,gl_system_value slot,unsigned compmask)1473 create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
1474                     unsigned compmask)
1475 {
1476    assert(compmask);
1477    struct ir3_instruction *sysval = create_input(ctx, compmask);
1478    add_sysval_input_compmask(ctx, slot, compmask, sysval);
1479    return sysval;
1480 }
1481 
1482 static struct ir3_instruction *
get_barycentric(struct ir3_context * ctx,enum ir3_bary bary)1483 get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
1484 {
1485    static const gl_system_value sysval_base =
1486       SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1487 
1488    STATIC_ASSERT(sysval_base + IJ_PERSP_PIXEL ==
1489                  SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
1490    STATIC_ASSERT(sysval_base + IJ_PERSP_SAMPLE ==
1491                  SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
1492    STATIC_ASSERT(sysval_base + IJ_PERSP_CENTROID ==
1493                  SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
1494    STATIC_ASSERT(sysval_base + IJ_PERSP_SIZE ==
1495                  SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE);
1496    STATIC_ASSERT(sysval_base + IJ_LINEAR_PIXEL ==
1497                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
1498    STATIC_ASSERT(sysval_base + IJ_LINEAR_CENTROID ==
1499                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
1500    STATIC_ASSERT(sysval_base + IJ_LINEAR_SAMPLE ==
1501                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
1502 
1503    if (!ctx->ij[bary]) {
1504       struct ir3_instruction *xy[2];
1505       struct ir3_instruction *ij;
1506 
1507       ij = create_sysval_input(ctx, sysval_base + bary, 0x3);
1508       ir3_split_dest(ctx->in_block, xy, ij, 0, 2);
1509 
1510       ctx->ij[bary] = ir3_create_collect(ctx->in_block, xy, 2);
1511    }
1512 
1513    return ctx->ij[bary];
1514 }
1515 
1516 /* TODO: make this a common NIR helper?
1517  * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
1518  * it can't be extended to work with this
1519  */
1520 static gl_system_value
nir_intrinsic_barycentric_sysval(nir_intrinsic_instr * intr)1521 nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
1522 {
1523    enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
1524    gl_system_value sysval;
1525 
1526    switch (intr->intrinsic) {
1527    case nir_intrinsic_load_barycentric_pixel:
1528       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1529          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1530       else
1531          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1532       break;
1533    case nir_intrinsic_load_barycentric_centroid:
1534       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1535          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
1536       else
1537          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
1538       break;
1539    case nir_intrinsic_load_barycentric_sample:
1540       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1541          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
1542       else
1543          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
1544       break;
1545    default:
1546       unreachable("invalid barycentric intrinsic");
1547    }
1548 
1549    return sysval;
1550 }
1551 
1552 static void
emit_intrinsic_barycentric(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1553 emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1554                            struct ir3_instruction **dst)
1555 {
1556    gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
1557 
1558    if (!ctx->so->key.msaa) {
1559       switch (sysval) {
1560       case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
1561          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1562          break;
1563       case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
1564          if (ctx->compiler->gen < 6)
1565             sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1566          break;
1567       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
1568          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1569          break;
1570       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
1571          if (ctx->compiler->gen < 6)
1572             sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1573          break;
1574       default:
1575          break;
1576       }
1577    }
1578 
1579    enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1580 
1581    struct ir3_instruction *ij = get_barycentric(ctx, bary);
1582    ir3_split_dest(ctx->block, dst, ij, 0, 2);
1583 }
1584 
1585 static struct ir3_instruction *
get_frag_coord(struct ir3_context * ctx,nir_intrinsic_instr * intr)1586 get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1587 {
1588    if (!ctx->frag_coord) {
1589       struct ir3_block *b = ctx->in_block;
1590       struct ir3_instruction *xyzw[4];
1591       struct ir3_instruction *hw_frag_coord;
1592 
1593       hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
1594       ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);
1595 
1596       /* for frag_coord.xy, we get unsigned values.. we need
1597        * to subtract (integer) 8 and divide by 16 (right-
1598        * shift by 4) then convert to float:
1599        *
1600        *    sub.s tmp, src, 8
1601        *    shr.b tmp, tmp, 4
1602        *    mov.u32f32 dst, tmp
1603        *
1604        */
1605       for (int i = 0; i < 2; i++) {
1606          xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
1607          xyzw[i] =
1608             ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
1609       }
1610 
1611       ctx->frag_coord = ir3_create_collect(b, xyzw, 4);
1612    }
1613 
1614    ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa);
1615 
1616    return ctx->frag_coord;
1617 }
1618 
1619 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
1620 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
1621 
1622 static void
emit_intrinsic(struct ir3_context * ctx,nir_intrinsic_instr * intr)1623 emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1624 {
1625    const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
1626    struct ir3_instruction **dst;
1627    struct ir3_instruction *const *src;
1628    struct ir3_block *b = ctx->block;
1629    unsigned dest_components = nir_intrinsic_dest_components(intr);
1630    int idx;
1631 
1632    if (info->has_dest) {
1633       dst = ir3_get_dst(ctx, &intr->dest, dest_components);
1634    } else {
1635       dst = NULL;
1636    }
1637 
1638    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1639    const unsigned primitive_param = const_state->offsets.primitive_param * 4;
1640    const unsigned primitive_map = const_state->offsets.primitive_map * 4;
1641 
1642    switch (intr->intrinsic) {
1643    case nir_intrinsic_load_uniform:
1644       idx = nir_intrinsic_base(intr);
1645       if (nir_src_is_const(intr->src[0])) {
1646          idx += nir_src_as_uint(intr->src[0]);
1647          for (int i = 0; i < dest_components; i++) {
1648             dst[i] = create_uniform_typed(
1649                b, idx + i,
1650                nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32);
1651          }
1652       } else {
1653          src = ir3_get_src(ctx, &intr->src[0]);
1654          for (int i = 0; i < dest_components; i++) {
1655             dst[i] = create_uniform_indirect(
1656                b, idx + i,
1657                nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32,
1658                ir3_get_addr0(ctx, src[0], 1));
1659          }
1660          /* NOTE: if relative addressing is used, we set
1661           * constlen in the compiler (to worst-case value)
1662           * since we don't know in the assembler what the max
1663           * addr reg value can be:
1664           */
1665          ctx->so->constlen =
1666             MAX2(ctx->so->constlen, const_state->ubo_state.size / 16);
1667       }
1668       break;
1669 
1670    case nir_intrinsic_load_vs_primitive_stride_ir3:
1671       dst[0] = create_uniform(b, primitive_param + 0);
1672       break;
1673    case nir_intrinsic_load_vs_vertex_stride_ir3:
1674       dst[0] = create_uniform(b, primitive_param + 1);
1675       break;
1676    case nir_intrinsic_load_hs_patch_stride_ir3:
1677       dst[0] = create_uniform(b, primitive_param + 2);
1678       break;
1679    case nir_intrinsic_load_patch_vertices_in:
1680       dst[0] = create_uniform(b, primitive_param + 3);
1681       break;
1682    case nir_intrinsic_load_tess_param_base_ir3:
1683       dst[0] = create_uniform(b, primitive_param + 4);
1684       dst[1] = create_uniform(b, primitive_param + 5);
1685       break;
1686    case nir_intrinsic_load_tess_factor_base_ir3:
1687       dst[0] = create_uniform(b, primitive_param + 6);
1688       dst[1] = create_uniform(b, primitive_param + 7);
1689       break;
1690 
1691    case nir_intrinsic_load_primitive_location_ir3:
1692       idx = nir_intrinsic_driver_location(intr);
1693       dst[0] = create_uniform(b, primitive_map + idx);
1694       break;
1695 
1696    case nir_intrinsic_load_gs_header_ir3:
1697       dst[0] = ctx->gs_header;
1698       break;
1699    case nir_intrinsic_load_tcs_header_ir3:
1700       dst[0] = ctx->tcs_header;
1701       break;
1702 
1703    case nir_intrinsic_load_rel_patch_id_ir3:
1704       dst[0] = ctx->rel_patch_id;
1705       break;
1706 
1707    case nir_intrinsic_load_primitive_id:
1708       if (!ctx->primitive_id) {
1709          ctx->primitive_id =
1710             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
1711       }
1712       dst[0] = ctx->primitive_id;
1713       break;
1714 
1715    case nir_intrinsic_load_tess_coord:
1716       if (!ctx->tess_coord) {
1717          ctx->tess_coord =
1718             create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
1719       }
1720       ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
1721 
1722       /* Unused, but ir3_put_dst() below wants to free something */
1723       dst[2] = create_immed(b, 0);
1724       break;
1725 
1726    case nir_intrinsic_end_patch_ir3:
1727       assert(ctx->so->type == MESA_SHADER_TESS_CTRL);
1728       struct ir3_instruction *end = ir3_PREDE(b);
1729       array_insert(b, b->keeps, end);
1730 
1731       end->barrier_class = IR3_BARRIER_EVERYTHING;
1732       end->barrier_conflict = IR3_BARRIER_EVERYTHING;
1733       break;
1734 
1735    case nir_intrinsic_store_global_ir3:
1736       ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
1737       break;
1738    case nir_intrinsic_load_global_ir3:
1739       ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
1740       break;
1741 
1742    case nir_intrinsic_load_ubo:
1743       emit_intrinsic_load_ubo(ctx, intr, dst);
1744       break;
1745    case nir_intrinsic_load_ubo_vec4:
1746       emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
1747       break;
1748    case nir_intrinsic_load_frag_coord:
1749       ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
1750       break;
1751    case nir_intrinsic_load_sample_pos_from_id: {
1752       /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
1753        * but that doesn't seem necessary.
1754        */
1755       struct ir3_instruction *offset =
1756          ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
1757       offset->dsts[0]->wrmask = 0x3;
1758       offset->cat5.type = TYPE_F32;
1759 
1760       ir3_split_dest(b, dst, offset, 0, 2);
1761 
1762       break;
1763    }
1764    case nir_intrinsic_load_size_ir3:
1765       if (!ctx->ij[IJ_PERSP_SIZE]) {
1766          ctx->ij[IJ_PERSP_SIZE] =
1767             create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE, 0x1);
1768       }
1769       dst[0] = ctx->ij[IJ_PERSP_SIZE];
1770       break;
1771    case nir_intrinsic_load_barycentric_centroid:
1772    case nir_intrinsic_load_barycentric_sample:
1773    case nir_intrinsic_load_barycentric_pixel:
1774       emit_intrinsic_barycentric(ctx, intr, dst);
1775       break;
1776    case nir_intrinsic_load_interpolated_input:
1777    case nir_intrinsic_load_input:
1778       setup_input(ctx, intr);
1779       break;
1780    /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
1781     * pass and replaced by an ir3-specifc version that adds the
1782     * dword-offset in the last source.
1783     */
1784    case nir_intrinsic_load_ssbo_ir3:
1785       ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
1786       break;
1787    case nir_intrinsic_store_ssbo_ir3:
1788       if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1789           !ctx->s->info.fs.early_fragment_tests)
1790          ctx->so->no_earlyz = true;
1791       ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
1792       break;
1793    case nir_intrinsic_get_ssbo_size:
1794       emit_intrinsic_ssbo_size(ctx, intr, dst);
1795       break;
1796    case nir_intrinsic_ssbo_atomic_add_ir3:
1797    case nir_intrinsic_ssbo_atomic_imin_ir3:
1798    case nir_intrinsic_ssbo_atomic_umin_ir3:
1799    case nir_intrinsic_ssbo_atomic_imax_ir3:
1800    case nir_intrinsic_ssbo_atomic_umax_ir3:
1801    case nir_intrinsic_ssbo_atomic_and_ir3:
1802    case nir_intrinsic_ssbo_atomic_or_ir3:
1803    case nir_intrinsic_ssbo_atomic_xor_ir3:
1804    case nir_intrinsic_ssbo_atomic_exchange_ir3:
1805    case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
1806       if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1807           !ctx->s->info.fs.early_fragment_tests)
1808          ctx->so->no_earlyz = true;
1809       dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
1810       break;
1811    case nir_intrinsic_load_shared:
1812       emit_intrinsic_load_shared(ctx, intr, dst);
1813       break;
1814    case nir_intrinsic_store_shared:
1815       emit_intrinsic_store_shared(ctx, intr);
1816       break;
1817    case nir_intrinsic_shared_atomic_add:
1818    case nir_intrinsic_shared_atomic_imin:
1819    case nir_intrinsic_shared_atomic_umin:
1820    case nir_intrinsic_shared_atomic_imax:
1821    case nir_intrinsic_shared_atomic_umax:
1822    case nir_intrinsic_shared_atomic_and:
1823    case nir_intrinsic_shared_atomic_or:
1824    case nir_intrinsic_shared_atomic_xor:
1825    case nir_intrinsic_shared_atomic_exchange:
1826    case nir_intrinsic_shared_atomic_comp_swap:
1827       dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
1828       break;
1829    case nir_intrinsic_load_scratch:
1830       emit_intrinsic_load_scratch(ctx, intr, dst);
1831       break;
1832    case nir_intrinsic_store_scratch:
1833       emit_intrinsic_store_scratch(ctx, intr);
1834       break;
1835    case nir_intrinsic_image_load:
1836       emit_intrinsic_load_image(ctx, intr, dst);
1837       break;
1838    case nir_intrinsic_bindless_image_load:
1839       /* Bindless uses the IBO state, which doesn't have swizzle filled out,
1840        * so using isam doesn't work.
1841        *
1842        * TODO: can we use isam if we fill out more fields?
1843        */
1844       ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1845       break;
1846    case nir_intrinsic_image_store:
1847    case nir_intrinsic_bindless_image_store:
1848       if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1849           !ctx->s->info.fs.early_fragment_tests)
1850          ctx->so->no_earlyz = true;
1851       ctx->funcs->emit_intrinsic_store_image(ctx, intr);
1852       break;
1853    case nir_intrinsic_image_size:
1854    case nir_intrinsic_bindless_image_size:
1855       ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
1856       break;
1857    case nir_intrinsic_image_atomic_add:
1858    case nir_intrinsic_bindless_image_atomic_add:
1859    case nir_intrinsic_image_atomic_imin:
1860    case nir_intrinsic_bindless_image_atomic_imin:
1861    case nir_intrinsic_image_atomic_umin:
1862    case nir_intrinsic_bindless_image_atomic_umin:
1863    case nir_intrinsic_image_atomic_imax:
1864    case nir_intrinsic_bindless_image_atomic_imax:
1865    case nir_intrinsic_image_atomic_umax:
1866    case nir_intrinsic_bindless_image_atomic_umax:
1867    case nir_intrinsic_image_atomic_and:
1868    case nir_intrinsic_bindless_image_atomic_and:
1869    case nir_intrinsic_image_atomic_or:
1870    case nir_intrinsic_bindless_image_atomic_or:
1871    case nir_intrinsic_image_atomic_xor:
1872    case nir_intrinsic_bindless_image_atomic_xor:
1873    case nir_intrinsic_image_atomic_exchange:
1874    case nir_intrinsic_bindless_image_atomic_exchange:
1875    case nir_intrinsic_image_atomic_comp_swap:
1876    case nir_intrinsic_bindless_image_atomic_comp_swap:
1877       if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1878           !ctx->s->info.fs.early_fragment_tests)
1879          ctx->so->no_earlyz = true;
1880       dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
1881       break;
1882    case nir_intrinsic_scoped_barrier:
1883    case nir_intrinsic_control_barrier:
1884    case nir_intrinsic_memory_barrier:
1885    case nir_intrinsic_group_memory_barrier:
1886    case nir_intrinsic_memory_barrier_buffer:
1887    case nir_intrinsic_memory_barrier_image:
1888    case nir_intrinsic_memory_barrier_shared:
1889    case nir_intrinsic_memory_barrier_tcs_patch:
1890       emit_intrinsic_barrier(ctx, intr);
1891       /* note that blk ptr no longer valid, make that obvious: */
1892       b = NULL;
1893       break;
1894    case nir_intrinsic_store_output:
1895       setup_output(ctx, intr);
1896       break;
1897    case nir_intrinsic_load_base_vertex:
1898    case nir_intrinsic_load_first_vertex:
1899       if (!ctx->basevertex) {
1900          ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
1901       }
1902       dst[0] = ctx->basevertex;
1903       break;
1904    case nir_intrinsic_load_draw_id:
1905       if (!ctx->draw_id) {
1906          ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
1907       }
1908       dst[0] = ctx->draw_id;
1909       break;
1910    case nir_intrinsic_load_base_instance:
1911       if (!ctx->base_instance) {
1912          ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
1913       }
1914       dst[0] = ctx->base_instance;
1915       break;
1916    case nir_intrinsic_load_view_index:
1917       if (!ctx->view_index) {
1918          ctx->view_index =
1919             create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
1920       }
1921       dst[0] = ctx->view_index;
1922       break;
1923    case nir_intrinsic_load_vertex_id_zero_base:
1924    case nir_intrinsic_load_vertex_id:
1925       if (!ctx->vertex_id) {
1926          gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
1927                                  ? SYSTEM_VALUE_VERTEX_ID
1928                                  : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
1929          ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
1930       }
1931       dst[0] = ctx->vertex_id;
1932       break;
1933    case nir_intrinsic_load_instance_id:
1934       if (!ctx->instance_id) {
1935          ctx->instance_id =
1936             create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
1937       }
1938       dst[0] = ctx->instance_id;
1939       break;
1940    case nir_intrinsic_load_sample_id:
1941       ctx->so->per_samp = true;
1942       FALLTHROUGH;
1943    case nir_intrinsic_load_sample_id_no_per_sample:
1944       if (!ctx->samp_id) {
1945          ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
1946          ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
1947       }
1948       dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
1949       break;
1950    case nir_intrinsic_load_sample_mask_in:
1951       if (!ctx->samp_mask_in) {
1952          ctx->samp_mask_in =
1953             create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
1954       }
1955       dst[0] = ctx->samp_mask_in;
1956       break;
1957    case nir_intrinsic_load_user_clip_plane:
1958       idx = nir_intrinsic_ucp_id(intr);
1959       for (int i = 0; i < dest_components; i++) {
1960          unsigned n = idx * 4 + i;
1961          dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
1962       }
1963       break;
1964    case nir_intrinsic_load_front_face:
1965       if (!ctx->frag_face) {
1966          ctx->so->frag_face = true;
1967          ctx->frag_face =
1968             create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
1969          ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
1970       }
1971       /* for fragface, we get -1 for back and 0 for front. However this is
1972        * the inverse of what nir expects (where ~0 is true).
1973        */
1974       dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
1975                           create_immed_typed(b, 0, TYPE_U16), 0);
1976       dst[0]->cat2.condition = IR3_COND_EQ;
1977       break;
1978    case nir_intrinsic_load_local_invocation_id:
1979       if (!ctx->local_invocation_id) {
1980          ctx->local_invocation_id =
1981             create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
1982       }
1983       ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
1984       break;
1985    case nir_intrinsic_load_workgroup_id:
1986    case nir_intrinsic_load_workgroup_id_zero_base:
1987       if (!ctx->work_group_id) {
1988          ctx->work_group_id =
1989             create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
1990          ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
1991       }
1992       ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
1993       break;
1994    case nir_intrinsic_load_base_workgroup_id:
1995       for (int i = 0; i < dest_components; i++) {
1996          dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
1997       }
1998       break;
1999    case nir_intrinsic_load_num_workgroups:
2000       for (int i = 0; i < dest_components; i++) {
2001          dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
2002       }
2003       break;
2004    case nir_intrinsic_load_workgroup_size:
2005       for (int i = 0; i < dest_components; i++) {
2006          dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
2007       }
2008       break;
2009    case nir_intrinsic_load_subgroup_size:
2010       dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
2011       break;
2012    case nir_intrinsic_load_subgroup_id_shift_ir3:
2013       dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
2014       break;
2015    case nir_intrinsic_discard_if:
2016    case nir_intrinsic_discard:
2017    case nir_intrinsic_demote:
2018    case nir_intrinsic_demote_if:
2019    case nir_intrinsic_terminate:
2020    case nir_intrinsic_terminate_if: {
2021       struct ir3_instruction *cond, *kill;
2022 
2023       if (intr->intrinsic == nir_intrinsic_discard_if ||
2024           intr->intrinsic == nir_intrinsic_demote_if ||
2025           intr->intrinsic == nir_intrinsic_terminate_if) {
2026          /* conditional discard: */
2027          src = ir3_get_src(ctx, &intr->src[0]);
2028          cond = src[0];
2029       } else {
2030          /* unconditional discard: */
2031          cond = create_immed(b, 1);
2032       }
2033 
2034       /* NOTE: only cmps.*.* can write p0.x: */
2035       cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
2036       cond->cat2.condition = IR3_COND_NE;
2037 
2038       /* condition always goes in predicate register: */
2039       cond->dsts[0]->num = regid(REG_P0, 0);
2040       cond->dsts[0]->flags &= ~IR3_REG_SSA;
2041 
2042       if (intr->intrinsic == nir_intrinsic_demote ||
2043           intr->intrinsic == nir_intrinsic_demote_if) {
2044          kill = ir3_DEMOTE(b, cond, 0);
2045       } else {
2046          kill = ir3_KILL(b, cond, 0);
2047       }
2048 
2049       /* Side-effects should not be moved on a different side of the kill */
2050       kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
2051       kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
2052       kill->srcs[0]->num = regid(REG_P0, 0);
2053       array_insert(ctx->ir, ctx->ir->predicates, kill);
2054 
2055       array_insert(b, b->keeps, kill);
2056       ctx->so->has_kill = true;
2057 
2058       break;
2059    }
2060 
2061    case nir_intrinsic_cond_end_ir3: {
2062       struct ir3_instruction *cond, *kill;
2063 
2064       src = ir3_get_src(ctx, &intr->src[0]);
2065       cond = src[0];
2066 
2067       /* NOTE: only cmps.*.* can write p0.x: */
2068       cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
2069       cond->cat2.condition = IR3_COND_NE;
2070 
2071       /* condition always goes in predicate register: */
2072       cond->dsts[0]->num = regid(REG_P0, 0);
2073 
2074       kill = ir3_PREDT(b, cond, 0);
2075 
2076       kill->barrier_class = IR3_BARRIER_EVERYTHING;
2077       kill->barrier_conflict = IR3_BARRIER_EVERYTHING;
2078 
2079       array_insert(ctx->ir, ctx->ir->predicates, kill);
2080       array_insert(b, b->keeps, kill);
2081       break;
2082    }
2083 
2084    case nir_intrinsic_vote_any:
2085    case nir_intrinsic_vote_all: {
2086       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2087       struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2088       if (intr->intrinsic == nir_intrinsic_vote_any)
2089          dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
2090       else
2091          dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
2092       dst[0]->srcs[0]->num = regid(REG_P0, 0);
2093       array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2094       break;
2095    }
2096    case nir_intrinsic_elect:
2097       dst[0] = ir3_ELECT_MACRO(ctx->block);
2098       /* This may expand to a divergent if/then, so allocate stack space for
2099        * it.
2100        */
2101       ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2102       break;
2103 
2104    case nir_intrinsic_read_invocation_cond_ir3: {
2105       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2106       struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
2107       dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
2108                                    src, 0);
2109       dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2110       dst[0]->srcs[0]->num = regid(REG_P0, 0);
2111       array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2112       ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2113       break;
2114    }
2115 
2116    case nir_intrinsic_read_first_invocation: {
2117       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2118       dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
2119       dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2120       ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2121       break;
2122    }
2123 
2124    case nir_intrinsic_ballot: {
2125       struct ir3_instruction *ballot;
2126       unsigned components = intr->dest.ssa.num_components;
2127       if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
2128          /* ballot(true) is just MOVMSK */
2129          ballot = ir3_MOVMSK(ctx->block, components);
2130       } else {
2131          struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2132          struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2133          ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
2134          ballot->srcs[0]->num = regid(REG_P0, 0);
2135          array_insert(ctx->ir, ctx->ir->predicates, ballot);
2136          ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2137       }
2138       ir3_split_dest(ctx->block, dst, ballot, 0, components);
2139       break;
2140    }
2141 
2142    case nir_intrinsic_load_shared_ir3:
2143       emit_intrinsic_load_shared_ir3(ctx, intr, dst);
2144       break;
2145    case nir_intrinsic_store_shared_ir3:
2146       emit_intrinsic_store_shared_ir3(ctx, intr);
2147       break;
2148    case nir_intrinsic_bindless_resource_ir3:
2149       dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
2150       break;
2151    default:
2152       ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
2153                         nir_intrinsic_infos[intr->intrinsic].name);
2154       break;
2155    }
2156 
2157    if (info->has_dest)
2158       ir3_put_dst(ctx, &intr->dest);
2159 }
2160 
2161 static void
emit_load_const(struct ir3_context * ctx,nir_load_const_instr * instr)2162 emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
2163 {
2164    struct ir3_instruction **dst =
2165       ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
2166 
2167    if (instr->def.bit_size == 16) {
2168       for (int i = 0; i < instr->def.num_components; i++)
2169          dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
2170    } else {
2171       for (int i = 0; i < instr->def.num_components; i++)
2172          dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);
2173    }
2174 }
2175 
2176 static void
emit_undef(struct ir3_context * ctx,nir_ssa_undef_instr * undef)2177 emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
2178 {
2179    struct ir3_instruction **dst =
2180       ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
2181    type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32;
2182 
2183    /* backend doesn't want undefined instructions, so just plug
2184     * in 0.0..
2185     */
2186    for (int i = 0; i < undef->def.num_components; i++)
2187       dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
2188 }
2189 
2190 /*
2191  * texture fetch/sample instructions:
2192  */
2193 
2194 static type_t
get_tex_dest_type(nir_tex_instr * tex)2195 get_tex_dest_type(nir_tex_instr *tex)
2196 {
2197    type_t type;
2198 
2199    switch (tex->dest_type) {
2200    case nir_type_float32:
2201       return TYPE_F32;
2202    case nir_type_float16:
2203       return TYPE_F16;
2204    case nir_type_int32:
2205       return TYPE_S32;
2206    case nir_type_int16:
2207       return TYPE_S16;
2208    case nir_type_bool32:
2209    case nir_type_uint32:
2210       return TYPE_U32;
2211    case nir_type_bool16:
2212    case nir_type_uint16:
2213       return TYPE_U16;
2214    case nir_type_invalid:
2215    default:
2216       unreachable("bad dest_type");
2217    }
2218 
2219    return type;
2220 }
2221 
2222 static void
tex_info(nir_tex_instr * tex,unsigned * flagsp,unsigned * coordsp)2223 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
2224 {
2225    unsigned coords =
2226       glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
2227    unsigned flags = 0;
2228 
2229    /* note: would use tex->coord_components.. except txs.. also,
2230     * since array index goes after shadow ref, we don't want to
2231     * count it:
2232     */
2233    if (coords == 3)
2234       flags |= IR3_INSTR_3D;
2235 
2236    if (tex->is_shadow && tex->op != nir_texop_lod)
2237       flags |= IR3_INSTR_S;
2238 
2239    if (tex->is_array && tex->op != nir_texop_lod)
2240       flags |= IR3_INSTR_A;
2241 
2242    *flagsp = flags;
2243    *coordsp = coords;
2244 }
2245 
2246 /* Gets the sampler/texture idx as a hvec2.  Which could either be dynamic
2247  * or immediate (in which case it will get lowered later to a non .s2en
2248  * version of the tex instruction which encode tex/samp as immediates:
2249  */
2250 static struct tex_src_info
get_tex_samp_tex_src(struct ir3_context * ctx,nir_tex_instr * tex)2251 get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
2252 {
2253    struct ir3_block *b = ctx->block;
2254    struct tex_src_info info = {0};
2255    int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
2256    int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2257    struct ir3_instruction *texture, *sampler;
2258 
2259    if (texture_idx >= 0 || sampler_idx >= 0) {
2260       /* Bindless case */
2261       info.flags |= IR3_INSTR_B;
2262 
2263       if (tex->texture_non_uniform || tex->sampler_non_uniform)
2264          info.flags |= IR3_INSTR_NONUNIF;
2265 
2266       /* Gather information required to determine which encoding to
2267        * choose as well as for prefetch.
2268        */
2269       nir_intrinsic_instr *bindless_tex = NULL;
2270       bool tex_const;
2271       if (texture_idx >= 0) {
2272          ctx->so->bindless_tex = true;
2273          bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);
2274          assert(bindless_tex);
2275          info.tex_base = nir_intrinsic_desc_set(bindless_tex);
2276          tex_const = nir_src_is_const(bindless_tex->src[0]);
2277          if (tex_const)
2278             info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
2279       } else {
2280          /* To simplify some of the logic below, assume the index is
2281           * constant 0 when it's not enabled.
2282           */
2283          tex_const = true;
2284          info.tex_idx = 0;
2285       }
2286       nir_intrinsic_instr *bindless_samp = NULL;
2287       bool samp_const;
2288       if (sampler_idx >= 0) {
2289          ctx->so->bindless_samp = true;
2290          bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);
2291          assert(bindless_samp);
2292          info.samp_base = nir_intrinsic_desc_set(bindless_samp);
2293          samp_const = nir_src_is_const(bindless_samp->src[0]);
2294          if (samp_const)
2295             info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
2296       } else {
2297          samp_const = true;
2298          info.samp_idx = 0;
2299       }
2300 
2301       /* Choose encoding. */
2302       if (tex_const && samp_const && info.tex_idx < 256 &&
2303           info.samp_idx < 256) {
2304          if (info.tex_idx < 16 && info.samp_idx < 16 &&
2305              (!bindless_tex || !bindless_samp ||
2306               info.tex_base == info.samp_base)) {
2307             /* Everything fits within the instruction */
2308             info.base = info.tex_base;
2309             info.combined_idx = info.samp_idx | (info.tex_idx << 4);
2310          } else {
2311             info.base = info.tex_base;
2312             info.a1_val = info.tex_idx << 3 | info.samp_base;
2313             info.combined_idx = info.samp_idx;
2314             info.flags |= IR3_INSTR_A1EN;
2315          }
2316          info.samp_tex = NULL;
2317       } else {
2318          info.flags |= IR3_INSTR_S2EN;
2319          /* In the indirect case, we only use a1.x to store the sampler
2320           * base if it differs from the texture base.
2321           */
2322          if (!bindless_tex || !bindless_samp ||
2323              info.tex_base == info.samp_base) {
2324             info.base = info.tex_base;
2325          } else {
2326             info.base = info.tex_base;
2327             info.a1_val = info.samp_base;
2328             info.flags |= IR3_INSTR_A1EN;
2329          }
2330 
2331          /* Note: the indirect source is now a vec2 instead of hvec2, and
2332           * for some reason the texture and sampler are swapped.
2333           */
2334          struct ir3_instruction *texture, *sampler;
2335 
2336          if (bindless_tex) {
2337             texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2338          } else {
2339             texture = create_immed(b, 0);
2340          }
2341 
2342          if (bindless_samp) {
2343             sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2344          } else {
2345             sampler = create_immed(b, 0);
2346          }
2347          info.samp_tex = ir3_collect(b, texture, sampler);
2348       }
2349    } else {
2350       info.flags |= IR3_INSTR_S2EN;
2351       texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
2352       sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
2353       if (texture_idx >= 0) {
2354          texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2355          texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
2356       } else {
2357          /* TODO what to do for dynamic case? I guess we only need the
2358           * max index for astc srgb workaround so maybe not a problem
2359           * to worry about if we don't enable indirect samplers for
2360           * a4xx?
2361           */
2362          ctx->max_texture_index =
2363             MAX2(ctx->max_texture_index, tex->texture_index);
2364          texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
2365          info.tex_idx = tex->texture_index;
2366       }
2367 
2368       if (sampler_idx >= 0) {
2369          sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2370          sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
2371       } else {
2372          sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
2373          info.samp_idx = tex->texture_index;
2374       }
2375 
2376       info.samp_tex = ir3_collect(b, sampler, texture);
2377    }
2378 
2379    return info;
2380 }
2381 
2382 static void
emit_tex(struct ir3_context * ctx,nir_tex_instr * tex)2383 emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
2384 {
2385    struct ir3_block *b = ctx->block;
2386    struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
2387    struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
2388    struct ir3_instruction *lod, *compare, *proj, *sample_index;
2389    struct tex_src_info info = {0};
2390    bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
2391    unsigned i, coords, flags, ncomp;
2392    unsigned nsrc0 = 0, nsrc1 = 0;
2393    type_t type;
2394    opc_t opc = 0;
2395 
2396    ncomp = nir_dest_num_components(tex->dest);
2397 
2398    coord = off = ddx = ddy = NULL;
2399    lod = proj = compare = sample_index = NULL;
2400 
2401    dst = ir3_get_dst(ctx, &tex->dest, ncomp);
2402 
2403    for (unsigned i = 0; i < tex->num_srcs; i++) {
2404       switch (tex->src[i].src_type) {
2405       case nir_tex_src_coord:
2406          coord = ir3_get_src(ctx, &tex->src[i].src);
2407          break;
2408       case nir_tex_src_bias:
2409          lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2410          has_bias = true;
2411          break;
2412       case nir_tex_src_lod:
2413          lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2414          has_lod = true;
2415          break;
2416       case nir_tex_src_comparator: /* shadow comparator */
2417          compare = ir3_get_src(ctx, &tex->src[i].src)[0];
2418          break;
2419       case nir_tex_src_projector:
2420          proj = ir3_get_src(ctx, &tex->src[i].src)[0];
2421          has_proj = true;
2422          break;
2423       case nir_tex_src_offset:
2424          off = ir3_get_src(ctx, &tex->src[i].src);
2425          has_off = true;
2426          break;
2427       case nir_tex_src_ddx:
2428          ddx = ir3_get_src(ctx, &tex->src[i].src);
2429          break;
2430       case nir_tex_src_ddy:
2431          ddy = ir3_get_src(ctx, &tex->src[i].src);
2432          break;
2433       case nir_tex_src_ms_index:
2434          sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
2435          break;
2436       case nir_tex_src_texture_offset:
2437       case nir_tex_src_sampler_offset:
2438       case nir_tex_src_texture_handle:
2439       case nir_tex_src_sampler_handle:
2440          /* handled in get_tex_samp_src() */
2441          break;
2442       default:
2443          ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
2444                            tex->src[i].src_type);
2445          return;
2446       }
2447    }
2448 
2449    switch (tex->op) {
2450    case nir_texop_tex_prefetch:
2451       compile_assert(ctx, !has_bias);
2452       compile_assert(ctx, !has_lod);
2453       compile_assert(ctx, !compare);
2454       compile_assert(ctx, !has_proj);
2455       compile_assert(ctx, !has_off);
2456       compile_assert(ctx, !ddx);
2457       compile_assert(ctx, !ddy);
2458       compile_assert(ctx, !sample_index);
2459       compile_assert(
2460          ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
2461       compile_assert(
2462          ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
2463 
2464       if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
2465          opc = OPC_META_TEX_PREFETCH;
2466          ctx->so->num_sampler_prefetch++;
2467          break;
2468       }
2469       FALLTHROUGH;
2470    case nir_texop_tex:
2471       opc = has_lod ? OPC_SAML : OPC_SAM;
2472       break;
2473    case nir_texop_txb:
2474       opc = OPC_SAMB;
2475       break;
2476    case nir_texop_txl:
2477       opc = OPC_SAML;
2478       break;
2479    case nir_texop_txd:
2480       opc = OPC_SAMGQ;
2481       break;
2482    case nir_texop_txf:
2483       opc = OPC_ISAML;
2484       break;
2485    case nir_texop_lod:
2486       opc = OPC_GETLOD;
2487       break;
2488    case nir_texop_tg4:
2489       /* NOTE: a4xx might need to emulate gather w/ txf (this is
2490        * what blob does, seems gather  is broken?), and a3xx did
2491        * not support it (but probably could also emulate).
2492        */
2493       switch (tex->component) {
2494       case 0:
2495          opc = OPC_GATHER4R;
2496          break;
2497       case 1:
2498          opc = OPC_GATHER4G;
2499          break;
2500       case 2:
2501          opc = OPC_GATHER4B;
2502          break;
2503       case 3:
2504          opc = OPC_GATHER4A;
2505          break;
2506       }
2507       break;
2508    case nir_texop_txf_ms_fb:
2509    case nir_texop_txf_ms:
2510       opc = OPC_ISAMM;
2511       break;
2512    default:
2513       ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
2514       return;
2515    }
2516 
2517    tex_info(tex, &flags, &coords);
2518 
2519    /*
2520     * lay out the first argument in the proper order:
2521     *  - actual coordinates first
2522     *  - shadow reference
2523     *  - array index
2524     *  - projection w
2525     *  - starting at offset 4, dpdx.xy, dpdy.xy
2526     *
2527     * bias/lod go into the second arg
2528     */
2529 
2530    /* insert tex coords: */
2531    for (i = 0; i < coords; i++)
2532       src0[i] = coord[i];
2533 
2534    nsrc0 = i;
2535 
2536    /* scale up integer coords for TXF based on the LOD */
2537    if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
2538       assert(has_lod);
2539       for (i = 0; i < coords; i++)
2540          src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
2541    }
2542 
2543    if (coords == 1) {
2544       /* hw doesn't do 1d, so we treat it as 2d with
2545        * height of 1, and patch up the y coord.
2546        */
2547       if (is_isam(opc)) {
2548          src0[nsrc0++] = create_immed(b, 0);
2549       } else {
2550          src0[nsrc0++] = create_immed(b, fui(0.5));
2551       }
2552    }
2553 
2554    if (tex->is_shadow && tex->op != nir_texop_lod)
2555       src0[nsrc0++] = compare;
2556 
2557    if (tex->is_array && tex->op != nir_texop_lod) {
2558       struct ir3_instruction *idx = coord[coords];
2559 
2560       /* the array coord for cube arrays needs 0.5 added to it */
2561       if (ctx->compiler->array_index_add_half && !is_isam(opc))
2562          idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
2563 
2564       src0[nsrc0++] = idx;
2565    }
2566 
2567    if (has_proj) {
2568       src0[nsrc0++] = proj;
2569       flags |= IR3_INSTR_P;
2570    }
2571 
2572    /* pad to 4, then ddx/ddy: */
2573    if (tex->op == nir_texop_txd) {
2574       while (nsrc0 < 4)
2575          src0[nsrc0++] = create_immed(b, fui(0.0));
2576       for (i = 0; i < coords; i++)
2577          src0[nsrc0++] = ddx[i];
2578       if (coords < 2)
2579          src0[nsrc0++] = create_immed(b, fui(0.0));
2580       for (i = 0; i < coords; i++)
2581          src0[nsrc0++] = ddy[i];
2582       if (coords < 2)
2583          src0[nsrc0++] = create_immed(b, fui(0.0));
2584    }
2585 
2586    /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
2587     * with scaled x coord according to requested sample:
2588     */
2589    if (opc == OPC_ISAMM) {
2590       if (ctx->compiler->txf_ms_with_isaml) {
2591          /* the samples are laid out in x dimension as
2592           *     0 1 2 3
2593           * x_ms = (x << ms) + sample_index;
2594           */
2595          struct ir3_instruction *ms;
2596          ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
2597 
2598          src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
2599          src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
2600 
2601          opc = OPC_ISAML;
2602       } else {
2603          src0[nsrc0++] = sample_index;
2604       }
2605    }
2606 
2607    /*
2608     * second argument (if applicable):
2609     *  - offsets
2610     *  - lod
2611     *  - bias
2612     */
2613    if (has_off | has_lod | has_bias) {
2614       if (has_off) {
2615          unsigned off_coords = coords;
2616          if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
2617             off_coords--;
2618          for (i = 0; i < off_coords; i++)
2619             src1[nsrc1++] = off[i];
2620          if (off_coords < 2)
2621             src1[nsrc1++] = create_immed(b, fui(0.0));
2622          flags |= IR3_INSTR_O;
2623       }
2624 
2625       if (has_lod | has_bias)
2626          src1[nsrc1++] = lod;
2627    }
2628 
2629    type = get_tex_dest_type(tex);
2630 
2631    if (opc == OPC_GETLOD)
2632       type = TYPE_S32;
2633 
2634    if (tex->op == nir_texop_txf_ms_fb) {
2635       /* only expect a single txf_ms_fb per shader: */
2636       compile_assert(ctx, !ctx->so->fb_read);
2637       compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
2638 
2639       ctx->so->fb_read = true;
2640       info.samp_tex = ir3_collect(
2641          b, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
2642          create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
2643       info.flags = IR3_INSTR_S2EN;
2644 
2645       ctx->so->num_samp++;
2646    } else {
2647       info = get_tex_samp_tex_src(ctx, tex);
2648    }
2649 
2650    struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
2651    struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
2652 
2653    if (opc == OPC_META_TEX_PREFETCH) {
2654       int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
2655 
2656       compile_assert(ctx, tex->src[idx].src.is_ssa);
2657 
2658       sam = ir3_SAM(b, opc, type, MASK(ncomp), 0, NULL,
2659                     get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
2660       sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
2661       /* make sure not to add irrelevant flags like S2EN */
2662       sam->flags = flags | (info.flags & IR3_INSTR_B);
2663       sam->prefetch.tex = info.tex_idx;
2664       sam->prefetch.samp = info.samp_idx;
2665       sam->prefetch.tex_base = info.tex_base;
2666       sam->prefetch.samp_base = info.samp_base;
2667    } else {
2668       info.flags |= flags;
2669       sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
2670    }
2671 
2672    if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
2673        !nir_tex_instr_is_query(tex)) {
2674       assert(opc != OPC_META_TEX_PREFETCH);
2675 
2676       /* only need first 3 components: */
2677       sam->dsts[0]->wrmask = 0x7;
2678       ir3_split_dest(b, dst, sam, 0, 3);
2679 
2680       /* we need to sample the alpha separately with a non-ASTC
2681        * texture state:
2682        */
2683       sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
2684                     col0, col1);
2685 
2686       array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
2687 
2688       /* fixup .w component: */
2689       ir3_split_dest(b, &dst[3], sam, 3, 1);
2690    } else {
2691       /* normal (non-workaround) case: */
2692       ir3_split_dest(b, dst, sam, 0, ncomp);
2693    }
2694 
2695    /* GETLOD returns results in 4.8 fixed point */
2696    if (opc == OPC_GETLOD) {
2697       struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
2698 
2699       compile_assert(ctx, tex->dest_type == nir_type_float32);
2700       for (i = 0; i < 2; i++) {
2701          dst[i] =
2702             ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_S32, TYPE_F32), 0, factor, 0);
2703       }
2704    }
2705 
2706    ir3_put_dst(ctx, &tex->dest);
2707 }
2708 
2709 static void
emit_tex_info(struct ir3_context * ctx,nir_tex_instr * tex,unsigned idx)2710 emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
2711 {
2712    struct ir3_block *b = ctx->block;
2713    struct ir3_instruction **dst, *sam;
2714    type_t dst_type = get_tex_dest_type(tex);
2715    struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
2716 
2717    dst = ir3_get_dst(ctx, &tex->dest, 1);
2718 
2719    sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
2720 
2721    /* even though there is only one component, since it ends
2722     * up in .y/.z/.w rather than .x, we need a split_dest()
2723     */
2724    ir3_split_dest(b, dst, sam, idx, 1);
2725 
2726    /* The # of levels comes from getinfo.z. We need to add 1 to it, since
2727     * the value in TEX_CONST_0 is zero-based.
2728     */
2729    if (ctx->compiler->levels_add_one)
2730       dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
2731 
2732    ir3_put_dst(ctx, &tex->dest);
2733 }
2734 
2735 static void
emit_tex_txs(struct ir3_context * ctx,nir_tex_instr * tex)2736 emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
2737 {
2738    struct ir3_block *b = ctx->block;
2739    struct ir3_instruction **dst, *sam;
2740    struct ir3_instruction *lod;
2741    unsigned flags, coords;
2742    type_t dst_type = get_tex_dest_type(tex);
2743    struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
2744 
2745    tex_info(tex, &flags, &coords);
2746    info.flags |= flags;
2747 
2748    /* Actually we want the number of dimensions, not coordinates. This
2749     * distinction only matters for cubes.
2750     */
2751    if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
2752       coords = 2;
2753 
2754    dst = ir3_get_dst(ctx, &tex->dest, 4);
2755 
2756    int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
2757    compile_assert(ctx, lod_idx >= 0);
2758 
2759    lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
2760 
2761    if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
2762       sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
2763    } else {
2764       /*
2765        * The maximum value which OPC_GETSIZE could return for one dimension
2766        * is 0x007ff0, however sampler buffer could be much bigger.
2767        * Blob uses OPC_GETBUF for them.
2768        */
2769       sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
2770    }
2771 
2772    ir3_split_dest(b, dst, sam, 0, 4);
2773 
2774    /* Array size actually ends up in .w rather than .z. This doesn't
2775     * matter for miplevel 0, but for higher mips the value in z is
2776     * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
2777     * returned, which means that we have to add 1 to it for arrays.
2778     */
2779    if (tex->is_array) {
2780       if (ctx->compiler->levels_add_one) {
2781          dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
2782       } else {
2783          dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
2784       }
2785    }
2786 
2787    ir3_put_dst(ctx, &tex->dest);
2788 }
2789 
2790 /* phi instructions are left partially constructed.  We don't resolve
2791  * their srcs until the end of the shader, since (eg. loops) one of
2792  * the phi's srcs might be defined after the phi due to back edges in
2793  * the CFG.
2794  */
2795 static void
emit_phi(struct ir3_context * ctx,nir_phi_instr * nphi)2796 emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
2797 {
2798    struct ir3_instruction *phi, **dst;
2799 
2800    /* NOTE: phi's should be lowered to scalar at this point */
2801    compile_assert(ctx, nphi->dest.ssa.num_components == 1);
2802 
2803    dst = ir3_get_dst(ctx, &nphi->dest, 1);
2804 
2805    phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,
2806                           exec_list_length(&nphi->srcs));
2807    __ssa_dst(phi);
2808    phi->phi.nphi = nphi;
2809 
2810    dst[0] = phi;
2811 
2812    ir3_put_dst(ctx, &nphi->dest);
2813 }
2814 
2815 static struct ir3_block *get_block(struct ir3_context *ctx,
2816                                    const nir_block *nblock);
2817 
2818 static struct ir3_instruction *
read_phi_src(struct ir3_context * ctx,struct ir3_block * blk,struct ir3_instruction * phi,nir_phi_instr * nphi)2819 read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
2820              struct ir3_instruction *phi, nir_phi_instr *nphi)
2821 {
2822    if (!blk->nblock) {
2823       struct ir3_instruction *continue_phi =
2824          ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
2825       __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
2826 
2827       for (unsigned i = 0; i < blk->predecessors_count; i++) {
2828          struct ir3_instruction *src =
2829             read_phi_src(ctx, blk->predecessors[i], phi, nphi);
2830          if (src)
2831             __ssa_src(continue_phi, src, 0);
2832          else
2833             ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
2834       }
2835 
2836       return continue_phi;
2837    }
2838 
2839    nir_foreach_phi_src (nsrc, nphi) {
2840       if (blk->nblock == nsrc->pred) {
2841          if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
2842             /* Create an ir3 undef */
2843             return NULL;
2844          } else {
2845             return ir3_get_src(ctx, &nsrc->src)[0];
2846          }
2847       }
2848    }
2849 
2850    unreachable("couldn't find phi node ir3 block");
2851    return NULL;
2852 }
2853 
2854 static void
resolve_phis(struct ir3_context * ctx,struct ir3_block * block)2855 resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
2856 {
2857    foreach_instr (phi, &block->instr_list) {
2858       if (phi->opc != OPC_META_PHI)
2859          break;
2860 
2861       nir_phi_instr *nphi = phi->phi.nphi;
2862 
2863       if (!nphi) /* skip continue phis created above */
2864          continue;
2865 
2866       for (unsigned i = 0; i < block->predecessors_count; i++) {
2867          struct ir3_block *pred = block->predecessors[i];
2868          struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
2869          if (src) {
2870             __ssa_src(phi, src, 0);
2871          } else {
2872             /* Create an ir3 undef */
2873             ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
2874          }
2875       }
2876    }
2877 }
2878 
2879 static void
emit_jump(struct ir3_context * ctx,nir_jump_instr * jump)2880 emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
2881 {
2882    switch (jump->type) {
2883    case nir_jump_break:
2884    case nir_jump_continue:
2885    case nir_jump_return:
2886       /* I *think* we can simply just ignore this, and use the
2887        * successor block link to figure out where we need to
2888        * jump to for break/continue
2889        */
2890       break;
2891    default:
2892       ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
2893       break;
2894    }
2895 }
2896 
2897 static void
emit_instr(struct ir3_context * ctx,nir_instr * instr)2898 emit_instr(struct ir3_context *ctx, nir_instr *instr)
2899 {
2900    switch (instr->type) {
2901    case nir_instr_type_alu:
2902       emit_alu(ctx, nir_instr_as_alu(instr));
2903       break;
2904    case nir_instr_type_deref:
2905       /* ignored, handled as part of the intrinsic they are src to */
2906       break;
2907    case nir_instr_type_intrinsic:
2908       emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
2909       break;
2910    case nir_instr_type_load_const:
2911       emit_load_const(ctx, nir_instr_as_load_const(instr));
2912       break;
2913    case nir_instr_type_ssa_undef:
2914       emit_undef(ctx, nir_instr_as_ssa_undef(instr));
2915       break;
2916    case nir_instr_type_tex: {
2917       nir_tex_instr *tex = nir_instr_as_tex(instr);
2918       /* couple tex instructions get special-cased:
2919        */
2920       switch (tex->op) {
2921       case nir_texop_txs:
2922          emit_tex_txs(ctx, tex);
2923          break;
2924       case nir_texop_query_levels:
2925          emit_tex_info(ctx, tex, 2);
2926          break;
2927       case nir_texop_texture_samples:
2928          emit_tex_info(ctx, tex, 3);
2929          break;
2930       default:
2931          emit_tex(ctx, tex);
2932          break;
2933       }
2934       break;
2935    }
2936    case nir_instr_type_jump:
2937       emit_jump(ctx, nir_instr_as_jump(instr));
2938       break;
2939    case nir_instr_type_phi:
2940       emit_phi(ctx, nir_instr_as_phi(instr));
2941       break;
2942    case nir_instr_type_call:
2943    case nir_instr_type_parallel_copy:
2944       ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
2945                         instr->type);
2946       break;
2947    }
2948 }
2949 
2950 static struct ir3_block *
get_block(struct ir3_context * ctx,const nir_block * nblock)2951 get_block(struct ir3_context *ctx, const nir_block *nblock)
2952 {
2953    struct ir3_block *block;
2954    struct hash_entry *hentry;
2955 
2956    hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
2957    if (hentry)
2958       return hentry->data;
2959 
2960    block = ir3_block_create(ctx->ir);
2961    block->nblock = nblock;
2962    _mesa_hash_table_insert(ctx->block_ht, nblock, block);
2963 
2964    return block;
2965 }
2966 
2967 static struct ir3_block *
get_block_or_continue(struct ir3_context * ctx,const nir_block * nblock)2968 get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
2969 {
2970    struct hash_entry *hentry;
2971 
2972    hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
2973    if (hentry)
2974       return hentry->data;
2975 
2976    return get_block(ctx, nblock);
2977 }
2978 
2979 static struct ir3_block *
create_continue_block(struct ir3_context * ctx,const nir_block * nblock)2980 create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
2981 {
2982    struct ir3_block *block = ir3_block_create(ctx->ir);
2983    block->nblock = NULL;
2984    _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
2985    return block;
2986 }
2987 
2988 static void
emit_block(struct ir3_context * ctx,nir_block * nblock)2989 emit_block(struct ir3_context *ctx, nir_block *nblock)
2990 {
2991    ctx->block = get_block(ctx, nblock);
2992 
2993    list_addtail(&ctx->block->node, &ctx->ir->block_list);
2994 
2995    ctx->block->loop_id = ctx->loop_id;
2996    ctx->block->loop_depth = ctx->loop_depth;
2997 
2998    /* re-emit addr register in each block if needed: */
2999    for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
3000       _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
3001       ctx->addr0_ht[i] = NULL;
3002    }
3003 
3004    _mesa_hash_table_u64_destroy(ctx->addr1_ht);
3005    ctx->addr1_ht = NULL;
3006 
3007    nir_foreach_instr (instr, nblock) {
3008       ctx->cur_instr = instr;
3009       emit_instr(ctx, instr);
3010       ctx->cur_instr = NULL;
3011       if (ctx->error)
3012          return;
3013    }
3014 
3015    for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
3016       if (nblock->successors[i]) {
3017          ctx->block->successors[i] =
3018             get_block_or_continue(ctx, nblock->successors[i]);
3019          ctx->block->physical_successors[i] = ctx->block->successors[i];
3020       }
3021    }
3022 
3023    _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
3024 }
3025 
3026 static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
3027 
3028 static void
emit_if(struct ir3_context * ctx,nir_if * nif)3029 emit_if(struct ir3_context *ctx, nir_if *nif)
3030 {
3031    struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];
3032 
3033    if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
3034       ctx->block->condition = ssa(condition->srcs[0]);
3035       ctx->block->brtype = IR3_BRANCH_ANY;
3036    } else if (condition->opc == OPC_ALL_MACRO &&
3037               condition->block == ctx->block) {
3038       ctx->block->condition = ssa(condition->srcs[0]);
3039       ctx->block->brtype = IR3_BRANCH_ALL;
3040    } else if (condition->opc == OPC_ELECT_MACRO &&
3041               condition->block == ctx->block) {
3042       ctx->block->condition = NULL;
3043       ctx->block->brtype = IR3_BRANCH_GETONE;
3044    } else {
3045       ctx->block->condition = ir3_get_predicate(ctx, condition);
3046       ctx->block->brtype = IR3_BRANCH_COND;
3047    }
3048 
3049    emit_cf_list(ctx, &nif->then_list);
3050    emit_cf_list(ctx, &nif->else_list);
3051 
3052    struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif));
3053    struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif));
3054    assert(last_then->physical_successors[0] &&
3055           !last_then->physical_successors[1]);
3056    last_then->physical_successors[1] = first_else;
3057 
3058    struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));
3059    struct ir3_block *after_if =
3060       get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
3061    assert(last_else->physical_successors[0] &&
3062           !last_else->physical_successors[1]);
3063    if (after_if != last_else->physical_successors[0])
3064       last_else->physical_successors[1] = after_if;
3065 }
3066 
3067 static void
emit_loop(struct ir3_context * ctx,nir_loop * nloop)3068 emit_loop(struct ir3_context *ctx, nir_loop *nloop)
3069 {
3070    unsigned old_loop_id = ctx->loop_id;
3071    ctx->loop_id = ctx->so->loops + 1;
3072    ctx->loop_depth++;
3073 
3074    struct nir_block *nstart = nir_loop_first_block(nloop);
3075    struct ir3_block *continue_blk = NULL;
3076 
3077    /* There's always one incoming edge from outside the loop, and if there
3078     * are more than two backedges from inside the loop (so more than 2 total
3079     * edges) then we need to create a continue block after the loop to ensure
3080     * that control reconverges at the end of each loop iteration.
3081     */
3082    if (nstart->predecessors->entries > 2) {
3083       continue_blk = create_continue_block(ctx, nstart);
3084    }
3085 
3086    emit_cf_list(ctx, &nloop->body);
3087 
3088    if (continue_blk) {
3089       struct ir3_block *start = get_block(ctx, nstart);
3090       continue_blk->successors[0] = start;
3091       continue_blk->physical_successors[0] = start;
3092       continue_blk->loop_id = ctx->loop_id;
3093       continue_blk->loop_depth = ctx->loop_depth;
3094       list_addtail(&continue_blk->node, &ctx->ir->block_list);
3095    }
3096 
3097    ctx->so->loops++;
3098    ctx->loop_depth--;
3099    ctx->loop_id = old_loop_id;
3100 }
3101 
3102 static void
stack_push(struct ir3_context * ctx)3103 stack_push(struct ir3_context *ctx)
3104 {
3105    ctx->stack++;
3106    ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);
3107 }
3108 
3109 static void
stack_pop(struct ir3_context * ctx)3110 stack_pop(struct ir3_context *ctx)
3111 {
3112    compile_assert(ctx, ctx->stack > 0);
3113    ctx->stack--;
3114 }
3115 
3116 static void
emit_cf_list(struct ir3_context * ctx,struct exec_list * list)3117 emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
3118 {
3119    foreach_list_typed (nir_cf_node, node, node, list) {
3120       switch (node->type) {
3121       case nir_cf_node_block:
3122          emit_block(ctx, nir_cf_node_as_block(node));
3123          break;
3124       case nir_cf_node_if:
3125          stack_push(ctx);
3126          emit_if(ctx, nir_cf_node_as_if(node));
3127          stack_pop(ctx);
3128          break;
3129       case nir_cf_node_loop:
3130          stack_push(ctx);
3131          emit_loop(ctx, nir_cf_node_as_loop(node));
3132          stack_pop(ctx);
3133          break;
3134       case nir_cf_node_function:
3135          ir3_context_error(ctx, "TODO\n");
3136          break;
3137       }
3138    }
3139 }
3140 
3141 /* emit stream-out code.  At this point, the current block is the original
3142  * (nir) end block, and nir ensures that all flow control paths terminate
3143  * into the end block.  We re-purpose the original end block to generate
3144  * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
3145  * block holding stream-out write instructions, followed by the new end
3146  * block:
3147  *
3148  *   blockOrigEnd {
3149  *      p0.x = (vtxcnt < maxvtxcnt)
3150  *      // succs: blockStreamOut, blockNewEnd
3151  *   }
3152  *   blockStreamOut {
3153  *      // preds: blockOrigEnd
3154  *      ... stream-out instructions ...
3155  *      // succs: blockNewEnd
3156  *   }
3157  *   blockNewEnd {
3158  *      // preds: blockOrigEnd, blockStreamOut
3159  *   }
3160  */
3161 static void
emit_stream_out(struct ir3_context * ctx)3162 emit_stream_out(struct ir3_context *ctx)
3163 {
3164    struct ir3 *ir = ctx->ir;
3165    struct ir3_stream_output_info *strmout = &ctx->so->shader->stream_output;
3166    struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
3167    struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
3168    struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
3169 
3170    /* create vtxcnt input in input block at top of shader,
3171     * so that it is seen as live over the entire duration
3172     * of the shader:
3173     */
3174    vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
3175    maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
3176 
3177    /* at this point, we are at the original 'end' block,
3178     * re-purpose this block to stream-out condition, then
3179     * append stream-out block and new-end block
3180     */
3181    orig_end_block = ctx->block;
3182 
3183    // maybe w/ store_global intrinsic, we could do this
3184    // stuff in nir->nir pass
3185 
3186    stream_out_block = ir3_block_create(ir);
3187    list_addtail(&stream_out_block->node, &ir->block_list);
3188 
3189    new_end_block = ir3_block_create(ir);
3190    list_addtail(&new_end_block->node, &ir->block_list);
3191 
3192    orig_end_block->successors[0] = stream_out_block;
3193    orig_end_block->successors[1] = new_end_block;
3194 
3195    orig_end_block->physical_successors[0] = stream_out_block;
3196    orig_end_block->physical_successors[1] = new_end_block;
3197 
3198    stream_out_block->successors[0] = new_end_block;
3199 
3200    stream_out_block->physical_successors[0] = new_end_block;
3201 
3202    /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
3203    cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
3204    cond->dsts[0]->num = regid(REG_P0, 0);
3205    cond->dsts[0]->flags &= ~IR3_REG_SSA;
3206    cond->cat2.condition = IR3_COND_LT;
3207 
3208    /* condition goes on previous block to the conditional,
3209     * since it is used to pick which of the two successor
3210     * paths to take:
3211     */
3212    orig_end_block->condition = cond;
3213 
3214    /* switch to stream_out_block to generate the stream-out
3215     * instructions:
3216     */
3217    ctx->block = stream_out_block;
3218 
3219    /* Calculate base addresses based on vtxcnt.  Instructions
3220     * generated for bases not used in following loop will be
3221     * stripped out in the backend.
3222     */
3223    for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3224       const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
3225       unsigned stride = strmout->stride[i];
3226       struct ir3_instruction *base, *off;
3227 
3228       base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
3229 
3230       /* 24-bit should be enough: */
3231       off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
3232                         create_immed(ctx->block, stride * 4), 0);
3233 
3234       bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
3235    }
3236 
3237    /* Generate the per-output store instructions: */
3238    for (unsigned i = 0; i < strmout->num_outputs; i++) {
3239       for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
3240          unsigned c = j + strmout->output[i].start_component;
3241          struct ir3_instruction *base, *out, *stg;
3242 
3243          base = bases[strmout->output[i].output_buffer];
3244          out = ctx->outputs[regid(strmout->output[i].register_index, c)];
3245 
3246          stg = ir3_STG(
3247             ctx->block, base, 0,
3248             create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),
3249             0, out, 0, create_immed(ctx->block, 1), 0);
3250          stg->cat6.type = TYPE_U32;
3251 
3252          array_insert(ctx->block, ctx->block->keeps, stg);
3253       }
3254    }
3255 
3256    /* and finally switch to the new_end_block: */
3257    ctx->block = new_end_block;
3258 }
3259 
3260 static void
setup_predecessors(struct ir3 * ir)3261 setup_predecessors(struct ir3 *ir)
3262 {
3263    foreach_block (block, &ir->block_list) {
3264       for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
3265          if (block->successors[i])
3266             ir3_block_add_predecessor(block->successors[i], block);
3267          if (block->physical_successors[i])
3268             ir3_block_add_physical_predecessor(block->physical_successors[i],
3269                                                block);
3270       }
3271    }
3272 }
3273 
3274 static void
emit_function(struct ir3_context * ctx,nir_function_impl * impl)3275 emit_function(struct ir3_context *ctx, nir_function_impl *impl)
3276 {
3277    nir_metadata_require(impl, nir_metadata_block_index);
3278 
3279    compile_assert(ctx, ctx->stack == 0);
3280 
3281    emit_cf_list(ctx, &impl->body);
3282    emit_block(ctx, impl->end_block);
3283 
3284    compile_assert(ctx, ctx->stack == 0);
3285 
3286    /* at this point, we should have a single empty block,
3287     * into which we emit the 'end' instruction.
3288     */
3289    compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
3290 
3291    /* If stream-out (aka transform-feedback) enabled, emit the
3292     * stream-out instructions, followed by a new empty block (into
3293     * which the 'end' instruction lands).
3294     *
3295     * NOTE: it is done in this order, rather than inserting before
3296     * we emit end_block, because NIR guarantees that all blocks
3297     * flow into end_block, and that end_block has no successors.
3298     * So by re-purposing end_block as the first block of stream-
3299     * out, we guarantee that all exit paths flow into the stream-
3300     * out instructions.
3301     */
3302    if ((ctx->compiler->gen < 5) &&
3303        (ctx->so->shader->stream_output.num_outputs > 0) &&
3304        !ctx->so->binning_pass) {
3305       debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
3306       emit_stream_out(ctx);
3307    }
3308 
3309    setup_predecessors(ctx->ir);
3310    foreach_block (block, &ctx->ir->block_list) {
3311       resolve_phis(ctx, block);
3312    }
3313 }
3314 
3315 static void
setup_input(struct ir3_context * ctx,nir_intrinsic_instr * intr)3316 setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
3317 {
3318    struct ir3_shader_variant *so = ctx->so;
3319    struct ir3_instruction *coord = NULL;
3320 
3321    if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
3322       coord = ir3_create_collect(ctx->block, ir3_get_src(ctx, &intr->src[0]), 2);
3323 
3324    compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
3325 
3326    unsigned frac = nir_intrinsic_component(intr);
3327    unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
3328    unsigned ncomp = nir_intrinsic_dest_components(intr);
3329    unsigned n = nir_intrinsic_base(intr) + offset;
3330    unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
3331    unsigned compmask;
3332 
3333    /* Inputs are loaded using ldlw or ldg for other stages. */
3334    compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
3335                           ctx->so->type == MESA_SHADER_VERTEX);
3336 
3337    if (ctx->so->type == MESA_SHADER_FRAGMENT)
3338       compmask = BITFIELD_MASK(ncomp) << frac;
3339    else
3340       compmask = BITFIELD_MASK(ncomp + frac);
3341 
3342    /* for a4xx+ rasterflat */
3343    if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
3344       coord = NULL;
3345 
3346    so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
3347 
3348    so->inputs[n].slot = slot;
3349    so->inputs[n].compmask |= compmask;
3350    so->inputs_count = MAX2(so->inputs_count, n + 1);
3351    compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
3352    so->inputs[n].flat = !coord;
3353 
3354    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3355       compile_assert(ctx, slot != VARYING_SLOT_POS);
3356 
3357       so->inputs[n].bary = true;
3358 
3359       for (int i = 0; i < ncomp; i++) {
3360          unsigned idx = (n * 4) + i + frac;
3361          ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
3362       }
3363    } else {
3364       struct ir3_instruction *input = NULL;
3365 
3366       foreach_input (in, ctx->ir) {
3367          if (in->input.inidx == n) {
3368             input = in;
3369             break;
3370          }
3371       }
3372 
3373       if (!input) {
3374          input = create_input(ctx, compmask);
3375          input->input.inidx = n;
3376       } else {
3377          /* For aliased inputs, just append to the wrmask.. ie. if we
3378           * first see a vec2 index at slot N, and then later a vec4,
3379           * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
3380           */
3381          input->dsts[0]->wrmask |= compmask;
3382       }
3383 
3384       for (int i = 0; i < ncomp + frac; i++) {
3385          unsigned idx = (n * 4) + i;
3386          compile_assert(ctx, idx < ctx->ninputs);
3387 
3388          /* fixup the src wrmask to avoid validation fail */
3389          if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
3390             ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
3391             continue;
3392          }
3393 
3394          ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
3395       }
3396 
3397       for (int i = 0; i < ncomp; i++) {
3398          unsigned idx = (n * 4) + i + frac;
3399          ctx->last_dst[i] = ctx->inputs[idx];
3400       }
3401    }
3402 }
3403 
3404 /* Initially we assign non-packed inloc's for varyings, as we don't really
3405  * know up-front which components will be unused.  After all the compilation
3406  * stages we scan the shader to see which components are actually used, and
3407  * re-pack the inlocs to eliminate unneeded varyings.
3408  */
3409 static void
pack_inlocs(struct ir3_context * ctx)3410 pack_inlocs(struct ir3_context *ctx)
3411 {
3412    struct ir3_shader_variant *so = ctx->so;
3413    uint8_t used_components[so->inputs_count];
3414 
3415    memset(used_components, 0, sizeof(used_components));
3416 
3417    /*
3418     * First Step: scan shader to find which bary.f/ldlv remain:
3419     */
3420 
3421    foreach_block (block, &ctx->ir->block_list) {
3422       foreach_instr (instr, &block->instr_list) {
3423          if (is_input(instr)) {
3424             unsigned inloc = instr->srcs[0]->iim_val;
3425             unsigned i = inloc / 4;
3426             unsigned j = inloc % 4;
3427 
3428             compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
3429             compile_assert(ctx, i < so->inputs_count);
3430 
3431             used_components[i] |= 1 << j;
3432          } else if (instr->opc == OPC_META_TEX_PREFETCH) {
3433             for (int n = 0; n < 2; n++) {
3434                unsigned inloc = instr->prefetch.input_offset + n;
3435                unsigned i = inloc / 4;
3436                unsigned j = inloc % 4;
3437 
3438                compile_assert(ctx, i < so->inputs_count);
3439 
3440                used_components[i] |= 1 << j;
3441             }
3442          }
3443       }
3444    }
3445 
3446    /*
3447     * Second Step: reassign varying inloc/slots:
3448     */
3449 
3450    unsigned actual_in = 0;
3451    unsigned inloc = 0;
3452 
3453    /* for clip+cull distances, unused components can't be eliminated because
3454     * they're read by fixed-function, even if there's a hole.  Note that
3455     * clip/cull distance arrays must be declared in the FS, so we can just
3456     * use the NIR clip/cull distances to avoid reading ucp_enables in the
3457     * shader key.
3458     */
3459    unsigned clip_cull_size =
3460       ctx->so->shader->nir->info.clip_distance_array_size +
3461       ctx->so->shader->nir->info.cull_distance_array_size;
3462    unsigned clip_cull_mask = MASK(clip_cull_size);
3463 
3464    for (unsigned i = 0; i < so->inputs_count; i++) {
3465       unsigned compmask = 0, maxcomp = 0;
3466 
3467       so->inputs[i].inloc = inloc;
3468       so->inputs[i].bary = false;
3469 
3470       if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
3471           so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
3472          if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
3473             compmask = clip_cull_mask & 0xf;
3474          else
3475             compmask = clip_cull_mask >> 4;
3476          used_components[i] = compmask;
3477       }
3478 
3479       for (unsigned j = 0; j < 4; j++) {
3480          if (!(used_components[i] & (1 << j)))
3481             continue;
3482 
3483          compmask |= (1 << j);
3484          actual_in++;
3485          maxcomp = j + 1;
3486 
3487          /* at this point, since used_components[i] mask is only
3488           * considering varyings (ie. not sysvals) we know this
3489           * is a varying:
3490           */
3491          so->inputs[i].bary = true;
3492       }
3493 
3494       if (so->inputs[i].bary) {
3495          so->varying_in++;
3496          so->inputs[i].compmask = (1 << maxcomp) - 1;
3497          inloc += maxcomp;
3498       }
3499    }
3500 
3501    /*
3502     * Third Step: reassign packed inloc's:
3503     */
3504 
3505    foreach_block (block, &ctx->ir->block_list) {
3506       foreach_instr (instr, &block->instr_list) {
3507          if (is_input(instr)) {
3508             unsigned inloc = instr->srcs[0]->iim_val;
3509             unsigned i = inloc / 4;
3510             unsigned j = inloc % 4;
3511 
3512             instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
3513          } else if (instr->opc == OPC_META_TEX_PREFETCH) {
3514             unsigned i = instr->prefetch.input_offset / 4;
3515             unsigned j = instr->prefetch.input_offset % 4;
3516             instr->prefetch.input_offset = so->inputs[i].inloc + j;
3517          }
3518       }
3519    }
3520 }
3521 
3522 static void
setup_output(struct ir3_context * ctx,nir_intrinsic_instr * intr)3523 setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
3524 {
3525    struct ir3_shader_variant *so = ctx->so;
3526    nir_io_semantics io = nir_intrinsic_io_semantics(intr);
3527 
3528    compile_assert(ctx, nir_src_is_const(intr->src[1]));
3529 
3530    unsigned offset = nir_src_as_uint(intr->src[1]);
3531    unsigned n = nir_intrinsic_base(intr) + offset;
3532    unsigned frac = nir_intrinsic_component(intr);
3533    unsigned ncomp = nir_intrinsic_src_components(intr, 0);
3534 
3535    /* For per-view variables, each user-facing slot corresponds to multiple
3536     * views, each with a corresponding driver_location, and the offset is for
3537     * the driver_location. To properly figure out of the slot, we'd need to
3538     * plumb through the number of views. However, for now we only use
3539     * per-view with gl_Position, so we assume that the variable is not an
3540     * array or matrix (so there are no indirect accesses to the variable
3541     * itself) and the indirect offset corresponds to the view.
3542     */
3543    unsigned slot = io.location + (io.per_view ? 0 : offset);
3544 
3545    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3546       switch (slot) {
3547       case FRAG_RESULT_DEPTH:
3548          so->writes_pos = true;
3549          break;
3550       case FRAG_RESULT_COLOR:
3551          if (!ctx->s->info.fs.color_is_dual_source) {
3552             so->color0_mrt = 1;
3553          } else {
3554             slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
3555          }
3556          break;
3557       case FRAG_RESULT_SAMPLE_MASK:
3558          so->writes_smask = true;
3559          break;
3560       case FRAG_RESULT_STENCIL:
3561          so->writes_stencilref = true;
3562          break;
3563       default:
3564          slot += io.dual_source_blend_index; /* For dual-src blend */
3565          if (slot >= FRAG_RESULT_DATA0)
3566             break;
3567          ir3_context_error(ctx, "unknown FS output name: %s\n",
3568                            gl_frag_result_name(slot));
3569       }
3570    } else if (ctx->so->type == MESA_SHADER_VERTEX ||
3571               ctx->so->type == MESA_SHADER_TESS_EVAL ||
3572               ctx->so->type == MESA_SHADER_GEOMETRY) {
3573       switch (slot) {
3574       case VARYING_SLOT_POS:
3575          so->writes_pos = true;
3576          break;
3577       case VARYING_SLOT_PSIZ:
3578          so->writes_psize = true;
3579          break;
3580       case VARYING_SLOT_PRIMITIVE_ID:
3581       case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
3582          debug_assert(ctx->so->type == MESA_SHADER_GEOMETRY);
3583          FALLTHROUGH;
3584       case VARYING_SLOT_COL0:
3585       case VARYING_SLOT_COL1:
3586       case VARYING_SLOT_BFC0:
3587       case VARYING_SLOT_BFC1:
3588       case VARYING_SLOT_FOGC:
3589       case VARYING_SLOT_CLIP_DIST0:
3590       case VARYING_SLOT_CLIP_DIST1:
3591       case VARYING_SLOT_CLIP_VERTEX:
3592       case VARYING_SLOT_LAYER:
3593       case VARYING_SLOT_VIEWPORT:
3594          break;
3595       default:
3596          if (slot >= VARYING_SLOT_VAR0)
3597             break;
3598          if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
3599             break;
3600          ir3_context_error(ctx, "unknown %s shader output name: %s\n",
3601                            _mesa_shader_stage_to_string(ctx->so->type),
3602                            gl_varying_slot_name_for_stage(slot, ctx->so->type));
3603       }
3604    } else {
3605       ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
3606    }
3607 
3608    so->outputs_count = MAX2(so->outputs_count, n + 1);
3609    compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs));
3610 
3611    so->outputs[n].slot = slot;
3612    if (io.per_view)
3613       so->outputs[n].view = offset;
3614 
3615    for (int i = 0; i < ncomp; i++) {
3616       unsigned idx = (n * 4) + i + frac;
3617       compile_assert(ctx, idx < ctx->noutputs);
3618       ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
3619    }
3620 
3621    /* if varying packing doesn't happen, we could end up in a situation
3622     * with "holes" in the output, and since the per-generation code that
3623     * sets up varying linkage registers doesn't expect to have more than
3624     * one varying per vec4 slot, pad the holes.
3625     *
3626     * Note that this should probably generate a performance warning of
3627     * some sort.
3628     */
3629    for (int i = 0; i < frac; i++) {
3630       unsigned idx = (n * 4) + i;
3631       if (!ctx->outputs[idx]) {
3632          ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
3633       }
3634    }
3635 
3636    struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
3637    for (int i = 0; i < ncomp; i++) {
3638       unsigned idx = (n * 4) + i + frac;
3639       ctx->outputs[idx] = src[i];
3640    }
3641 }
3642 
3643 static bool
uses_load_input(struct ir3_shader_variant * so)3644 uses_load_input(struct ir3_shader_variant *so)
3645 {
3646    return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
3647 }
3648 
3649 static bool
uses_store_output(struct ir3_shader_variant * so)3650 uses_store_output(struct ir3_shader_variant *so)
3651 {
3652    switch (so->type) {
3653    case MESA_SHADER_VERTEX:
3654       return !so->key.has_gs && !so->key.tessellation;
3655    case MESA_SHADER_TESS_EVAL:
3656       return !so->key.has_gs;
3657    case MESA_SHADER_GEOMETRY:
3658    case MESA_SHADER_FRAGMENT:
3659       return true;
3660    case MESA_SHADER_TESS_CTRL:
3661    case MESA_SHADER_COMPUTE:
3662       return false;
3663    default:
3664       unreachable("unknown stage");
3665    }
3666 }
3667 
3668 static void
emit_instructions(struct ir3_context * ctx)3669 emit_instructions(struct ir3_context *ctx)
3670 {
3671    nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
3672 
3673    /* some varying setup which can't be done in setup_input(): */
3674    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3675       nir_foreach_shader_in_variable (var, ctx->s) {
3676          /* if any varyings have 'sample' qualifer, that triggers us
3677           * to run in per-sample mode:
3678           */
3679          if (var->data.sample)
3680             ctx->so->per_samp = true;
3681 
3682          /* set rasterflat flag for front/back color */
3683          if (var->data.interpolation == INTERP_MODE_NONE) {
3684             switch (var->data.location) {
3685             case VARYING_SLOT_COL0:
3686             case VARYING_SLOT_COL1:
3687             case VARYING_SLOT_BFC0:
3688             case VARYING_SLOT_BFC1:
3689                ctx->so->inputs[var->data.driver_location].rasterflat = true;
3690                break;
3691             default:
3692                break;
3693             }
3694          }
3695       }
3696    }
3697 
3698    if (uses_load_input(ctx->so)) {
3699       ctx->so->inputs_count = ctx->s->num_inputs;
3700       compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
3701       ctx->ninputs = ctx->s->num_inputs * 4;
3702       ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
3703    } else {
3704       ctx->ninputs = 0;
3705       ctx->so->inputs_count = 0;
3706    }
3707 
3708    if (uses_store_output(ctx->so)) {
3709       ctx->noutputs = ctx->s->num_outputs * 4;
3710       ctx->outputs =
3711          rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
3712    } else {
3713       ctx->noutputs = 0;
3714    }
3715 
3716    ctx->ir = ir3_create(ctx->compiler, ctx->so);
3717 
3718    /* Create inputs in first block: */
3719    ctx->block = get_block(ctx, nir_start_block(fxn));
3720    ctx->in_block = ctx->block;
3721 
3722    /* for fragment shader, the vcoord input register is used as the
3723     * base for bary.f varying fetch instrs:
3724     *
3725     * TODO defer creating ctx->ij_pixel and corresponding sysvals
3726     * until emit_intrinsic when we know they are actually needed.
3727     * For now, we defer creating ctx->ij_centroid, etc, since we
3728     * only need ij_pixel for "old style" varying inputs (ie.
3729     * tgsi_to_nir)
3730     */
3731    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3732       ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
3733    }
3734 
3735    /* Defer add_sysval_input() stuff until after setup_inputs(),
3736     * because sysvals need to be appended after varyings:
3737     */
3738    if (ctx->ij[IJ_PERSP_PIXEL]) {
3739       add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
3740                                 ctx->ij[IJ_PERSP_PIXEL]);
3741    }
3742 
3743    /* Tesselation shaders always need primitive ID for indexing the
3744     * BO. Geometry shaders don't always need it but when they do it has be
3745     * delivered and unclobbered in the VS. To make things easy, we always
3746     * make room for it in VS/DS.
3747     */
3748    bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
3749    bool has_gs = ctx->so->key.has_gs;
3750    switch (ctx->so->type) {
3751    case MESA_SHADER_VERTEX:
3752       if (has_tess) {
3753          ctx->tcs_header =
3754             create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
3755          ctx->rel_patch_id =
3756             create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
3757          ctx->primitive_id =
3758             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3759       } else if (has_gs) {
3760          ctx->gs_header =
3761             create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3762          ctx->primitive_id =
3763             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3764       }
3765       break;
3766    case MESA_SHADER_TESS_CTRL:
3767       ctx->tcs_header =
3768          create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
3769       ctx->rel_patch_id =
3770          create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
3771       break;
3772    case MESA_SHADER_TESS_EVAL:
3773       if (has_gs) {
3774          ctx->gs_header =
3775             create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3776          ctx->primitive_id =
3777             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3778       }
3779       ctx->rel_patch_id =
3780          create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
3781       break;
3782    case MESA_SHADER_GEOMETRY:
3783       ctx->gs_header =
3784          create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3785       break;
3786    default:
3787       break;
3788    }
3789 
3790    /* Find # of samplers. Just assume that we'll be reading from images.. if
3791     * it is write-only we don't have to count it, but after lowering derefs
3792     * is too late to compact indices for that.
3793     */
3794    ctx->so->num_samp =
3795       BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
3796 
3797    /* Save off clip+cull information. */
3798    ctx->so->clip_mask = MASK(ctx->s->info.clip_distance_array_size);
3799    ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
3800                         << ctx->s->info.clip_distance_array_size;
3801 
3802    ctx->so->pvtmem_size = ctx->s->scratch_size;
3803    ctx->so->shared_size = ctx->s->info.shared_size;
3804 
3805    /* NOTE: need to do something more clever when we support >1 fxn */
3806    nir_foreach_register (reg, &fxn->registers) {
3807       ir3_declare_array(ctx, reg);
3808    }
3809 
3810    if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
3811        ctx->compiler->tess_use_shared) {
3812       struct ir3_instruction *barrier = ir3_BAR(ctx->block);
3813       barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
3814       barrier->barrier_class = IR3_BARRIER_EVERYTHING;
3815       array_insert(ctx->block, ctx->block->keeps, barrier);
3816    }
3817 
3818    /* And emit the body: */
3819    ctx->impl = fxn;
3820    emit_function(ctx, fxn);
3821 }
3822 
3823 /* Fixup tex sampler state for astc/srgb workaround instructions.  We
3824  * need to assign the tex state indexes for these after we know the
3825  * max tex index.
3826  */
3827 static void
fixup_astc_srgb(struct ir3_context * ctx)3828 fixup_astc_srgb(struct ir3_context *ctx)
3829 {
3830    struct ir3_shader_variant *so = ctx->so;
3831    /* indexed by original tex idx, value is newly assigned alpha sampler
3832     * state tex idx.  Zero is invalid since there is at least one sampler
3833     * if we get here.
3834     */
3835    unsigned alt_tex_state[16] = {0};
3836    unsigned tex_idx = ctx->max_texture_index + 1;
3837    unsigned idx = 0;
3838 
3839    so->astc_srgb.base = tex_idx;
3840 
3841    for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
3842       struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
3843 
3844       compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
3845 
3846       if (alt_tex_state[sam->cat5.tex] == 0) {
3847          /* assign new alternate/alpha tex state slot: */
3848          alt_tex_state[sam->cat5.tex] = tex_idx++;
3849          so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
3850          so->astc_srgb.count++;
3851       }
3852 
3853       sam->cat5.tex = alt_tex_state[sam->cat5.tex];
3854    }
3855 }
3856 
3857 static bool
output_slot_used_for_binning(gl_varying_slot slot)3858 output_slot_used_for_binning(gl_varying_slot slot)
3859 {
3860    return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||
3861           slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1 ||
3862           slot == VARYING_SLOT_VIEWPORT;
3863 }
3864 
3865 static struct ir3_instruction *
find_end(struct ir3 * ir)3866 find_end(struct ir3 *ir)
3867 {
3868    foreach_block_rev (block, &ir->block_list) {
3869       foreach_instr_rev (instr, &block->instr_list) {
3870          if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
3871             return instr;
3872       }
3873    }
3874    unreachable("couldn't find end instruction");
3875 }
3876 
3877 static void
fixup_binning_pass(struct ir3_context * ctx,struct ir3_instruction * end)3878 fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
3879 {
3880    struct ir3_shader_variant *so = ctx->so;
3881    unsigned i, j;
3882 
3883    /* first pass, remove unused outputs from the IR level outputs: */
3884    for (i = 0, j = 0; i < end->srcs_count; i++) {
3885       unsigned outidx = end->end.outidxs[i];
3886       unsigned slot = so->outputs[outidx].slot;
3887 
3888       if (output_slot_used_for_binning(slot)) {
3889          end->srcs[j] = end->srcs[i];
3890          end->end.outidxs[j] = end->end.outidxs[i];
3891          j++;
3892       }
3893    }
3894    end->srcs_count = j;
3895 
3896    /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
3897     * table:
3898     */
3899    for (i = 0, j = 0; i < so->outputs_count; i++) {
3900       unsigned slot = so->outputs[i].slot;
3901 
3902       if (output_slot_used_for_binning(slot)) {
3903          so->outputs[j] = so->outputs[i];
3904 
3905          /* fixup outidx to point to new output table entry: */
3906          for (unsigned k = 0; k < end->srcs_count; k++) {
3907             if (end->end.outidxs[k] == i) {
3908                end->end.outidxs[k] = j;
3909                break;
3910             }
3911          }
3912 
3913          j++;
3914       }
3915    }
3916    so->outputs_count = j;
3917 }
3918 
3919 static void
collect_tex_prefetches(struct ir3_context * ctx,struct ir3 * ir)3920 collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
3921 {
3922    unsigned idx = 0;
3923 
3924    /* Collect sampling instructions eligible for pre-dispatch. */
3925    foreach_block (block, &ir->block_list) {
3926       foreach_instr_safe (instr, &block->instr_list) {
3927          if (instr->opc == OPC_META_TEX_PREFETCH) {
3928             assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
3929             struct ir3_sampler_prefetch *fetch =
3930                &ctx->so->sampler_prefetch[idx];
3931             idx++;
3932 
3933             if (instr->flags & IR3_INSTR_B) {
3934                fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD;
3935                /* In bindless mode, the index is actually the base */
3936                fetch->tex_id = instr->prefetch.tex_base;
3937                fetch->samp_id = instr->prefetch.samp_base;
3938                fetch->tex_bindless_id = instr->prefetch.tex;
3939                fetch->samp_bindless_id = instr->prefetch.samp;
3940             } else {
3941                fetch->cmd = IR3_SAMPLER_PREFETCH_CMD;
3942                fetch->tex_id = instr->prefetch.tex;
3943                fetch->samp_id = instr->prefetch.samp;
3944             }
3945             fetch->wrmask = instr->dsts[0]->wrmask;
3946             fetch->dst = instr->dsts[0]->num;
3947             fetch->src = instr->prefetch.input_offset;
3948 
3949             /* These are the limits on a5xx/a6xx, we might need to
3950              * revisit if SP_FS_PREFETCH[n] changes on later gens:
3951              */
3952             assert(fetch->dst <= 0x3f);
3953             assert(fetch->tex_id <= 0x1f);
3954             assert(fetch->samp_id <= 0xf);
3955 
3956             ctx->so->total_in =
3957                MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
3958 
3959             fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
3960 
3961             /* Remove the prefetch placeholder instruction: */
3962             list_delinit(&instr->node);
3963          }
3964       }
3965    }
3966 }
3967 
3968 int
ir3_compile_shader_nir(struct ir3_compiler * compiler,struct ir3_shader_variant * so)3969 ir3_compile_shader_nir(struct ir3_compiler *compiler,
3970                        struct ir3_shader_variant *so)
3971 {
3972    struct ir3_context *ctx;
3973    struct ir3 *ir;
3974    int ret = 0, max_bary;
3975    bool progress;
3976 
3977    assert(!so->ir);
3978 
3979    ctx = ir3_context_init(compiler, so);
3980    if (!ctx) {
3981       DBG("INIT failed!");
3982       ret = -1;
3983       goto out;
3984    }
3985 
3986    emit_instructions(ctx);
3987 
3988    if (ctx->error) {
3989       DBG("EMIT failed!");
3990       ret = -1;
3991       goto out;
3992    }
3993 
3994    ir = so->ir = ctx->ir;
3995 
3996    if (so->type == MESA_SHADER_COMPUTE) {
3997       so->local_size[0] = ctx->s->info.workgroup_size[0];
3998       so->local_size[1] = ctx->s->info.workgroup_size[1];
3999       so->local_size[2] = ctx->s->info.workgroup_size[2];
4000       so->local_size_variable = ctx->s->info.workgroup_size_variable;
4001    }
4002 
4003    /* Vertex shaders in a tessellation or geometry pipeline treat END as a
4004     * NOP and has an epilogue that writes the VS outputs to local storage, to
4005     * be read by the HS.  Then it resets execution mask (chmask) and chains
4006     * to the next shader (chsh). There are also a few output values which we
4007     * must send to the next stage via registers, and in order for both stages
4008     * to agree on the register used we must force these to be in specific
4009     * registers.
4010     */
4011    if ((so->type == MESA_SHADER_VERTEX &&
4012         (so->key.has_gs || so->key.tessellation)) ||
4013        (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
4014       struct ir3_instruction *outputs[3];
4015       unsigned outidxs[3];
4016       unsigned regids[3];
4017       unsigned outputs_count = 0;
4018 
4019       if (ctx->primitive_id) {
4020          unsigned n = so->outputs_count++;
4021          so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
4022 
4023          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->primitive_id);
4024          outputs[outputs_count] = out;
4025          outidxs[outputs_count] = n;
4026          if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
4027             regids[outputs_count] = regid(0, 2);
4028          else
4029             regids[outputs_count] = regid(0, 1);
4030          outputs_count++;
4031       }
4032 
4033       if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
4034          unsigned n = so->outputs_count++;
4035          so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
4036          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->rel_patch_id);
4037          outputs[outputs_count] = out;
4038          outidxs[outputs_count] = n;
4039          regids[outputs_count] = regid(0, 1);
4040          outputs_count++;
4041       }
4042 
4043       if (ctx->gs_header) {
4044          unsigned n = so->outputs_count++;
4045          so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
4046          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->gs_header);
4047          outputs[outputs_count] = out;
4048          outidxs[outputs_count] = n;
4049          regids[outputs_count] = regid(0, 0);
4050          outputs_count++;
4051       }
4052 
4053       if (ctx->tcs_header) {
4054          unsigned n = so->outputs_count++;
4055          so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
4056          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->tcs_header);
4057          outputs[outputs_count] = out;
4058          outidxs[outputs_count] = n;
4059          regids[outputs_count] = regid(0, 0);
4060          outputs_count++;
4061       }
4062 
4063       struct ir3_instruction *chmask =
4064          ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
4065       chmask->barrier_class = IR3_BARRIER_EVERYTHING;
4066       chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
4067 
4068       for (unsigned i = 0; i < outputs_count; i++)
4069          __ssa_src(chmask, outputs[i], 0)->num = regids[i];
4070 
4071       chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
4072       memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4073 
4074       array_insert(ctx->block, ctx->block->keeps, chmask);
4075 
4076       struct ir3_instruction *chsh = ir3_CHSH(ctx->block);
4077       chsh->barrier_class = IR3_BARRIER_EVERYTHING;
4078       chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
4079    } else {
4080       assert((ctx->noutputs % 4) == 0);
4081       unsigned outidxs[ctx->noutputs / 4];
4082       struct ir3_instruction *outputs[ctx->noutputs / 4];
4083       unsigned outputs_count = 0;
4084 
4085       struct ir3_block *b = ctx->block;
4086       /* Insert these collect's in the block before the end-block if
4087        * possible, so that any moves they generate can be shuffled around to
4088        * reduce nop's:
4089        */
4090       if (ctx->block->predecessors_count == 1)
4091          b = ctx->block->predecessors[0];
4092 
4093       /* Setup IR level outputs, which are "collects" that gather
4094        * the scalar components of outputs.
4095        */
4096       for (unsigned i = 0; i < ctx->noutputs; i += 4) {
4097          unsigned ncomp = 0;
4098          /* figure out the # of components written:
4099           *
4100           * TODO do we need to handle holes, ie. if .x and .z
4101           * components written, but .y component not written?
4102           */
4103          for (unsigned j = 0; j < 4; j++) {
4104             if (!ctx->outputs[i + j])
4105                break;
4106             ncomp++;
4107          }
4108 
4109          /* Note that in some stages, like TCS, store_output is
4110           * lowered to memory writes, so no components of the
4111           * are "written" from the PoV of traditional store-
4112           * output instructions:
4113           */
4114          if (!ncomp)
4115             continue;
4116 
4117          struct ir3_instruction *out =
4118             ir3_create_collect(b, &ctx->outputs[i], ncomp);
4119 
4120          int outidx = i / 4;
4121          assert(outidx < so->outputs_count);
4122 
4123          outidxs[outputs_count] = outidx;
4124          outputs[outputs_count] = out;
4125          outputs_count++;
4126       }
4127 
4128       /* for a6xx+, binning and draw pass VS use same VBO state, so we
4129        * need to make sure not to remove any inputs that are used by
4130        * the nonbinning VS.
4131        */
4132       if (ctx->compiler->gen >= 6 && so->binning_pass &&
4133           so->type == MESA_SHADER_VERTEX) {
4134          for (int i = 0; i < ctx->ninputs; i++) {
4135             struct ir3_instruction *in = ctx->inputs[i];
4136 
4137             if (!in)
4138                continue;
4139 
4140             unsigned n = i / 4;
4141             unsigned c = i % 4;
4142 
4143             debug_assert(n < so->nonbinning->inputs_count);
4144 
4145             if (so->nonbinning->inputs[n].sysval)
4146                continue;
4147 
4148             /* be sure to keep inputs, even if only used in VS */
4149             if (so->nonbinning->inputs[n].compmask & (1 << c))
4150                array_insert(in->block, in->block->keeps, in);
4151          }
4152       }
4153 
4154       struct ir3_instruction *end =
4155          ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
4156 
4157       for (unsigned i = 0; i < outputs_count; i++) {
4158          __ssa_src(end, outputs[i], 0);
4159       }
4160 
4161       end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
4162       memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4163 
4164       array_insert(ctx->block, ctx->block->keeps, end);
4165 
4166       /* at this point, for binning pass, throw away unneeded outputs: */
4167       if (so->binning_pass && (ctx->compiler->gen < 6))
4168          fixup_binning_pass(ctx, end);
4169    }
4170 
4171    ir3_debug_print(ir, "AFTER: nir->ir3");
4172    ir3_validate(ir);
4173 
4174    IR3_PASS(ir, ir3_remove_unreachable);
4175 
4176    IR3_PASS(ir, ir3_array_to_ssa);
4177 
4178    do {
4179       progress = false;
4180 
4181       progress |= IR3_PASS(ir, ir3_cf);
4182       progress |= IR3_PASS(ir, ir3_cp, so);
4183       progress |= IR3_PASS(ir, ir3_cse);
4184       progress |= IR3_PASS(ir, ir3_dce, so);
4185    } while (progress);
4186 
4187    /* at this point, for binning pass, throw away unneeded outputs:
4188     * Note that for a6xx and later, we do this after ir3_cp to ensure
4189     * that the uniform/constant layout for BS and VS matches, so that
4190     * we can re-use same VS_CONST state group.
4191     */
4192    if (so->binning_pass && (ctx->compiler->gen >= 6)) {
4193       fixup_binning_pass(ctx, find_end(ctx->so->ir));
4194       /* cleanup the result of removing unneeded outputs: */
4195       while (IR3_PASS(ir, ir3_dce, so)) {
4196       }
4197    }
4198 
4199    IR3_PASS(ir, ir3_sched_add_deps);
4200 
4201    /* At this point, all the dead code should be long gone: */
4202    assert(!IR3_PASS(ir, ir3_dce, so));
4203 
4204    ret = ir3_sched(ir);
4205    if (ret) {
4206       DBG("SCHED failed!");
4207       goto out;
4208    }
4209 
4210    ir3_debug_print(ir, "AFTER: ir3_sched");
4211 
4212    if (IR3_PASS(ir, ir3_cp_postsched)) {
4213       /* cleanup the result of removing unneeded mov's: */
4214       while (IR3_PASS(ir, ir3_dce, so)) {
4215       }
4216    }
4217 
4218    /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
4219     * with draw pass VS, so binning and draw pass can both use the
4220     * same VBO state.
4221     *
4222     * Note that VS inputs are expected to be full precision.
4223     */
4224    bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
4225                             (ir->type == MESA_SHADER_VERTEX) &&
4226                             so->binning_pass;
4227 
4228    if (pre_assign_inputs) {
4229       foreach_input (in, ir) {
4230          assert(in->opc == OPC_META_INPUT);
4231          unsigned inidx = in->input.inidx;
4232 
4233          in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
4234       }
4235    } else if (ctx->tcs_header) {
4236       /* We need to have these values in the same registers between VS and TCS
4237        * since the VS chains to TCS and doesn't get the sysvals redelivered.
4238        */
4239 
4240       ctx->tcs_header->dsts[0]->num = regid(0, 0);
4241       ctx->rel_patch_id->dsts[0]->num = regid(0, 1);
4242       if (ctx->primitive_id)
4243          ctx->primitive_id->dsts[0]->num = regid(0, 2);
4244    } else if (ctx->gs_header) {
4245       /* We need to have these values in the same registers between producer
4246        * (VS or DS) and GS since the producer chains to GS and doesn't get
4247        * the sysvals redelivered.
4248        */
4249 
4250       ctx->gs_header->dsts[0]->num = regid(0, 0);
4251       if (ctx->primitive_id)
4252          ctx->primitive_id->dsts[0]->num = regid(0, 1);
4253    } else if (so->num_sampler_prefetch) {
4254       assert(so->type == MESA_SHADER_FRAGMENT);
4255       int idx = 0;
4256 
4257       foreach_input (instr, ir) {
4258          if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
4259             continue;
4260 
4261          assert(idx < 2);
4262          instr->dsts[0]->num = idx;
4263          idx++;
4264       }
4265    }
4266 
4267    ret = ir3_ra(so);
4268 
4269    if (ret) {
4270       mesa_loge("ir3_ra() failed!");
4271       goto out;
4272    }
4273 
4274    IR3_PASS(ir, ir3_postsched, so);
4275 
4276    IR3_PASS(ir, ir3_lower_subgroups);
4277 
4278    if (so->type == MESA_SHADER_FRAGMENT)
4279       pack_inlocs(ctx);
4280 
4281    /*
4282     * Fixup inputs/outputs to point to the actual registers assigned:
4283     *
4284     * 1) initialize to r63.x (invalid/unused)
4285     * 2) iterate IR level inputs/outputs and update the variants
4286     *    inputs/outputs table based on the assigned registers for
4287     *    the remaining inputs/outputs.
4288     */
4289 
4290    for (unsigned i = 0; i < so->inputs_count; i++)
4291       so->inputs[i].regid = INVALID_REG;
4292    for (unsigned i = 0; i < so->outputs_count; i++)
4293       so->outputs[i].regid = INVALID_REG;
4294 
4295    struct ir3_instruction *end = find_end(so->ir);
4296 
4297    for (unsigned i = 0; i < end->srcs_count; i++) {
4298       unsigned outidx = end->end.outidxs[i];
4299       struct ir3_register *reg = end->srcs[i];
4300 
4301       so->outputs[outidx].regid = reg->num;
4302       so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
4303    }
4304 
4305    foreach_input (in, ir) {
4306       assert(in->opc == OPC_META_INPUT);
4307       unsigned inidx = in->input.inidx;
4308 
4309       if (pre_assign_inputs && !so->inputs[inidx].sysval) {
4310          if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
4311             compile_assert(
4312                ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
4313             compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
4314                                    so->nonbinning->inputs[inidx].half);
4315          }
4316          so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
4317          so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
4318       } else {
4319          so->inputs[inidx].regid = in->dsts[0]->num;
4320          so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
4321       }
4322    }
4323 
4324    if (ctx->astc_srgb)
4325       fixup_astc_srgb(ctx);
4326 
4327    /* We need to do legalize after (for frag shader's) the "bary.f"
4328     * offsets (inloc) have been assigned.
4329     */
4330    IR3_PASS(ir, ir3_legalize, so, &max_bary);
4331 
4332    /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
4333     * know what we might have to wait on when coming in from VS chsh.
4334     */
4335    if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
4336       foreach_block (block, &ir->block_list) {
4337          foreach_instr (instr, &block->instr_list) {
4338             instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
4339             break;
4340          }
4341       }
4342    }
4343 
4344    so->branchstack = ctx->max_stack;
4345 
4346    /* Note that actual_in counts inputs that are not bary.f'd for FS: */
4347    if (so->type == MESA_SHADER_FRAGMENT)
4348       so->total_in = max_bary + 1;
4349 
4350    /* Collect sampling instructions eligible for pre-dispatch. */
4351    collect_tex_prefetches(ctx, ir);
4352 
4353    if (so->type == MESA_SHADER_FRAGMENT &&
4354        ctx->s->info.fs.needs_quad_helper_invocations)
4355       so->need_pixlod = true;
4356 
4357 out:
4358    if (ret) {
4359       if (so->ir)
4360          ir3_destroy(so->ir);
4361       so->ir = NULL;
4362    }
4363    ir3_context_free(ctx);
4364 
4365    return ret;
4366 }
4367