1 /*
2 * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include <stdarg.h>
28
29 #include "util/u_math.h"
30 #include "util/u_memory.h"
31 #include "util/u_string.h"
32
33 #include "ir3_compiler.h"
34 #include "ir3_image.h"
35 #include "ir3_nir.h"
36 #include "ir3_shader.h"
37
38 #include "instr-a3xx.h"
39 #include "ir3.h"
40 #include "ir3_context.h"
41
42 void
ir3_handle_nonuniform(struct ir3_instruction * instr,nir_intrinsic_instr * intrin)43 ir3_handle_nonuniform(struct ir3_instruction *instr,
44 nir_intrinsic_instr *intrin)
45 {
46 if (nir_intrinsic_has_access(intrin) &&
47 (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
48 instr->flags |= IR3_INSTR_NONUNIF;
49 }
50 }
51
52 void
ir3_handle_bindless_cat6(struct ir3_instruction * instr,nir_src rsrc)53 ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
54 {
55 nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
56 if (!intrin)
57 return;
58
59 instr->flags |= IR3_INSTR_B;
60 instr->cat6.base = nir_intrinsic_desc_set(intrin);
61 }
62
63 static struct ir3_instruction *
create_input(struct ir3_context * ctx,unsigned compmask)64 create_input(struct ir3_context *ctx, unsigned compmask)
65 {
66 struct ir3_instruction *in;
67
68 in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
69 in->input.sysval = ~0;
70 __ssa_dst(in)->wrmask = compmask;
71
72 array_insert(ctx->ir, ctx->ir->inputs, in);
73
74 return in;
75 }
76
77 static struct ir3_instruction *
create_frag_input(struct ir3_context * ctx,struct ir3_instruction * coord,unsigned n)78 create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
79 unsigned n)
80 {
81 struct ir3_block *block = ctx->block;
82 struct ir3_instruction *instr;
83 /* packed inloc is fixed up later: */
84 struct ir3_instruction *inloc = create_immed(block, n);
85
86 if (coord) {
87 instr = ir3_BARY_F(block, inloc, 0, coord, 0);
88 } else if (ctx->compiler->flat_bypass) {
89 instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
90 instr->cat6.type = TYPE_U32;
91 instr->cat6.iim_val = 1;
92 } else {
93 instr = ir3_BARY_F(block, inloc, 0, ctx->ij[IJ_PERSP_PIXEL], 0);
94 instr->srcs[1]->wrmask = 0x3;
95 }
96
97 return instr;
98 }
99
100 static struct ir3_instruction *
create_driver_param(struct ir3_context * ctx,enum ir3_driver_param dp)101 create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
102 {
103 /* first four vec4 sysval's reserved for UBOs: */
104 /* NOTE: dp is in scalar, but there can be >4 dp components: */
105 struct ir3_const_state *const_state = ir3_const_state(ctx->so);
106 unsigned n = const_state->offsets.driver_param;
107 unsigned r = regid(n + dp / 4, dp % 4);
108 return create_uniform(ctx->block, r);
109 }
110
111 /*
112 * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
113 * 32-bit registers. We use NIR's 1-bit integers to represent bools, and
114 * trust that we will only see and/or/xor on those 1-bit values, so we can
115 * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
116 * 0.
117 */
118
119 /*
120 * alu/sfu instructions:
121 */
122
123 static struct ir3_instruction *
create_cov(struct ir3_context * ctx,struct ir3_instruction * src,unsigned src_bitsize,nir_op op)124 create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
125 unsigned src_bitsize, nir_op op)
126 {
127 type_t src_type, dst_type;
128
129 switch (op) {
130 case nir_op_f2f32:
131 case nir_op_f2f16_rtne:
132 case nir_op_f2f16_rtz:
133 case nir_op_f2f16:
134 case nir_op_f2i32:
135 case nir_op_f2i16:
136 case nir_op_f2i8:
137 case nir_op_f2u32:
138 case nir_op_f2u16:
139 case nir_op_f2u8:
140 switch (src_bitsize) {
141 case 32:
142 src_type = TYPE_F32;
143 break;
144 case 16:
145 src_type = TYPE_F16;
146 break;
147 default:
148 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
149 }
150 break;
151
152 case nir_op_i2f32:
153 case nir_op_i2f16:
154 case nir_op_i2i32:
155 case nir_op_i2i16:
156 case nir_op_i2i8:
157 switch (src_bitsize) {
158 case 32:
159 src_type = TYPE_S32;
160 break;
161 case 16:
162 src_type = TYPE_S16;
163 break;
164 case 8:
165 src_type = TYPE_S8;
166 break;
167 default:
168 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
169 }
170 break;
171
172 case nir_op_u2f32:
173 case nir_op_u2f16:
174 case nir_op_u2u32:
175 case nir_op_u2u16:
176 case nir_op_u2u8:
177 switch (src_bitsize) {
178 case 32:
179 src_type = TYPE_U32;
180 break;
181 case 16:
182 src_type = TYPE_U16;
183 break;
184 case 8:
185 src_type = TYPE_U8;
186 break;
187 default:
188 ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
189 }
190 break;
191
192 case nir_op_b2f16:
193 case nir_op_b2f32:
194 case nir_op_b2i8:
195 case nir_op_b2i16:
196 case nir_op_b2i32:
197 src_type = TYPE_U32;
198 break;
199
200 default:
201 ir3_context_error(ctx, "invalid conversion op: %u", op);
202 }
203
204 switch (op) {
205 case nir_op_f2f32:
206 case nir_op_i2f32:
207 case nir_op_u2f32:
208 case nir_op_b2f32:
209 dst_type = TYPE_F32;
210 break;
211
212 case nir_op_f2f16_rtne:
213 case nir_op_f2f16_rtz:
214 case nir_op_f2f16:
215 case nir_op_i2f16:
216 case nir_op_u2f16:
217 case nir_op_b2f16:
218 dst_type = TYPE_F16;
219 break;
220
221 case nir_op_f2i32:
222 case nir_op_i2i32:
223 case nir_op_b2i32:
224 dst_type = TYPE_S32;
225 break;
226
227 case nir_op_f2i16:
228 case nir_op_i2i16:
229 case nir_op_b2i16:
230 dst_type = TYPE_S16;
231 break;
232
233 case nir_op_f2i8:
234 case nir_op_i2i8:
235 case nir_op_b2i8:
236 dst_type = TYPE_S8;
237 break;
238
239 case nir_op_f2u32:
240 case nir_op_u2u32:
241 dst_type = TYPE_U32;
242 break;
243
244 case nir_op_f2u16:
245 case nir_op_u2u16:
246 dst_type = TYPE_U16;
247 break;
248
249 case nir_op_f2u8:
250 case nir_op_u2u8:
251 dst_type = TYPE_U8;
252 break;
253
254 default:
255 ir3_context_error(ctx, "invalid conversion op: %u", op);
256 }
257
258 if (src_type == dst_type)
259 return src;
260
261 struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
262
263 if (op == nir_op_f2f16_rtne) {
264 cov->cat1.round = ROUND_EVEN;
265 } else if (op == nir_op_f2f16) {
266 unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
267 nir_rounding_mode rounding_mode =
268 nir_get_rounding_mode_from_float_controls(execution_mode,
269 nir_type_float16);
270 if (rounding_mode == nir_rounding_mode_rtne)
271 cov->cat1.round = ROUND_EVEN;
272 }
273
274 return cov;
275 }
276
277 /* For shift instructions NIR always has shift amount as 32 bit integer */
278 static struct ir3_instruction *
resize_shift_amount(struct ir3_context * ctx,struct ir3_instruction * src,unsigned bs)279 resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
280 unsigned bs)
281 {
282 if (bs != 16)
283 return src;
284
285 return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
286 }
287
288 static void
emit_alu(struct ir3_context * ctx,nir_alu_instr * alu)289 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
290 {
291 const nir_op_info *info = &nir_op_infos[alu->op];
292 struct ir3_instruction **dst, *src[info->num_inputs];
293 unsigned bs[info->num_inputs]; /* bit size */
294 struct ir3_block *b = ctx->block;
295 unsigned dst_sz, wrmask;
296 type_t dst_type =
297 nir_dest_bit_size(alu->dest.dest) == 16 ? TYPE_U16 : TYPE_U32;
298
299 if (alu->dest.dest.is_ssa) {
300 dst_sz = alu->dest.dest.ssa.num_components;
301 wrmask = (1 << dst_sz) - 1;
302 } else {
303 dst_sz = alu->dest.dest.reg.reg->num_components;
304 wrmask = alu->dest.write_mask;
305 }
306
307 dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz);
308
309 /* Vectors are special in that they have non-scalarized writemasks,
310 * and just take the first swizzle channel for each argument in
311 * order into each writemask channel.
312 */
313 if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
314 (alu->op == nir_op_vec4)) {
315
316 for (int i = 0; i < info->num_inputs; i++) {
317 nir_alu_src *asrc = &alu->src[i];
318
319 compile_assert(ctx, !asrc->abs);
320 compile_assert(ctx, !asrc->negate);
321
322 src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]];
323 if (!src[i])
324 src[i] = create_immed_typed(ctx->block, 0, dst_type);
325 dst[i] = ir3_MOV(b, src[i], dst_type);
326 }
327
328 ir3_put_dst(ctx, &alu->dest.dest);
329 return;
330 }
331
332 /* We also get mov's with more than one component for mov's so
333 * handle those specially:
334 */
335 if (alu->op == nir_op_mov) {
336 nir_alu_src *asrc = &alu->src[0];
337 struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src);
338
339 for (unsigned i = 0; i < dst_sz; i++) {
340 if (wrmask & (1 << i)) {
341 dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], dst_type);
342 } else {
343 dst[i] = NULL;
344 }
345 }
346
347 ir3_put_dst(ctx, &alu->dest.dest);
348 return;
349 }
350
351 /* General case: We can just grab the one used channel per src. */
352 for (int i = 0; i < info->num_inputs; i++) {
353 unsigned chan = ffs(alu->dest.write_mask) - 1;
354 nir_alu_src *asrc = &alu->src[i];
355
356 compile_assert(ctx, !asrc->abs);
357 compile_assert(ctx, !asrc->negate);
358
359 src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
360 bs[i] = nir_src_bit_size(asrc->src);
361
362 compile_assert(ctx, src[i]);
363 }
364
365 switch (alu->op) {
366 case nir_op_f2f32:
367 case nir_op_f2f16_rtne:
368 case nir_op_f2f16_rtz:
369 case nir_op_f2f16:
370 case nir_op_f2i32:
371 case nir_op_f2i16:
372 case nir_op_f2i8:
373 case nir_op_f2u32:
374 case nir_op_f2u16:
375 case nir_op_f2u8:
376 case nir_op_i2f32:
377 case nir_op_i2f16:
378 case nir_op_i2i32:
379 case nir_op_i2i16:
380 case nir_op_i2i8:
381 case nir_op_u2f32:
382 case nir_op_u2f16:
383 case nir_op_u2u32:
384 case nir_op_u2u16:
385 case nir_op_u2u8:
386 case nir_op_b2f16:
387 case nir_op_b2f32:
388 case nir_op_b2i8:
389 case nir_op_b2i16:
390 case nir_op_b2i32:
391 dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
392 break;
393
394 case nir_op_fquantize2f16:
395 dst[0] = create_cov(ctx, create_cov(ctx, src[0], 32, nir_op_f2f16_rtne),
396 16, nir_op_f2f32);
397 break;
398 case nir_op_f2b1:
399 dst[0] = ir3_CMPS_F(
400 b, src[0], 0,
401 create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0);
402 dst[0]->cat2.condition = IR3_COND_NE;
403 break;
404
405 case nir_op_i2b1:
406 /* i2b1 will appear when translating from nir_load_ubo or
407 * nir_intrinsic_load_ssbo, where any non-zero value is true.
408 */
409 dst[0] = ir3_CMPS_S(
410 b, src[0], 0,
411 create_immed_typed(b, 0, bs[0] == 16 ? TYPE_U16 : TYPE_U32), 0);
412 dst[0]->cat2.condition = IR3_COND_NE;
413 break;
414
415 case nir_op_b2b1:
416 /* b2b1 will appear when translating from
417 *
418 * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
419 * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
420 *
421 * A negate can turn those into a 1 or 0 for us.
422 */
423 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
424 break;
425
426 case nir_op_b2b32:
427 /* b2b32 will appear when converting our 1-bit bools to a store_shared
428 * argument.
429 *
430 * A negate can turn those into a ~0 for us.
431 */
432 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
433 break;
434
435 case nir_op_fneg:
436 dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
437 break;
438 case nir_op_fabs:
439 dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
440 break;
441 case nir_op_fmax:
442 dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
443 break;
444 case nir_op_fmin:
445 dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
446 break;
447 case nir_op_fsat:
448 /* if there is just a single use of the src, and it supports
449 * (sat) bit, we can just fold the (sat) flag back to the
450 * src instruction and create a mov. This is easier for cp
451 * to eliminate.
452 */
453 if (alu->src[0].src.is_ssa && is_sat_compatible(src[0]->opc) &&
454 (list_length(&alu->src[0].src.ssa->uses) == 1)) {
455 src[0]->flags |= IR3_INSTR_SAT;
456 dst[0] = ir3_MOV(b, src[0], dst_type);
457 } else {
458 /* otherwise generate a max.f that saturates.. blob does
459 * similar (generating a cat2 mov using max.f)
460 */
461 dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
462 dst[0]->flags |= IR3_INSTR_SAT;
463 }
464 break;
465 case nir_op_fmul:
466 dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
467 break;
468 case nir_op_fadd:
469 dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
470 break;
471 case nir_op_fsub:
472 dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
473 break;
474 case nir_op_ffma:
475 dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
476 break;
477 case nir_op_fddx:
478 case nir_op_fddx_coarse:
479 dst[0] = ir3_DSX(b, src[0], 0);
480 dst[0]->cat5.type = TYPE_F32;
481 break;
482 case nir_op_fddx_fine:
483 dst[0] = ir3_DSXPP_MACRO(b, src[0], 0);
484 dst[0]->cat5.type = TYPE_F32;
485 break;
486 case nir_op_fddy:
487 case nir_op_fddy_coarse:
488 dst[0] = ir3_DSY(b, src[0], 0);
489 dst[0]->cat5.type = TYPE_F32;
490 break;
491 break;
492 case nir_op_fddy_fine:
493 dst[0] = ir3_DSYPP_MACRO(b, src[0], 0);
494 dst[0]->cat5.type = TYPE_F32;
495 break;
496 case nir_op_flt:
497 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
498 dst[0]->cat2.condition = IR3_COND_LT;
499 break;
500 case nir_op_fge:
501 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
502 dst[0]->cat2.condition = IR3_COND_GE;
503 break;
504 case nir_op_feq:
505 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
506 dst[0]->cat2.condition = IR3_COND_EQ;
507 break;
508 case nir_op_fneu:
509 dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
510 dst[0]->cat2.condition = IR3_COND_NE;
511 break;
512 case nir_op_fceil:
513 dst[0] = ir3_CEIL_F(b, src[0], 0);
514 break;
515 case nir_op_ffloor:
516 dst[0] = ir3_FLOOR_F(b, src[0], 0);
517 break;
518 case nir_op_ftrunc:
519 dst[0] = ir3_TRUNC_F(b, src[0], 0);
520 break;
521 case nir_op_fround_even:
522 dst[0] = ir3_RNDNE_F(b, src[0], 0);
523 break;
524 case nir_op_fsign:
525 dst[0] = ir3_SIGN_F(b, src[0], 0);
526 break;
527
528 case nir_op_fsin:
529 dst[0] = ir3_SIN(b, src[0], 0);
530 break;
531 case nir_op_fcos:
532 dst[0] = ir3_COS(b, src[0], 0);
533 break;
534 case nir_op_frsq:
535 dst[0] = ir3_RSQ(b, src[0], 0);
536 break;
537 case nir_op_frcp:
538 dst[0] = ir3_RCP(b, src[0], 0);
539 break;
540 case nir_op_flog2:
541 dst[0] = ir3_LOG2(b, src[0], 0);
542 break;
543 case nir_op_fexp2:
544 dst[0] = ir3_EXP2(b, src[0], 0);
545 break;
546 case nir_op_fsqrt:
547 dst[0] = ir3_SQRT(b, src[0], 0);
548 break;
549
550 case nir_op_iabs:
551 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
552 break;
553 case nir_op_iadd:
554 dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
555 break;
556 case nir_op_iand:
557 dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
558 break;
559 case nir_op_imax:
560 dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
561 break;
562 case nir_op_umax:
563 dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
564 break;
565 case nir_op_imin:
566 dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
567 break;
568 case nir_op_umin:
569 dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
570 break;
571 case nir_op_umul_low:
572 dst[0] = ir3_MULL_U(b, src[0], 0, src[1], 0);
573 break;
574 case nir_op_imadsh_mix16:
575 dst[0] = ir3_MADSH_M16(b, src[0], 0, src[1], 0, src[2], 0);
576 break;
577 case nir_op_imad24_ir3:
578 dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
579 break;
580 case nir_op_imul:
581 compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) == 16);
582 dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
583 break;
584 case nir_op_imul24:
585 dst[0] = ir3_MUL_S24(b, src[0], 0, src[1], 0);
586 break;
587 case nir_op_ineg:
588 dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
589 break;
590 case nir_op_inot:
591 if (bs[0] == 1) {
592 dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0);
593 } else {
594 dst[0] = ir3_NOT_B(b, src[0], 0);
595 }
596 break;
597 case nir_op_ior:
598 dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
599 break;
600 case nir_op_ishl:
601 dst[0] =
602 ir3_SHL_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
603 break;
604 case nir_op_ishr:
605 dst[0] =
606 ir3_ASHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
607 break;
608 case nir_op_isub:
609 dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
610 break;
611 case nir_op_ixor:
612 dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
613 break;
614 case nir_op_ushr:
615 dst[0] =
616 ir3_SHR_B(b, src[0], 0, resize_shift_amount(ctx, src[1], bs[0]), 0);
617 break;
618 case nir_op_ilt:
619 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
620 dst[0]->cat2.condition = IR3_COND_LT;
621 break;
622 case nir_op_ige:
623 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
624 dst[0]->cat2.condition = IR3_COND_GE;
625 break;
626 case nir_op_ieq:
627 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
628 dst[0]->cat2.condition = IR3_COND_EQ;
629 break;
630 case nir_op_ine:
631 dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
632 dst[0]->cat2.condition = IR3_COND_NE;
633 break;
634 case nir_op_ult:
635 dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
636 dst[0]->cat2.condition = IR3_COND_LT;
637 break;
638 case nir_op_uge:
639 dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
640 dst[0]->cat2.condition = IR3_COND_GE;
641 break;
642
643 case nir_op_bcsel: {
644 struct ir3_instruction *cond = src[0];
645
646 /* If src[0] is a negation (likely as a result of an ir3_b2n(cond)),
647 * we can ignore that and use original cond, since the nonzero-ness of
648 * cond stays the same.
649 */
650 if (cond->opc == OPC_ABSNEG_S && cond->flags == 0 &&
651 (cond->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
652 IR3_REG_SNEG) {
653 cond = cond->srcs[0]->def->instr;
654 }
655
656 compile_assert(ctx, bs[1] == bs[2]);
657 /* The condition's size has to match the other two arguments' size, so
658 * convert down if necessary.
659 */
660 if (bs[1] == 16) {
661 struct hash_entry *prev_entry =
662 _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
663 if (prev_entry) {
664 cond = prev_entry->data;
665 } else {
666 cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
667 _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
668 }
669 }
670
671 if (bs[1] != 16)
672 dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
673 else
674 dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
675 break;
676 }
677 case nir_op_bit_count: {
678 // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to
679 // double check on earlier gen's. Once half-precision support is
680 // in place, this should probably move to a NIR lowering pass:
681 struct ir3_instruction *hi, *lo;
682
683 hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), TYPE_U32,
684 TYPE_U16);
685 lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16);
686
687 hi = ir3_CBITS_B(b, hi, 0);
688 lo = ir3_CBITS_B(b, lo, 0);
689
690 // TODO maybe the builders should default to making dst half-precision
691 // if the src's were half precision, to make this less awkward.. otoh
692 // we should probably just do this lowering in NIR.
693 hi->dsts[0]->flags |= IR3_REG_HALF;
694 lo->dsts[0]->flags |= IR3_REG_HALF;
695
696 dst[0] = ir3_ADD_S(b, hi, 0, lo, 0);
697 dst[0]->dsts[0]->flags |= IR3_REG_HALF;
698 dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32);
699 break;
700 }
701 case nir_op_ifind_msb: {
702 struct ir3_instruction *cmp;
703 dst[0] = ir3_CLZ_S(b, src[0], 0);
704 cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
705 cmp->cat2.condition = IR3_COND_GE;
706 dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
707 0, cmp, 0, dst[0], 0);
708 break;
709 }
710 case nir_op_ufind_msb:
711 dst[0] = ir3_CLZ_B(b, src[0], 0);
712 dst[0] = ir3_SEL_B32(b, ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0),
713 0, src[0], 0, dst[0], 0);
714 break;
715 case nir_op_find_lsb:
716 dst[0] = ir3_BFREV_B(b, src[0], 0);
717 dst[0] = ir3_CLZ_B(b, dst[0], 0);
718 break;
719 case nir_op_bitfield_reverse:
720 dst[0] = ir3_BFREV_B(b, src[0], 0);
721 break;
722
723 default:
724 ir3_context_error(ctx, "Unhandled ALU op: %s\n",
725 nir_op_infos[alu->op].name);
726 break;
727 }
728
729 if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
730 assert(nir_dest_bit_size(alu->dest.dest) == 1 || alu->op == nir_op_b2b32);
731 assert(dst_sz == 1);
732 } else {
733 /* 1-bit values stored in 32-bit registers are only valid for certain
734 * ALU ops.
735 */
736 switch (alu->op) {
737 case nir_op_iand:
738 case nir_op_ior:
739 case nir_op_ixor:
740 case nir_op_inot:
741 case nir_op_bcsel:
742 break;
743 default:
744 compile_assert(ctx, nir_dest_bit_size(alu->dest.dest) != 1);
745 }
746 }
747
748 ir3_put_dst(ctx, &alu->dest.dest);
749 }
750
751 static void
emit_intrinsic_load_ubo_ldc(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)752 emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
753 struct ir3_instruction **dst)
754 {
755 struct ir3_block *b = ctx->block;
756
757 unsigned ncomp = intr->num_components;
758 struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
759 struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
760 struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
761 ldc->dsts[0]->wrmask = MASK(ncomp);
762 ldc->cat6.iim_val = ncomp;
763 ldc->cat6.d = nir_intrinsic_component(intr);
764 ldc->cat6.type = TYPE_U32;
765
766 ir3_handle_bindless_cat6(ldc, intr->src[0]);
767 if (ldc->flags & IR3_INSTR_B)
768 ctx->so->bindless_ubo = true;
769 ir3_handle_nonuniform(ldc, intr);
770
771 ir3_split_dest(b, dst, ldc, 0, ncomp);
772 }
773
774 /* handles direct/indirect UBO reads: */
775 static void
emit_intrinsic_load_ubo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)776 emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
777 struct ir3_instruction **dst)
778 {
779 struct ir3_block *b = ctx->block;
780 struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
781 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
782 unsigned ubo = regid(const_state->offsets.ubo, 0);
783 const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
784
785 int off = 0;
786
787 /* First src is ubo index, which could either be an immed or not: */
788 src0 = ir3_get_src(ctx, &intr->src[0])[0];
789 if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
790 base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
791 base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
792 } else {
793 base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
794 ir3_get_addr0(ctx, src0, ptrsz));
795 base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
796 ir3_get_addr0(ctx, src0, ptrsz));
797
798 /* NOTE: since relative addressing is used, make sure constlen is
799 * at least big enough to cover all the UBO addresses, since the
800 * assembler won't know what the max address reg is.
801 */
802 ctx->so->constlen =
803 MAX2(ctx->so->constlen,
804 const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
805 }
806
807 /* note: on 32bit gpu's base_hi is ignored and DCE'd */
808 addr = base_lo;
809
810 if (nir_src_is_const(intr->src[1])) {
811 off += nir_src_as_uint(intr->src[1]);
812 } else {
813 /* For load_ubo_indirect, second src is indirect offset: */
814 src1 = ir3_get_src(ctx, &intr->src[1])[0];
815
816 /* and add offset to addr: */
817 addr = ir3_ADD_S(b, addr, 0, src1, 0);
818 }
819
820 /* if offset is to large to encode in the ldg, split it out: */
821 if ((off + (intr->num_components * 4)) > 1024) {
822 /* split out the minimal amount to improve the odds that
823 * cp can fit the immediate in the add.s instruction:
824 */
825 unsigned off2 = off + (intr->num_components * 4) - 1024;
826 addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
827 off -= off2;
828 }
829
830 if (ptrsz == 2) {
831 struct ir3_instruction *carry;
832
833 /* handle 32b rollover, ie:
834 * if (addr < base_lo)
835 * base_hi++
836 */
837 carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
838 carry->cat2.condition = IR3_COND_LT;
839 base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
840
841 addr = ir3_collect(b, addr, base_hi);
842 }
843
844 for (int i = 0; i < intr->num_components; i++) {
845 struct ir3_instruction *load =
846 ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
847 create_immed(b, 1), 0); /* num components */
848 load->cat6.type = TYPE_U32;
849 dst[i] = load;
850 }
851 }
852
853 /* src[] = { block_index } */
854 static void
emit_intrinsic_ssbo_size(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)855 emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
856 struct ir3_instruction **dst)
857 {
858 struct ir3_block *b = ctx->block;
859 struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
860 struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
861 resinfo->cat6.iim_val = 1;
862 resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2;
863 resinfo->cat6.type = TYPE_U32;
864 resinfo->cat6.typed = false;
865 /* resinfo has no writemask and always writes out 3 components */
866 resinfo->dsts[0]->wrmask = MASK(3);
867 ir3_handle_bindless_cat6(resinfo, intr->src[0]);
868 ir3_handle_nonuniform(resinfo, intr);
869
870 if (ctx->compiler->gen >= 6) {
871 ir3_split_dest(b, dst, resinfo, 0, 1);
872 } else {
873 /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */
874 struct ir3_instruction *resinfo_dst[2];
875 ir3_split_dest(b, resinfo_dst, resinfo, 0, 2);
876 *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0);
877 }
878 }
879
880 /* src[] = { offset }. const_index[] = { base } */
881 static void
emit_intrinsic_load_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)882 emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
883 struct ir3_instruction **dst)
884 {
885 struct ir3_block *b = ctx->block;
886 struct ir3_instruction *ldl, *offset;
887 unsigned base;
888
889 offset = ir3_get_src(ctx, &intr->src[0])[0];
890 base = nir_intrinsic_base(intr);
891
892 ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
893 create_immed(b, intr->num_components), 0);
894
895 ldl->cat6.type = utype_dst(intr->dest);
896 ldl->dsts[0]->wrmask = MASK(intr->num_components);
897
898 ldl->barrier_class = IR3_BARRIER_SHARED_R;
899 ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
900
901 ir3_split_dest(b, dst, ldl, 0, intr->num_components);
902 }
903
904 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
905 static void
emit_intrinsic_store_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)906 emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
907 {
908 struct ir3_block *b = ctx->block;
909 struct ir3_instruction *stl, *offset;
910 struct ir3_instruction *const *value;
911 unsigned base, wrmask, ncomp;
912
913 value = ir3_get_src(ctx, &intr->src[0]);
914 offset = ir3_get_src(ctx, &intr->src[1])[0];
915
916 base = nir_intrinsic_base(intr);
917 wrmask = nir_intrinsic_write_mask(intr);
918 ncomp = ffs(~wrmask) - 1;
919
920 assert(wrmask == BITFIELD_MASK(intr->num_components));
921
922 stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
923 create_immed(b, ncomp), 0);
924 stl->cat6.dst_offset = base;
925 stl->cat6.type = utype_src(intr->src[0]);
926 stl->barrier_class = IR3_BARRIER_SHARED_W;
927 stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
928
929 array_insert(b, b->keeps, stl);
930 }
931
932 /* src[] = { offset }. const_index[] = { base } */
933 static void
emit_intrinsic_load_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)934 emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
935 nir_intrinsic_instr *intr,
936 struct ir3_instruction **dst)
937 {
938 struct ir3_block *b = ctx->block;
939 struct ir3_instruction *load, *offset;
940 unsigned base;
941
942 offset = ir3_get_src(ctx, &intr->src[0])[0];
943 base = nir_intrinsic_base(intr);
944
945 load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
946 create_immed(b, intr->num_components), 0);
947
948 /* for a650, use LDL for tess ctrl inputs: */
949 if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
950 load->opc = OPC_LDL;
951
952 load->cat6.type = utype_dst(intr->dest);
953 load->dsts[0]->wrmask = MASK(intr->num_components);
954
955 load->barrier_class = IR3_BARRIER_SHARED_R;
956 load->barrier_conflict = IR3_BARRIER_SHARED_W;
957
958 ir3_split_dest(b, dst, load, 0, intr->num_components);
959 }
960
961 /* src[] = { value, offset }. const_index[] = { base } */
962 static void
emit_intrinsic_store_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr)963 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
964 nir_intrinsic_instr *intr)
965 {
966 struct ir3_block *b = ctx->block;
967 struct ir3_instruction *store, *offset;
968 struct ir3_instruction *const *value;
969
970 value = ir3_get_src(ctx, &intr->src[0]);
971 offset = ir3_get_src(ctx, &intr->src[1])[0];
972
973 store = ir3_STLW(b, offset, 0,
974 ir3_create_collect(b, value, intr->num_components), 0,
975 create_immed(b, intr->num_components), 0);
976
977 /* for a650, use STL for vertex outputs used by tess ctrl shader: */
978 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
979 ctx->compiler->tess_use_shared)
980 store->opc = OPC_STL;
981
982 store->cat6.dst_offset = nir_intrinsic_base(intr);
983 store->cat6.type = utype_src(intr->src[0]);
984 store->barrier_class = IR3_BARRIER_SHARED_W;
985 store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
986
987 array_insert(b, b->keeps, store);
988 }
989
990 /*
991 * CS shared variable atomic intrinsics
992 *
993 * All of the shared variable atomic memory operations read a value from
994 * memory, compute a new value using one of the operations below, write the
995 * new value to memory, and return the original value read.
996 *
997 * All operations take 2 sources except CompSwap that takes 3. These
998 * sources represent:
999 *
1000 * 0: The offset into the shared variable storage region that the atomic
1001 * operation will operate on.
1002 * 1: The data parameter to the atomic function (i.e. the value to add
1003 * in shared_atomic_add, etc).
1004 * 2: For CompSwap only: the second data parameter.
1005 */
1006 static struct ir3_instruction *
emit_intrinsic_atomic_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1007 emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1008 {
1009 struct ir3_block *b = ctx->block;
1010 struct ir3_instruction *atomic, *src0, *src1;
1011 type_t type = TYPE_U32;
1012
1013 src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1014 src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1015
1016 switch (intr->intrinsic) {
1017 case nir_intrinsic_shared_atomic_add:
1018 atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1019 break;
1020 case nir_intrinsic_shared_atomic_imin:
1021 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1022 type = TYPE_S32;
1023 break;
1024 case nir_intrinsic_shared_atomic_umin:
1025 atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1026 break;
1027 case nir_intrinsic_shared_atomic_imax:
1028 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1029 type = TYPE_S32;
1030 break;
1031 case nir_intrinsic_shared_atomic_umax:
1032 atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1033 break;
1034 case nir_intrinsic_shared_atomic_and:
1035 atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1036 break;
1037 case nir_intrinsic_shared_atomic_or:
1038 atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1039 break;
1040 case nir_intrinsic_shared_atomic_xor:
1041 atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1042 break;
1043 case nir_intrinsic_shared_atomic_exchange:
1044 atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1045 break;
1046 case nir_intrinsic_shared_atomic_comp_swap:
1047 /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1048 src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
1049 atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1050 break;
1051 default:
1052 unreachable("boo");
1053 }
1054
1055 atomic->cat6.iim_val = 1;
1056 atomic->cat6.d = 1;
1057 atomic->cat6.type = type;
1058 atomic->barrier_class = IR3_BARRIER_SHARED_W;
1059 atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1060
1061 /* even if nothing consume the result, we can't DCE the instruction: */
1062 array_insert(b, b->keeps, atomic);
1063
1064 return atomic;
1065 }
1066
1067 /* src[] = { offset }. */
1068 static void
emit_intrinsic_load_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1069 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1070 struct ir3_instruction **dst)
1071 {
1072 struct ir3_block *b = ctx->block;
1073 struct ir3_instruction *ldp, *offset;
1074
1075 offset = ir3_get_src(ctx, &intr->src[0])[0];
1076
1077 ldp = ir3_LDP(b, offset, 0, create_immed(b, 0), 0,
1078 create_immed(b, intr->num_components), 0);
1079
1080 ldp->cat6.type = utype_dst(intr->dest);
1081 ldp->dsts[0]->wrmask = MASK(intr->num_components);
1082
1083 ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1084 ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1085
1086 ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1087 }
1088
1089 /* src[] = { value, offset }. const_index[] = { write_mask } */
1090 static void
emit_intrinsic_store_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr)1091 emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1092 {
1093 struct ir3_block *b = ctx->block;
1094 struct ir3_instruction *stp, *offset;
1095 struct ir3_instruction *const *value;
1096 unsigned wrmask, ncomp;
1097
1098 value = ir3_get_src(ctx, &intr->src[0]);
1099 offset = ir3_get_src(ctx, &intr->src[1])[0];
1100
1101 wrmask = nir_intrinsic_write_mask(intr);
1102 ncomp = ffs(~wrmask) - 1;
1103
1104 assert(wrmask == BITFIELD_MASK(intr->num_components));
1105
1106 stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1107 create_immed(b, ncomp), 0);
1108 stp->cat6.dst_offset = 0;
1109 stp->cat6.type = utype_src(intr->src[0]);
1110 stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1111 stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1112
1113 array_insert(b, b->keeps, stp);
1114 }
1115
1116 struct tex_src_info {
1117 /* For prefetch */
1118 unsigned tex_base, samp_base, tex_idx, samp_idx;
1119 /* For normal tex instructions */
1120 unsigned base, combined_idx, a1_val, flags;
1121 struct ir3_instruction *samp_tex;
1122 };
1123
1124 /* TODO handle actual indirect/dynamic case.. which is going to be weird
1125 * to handle with the image_mapping table..
1126 */
1127 static struct tex_src_info
get_image_samp_tex_src(struct ir3_context * ctx,nir_intrinsic_instr * intr)1128 get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1129 {
1130 struct ir3_block *b = ctx->block;
1131 struct tex_src_info info = {0};
1132 nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(intr->src[0]);
1133 ctx->so->bindless_tex = true;
1134
1135 if (bindless_tex) {
1136 /* Bindless case */
1137 info.flags |= IR3_INSTR_B;
1138
1139 /* Gather information required to determine which encoding to
1140 * choose as well as for prefetch.
1141 */
1142 info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1143 bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1144 if (tex_const)
1145 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1146 info.samp_idx = 0;
1147
1148 /* Choose encoding. */
1149 if (tex_const && info.tex_idx < 256) {
1150 if (info.tex_idx < 16) {
1151 /* Everything fits within the instruction */
1152 info.base = info.tex_base;
1153 info.combined_idx = info.samp_idx | (info.tex_idx << 4);
1154 } else {
1155 info.base = info.tex_base;
1156 info.a1_val = info.tex_idx << 3;
1157 info.combined_idx = 0;
1158 info.flags |= IR3_INSTR_A1EN;
1159 }
1160 info.samp_tex = NULL;
1161 } else {
1162 info.flags |= IR3_INSTR_S2EN;
1163 info.base = info.tex_base;
1164
1165 /* Note: the indirect source is now a vec2 instead of hvec2 */
1166 struct ir3_instruction *texture, *sampler;
1167
1168 texture = ir3_get_src(ctx, &intr->src[0])[0];
1169 sampler = create_immed(b, 0);
1170 info.samp_tex = ir3_collect(b, texture, sampler);
1171 }
1172 } else {
1173 info.flags |= IR3_INSTR_S2EN;
1174 unsigned slot = nir_src_as_uint(intr->src[0]);
1175 unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot);
1176 struct ir3_instruction *texture, *sampler;
1177
1178 texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1179 sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1180
1181 info.samp_tex = ir3_collect(b, sampler, texture);
1182 }
1183
1184 return info;
1185 }
1186
1187 static struct ir3_instruction *
emit_sam(struct ir3_context * ctx,opc_t opc,struct tex_src_info info,type_t type,unsigned wrmask,struct ir3_instruction * src0,struct ir3_instruction * src1)1188 emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1189 type_t type, unsigned wrmask, struct ir3_instruction *src0,
1190 struct ir3_instruction *src1)
1191 {
1192 struct ir3_instruction *sam, *addr;
1193 if (info.flags & IR3_INSTR_A1EN) {
1194 addr = ir3_get_addr1(ctx, info.a1_val);
1195 }
1196 sam = ir3_SAM(ctx->block, opc, type, 0b1111, info.flags, info.samp_tex, src0,
1197 src1);
1198 if (info.flags & IR3_INSTR_A1EN) {
1199 ir3_instr_set_address(sam, addr);
1200 }
1201 if (info.flags & IR3_INSTR_B) {
1202 sam->cat5.tex_base = info.base;
1203 sam->cat5.samp = info.combined_idx;
1204 }
1205 return sam;
1206 }
1207
1208 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
1209 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1210 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1211 struct ir3_instruction **dst)
1212 {
1213 /* Coherent accesses have to go directly to memory, rather than through
1214 * ISAM's texture cache (which isn't coherent with image stores).
1215 */
1216 if (nir_intrinsic_access(intr) & ACCESS_COHERENT && ctx->compiler->gen >= 5) {
1217 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1218 return;
1219 }
1220
1221 struct ir3_block *b = ctx->block;
1222 struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
1223 struct ir3_instruction *sam;
1224 struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1225 struct ir3_instruction *coords[4];
1226 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1227 type_t type = ir3_get_type_for_image_intrinsic(intr);
1228
1229 /* hmm, this seems a bit odd, but it is what blob does and (at least
1230 * a5xx) just faults on bogus addresses otherwise:
1231 */
1232 if (flags & IR3_INSTR_3D) {
1233 flags &= ~IR3_INSTR_3D;
1234 flags |= IR3_INSTR_A;
1235 }
1236 info.flags |= flags;
1237
1238 for (unsigned i = 0; i < ncoords; i++)
1239 coords[i] = src0[i];
1240
1241 if (ncoords == 1)
1242 coords[ncoords++] = create_immed(b, 0);
1243
1244 sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1245 ir3_create_collect(b, coords, ncoords), NULL);
1246
1247 ir3_handle_nonuniform(sam, intr);
1248
1249 sam->barrier_class = IR3_BARRIER_IMAGE_R;
1250 sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1251
1252 ir3_split_dest(b, dst, sam, 0, 4);
1253 }
1254
1255 /* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1256 void
emit_intrinsic_image_size_tex(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1257 emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1258 nir_intrinsic_instr *intr,
1259 struct ir3_instruction **dst)
1260 {
1261 struct ir3_block *b = ctx->block;
1262 struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
1263 struct ir3_instruction *sam, *lod;
1264 unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1265 type_t dst_type = nir_dest_bit_size(intr->dest) == 16 ? TYPE_U16 : TYPE_U32;
1266
1267 info.flags |= flags;
1268 assert(nir_src_as_uint(intr->src[1]) == 0);
1269 lod = create_immed(b, 0);
1270 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1271
1272 /* Array size actually ends up in .w rather than .z. This doesn't
1273 * matter for miplevel 0, but for higher mips the value in z is
1274 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1275 * returned, which means that we have to add 1 to it for arrays for
1276 * a3xx.
1277 *
1278 * Note use a temporary dst and then copy, since the size of the dst
1279 * array that is passed in is based on nir's understanding of the
1280 * result size, not the hardware's
1281 */
1282 struct ir3_instruction *tmp[4];
1283
1284 ir3_split_dest(b, tmp, sam, 0, 4);
1285
1286 for (unsigned i = 0; i < ncoords; i++)
1287 dst[i] = tmp[i];
1288
1289 if (flags & IR3_INSTR_A) {
1290 if (ctx->compiler->levels_add_one) {
1291 dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1292 } else {
1293 dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1294 }
1295 }
1296 }
1297
1298 static void
emit_control_barrier(struct ir3_context * ctx)1299 emit_control_barrier(struct ir3_context *ctx)
1300 {
1301 /* Hull shaders dispatch 32 wide so an entire patch will always
1302 * fit in a single warp and execute in lock-step. Consequently,
1303 * we don't need to do anything for TCS barriers. Emitting
1304 * barrier instruction will deadlock.
1305 */
1306 if (ctx->so->type == MESA_SHADER_TESS_CTRL)
1307 return;
1308
1309 struct ir3_block *b = ctx->block;
1310 struct ir3_instruction *barrier = ir3_BAR(b);
1311 barrier->cat7.g = true;
1312 if (ctx->compiler->gen < 6)
1313 barrier->cat7.l = true;
1314 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
1315 barrier->barrier_class = IR3_BARRIER_EVERYTHING;
1316 array_insert(b, b->keeps, barrier);
1317 }
1318
1319 static void
emit_intrinsic_barrier(struct ir3_context * ctx,nir_intrinsic_instr * intr)1320 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1321 {
1322 struct ir3_block *b = ctx->block;
1323 struct ir3_instruction *barrier;
1324
1325 /* TODO: find out why there is a major difference of .l usage
1326 * between a5xx and a6xx,
1327 */
1328
1329 switch (intr->intrinsic) {
1330 case nir_intrinsic_control_barrier:
1331 emit_control_barrier(ctx);
1332 return;
1333 case nir_intrinsic_scoped_barrier: {
1334 nir_scope exec_scope = nir_intrinsic_execution_scope(intr);
1335 nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
1336
1337 if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
1338 /* Remove mode corresponding to nir_intrinsic_memory_barrier_tcs_patch,
1339 * because hull shaders dispatch 32 wide so an entire patch will
1340 * always fit in a single warp and execute in lock-step.
1341 *
1342 * TODO: memory barrier also tells us not to reorder stores, this
1343 * information is lost here (backend doesn't reorder stores so we
1344 * are safe for now).
1345 */
1346 modes &= ~nir_var_shader_out;
1347 }
1348
1349 assert(!(modes & nir_var_shader_out));
1350
1351 if ((modes &
1352 (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global))) {
1353 barrier = ir3_FENCE(b);
1354 barrier->cat7.r = true;
1355 barrier->cat7.w = true;
1356
1357 if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1358 barrier->cat7.g = true;
1359 }
1360
1361 if (ctx->compiler->gen >= 6) {
1362 if (modes & nir_var_mem_ssbo) {
1363 barrier->cat7.l = true;
1364 }
1365 } else {
1366 if (modes & (nir_var_mem_shared | nir_var_mem_ssbo)) {
1367 barrier->cat7.l = true;
1368 }
1369 }
1370
1371 barrier->barrier_class = 0;
1372 barrier->barrier_conflict = 0;
1373
1374 if (modes & nir_var_mem_shared) {
1375 barrier->barrier_class |= IR3_BARRIER_SHARED_W;
1376 barrier->barrier_conflict |=
1377 IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1378 }
1379
1380 if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
1381 barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
1382 barrier->barrier_conflict |=
1383 IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1384 }
1385
1386 /* TODO: check for image mode when it has a separate one */
1387 if (modes & nir_var_mem_ssbo) {
1388 barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
1389 barrier->barrier_conflict |=
1390 IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
1391 }
1392 array_insert(b, b->keeps, barrier);
1393 }
1394
1395 if (exec_scope >= NIR_SCOPE_WORKGROUP) {
1396 emit_control_barrier(ctx);
1397 }
1398
1399 return;
1400 }
1401 case nir_intrinsic_memory_barrier_tcs_patch:
1402 /* Not applicable, see explanation for scoped_barrier + shader_out */
1403 return;
1404 case nir_intrinsic_memory_barrier_buffer:
1405 barrier = ir3_FENCE(b);
1406 barrier->cat7.g = true;
1407 if (ctx->compiler->gen >= 6)
1408 barrier->cat7.l = true;
1409 barrier->cat7.r = true;
1410 barrier->cat7.w = true;
1411 barrier->barrier_class = IR3_BARRIER_BUFFER_W;
1412 barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1413 break;
1414 case nir_intrinsic_memory_barrier_image:
1415 barrier = ir3_FENCE(b);
1416 barrier->cat7.g = true;
1417 barrier->cat7.l = true;
1418 barrier->cat7.r = true;
1419 barrier->cat7.w = true;
1420 barrier->barrier_class = IR3_BARRIER_IMAGE_W;
1421 barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
1422 break;
1423 case nir_intrinsic_memory_barrier_shared:
1424 barrier = ir3_FENCE(b);
1425 if (ctx->compiler->gen < 6)
1426 barrier->cat7.l = true;
1427 barrier->cat7.r = true;
1428 barrier->cat7.w = true;
1429 barrier->barrier_class = IR3_BARRIER_SHARED_W;
1430 barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1431 break;
1432 case nir_intrinsic_memory_barrier:
1433 case nir_intrinsic_group_memory_barrier:
1434 barrier = ir3_FENCE(b);
1435 barrier->cat7.g = true;
1436 barrier->cat7.l = true;
1437 barrier->cat7.r = true;
1438 barrier->cat7.w = true;
1439 barrier->barrier_class =
1440 IR3_BARRIER_SHARED_W | IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
1441 barrier->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
1442 IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
1443 IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
1444 break;
1445 default:
1446 unreachable("boo");
1447 }
1448
1449 /* make sure barrier doesn't get DCE'd */
1450 array_insert(b, b->keeps, barrier);
1451 }
1452
1453 static void
add_sysval_input_compmask(struct ir3_context * ctx,gl_system_value slot,unsigned compmask,struct ir3_instruction * instr)1454 add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
1455 unsigned compmask, struct ir3_instruction *instr)
1456 {
1457 struct ir3_shader_variant *so = ctx->so;
1458 unsigned n = so->inputs_count++;
1459
1460 assert(instr->opc == OPC_META_INPUT);
1461 instr->input.inidx = n;
1462 instr->input.sysval = slot;
1463
1464 so->inputs[n].sysval = true;
1465 so->inputs[n].slot = slot;
1466 so->inputs[n].compmask = compmask;
1467 so->total_in++;
1468
1469 so->sysval_in += util_last_bit(compmask);
1470 }
1471
1472 static struct ir3_instruction *
create_sysval_input(struct ir3_context * ctx,gl_system_value slot,unsigned compmask)1473 create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
1474 unsigned compmask)
1475 {
1476 assert(compmask);
1477 struct ir3_instruction *sysval = create_input(ctx, compmask);
1478 add_sysval_input_compmask(ctx, slot, compmask, sysval);
1479 return sysval;
1480 }
1481
1482 static struct ir3_instruction *
get_barycentric(struct ir3_context * ctx,enum ir3_bary bary)1483 get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
1484 {
1485 static const gl_system_value sysval_base =
1486 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1487
1488 STATIC_ASSERT(sysval_base + IJ_PERSP_PIXEL ==
1489 SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
1490 STATIC_ASSERT(sysval_base + IJ_PERSP_SAMPLE ==
1491 SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
1492 STATIC_ASSERT(sysval_base + IJ_PERSP_CENTROID ==
1493 SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
1494 STATIC_ASSERT(sysval_base + IJ_PERSP_SIZE ==
1495 SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE);
1496 STATIC_ASSERT(sysval_base + IJ_LINEAR_PIXEL ==
1497 SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
1498 STATIC_ASSERT(sysval_base + IJ_LINEAR_CENTROID ==
1499 SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
1500 STATIC_ASSERT(sysval_base + IJ_LINEAR_SAMPLE ==
1501 SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
1502
1503 if (!ctx->ij[bary]) {
1504 struct ir3_instruction *xy[2];
1505 struct ir3_instruction *ij;
1506
1507 ij = create_sysval_input(ctx, sysval_base + bary, 0x3);
1508 ir3_split_dest(ctx->in_block, xy, ij, 0, 2);
1509
1510 ctx->ij[bary] = ir3_create_collect(ctx->in_block, xy, 2);
1511 }
1512
1513 return ctx->ij[bary];
1514 }
1515
1516 /* TODO: make this a common NIR helper?
1517 * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
1518 * it can't be extended to work with this
1519 */
1520 static gl_system_value
nir_intrinsic_barycentric_sysval(nir_intrinsic_instr * intr)1521 nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
1522 {
1523 enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
1524 gl_system_value sysval;
1525
1526 switch (intr->intrinsic) {
1527 case nir_intrinsic_load_barycentric_pixel:
1528 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1529 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1530 else
1531 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1532 break;
1533 case nir_intrinsic_load_barycentric_centroid:
1534 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1535 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
1536 else
1537 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
1538 break;
1539 case nir_intrinsic_load_barycentric_sample:
1540 if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
1541 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
1542 else
1543 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
1544 break;
1545 default:
1546 unreachable("invalid barycentric intrinsic");
1547 }
1548
1549 return sysval;
1550 }
1551
1552 static void
emit_intrinsic_barycentric(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1553 emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1554 struct ir3_instruction **dst)
1555 {
1556 gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
1557
1558 if (!ctx->so->key.msaa) {
1559 switch (sysval) {
1560 case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
1561 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1562 break;
1563 case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
1564 if (ctx->compiler->gen < 6)
1565 sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1566 break;
1567 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
1568 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1569 break;
1570 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
1571 if (ctx->compiler->gen < 6)
1572 sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
1573 break;
1574 default:
1575 break;
1576 }
1577 }
1578
1579 enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
1580
1581 struct ir3_instruction *ij = get_barycentric(ctx, bary);
1582 ir3_split_dest(ctx->block, dst, ij, 0, 2);
1583 }
1584
1585 static struct ir3_instruction *
get_frag_coord(struct ir3_context * ctx,nir_intrinsic_instr * intr)1586 get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1587 {
1588 if (!ctx->frag_coord) {
1589 struct ir3_block *b = ctx->in_block;
1590 struct ir3_instruction *xyzw[4];
1591 struct ir3_instruction *hw_frag_coord;
1592
1593 hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
1594 ir3_split_dest(b, xyzw, hw_frag_coord, 0, 4);
1595
1596 /* for frag_coord.xy, we get unsigned values.. we need
1597 * to subtract (integer) 8 and divide by 16 (right-
1598 * shift by 4) then convert to float:
1599 *
1600 * sub.s tmp, src, 8
1601 * shr.b tmp, tmp, 4
1602 * mov.u32f32 dst, tmp
1603 *
1604 */
1605 for (int i = 0; i < 2; i++) {
1606 xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32);
1607 xyzw[i] =
1608 ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
1609 }
1610
1611 ctx->frag_coord = ir3_create_collect(b, xyzw, 4);
1612 }
1613
1614 ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa);
1615
1616 return ctx->frag_coord;
1617 }
1618
1619 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
1620 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
1621
1622 static void
emit_intrinsic(struct ir3_context * ctx,nir_intrinsic_instr * intr)1623 emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1624 {
1625 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
1626 struct ir3_instruction **dst;
1627 struct ir3_instruction *const *src;
1628 struct ir3_block *b = ctx->block;
1629 unsigned dest_components = nir_intrinsic_dest_components(intr);
1630 int idx;
1631
1632 if (info->has_dest) {
1633 dst = ir3_get_dst(ctx, &intr->dest, dest_components);
1634 } else {
1635 dst = NULL;
1636 }
1637
1638 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1639 const unsigned primitive_param = const_state->offsets.primitive_param * 4;
1640 const unsigned primitive_map = const_state->offsets.primitive_map * 4;
1641
1642 switch (intr->intrinsic) {
1643 case nir_intrinsic_load_uniform:
1644 idx = nir_intrinsic_base(intr);
1645 if (nir_src_is_const(intr->src[0])) {
1646 idx += nir_src_as_uint(intr->src[0]);
1647 for (int i = 0; i < dest_components; i++) {
1648 dst[i] = create_uniform_typed(
1649 b, idx + i,
1650 nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32);
1651 }
1652 } else {
1653 src = ir3_get_src(ctx, &intr->src[0]);
1654 for (int i = 0; i < dest_components; i++) {
1655 dst[i] = create_uniform_indirect(
1656 b, idx + i,
1657 nir_dest_bit_size(intr->dest) == 16 ? TYPE_F16 : TYPE_F32,
1658 ir3_get_addr0(ctx, src[0], 1));
1659 }
1660 /* NOTE: if relative addressing is used, we set
1661 * constlen in the compiler (to worst-case value)
1662 * since we don't know in the assembler what the max
1663 * addr reg value can be:
1664 */
1665 ctx->so->constlen =
1666 MAX2(ctx->so->constlen, const_state->ubo_state.size / 16);
1667 }
1668 break;
1669
1670 case nir_intrinsic_load_vs_primitive_stride_ir3:
1671 dst[0] = create_uniform(b, primitive_param + 0);
1672 break;
1673 case nir_intrinsic_load_vs_vertex_stride_ir3:
1674 dst[0] = create_uniform(b, primitive_param + 1);
1675 break;
1676 case nir_intrinsic_load_hs_patch_stride_ir3:
1677 dst[0] = create_uniform(b, primitive_param + 2);
1678 break;
1679 case nir_intrinsic_load_patch_vertices_in:
1680 dst[0] = create_uniform(b, primitive_param + 3);
1681 break;
1682 case nir_intrinsic_load_tess_param_base_ir3:
1683 dst[0] = create_uniform(b, primitive_param + 4);
1684 dst[1] = create_uniform(b, primitive_param + 5);
1685 break;
1686 case nir_intrinsic_load_tess_factor_base_ir3:
1687 dst[0] = create_uniform(b, primitive_param + 6);
1688 dst[1] = create_uniform(b, primitive_param + 7);
1689 break;
1690
1691 case nir_intrinsic_load_primitive_location_ir3:
1692 idx = nir_intrinsic_driver_location(intr);
1693 dst[0] = create_uniform(b, primitive_map + idx);
1694 break;
1695
1696 case nir_intrinsic_load_gs_header_ir3:
1697 dst[0] = ctx->gs_header;
1698 break;
1699 case nir_intrinsic_load_tcs_header_ir3:
1700 dst[0] = ctx->tcs_header;
1701 break;
1702
1703 case nir_intrinsic_load_rel_patch_id_ir3:
1704 dst[0] = ctx->rel_patch_id;
1705 break;
1706
1707 case nir_intrinsic_load_primitive_id:
1708 if (!ctx->primitive_id) {
1709 ctx->primitive_id =
1710 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
1711 }
1712 dst[0] = ctx->primitive_id;
1713 break;
1714
1715 case nir_intrinsic_load_tess_coord:
1716 if (!ctx->tess_coord) {
1717 ctx->tess_coord =
1718 create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
1719 }
1720 ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
1721
1722 /* Unused, but ir3_put_dst() below wants to free something */
1723 dst[2] = create_immed(b, 0);
1724 break;
1725
1726 case nir_intrinsic_end_patch_ir3:
1727 assert(ctx->so->type == MESA_SHADER_TESS_CTRL);
1728 struct ir3_instruction *end = ir3_PREDE(b);
1729 array_insert(b, b->keeps, end);
1730
1731 end->barrier_class = IR3_BARRIER_EVERYTHING;
1732 end->barrier_conflict = IR3_BARRIER_EVERYTHING;
1733 break;
1734
1735 case nir_intrinsic_store_global_ir3:
1736 ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
1737 break;
1738 case nir_intrinsic_load_global_ir3:
1739 ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
1740 break;
1741
1742 case nir_intrinsic_load_ubo:
1743 emit_intrinsic_load_ubo(ctx, intr, dst);
1744 break;
1745 case nir_intrinsic_load_ubo_vec4:
1746 emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
1747 break;
1748 case nir_intrinsic_load_frag_coord:
1749 ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
1750 break;
1751 case nir_intrinsic_load_sample_pos_from_id: {
1752 /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
1753 * but that doesn't seem necessary.
1754 */
1755 struct ir3_instruction *offset =
1756 ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
1757 offset->dsts[0]->wrmask = 0x3;
1758 offset->cat5.type = TYPE_F32;
1759
1760 ir3_split_dest(b, dst, offset, 0, 2);
1761
1762 break;
1763 }
1764 case nir_intrinsic_load_size_ir3:
1765 if (!ctx->ij[IJ_PERSP_SIZE]) {
1766 ctx->ij[IJ_PERSP_SIZE] =
1767 create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE, 0x1);
1768 }
1769 dst[0] = ctx->ij[IJ_PERSP_SIZE];
1770 break;
1771 case nir_intrinsic_load_barycentric_centroid:
1772 case nir_intrinsic_load_barycentric_sample:
1773 case nir_intrinsic_load_barycentric_pixel:
1774 emit_intrinsic_barycentric(ctx, intr, dst);
1775 break;
1776 case nir_intrinsic_load_interpolated_input:
1777 case nir_intrinsic_load_input:
1778 setup_input(ctx, intr);
1779 break;
1780 /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
1781 * pass and replaced by an ir3-specifc version that adds the
1782 * dword-offset in the last source.
1783 */
1784 case nir_intrinsic_load_ssbo_ir3:
1785 ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
1786 break;
1787 case nir_intrinsic_store_ssbo_ir3:
1788 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1789 !ctx->s->info.fs.early_fragment_tests)
1790 ctx->so->no_earlyz = true;
1791 ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
1792 break;
1793 case nir_intrinsic_get_ssbo_size:
1794 emit_intrinsic_ssbo_size(ctx, intr, dst);
1795 break;
1796 case nir_intrinsic_ssbo_atomic_add_ir3:
1797 case nir_intrinsic_ssbo_atomic_imin_ir3:
1798 case nir_intrinsic_ssbo_atomic_umin_ir3:
1799 case nir_intrinsic_ssbo_atomic_imax_ir3:
1800 case nir_intrinsic_ssbo_atomic_umax_ir3:
1801 case nir_intrinsic_ssbo_atomic_and_ir3:
1802 case nir_intrinsic_ssbo_atomic_or_ir3:
1803 case nir_intrinsic_ssbo_atomic_xor_ir3:
1804 case nir_intrinsic_ssbo_atomic_exchange_ir3:
1805 case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
1806 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1807 !ctx->s->info.fs.early_fragment_tests)
1808 ctx->so->no_earlyz = true;
1809 dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
1810 break;
1811 case nir_intrinsic_load_shared:
1812 emit_intrinsic_load_shared(ctx, intr, dst);
1813 break;
1814 case nir_intrinsic_store_shared:
1815 emit_intrinsic_store_shared(ctx, intr);
1816 break;
1817 case nir_intrinsic_shared_atomic_add:
1818 case nir_intrinsic_shared_atomic_imin:
1819 case nir_intrinsic_shared_atomic_umin:
1820 case nir_intrinsic_shared_atomic_imax:
1821 case nir_intrinsic_shared_atomic_umax:
1822 case nir_intrinsic_shared_atomic_and:
1823 case nir_intrinsic_shared_atomic_or:
1824 case nir_intrinsic_shared_atomic_xor:
1825 case nir_intrinsic_shared_atomic_exchange:
1826 case nir_intrinsic_shared_atomic_comp_swap:
1827 dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
1828 break;
1829 case nir_intrinsic_load_scratch:
1830 emit_intrinsic_load_scratch(ctx, intr, dst);
1831 break;
1832 case nir_intrinsic_store_scratch:
1833 emit_intrinsic_store_scratch(ctx, intr);
1834 break;
1835 case nir_intrinsic_image_load:
1836 emit_intrinsic_load_image(ctx, intr, dst);
1837 break;
1838 case nir_intrinsic_bindless_image_load:
1839 /* Bindless uses the IBO state, which doesn't have swizzle filled out,
1840 * so using isam doesn't work.
1841 *
1842 * TODO: can we use isam if we fill out more fields?
1843 */
1844 ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1845 break;
1846 case nir_intrinsic_image_store:
1847 case nir_intrinsic_bindless_image_store:
1848 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1849 !ctx->s->info.fs.early_fragment_tests)
1850 ctx->so->no_earlyz = true;
1851 ctx->funcs->emit_intrinsic_store_image(ctx, intr);
1852 break;
1853 case nir_intrinsic_image_size:
1854 case nir_intrinsic_bindless_image_size:
1855 ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
1856 break;
1857 case nir_intrinsic_image_atomic_add:
1858 case nir_intrinsic_bindless_image_atomic_add:
1859 case nir_intrinsic_image_atomic_imin:
1860 case nir_intrinsic_bindless_image_atomic_imin:
1861 case nir_intrinsic_image_atomic_umin:
1862 case nir_intrinsic_bindless_image_atomic_umin:
1863 case nir_intrinsic_image_atomic_imax:
1864 case nir_intrinsic_bindless_image_atomic_imax:
1865 case nir_intrinsic_image_atomic_umax:
1866 case nir_intrinsic_bindless_image_atomic_umax:
1867 case nir_intrinsic_image_atomic_and:
1868 case nir_intrinsic_bindless_image_atomic_and:
1869 case nir_intrinsic_image_atomic_or:
1870 case nir_intrinsic_bindless_image_atomic_or:
1871 case nir_intrinsic_image_atomic_xor:
1872 case nir_intrinsic_bindless_image_atomic_xor:
1873 case nir_intrinsic_image_atomic_exchange:
1874 case nir_intrinsic_bindless_image_atomic_exchange:
1875 case nir_intrinsic_image_atomic_comp_swap:
1876 case nir_intrinsic_bindless_image_atomic_comp_swap:
1877 if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
1878 !ctx->s->info.fs.early_fragment_tests)
1879 ctx->so->no_earlyz = true;
1880 dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
1881 break;
1882 case nir_intrinsic_scoped_barrier:
1883 case nir_intrinsic_control_barrier:
1884 case nir_intrinsic_memory_barrier:
1885 case nir_intrinsic_group_memory_barrier:
1886 case nir_intrinsic_memory_barrier_buffer:
1887 case nir_intrinsic_memory_barrier_image:
1888 case nir_intrinsic_memory_barrier_shared:
1889 case nir_intrinsic_memory_barrier_tcs_patch:
1890 emit_intrinsic_barrier(ctx, intr);
1891 /* note that blk ptr no longer valid, make that obvious: */
1892 b = NULL;
1893 break;
1894 case nir_intrinsic_store_output:
1895 setup_output(ctx, intr);
1896 break;
1897 case nir_intrinsic_load_base_vertex:
1898 case nir_intrinsic_load_first_vertex:
1899 if (!ctx->basevertex) {
1900 ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
1901 }
1902 dst[0] = ctx->basevertex;
1903 break;
1904 case nir_intrinsic_load_draw_id:
1905 if (!ctx->draw_id) {
1906 ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
1907 }
1908 dst[0] = ctx->draw_id;
1909 break;
1910 case nir_intrinsic_load_base_instance:
1911 if (!ctx->base_instance) {
1912 ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
1913 }
1914 dst[0] = ctx->base_instance;
1915 break;
1916 case nir_intrinsic_load_view_index:
1917 if (!ctx->view_index) {
1918 ctx->view_index =
1919 create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
1920 }
1921 dst[0] = ctx->view_index;
1922 break;
1923 case nir_intrinsic_load_vertex_id_zero_base:
1924 case nir_intrinsic_load_vertex_id:
1925 if (!ctx->vertex_id) {
1926 gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
1927 ? SYSTEM_VALUE_VERTEX_ID
1928 : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
1929 ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
1930 }
1931 dst[0] = ctx->vertex_id;
1932 break;
1933 case nir_intrinsic_load_instance_id:
1934 if (!ctx->instance_id) {
1935 ctx->instance_id =
1936 create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
1937 }
1938 dst[0] = ctx->instance_id;
1939 break;
1940 case nir_intrinsic_load_sample_id:
1941 ctx->so->per_samp = true;
1942 FALLTHROUGH;
1943 case nir_intrinsic_load_sample_id_no_per_sample:
1944 if (!ctx->samp_id) {
1945 ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
1946 ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
1947 }
1948 dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
1949 break;
1950 case nir_intrinsic_load_sample_mask_in:
1951 if (!ctx->samp_mask_in) {
1952 ctx->samp_mask_in =
1953 create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
1954 }
1955 dst[0] = ctx->samp_mask_in;
1956 break;
1957 case nir_intrinsic_load_user_clip_plane:
1958 idx = nir_intrinsic_ucp_id(intr);
1959 for (int i = 0; i < dest_components; i++) {
1960 unsigned n = idx * 4 + i;
1961 dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
1962 }
1963 break;
1964 case nir_intrinsic_load_front_face:
1965 if (!ctx->frag_face) {
1966 ctx->so->frag_face = true;
1967 ctx->frag_face =
1968 create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
1969 ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
1970 }
1971 /* for fragface, we get -1 for back and 0 for front. However this is
1972 * the inverse of what nir expects (where ~0 is true).
1973 */
1974 dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
1975 create_immed_typed(b, 0, TYPE_U16), 0);
1976 dst[0]->cat2.condition = IR3_COND_EQ;
1977 break;
1978 case nir_intrinsic_load_local_invocation_id:
1979 if (!ctx->local_invocation_id) {
1980 ctx->local_invocation_id =
1981 create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
1982 }
1983 ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
1984 break;
1985 case nir_intrinsic_load_workgroup_id:
1986 case nir_intrinsic_load_workgroup_id_zero_base:
1987 if (!ctx->work_group_id) {
1988 ctx->work_group_id =
1989 create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
1990 ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
1991 }
1992 ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
1993 break;
1994 case nir_intrinsic_load_base_workgroup_id:
1995 for (int i = 0; i < dest_components; i++) {
1996 dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
1997 }
1998 break;
1999 case nir_intrinsic_load_num_workgroups:
2000 for (int i = 0; i < dest_components; i++) {
2001 dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
2002 }
2003 break;
2004 case nir_intrinsic_load_workgroup_size:
2005 for (int i = 0; i < dest_components; i++) {
2006 dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
2007 }
2008 break;
2009 case nir_intrinsic_load_subgroup_size:
2010 dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
2011 break;
2012 case nir_intrinsic_load_subgroup_id_shift_ir3:
2013 dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
2014 break;
2015 case nir_intrinsic_discard_if:
2016 case nir_intrinsic_discard:
2017 case nir_intrinsic_demote:
2018 case nir_intrinsic_demote_if:
2019 case nir_intrinsic_terminate:
2020 case nir_intrinsic_terminate_if: {
2021 struct ir3_instruction *cond, *kill;
2022
2023 if (intr->intrinsic == nir_intrinsic_discard_if ||
2024 intr->intrinsic == nir_intrinsic_demote_if ||
2025 intr->intrinsic == nir_intrinsic_terminate_if) {
2026 /* conditional discard: */
2027 src = ir3_get_src(ctx, &intr->src[0]);
2028 cond = src[0];
2029 } else {
2030 /* unconditional discard: */
2031 cond = create_immed(b, 1);
2032 }
2033
2034 /* NOTE: only cmps.*.* can write p0.x: */
2035 cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
2036 cond->cat2.condition = IR3_COND_NE;
2037
2038 /* condition always goes in predicate register: */
2039 cond->dsts[0]->num = regid(REG_P0, 0);
2040 cond->dsts[0]->flags &= ~IR3_REG_SSA;
2041
2042 if (intr->intrinsic == nir_intrinsic_demote ||
2043 intr->intrinsic == nir_intrinsic_demote_if) {
2044 kill = ir3_DEMOTE(b, cond, 0);
2045 } else {
2046 kill = ir3_KILL(b, cond, 0);
2047 }
2048
2049 /* Side-effects should not be moved on a different side of the kill */
2050 kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
2051 kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W;
2052 kill->srcs[0]->num = regid(REG_P0, 0);
2053 array_insert(ctx->ir, ctx->ir->predicates, kill);
2054
2055 array_insert(b, b->keeps, kill);
2056 ctx->so->has_kill = true;
2057
2058 break;
2059 }
2060
2061 case nir_intrinsic_cond_end_ir3: {
2062 struct ir3_instruction *cond, *kill;
2063
2064 src = ir3_get_src(ctx, &intr->src[0]);
2065 cond = src[0];
2066
2067 /* NOTE: only cmps.*.* can write p0.x: */
2068 cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
2069 cond->cat2.condition = IR3_COND_NE;
2070
2071 /* condition always goes in predicate register: */
2072 cond->dsts[0]->num = regid(REG_P0, 0);
2073
2074 kill = ir3_PREDT(b, cond, 0);
2075
2076 kill->barrier_class = IR3_BARRIER_EVERYTHING;
2077 kill->barrier_conflict = IR3_BARRIER_EVERYTHING;
2078
2079 array_insert(ctx->ir, ctx->ir->predicates, kill);
2080 array_insert(b, b->keeps, kill);
2081 break;
2082 }
2083
2084 case nir_intrinsic_vote_any:
2085 case nir_intrinsic_vote_all: {
2086 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2087 struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2088 if (intr->intrinsic == nir_intrinsic_vote_any)
2089 dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
2090 else
2091 dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
2092 dst[0]->srcs[0]->num = regid(REG_P0, 0);
2093 array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2094 break;
2095 }
2096 case nir_intrinsic_elect:
2097 dst[0] = ir3_ELECT_MACRO(ctx->block);
2098 /* This may expand to a divergent if/then, so allocate stack space for
2099 * it.
2100 */
2101 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2102 break;
2103
2104 case nir_intrinsic_read_invocation_cond_ir3: {
2105 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2106 struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
2107 dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
2108 src, 0);
2109 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2110 dst[0]->srcs[0]->num = regid(REG_P0, 0);
2111 array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
2112 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2113 break;
2114 }
2115
2116 case nir_intrinsic_read_first_invocation: {
2117 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2118 dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
2119 dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
2120 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2121 break;
2122 }
2123
2124 case nir_intrinsic_ballot: {
2125 struct ir3_instruction *ballot;
2126 unsigned components = intr->dest.ssa.num_components;
2127 if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
2128 /* ballot(true) is just MOVMSK */
2129 ballot = ir3_MOVMSK(ctx->block, components);
2130 } else {
2131 struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2132 struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
2133 ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
2134 ballot->srcs[0]->num = regid(REG_P0, 0);
2135 array_insert(ctx->ir, ctx->ir->predicates, ballot);
2136 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack + 1);
2137 }
2138 ir3_split_dest(ctx->block, dst, ballot, 0, components);
2139 break;
2140 }
2141
2142 case nir_intrinsic_load_shared_ir3:
2143 emit_intrinsic_load_shared_ir3(ctx, intr, dst);
2144 break;
2145 case nir_intrinsic_store_shared_ir3:
2146 emit_intrinsic_store_shared_ir3(ctx, intr);
2147 break;
2148 case nir_intrinsic_bindless_resource_ir3:
2149 dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
2150 break;
2151 default:
2152 ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
2153 nir_intrinsic_infos[intr->intrinsic].name);
2154 break;
2155 }
2156
2157 if (info->has_dest)
2158 ir3_put_dst(ctx, &intr->dest);
2159 }
2160
2161 static void
emit_load_const(struct ir3_context * ctx,nir_load_const_instr * instr)2162 emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
2163 {
2164 struct ir3_instruction **dst =
2165 ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
2166
2167 if (instr->def.bit_size == 16) {
2168 for (int i = 0; i < instr->def.num_components; i++)
2169 dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
2170 } else {
2171 for (int i = 0; i < instr->def.num_components; i++)
2172 dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);
2173 }
2174 }
2175
2176 static void
emit_undef(struct ir3_context * ctx,nir_ssa_undef_instr * undef)2177 emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
2178 {
2179 struct ir3_instruction **dst =
2180 ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
2181 type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32;
2182
2183 /* backend doesn't want undefined instructions, so just plug
2184 * in 0.0..
2185 */
2186 for (int i = 0; i < undef->def.num_components; i++)
2187 dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
2188 }
2189
2190 /*
2191 * texture fetch/sample instructions:
2192 */
2193
2194 static type_t
get_tex_dest_type(nir_tex_instr * tex)2195 get_tex_dest_type(nir_tex_instr *tex)
2196 {
2197 type_t type;
2198
2199 switch (tex->dest_type) {
2200 case nir_type_float32:
2201 return TYPE_F32;
2202 case nir_type_float16:
2203 return TYPE_F16;
2204 case nir_type_int32:
2205 return TYPE_S32;
2206 case nir_type_int16:
2207 return TYPE_S16;
2208 case nir_type_bool32:
2209 case nir_type_uint32:
2210 return TYPE_U32;
2211 case nir_type_bool16:
2212 case nir_type_uint16:
2213 return TYPE_U16;
2214 case nir_type_invalid:
2215 default:
2216 unreachable("bad dest_type");
2217 }
2218
2219 return type;
2220 }
2221
2222 static void
tex_info(nir_tex_instr * tex,unsigned * flagsp,unsigned * coordsp)2223 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
2224 {
2225 unsigned coords =
2226 glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
2227 unsigned flags = 0;
2228
2229 /* note: would use tex->coord_components.. except txs.. also,
2230 * since array index goes after shadow ref, we don't want to
2231 * count it:
2232 */
2233 if (coords == 3)
2234 flags |= IR3_INSTR_3D;
2235
2236 if (tex->is_shadow && tex->op != nir_texop_lod)
2237 flags |= IR3_INSTR_S;
2238
2239 if (tex->is_array && tex->op != nir_texop_lod)
2240 flags |= IR3_INSTR_A;
2241
2242 *flagsp = flags;
2243 *coordsp = coords;
2244 }
2245
2246 /* Gets the sampler/texture idx as a hvec2. Which could either be dynamic
2247 * or immediate (in which case it will get lowered later to a non .s2en
2248 * version of the tex instruction which encode tex/samp as immediates:
2249 */
2250 static struct tex_src_info
get_tex_samp_tex_src(struct ir3_context * ctx,nir_tex_instr * tex)2251 get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
2252 {
2253 struct ir3_block *b = ctx->block;
2254 struct tex_src_info info = {0};
2255 int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
2256 int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2257 struct ir3_instruction *texture, *sampler;
2258
2259 if (texture_idx >= 0 || sampler_idx >= 0) {
2260 /* Bindless case */
2261 info.flags |= IR3_INSTR_B;
2262
2263 if (tex->texture_non_uniform || tex->sampler_non_uniform)
2264 info.flags |= IR3_INSTR_NONUNIF;
2265
2266 /* Gather information required to determine which encoding to
2267 * choose as well as for prefetch.
2268 */
2269 nir_intrinsic_instr *bindless_tex = NULL;
2270 bool tex_const;
2271 if (texture_idx >= 0) {
2272 ctx->so->bindless_tex = true;
2273 bindless_tex = ir3_bindless_resource(tex->src[texture_idx].src);
2274 assert(bindless_tex);
2275 info.tex_base = nir_intrinsic_desc_set(bindless_tex);
2276 tex_const = nir_src_is_const(bindless_tex->src[0]);
2277 if (tex_const)
2278 info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
2279 } else {
2280 /* To simplify some of the logic below, assume the index is
2281 * constant 0 when it's not enabled.
2282 */
2283 tex_const = true;
2284 info.tex_idx = 0;
2285 }
2286 nir_intrinsic_instr *bindless_samp = NULL;
2287 bool samp_const;
2288 if (sampler_idx >= 0) {
2289 ctx->so->bindless_samp = true;
2290 bindless_samp = ir3_bindless_resource(tex->src[sampler_idx].src);
2291 assert(bindless_samp);
2292 info.samp_base = nir_intrinsic_desc_set(bindless_samp);
2293 samp_const = nir_src_is_const(bindless_samp->src[0]);
2294 if (samp_const)
2295 info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
2296 } else {
2297 samp_const = true;
2298 info.samp_idx = 0;
2299 }
2300
2301 /* Choose encoding. */
2302 if (tex_const && samp_const && info.tex_idx < 256 &&
2303 info.samp_idx < 256) {
2304 if (info.tex_idx < 16 && info.samp_idx < 16 &&
2305 (!bindless_tex || !bindless_samp ||
2306 info.tex_base == info.samp_base)) {
2307 /* Everything fits within the instruction */
2308 info.base = info.tex_base;
2309 info.combined_idx = info.samp_idx | (info.tex_idx << 4);
2310 } else {
2311 info.base = info.tex_base;
2312 info.a1_val = info.tex_idx << 3 | info.samp_base;
2313 info.combined_idx = info.samp_idx;
2314 info.flags |= IR3_INSTR_A1EN;
2315 }
2316 info.samp_tex = NULL;
2317 } else {
2318 info.flags |= IR3_INSTR_S2EN;
2319 /* In the indirect case, we only use a1.x to store the sampler
2320 * base if it differs from the texture base.
2321 */
2322 if (!bindless_tex || !bindless_samp ||
2323 info.tex_base == info.samp_base) {
2324 info.base = info.tex_base;
2325 } else {
2326 info.base = info.tex_base;
2327 info.a1_val = info.samp_base;
2328 info.flags |= IR3_INSTR_A1EN;
2329 }
2330
2331 /* Note: the indirect source is now a vec2 instead of hvec2, and
2332 * for some reason the texture and sampler are swapped.
2333 */
2334 struct ir3_instruction *texture, *sampler;
2335
2336 if (bindless_tex) {
2337 texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2338 } else {
2339 texture = create_immed(b, 0);
2340 }
2341
2342 if (bindless_samp) {
2343 sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2344 } else {
2345 sampler = create_immed(b, 0);
2346 }
2347 info.samp_tex = ir3_collect(b, texture, sampler);
2348 }
2349 } else {
2350 info.flags |= IR3_INSTR_S2EN;
2351 texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
2352 sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
2353 if (texture_idx >= 0) {
2354 texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
2355 texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
2356 } else {
2357 /* TODO what to do for dynamic case? I guess we only need the
2358 * max index for astc srgb workaround so maybe not a problem
2359 * to worry about if we don't enable indirect samplers for
2360 * a4xx?
2361 */
2362 ctx->max_texture_index =
2363 MAX2(ctx->max_texture_index, tex->texture_index);
2364 texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
2365 info.tex_idx = tex->texture_index;
2366 }
2367
2368 if (sampler_idx >= 0) {
2369 sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
2370 sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
2371 } else {
2372 sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
2373 info.samp_idx = tex->texture_index;
2374 }
2375
2376 info.samp_tex = ir3_collect(b, sampler, texture);
2377 }
2378
2379 return info;
2380 }
2381
2382 static void
emit_tex(struct ir3_context * ctx,nir_tex_instr * tex)2383 emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
2384 {
2385 struct ir3_block *b = ctx->block;
2386 struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
2387 struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
2388 struct ir3_instruction *lod, *compare, *proj, *sample_index;
2389 struct tex_src_info info = {0};
2390 bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
2391 unsigned i, coords, flags, ncomp;
2392 unsigned nsrc0 = 0, nsrc1 = 0;
2393 type_t type;
2394 opc_t opc = 0;
2395
2396 ncomp = nir_dest_num_components(tex->dest);
2397
2398 coord = off = ddx = ddy = NULL;
2399 lod = proj = compare = sample_index = NULL;
2400
2401 dst = ir3_get_dst(ctx, &tex->dest, ncomp);
2402
2403 for (unsigned i = 0; i < tex->num_srcs; i++) {
2404 switch (tex->src[i].src_type) {
2405 case nir_tex_src_coord:
2406 coord = ir3_get_src(ctx, &tex->src[i].src);
2407 break;
2408 case nir_tex_src_bias:
2409 lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2410 has_bias = true;
2411 break;
2412 case nir_tex_src_lod:
2413 lod = ir3_get_src(ctx, &tex->src[i].src)[0];
2414 has_lod = true;
2415 break;
2416 case nir_tex_src_comparator: /* shadow comparator */
2417 compare = ir3_get_src(ctx, &tex->src[i].src)[0];
2418 break;
2419 case nir_tex_src_projector:
2420 proj = ir3_get_src(ctx, &tex->src[i].src)[0];
2421 has_proj = true;
2422 break;
2423 case nir_tex_src_offset:
2424 off = ir3_get_src(ctx, &tex->src[i].src);
2425 has_off = true;
2426 break;
2427 case nir_tex_src_ddx:
2428 ddx = ir3_get_src(ctx, &tex->src[i].src);
2429 break;
2430 case nir_tex_src_ddy:
2431 ddy = ir3_get_src(ctx, &tex->src[i].src);
2432 break;
2433 case nir_tex_src_ms_index:
2434 sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
2435 break;
2436 case nir_tex_src_texture_offset:
2437 case nir_tex_src_sampler_offset:
2438 case nir_tex_src_texture_handle:
2439 case nir_tex_src_sampler_handle:
2440 /* handled in get_tex_samp_src() */
2441 break;
2442 default:
2443 ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
2444 tex->src[i].src_type);
2445 return;
2446 }
2447 }
2448
2449 switch (tex->op) {
2450 case nir_texop_tex_prefetch:
2451 compile_assert(ctx, !has_bias);
2452 compile_assert(ctx, !has_lod);
2453 compile_assert(ctx, !compare);
2454 compile_assert(ctx, !has_proj);
2455 compile_assert(ctx, !has_off);
2456 compile_assert(ctx, !ddx);
2457 compile_assert(ctx, !ddy);
2458 compile_assert(ctx, !sample_index);
2459 compile_assert(
2460 ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
2461 compile_assert(
2462 ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
2463
2464 if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
2465 opc = OPC_META_TEX_PREFETCH;
2466 ctx->so->num_sampler_prefetch++;
2467 break;
2468 }
2469 FALLTHROUGH;
2470 case nir_texop_tex:
2471 opc = has_lod ? OPC_SAML : OPC_SAM;
2472 break;
2473 case nir_texop_txb:
2474 opc = OPC_SAMB;
2475 break;
2476 case nir_texop_txl:
2477 opc = OPC_SAML;
2478 break;
2479 case nir_texop_txd:
2480 opc = OPC_SAMGQ;
2481 break;
2482 case nir_texop_txf:
2483 opc = OPC_ISAML;
2484 break;
2485 case nir_texop_lod:
2486 opc = OPC_GETLOD;
2487 break;
2488 case nir_texop_tg4:
2489 /* NOTE: a4xx might need to emulate gather w/ txf (this is
2490 * what blob does, seems gather is broken?), and a3xx did
2491 * not support it (but probably could also emulate).
2492 */
2493 switch (tex->component) {
2494 case 0:
2495 opc = OPC_GATHER4R;
2496 break;
2497 case 1:
2498 opc = OPC_GATHER4G;
2499 break;
2500 case 2:
2501 opc = OPC_GATHER4B;
2502 break;
2503 case 3:
2504 opc = OPC_GATHER4A;
2505 break;
2506 }
2507 break;
2508 case nir_texop_txf_ms_fb:
2509 case nir_texop_txf_ms:
2510 opc = OPC_ISAMM;
2511 break;
2512 default:
2513 ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
2514 return;
2515 }
2516
2517 tex_info(tex, &flags, &coords);
2518
2519 /*
2520 * lay out the first argument in the proper order:
2521 * - actual coordinates first
2522 * - shadow reference
2523 * - array index
2524 * - projection w
2525 * - starting at offset 4, dpdx.xy, dpdy.xy
2526 *
2527 * bias/lod go into the second arg
2528 */
2529
2530 /* insert tex coords: */
2531 for (i = 0; i < coords; i++)
2532 src0[i] = coord[i];
2533
2534 nsrc0 = i;
2535
2536 /* scale up integer coords for TXF based on the LOD */
2537 if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
2538 assert(has_lod);
2539 for (i = 0; i < coords; i++)
2540 src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
2541 }
2542
2543 if (coords == 1) {
2544 /* hw doesn't do 1d, so we treat it as 2d with
2545 * height of 1, and patch up the y coord.
2546 */
2547 if (is_isam(opc)) {
2548 src0[nsrc0++] = create_immed(b, 0);
2549 } else {
2550 src0[nsrc0++] = create_immed(b, fui(0.5));
2551 }
2552 }
2553
2554 if (tex->is_shadow && tex->op != nir_texop_lod)
2555 src0[nsrc0++] = compare;
2556
2557 if (tex->is_array && tex->op != nir_texop_lod) {
2558 struct ir3_instruction *idx = coord[coords];
2559
2560 /* the array coord for cube arrays needs 0.5 added to it */
2561 if (ctx->compiler->array_index_add_half && !is_isam(opc))
2562 idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
2563
2564 src0[nsrc0++] = idx;
2565 }
2566
2567 if (has_proj) {
2568 src0[nsrc0++] = proj;
2569 flags |= IR3_INSTR_P;
2570 }
2571
2572 /* pad to 4, then ddx/ddy: */
2573 if (tex->op == nir_texop_txd) {
2574 while (nsrc0 < 4)
2575 src0[nsrc0++] = create_immed(b, fui(0.0));
2576 for (i = 0; i < coords; i++)
2577 src0[nsrc0++] = ddx[i];
2578 if (coords < 2)
2579 src0[nsrc0++] = create_immed(b, fui(0.0));
2580 for (i = 0; i < coords; i++)
2581 src0[nsrc0++] = ddy[i];
2582 if (coords < 2)
2583 src0[nsrc0++] = create_immed(b, fui(0.0));
2584 }
2585
2586 /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
2587 * with scaled x coord according to requested sample:
2588 */
2589 if (opc == OPC_ISAMM) {
2590 if (ctx->compiler->txf_ms_with_isaml) {
2591 /* the samples are laid out in x dimension as
2592 * 0 1 2 3
2593 * x_ms = (x << ms) + sample_index;
2594 */
2595 struct ir3_instruction *ms;
2596 ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
2597
2598 src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
2599 src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
2600
2601 opc = OPC_ISAML;
2602 } else {
2603 src0[nsrc0++] = sample_index;
2604 }
2605 }
2606
2607 /*
2608 * second argument (if applicable):
2609 * - offsets
2610 * - lod
2611 * - bias
2612 */
2613 if (has_off | has_lod | has_bias) {
2614 if (has_off) {
2615 unsigned off_coords = coords;
2616 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
2617 off_coords--;
2618 for (i = 0; i < off_coords; i++)
2619 src1[nsrc1++] = off[i];
2620 if (off_coords < 2)
2621 src1[nsrc1++] = create_immed(b, fui(0.0));
2622 flags |= IR3_INSTR_O;
2623 }
2624
2625 if (has_lod | has_bias)
2626 src1[nsrc1++] = lod;
2627 }
2628
2629 type = get_tex_dest_type(tex);
2630
2631 if (opc == OPC_GETLOD)
2632 type = TYPE_S32;
2633
2634 if (tex->op == nir_texop_txf_ms_fb) {
2635 /* only expect a single txf_ms_fb per shader: */
2636 compile_assert(ctx, !ctx->so->fb_read);
2637 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
2638
2639 ctx->so->fb_read = true;
2640 info.samp_tex = ir3_collect(
2641 b, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
2642 create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
2643 info.flags = IR3_INSTR_S2EN;
2644
2645 ctx->so->num_samp++;
2646 } else {
2647 info = get_tex_samp_tex_src(ctx, tex);
2648 }
2649
2650 struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
2651 struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
2652
2653 if (opc == OPC_META_TEX_PREFETCH) {
2654 int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
2655
2656 compile_assert(ctx, tex->src[idx].src.is_ssa);
2657
2658 sam = ir3_SAM(b, opc, type, MASK(ncomp), 0, NULL,
2659 get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
2660 sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
2661 /* make sure not to add irrelevant flags like S2EN */
2662 sam->flags = flags | (info.flags & IR3_INSTR_B);
2663 sam->prefetch.tex = info.tex_idx;
2664 sam->prefetch.samp = info.samp_idx;
2665 sam->prefetch.tex_base = info.tex_base;
2666 sam->prefetch.samp_base = info.samp_base;
2667 } else {
2668 info.flags |= flags;
2669 sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
2670 }
2671
2672 if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
2673 !nir_tex_instr_is_query(tex)) {
2674 assert(opc != OPC_META_TEX_PREFETCH);
2675
2676 /* only need first 3 components: */
2677 sam->dsts[0]->wrmask = 0x7;
2678 ir3_split_dest(b, dst, sam, 0, 3);
2679
2680 /* we need to sample the alpha separately with a non-ASTC
2681 * texture state:
2682 */
2683 sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
2684 col0, col1);
2685
2686 array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
2687
2688 /* fixup .w component: */
2689 ir3_split_dest(b, &dst[3], sam, 3, 1);
2690 } else {
2691 /* normal (non-workaround) case: */
2692 ir3_split_dest(b, dst, sam, 0, ncomp);
2693 }
2694
2695 /* GETLOD returns results in 4.8 fixed point */
2696 if (opc == OPC_GETLOD) {
2697 struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
2698
2699 compile_assert(ctx, tex->dest_type == nir_type_float32);
2700 for (i = 0; i < 2; i++) {
2701 dst[i] =
2702 ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_S32, TYPE_F32), 0, factor, 0);
2703 }
2704 }
2705
2706 ir3_put_dst(ctx, &tex->dest);
2707 }
2708
2709 static void
emit_tex_info(struct ir3_context * ctx,nir_tex_instr * tex,unsigned idx)2710 emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
2711 {
2712 struct ir3_block *b = ctx->block;
2713 struct ir3_instruction **dst, *sam;
2714 type_t dst_type = get_tex_dest_type(tex);
2715 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
2716
2717 dst = ir3_get_dst(ctx, &tex->dest, 1);
2718
2719 sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
2720
2721 /* even though there is only one component, since it ends
2722 * up in .y/.z/.w rather than .x, we need a split_dest()
2723 */
2724 ir3_split_dest(b, dst, sam, idx, 1);
2725
2726 /* The # of levels comes from getinfo.z. We need to add 1 to it, since
2727 * the value in TEX_CONST_0 is zero-based.
2728 */
2729 if (ctx->compiler->levels_add_one)
2730 dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
2731
2732 ir3_put_dst(ctx, &tex->dest);
2733 }
2734
2735 static void
emit_tex_txs(struct ir3_context * ctx,nir_tex_instr * tex)2736 emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
2737 {
2738 struct ir3_block *b = ctx->block;
2739 struct ir3_instruction **dst, *sam;
2740 struct ir3_instruction *lod;
2741 unsigned flags, coords;
2742 type_t dst_type = get_tex_dest_type(tex);
2743 struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
2744
2745 tex_info(tex, &flags, &coords);
2746 info.flags |= flags;
2747
2748 /* Actually we want the number of dimensions, not coordinates. This
2749 * distinction only matters for cubes.
2750 */
2751 if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
2752 coords = 2;
2753
2754 dst = ir3_get_dst(ctx, &tex->dest, 4);
2755
2756 int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
2757 compile_assert(ctx, lod_idx >= 0);
2758
2759 lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
2760
2761 if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
2762 sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
2763 } else {
2764 /*
2765 * The maximum value which OPC_GETSIZE could return for one dimension
2766 * is 0x007ff0, however sampler buffer could be much bigger.
2767 * Blob uses OPC_GETBUF for them.
2768 */
2769 sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
2770 }
2771
2772 ir3_split_dest(b, dst, sam, 0, 4);
2773
2774 /* Array size actually ends up in .w rather than .z. This doesn't
2775 * matter for miplevel 0, but for higher mips the value in z is
2776 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
2777 * returned, which means that we have to add 1 to it for arrays.
2778 */
2779 if (tex->is_array) {
2780 if (ctx->compiler->levels_add_one) {
2781 dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
2782 } else {
2783 dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
2784 }
2785 }
2786
2787 ir3_put_dst(ctx, &tex->dest);
2788 }
2789
2790 /* phi instructions are left partially constructed. We don't resolve
2791 * their srcs until the end of the shader, since (eg. loops) one of
2792 * the phi's srcs might be defined after the phi due to back edges in
2793 * the CFG.
2794 */
2795 static void
emit_phi(struct ir3_context * ctx,nir_phi_instr * nphi)2796 emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
2797 {
2798 struct ir3_instruction *phi, **dst;
2799
2800 /* NOTE: phi's should be lowered to scalar at this point */
2801 compile_assert(ctx, nphi->dest.ssa.num_components == 1);
2802
2803 dst = ir3_get_dst(ctx, &nphi->dest, 1);
2804
2805 phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,
2806 exec_list_length(&nphi->srcs));
2807 __ssa_dst(phi);
2808 phi->phi.nphi = nphi;
2809
2810 dst[0] = phi;
2811
2812 ir3_put_dst(ctx, &nphi->dest);
2813 }
2814
2815 static struct ir3_block *get_block(struct ir3_context *ctx,
2816 const nir_block *nblock);
2817
2818 static struct ir3_instruction *
read_phi_src(struct ir3_context * ctx,struct ir3_block * blk,struct ir3_instruction * phi,nir_phi_instr * nphi)2819 read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
2820 struct ir3_instruction *phi, nir_phi_instr *nphi)
2821 {
2822 if (!blk->nblock) {
2823 struct ir3_instruction *continue_phi =
2824 ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
2825 __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
2826
2827 for (unsigned i = 0; i < blk->predecessors_count; i++) {
2828 struct ir3_instruction *src =
2829 read_phi_src(ctx, blk->predecessors[i], phi, nphi);
2830 if (src)
2831 __ssa_src(continue_phi, src, 0);
2832 else
2833 ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
2834 }
2835
2836 return continue_phi;
2837 }
2838
2839 nir_foreach_phi_src (nsrc, nphi) {
2840 if (blk->nblock == nsrc->pred) {
2841 if (nsrc->src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
2842 /* Create an ir3 undef */
2843 return NULL;
2844 } else {
2845 return ir3_get_src(ctx, &nsrc->src)[0];
2846 }
2847 }
2848 }
2849
2850 unreachable("couldn't find phi node ir3 block");
2851 return NULL;
2852 }
2853
2854 static void
resolve_phis(struct ir3_context * ctx,struct ir3_block * block)2855 resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
2856 {
2857 foreach_instr (phi, &block->instr_list) {
2858 if (phi->opc != OPC_META_PHI)
2859 break;
2860
2861 nir_phi_instr *nphi = phi->phi.nphi;
2862
2863 if (!nphi) /* skip continue phis created above */
2864 continue;
2865
2866 for (unsigned i = 0; i < block->predecessors_count; i++) {
2867 struct ir3_block *pred = block->predecessors[i];
2868 struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
2869 if (src) {
2870 __ssa_src(phi, src, 0);
2871 } else {
2872 /* Create an ir3 undef */
2873 ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
2874 }
2875 }
2876 }
2877 }
2878
2879 static void
emit_jump(struct ir3_context * ctx,nir_jump_instr * jump)2880 emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
2881 {
2882 switch (jump->type) {
2883 case nir_jump_break:
2884 case nir_jump_continue:
2885 case nir_jump_return:
2886 /* I *think* we can simply just ignore this, and use the
2887 * successor block link to figure out where we need to
2888 * jump to for break/continue
2889 */
2890 break;
2891 default:
2892 ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
2893 break;
2894 }
2895 }
2896
2897 static void
emit_instr(struct ir3_context * ctx,nir_instr * instr)2898 emit_instr(struct ir3_context *ctx, nir_instr *instr)
2899 {
2900 switch (instr->type) {
2901 case nir_instr_type_alu:
2902 emit_alu(ctx, nir_instr_as_alu(instr));
2903 break;
2904 case nir_instr_type_deref:
2905 /* ignored, handled as part of the intrinsic they are src to */
2906 break;
2907 case nir_instr_type_intrinsic:
2908 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
2909 break;
2910 case nir_instr_type_load_const:
2911 emit_load_const(ctx, nir_instr_as_load_const(instr));
2912 break;
2913 case nir_instr_type_ssa_undef:
2914 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
2915 break;
2916 case nir_instr_type_tex: {
2917 nir_tex_instr *tex = nir_instr_as_tex(instr);
2918 /* couple tex instructions get special-cased:
2919 */
2920 switch (tex->op) {
2921 case nir_texop_txs:
2922 emit_tex_txs(ctx, tex);
2923 break;
2924 case nir_texop_query_levels:
2925 emit_tex_info(ctx, tex, 2);
2926 break;
2927 case nir_texop_texture_samples:
2928 emit_tex_info(ctx, tex, 3);
2929 break;
2930 default:
2931 emit_tex(ctx, tex);
2932 break;
2933 }
2934 break;
2935 }
2936 case nir_instr_type_jump:
2937 emit_jump(ctx, nir_instr_as_jump(instr));
2938 break;
2939 case nir_instr_type_phi:
2940 emit_phi(ctx, nir_instr_as_phi(instr));
2941 break;
2942 case nir_instr_type_call:
2943 case nir_instr_type_parallel_copy:
2944 ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
2945 instr->type);
2946 break;
2947 }
2948 }
2949
2950 static struct ir3_block *
get_block(struct ir3_context * ctx,const nir_block * nblock)2951 get_block(struct ir3_context *ctx, const nir_block *nblock)
2952 {
2953 struct ir3_block *block;
2954 struct hash_entry *hentry;
2955
2956 hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
2957 if (hentry)
2958 return hentry->data;
2959
2960 block = ir3_block_create(ctx->ir);
2961 block->nblock = nblock;
2962 _mesa_hash_table_insert(ctx->block_ht, nblock, block);
2963
2964 return block;
2965 }
2966
2967 static struct ir3_block *
get_block_or_continue(struct ir3_context * ctx,const nir_block * nblock)2968 get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
2969 {
2970 struct hash_entry *hentry;
2971
2972 hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
2973 if (hentry)
2974 return hentry->data;
2975
2976 return get_block(ctx, nblock);
2977 }
2978
2979 static struct ir3_block *
create_continue_block(struct ir3_context * ctx,const nir_block * nblock)2980 create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
2981 {
2982 struct ir3_block *block = ir3_block_create(ctx->ir);
2983 block->nblock = NULL;
2984 _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
2985 return block;
2986 }
2987
2988 static void
emit_block(struct ir3_context * ctx,nir_block * nblock)2989 emit_block(struct ir3_context *ctx, nir_block *nblock)
2990 {
2991 ctx->block = get_block(ctx, nblock);
2992
2993 list_addtail(&ctx->block->node, &ctx->ir->block_list);
2994
2995 ctx->block->loop_id = ctx->loop_id;
2996 ctx->block->loop_depth = ctx->loop_depth;
2997
2998 /* re-emit addr register in each block if needed: */
2999 for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
3000 _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
3001 ctx->addr0_ht[i] = NULL;
3002 }
3003
3004 _mesa_hash_table_u64_destroy(ctx->addr1_ht);
3005 ctx->addr1_ht = NULL;
3006
3007 nir_foreach_instr (instr, nblock) {
3008 ctx->cur_instr = instr;
3009 emit_instr(ctx, instr);
3010 ctx->cur_instr = NULL;
3011 if (ctx->error)
3012 return;
3013 }
3014
3015 for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
3016 if (nblock->successors[i]) {
3017 ctx->block->successors[i] =
3018 get_block_or_continue(ctx, nblock->successors[i]);
3019 ctx->block->physical_successors[i] = ctx->block->successors[i];
3020 }
3021 }
3022
3023 _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
3024 }
3025
3026 static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
3027
3028 static void
emit_if(struct ir3_context * ctx,nir_if * nif)3029 emit_if(struct ir3_context *ctx, nir_if *nif)
3030 {
3031 struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0];
3032
3033 if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
3034 ctx->block->condition = ssa(condition->srcs[0]);
3035 ctx->block->brtype = IR3_BRANCH_ANY;
3036 } else if (condition->opc == OPC_ALL_MACRO &&
3037 condition->block == ctx->block) {
3038 ctx->block->condition = ssa(condition->srcs[0]);
3039 ctx->block->brtype = IR3_BRANCH_ALL;
3040 } else if (condition->opc == OPC_ELECT_MACRO &&
3041 condition->block == ctx->block) {
3042 ctx->block->condition = NULL;
3043 ctx->block->brtype = IR3_BRANCH_GETONE;
3044 } else {
3045 ctx->block->condition = ir3_get_predicate(ctx, condition);
3046 ctx->block->brtype = IR3_BRANCH_COND;
3047 }
3048
3049 emit_cf_list(ctx, &nif->then_list);
3050 emit_cf_list(ctx, &nif->else_list);
3051
3052 struct ir3_block *last_then = get_block(ctx, nir_if_last_then_block(nif));
3053 struct ir3_block *first_else = get_block(ctx, nir_if_first_else_block(nif));
3054 assert(last_then->physical_successors[0] &&
3055 !last_then->physical_successors[1]);
3056 last_then->physical_successors[1] = first_else;
3057
3058 struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));
3059 struct ir3_block *after_if =
3060 get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
3061 assert(last_else->physical_successors[0] &&
3062 !last_else->physical_successors[1]);
3063 if (after_if != last_else->physical_successors[0])
3064 last_else->physical_successors[1] = after_if;
3065 }
3066
3067 static void
emit_loop(struct ir3_context * ctx,nir_loop * nloop)3068 emit_loop(struct ir3_context *ctx, nir_loop *nloop)
3069 {
3070 unsigned old_loop_id = ctx->loop_id;
3071 ctx->loop_id = ctx->so->loops + 1;
3072 ctx->loop_depth++;
3073
3074 struct nir_block *nstart = nir_loop_first_block(nloop);
3075 struct ir3_block *continue_blk = NULL;
3076
3077 /* There's always one incoming edge from outside the loop, and if there
3078 * are more than two backedges from inside the loop (so more than 2 total
3079 * edges) then we need to create a continue block after the loop to ensure
3080 * that control reconverges at the end of each loop iteration.
3081 */
3082 if (nstart->predecessors->entries > 2) {
3083 continue_blk = create_continue_block(ctx, nstart);
3084 }
3085
3086 emit_cf_list(ctx, &nloop->body);
3087
3088 if (continue_blk) {
3089 struct ir3_block *start = get_block(ctx, nstart);
3090 continue_blk->successors[0] = start;
3091 continue_blk->physical_successors[0] = start;
3092 continue_blk->loop_id = ctx->loop_id;
3093 continue_blk->loop_depth = ctx->loop_depth;
3094 list_addtail(&continue_blk->node, &ctx->ir->block_list);
3095 }
3096
3097 ctx->so->loops++;
3098 ctx->loop_depth--;
3099 ctx->loop_id = old_loop_id;
3100 }
3101
3102 static void
stack_push(struct ir3_context * ctx)3103 stack_push(struct ir3_context *ctx)
3104 {
3105 ctx->stack++;
3106 ctx->max_stack = MAX2(ctx->max_stack, ctx->stack);
3107 }
3108
3109 static void
stack_pop(struct ir3_context * ctx)3110 stack_pop(struct ir3_context *ctx)
3111 {
3112 compile_assert(ctx, ctx->stack > 0);
3113 ctx->stack--;
3114 }
3115
3116 static void
emit_cf_list(struct ir3_context * ctx,struct exec_list * list)3117 emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
3118 {
3119 foreach_list_typed (nir_cf_node, node, node, list) {
3120 switch (node->type) {
3121 case nir_cf_node_block:
3122 emit_block(ctx, nir_cf_node_as_block(node));
3123 break;
3124 case nir_cf_node_if:
3125 stack_push(ctx);
3126 emit_if(ctx, nir_cf_node_as_if(node));
3127 stack_pop(ctx);
3128 break;
3129 case nir_cf_node_loop:
3130 stack_push(ctx);
3131 emit_loop(ctx, nir_cf_node_as_loop(node));
3132 stack_pop(ctx);
3133 break;
3134 case nir_cf_node_function:
3135 ir3_context_error(ctx, "TODO\n");
3136 break;
3137 }
3138 }
3139 }
3140
3141 /* emit stream-out code. At this point, the current block is the original
3142 * (nir) end block, and nir ensures that all flow control paths terminate
3143 * into the end block. We re-purpose the original end block to generate
3144 * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
3145 * block holding stream-out write instructions, followed by the new end
3146 * block:
3147 *
3148 * blockOrigEnd {
3149 * p0.x = (vtxcnt < maxvtxcnt)
3150 * // succs: blockStreamOut, blockNewEnd
3151 * }
3152 * blockStreamOut {
3153 * // preds: blockOrigEnd
3154 * ... stream-out instructions ...
3155 * // succs: blockNewEnd
3156 * }
3157 * blockNewEnd {
3158 * // preds: blockOrigEnd, blockStreamOut
3159 * }
3160 */
3161 static void
emit_stream_out(struct ir3_context * ctx)3162 emit_stream_out(struct ir3_context *ctx)
3163 {
3164 struct ir3 *ir = ctx->ir;
3165 struct ir3_stream_output_info *strmout = &ctx->so->shader->stream_output;
3166 struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
3167 struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
3168 struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
3169
3170 /* create vtxcnt input in input block at top of shader,
3171 * so that it is seen as live over the entire duration
3172 * of the shader:
3173 */
3174 vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
3175 maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
3176
3177 /* at this point, we are at the original 'end' block,
3178 * re-purpose this block to stream-out condition, then
3179 * append stream-out block and new-end block
3180 */
3181 orig_end_block = ctx->block;
3182
3183 // maybe w/ store_global intrinsic, we could do this
3184 // stuff in nir->nir pass
3185
3186 stream_out_block = ir3_block_create(ir);
3187 list_addtail(&stream_out_block->node, &ir->block_list);
3188
3189 new_end_block = ir3_block_create(ir);
3190 list_addtail(&new_end_block->node, &ir->block_list);
3191
3192 orig_end_block->successors[0] = stream_out_block;
3193 orig_end_block->successors[1] = new_end_block;
3194
3195 orig_end_block->physical_successors[0] = stream_out_block;
3196 orig_end_block->physical_successors[1] = new_end_block;
3197
3198 stream_out_block->successors[0] = new_end_block;
3199
3200 stream_out_block->physical_successors[0] = new_end_block;
3201
3202 /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
3203 cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
3204 cond->dsts[0]->num = regid(REG_P0, 0);
3205 cond->dsts[0]->flags &= ~IR3_REG_SSA;
3206 cond->cat2.condition = IR3_COND_LT;
3207
3208 /* condition goes on previous block to the conditional,
3209 * since it is used to pick which of the two successor
3210 * paths to take:
3211 */
3212 orig_end_block->condition = cond;
3213
3214 /* switch to stream_out_block to generate the stream-out
3215 * instructions:
3216 */
3217 ctx->block = stream_out_block;
3218
3219 /* Calculate base addresses based on vtxcnt. Instructions
3220 * generated for bases not used in following loop will be
3221 * stripped out in the backend.
3222 */
3223 for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
3224 const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
3225 unsigned stride = strmout->stride[i];
3226 struct ir3_instruction *base, *off;
3227
3228 base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
3229
3230 /* 24-bit should be enough: */
3231 off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
3232 create_immed(ctx->block, stride * 4), 0);
3233
3234 bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
3235 }
3236
3237 /* Generate the per-output store instructions: */
3238 for (unsigned i = 0; i < strmout->num_outputs; i++) {
3239 for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
3240 unsigned c = j + strmout->output[i].start_component;
3241 struct ir3_instruction *base, *out, *stg;
3242
3243 base = bases[strmout->output[i].output_buffer];
3244 out = ctx->outputs[regid(strmout->output[i].register_index, c)];
3245
3246 stg = ir3_STG(
3247 ctx->block, base, 0,
3248 create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),
3249 0, out, 0, create_immed(ctx->block, 1), 0);
3250 stg->cat6.type = TYPE_U32;
3251
3252 array_insert(ctx->block, ctx->block->keeps, stg);
3253 }
3254 }
3255
3256 /* and finally switch to the new_end_block: */
3257 ctx->block = new_end_block;
3258 }
3259
3260 static void
setup_predecessors(struct ir3 * ir)3261 setup_predecessors(struct ir3 *ir)
3262 {
3263 foreach_block (block, &ir->block_list) {
3264 for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
3265 if (block->successors[i])
3266 ir3_block_add_predecessor(block->successors[i], block);
3267 if (block->physical_successors[i])
3268 ir3_block_add_physical_predecessor(block->physical_successors[i],
3269 block);
3270 }
3271 }
3272 }
3273
3274 static void
emit_function(struct ir3_context * ctx,nir_function_impl * impl)3275 emit_function(struct ir3_context *ctx, nir_function_impl *impl)
3276 {
3277 nir_metadata_require(impl, nir_metadata_block_index);
3278
3279 compile_assert(ctx, ctx->stack == 0);
3280
3281 emit_cf_list(ctx, &impl->body);
3282 emit_block(ctx, impl->end_block);
3283
3284 compile_assert(ctx, ctx->stack == 0);
3285
3286 /* at this point, we should have a single empty block,
3287 * into which we emit the 'end' instruction.
3288 */
3289 compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
3290
3291 /* If stream-out (aka transform-feedback) enabled, emit the
3292 * stream-out instructions, followed by a new empty block (into
3293 * which the 'end' instruction lands).
3294 *
3295 * NOTE: it is done in this order, rather than inserting before
3296 * we emit end_block, because NIR guarantees that all blocks
3297 * flow into end_block, and that end_block has no successors.
3298 * So by re-purposing end_block as the first block of stream-
3299 * out, we guarantee that all exit paths flow into the stream-
3300 * out instructions.
3301 */
3302 if ((ctx->compiler->gen < 5) &&
3303 (ctx->so->shader->stream_output.num_outputs > 0) &&
3304 !ctx->so->binning_pass) {
3305 debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
3306 emit_stream_out(ctx);
3307 }
3308
3309 setup_predecessors(ctx->ir);
3310 foreach_block (block, &ctx->ir->block_list) {
3311 resolve_phis(ctx, block);
3312 }
3313 }
3314
3315 static void
setup_input(struct ir3_context * ctx,nir_intrinsic_instr * intr)3316 setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
3317 {
3318 struct ir3_shader_variant *so = ctx->so;
3319 struct ir3_instruction *coord = NULL;
3320
3321 if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
3322 coord = ir3_create_collect(ctx->block, ir3_get_src(ctx, &intr->src[0]), 2);
3323
3324 compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
3325
3326 unsigned frac = nir_intrinsic_component(intr);
3327 unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
3328 unsigned ncomp = nir_intrinsic_dest_components(intr);
3329 unsigned n = nir_intrinsic_base(intr) + offset;
3330 unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
3331 unsigned compmask;
3332
3333 /* Inputs are loaded using ldlw or ldg for other stages. */
3334 compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
3335 ctx->so->type == MESA_SHADER_VERTEX);
3336
3337 if (ctx->so->type == MESA_SHADER_FRAGMENT)
3338 compmask = BITFIELD_MASK(ncomp) << frac;
3339 else
3340 compmask = BITFIELD_MASK(ncomp + frac);
3341
3342 /* for a4xx+ rasterflat */
3343 if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
3344 coord = NULL;
3345
3346 so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
3347
3348 so->inputs[n].slot = slot;
3349 so->inputs[n].compmask |= compmask;
3350 so->inputs_count = MAX2(so->inputs_count, n + 1);
3351 compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
3352 so->inputs[n].flat = !coord;
3353
3354 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3355 compile_assert(ctx, slot != VARYING_SLOT_POS);
3356
3357 so->inputs[n].bary = true;
3358
3359 for (int i = 0; i < ncomp; i++) {
3360 unsigned idx = (n * 4) + i + frac;
3361 ctx->last_dst[i] = create_frag_input(ctx, coord, idx);
3362 }
3363 } else {
3364 struct ir3_instruction *input = NULL;
3365
3366 foreach_input (in, ctx->ir) {
3367 if (in->input.inidx == n) {
3368 input = in;
3369 break;
3370 }
3371 }
3372
3373 if (!input) {
3374 input = create_input(ctx, compmask);
3375 input->input.inidx = n;
3376 } else {
3377 /* For aliased inputs, just append to the wrmask.. ie. if we
3378 * first see a vec2 index at slot N, and then later a vec4,
3379 * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
3380 */
3381 input->dsts[0]->wrmask |= compmask;
3382 }
3383
3384 for (int i = 0; i < ncomp + frac; i++) {
3385 unsigned idx = (n * 4) + i;
3386 compile_assert(ctx, idx < ctx->ninputs);
3387
3388 /* fixup the src wrmask to avoid validation fail */
3389 if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
3390 ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
3391 continue;
3392 }
3393
3394 ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
3395 }
3396
3397 for (int i = 0; i < ncomp; i++) {
3398 unsigned idx = (n * 4) + i + frac;
3399 ctx->last_dst[i] = ctx->inputs[idx];
3400 }
3401 }
3402 }
3403
3404 /* Initially we assign non-packed inloc's for varyings, as we don't really
3405 * know up-front which components will be unused. After all the compilation
3406 * stages we scan the shader to see which components are actually used, and
3407 * re-pack the inlocs to eliminate unneeded varyings.
3408 */
3409 static void
pack_inlocs(struct ir3_context * ctx)3410 pack_inlocs(struct ir3_context *ctx)
3411 {
3412 struct ir3_shader_variant *so = ctx->so;
3413 uint8_t used_components[so->inputs_count];
3414
3415 memset(used_components, 0, sizeof(used_components));
3416
3417 /*
3418 * First Step: scan shader to find which bary.f/ldlv remain:
3419 */
3420
3421 foreach_block (block, &ctx->ir->block_list) {
3422 foreach_instr (instr, &block->instr_list) {
3423 if (is_input(instr)) {
3424 unsigned inloc = instr->srcs[0]->iim_val;
3425 unsigned i = inloc / 4;
3426 unsigned j = inloc % 4;
3427
3428 compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
3429 compile_assert(ctx, i < so->inputs_count);
3430
3431 used_components[i] |= 1 << j;
3432 } else if (instr->opc == OPC_META_TEX_PREFETCH) {
3433 for (int n = 0; n < 2; n++) {
3434 unsigned inloc = instr->prefetch.input_offset + n;
3435 unsigned i = inloc / 4;
3436 unsigned j = inloc % 4;
3437
3438 compile_assert(ctx, i < so->inputs_count);
3439
3440 used_components[i] |= 1 << j;
3441 }
3442 }
3443 }
3444 }
3445
3446 /*
3447 * Second Step: reassign varying inloc/slots:
3448 */
3449
3450 unsigned actual_in = 0;
3451 unsigned inloc = 0;
3452
3453 /* for clip+cull distances, unused components can't be eliminated because
3454 * they're read by fixed-function, even if there's a hole. Note that
3455 * clip/cull distance arrays must be declared in the FS, so we can just
3456 * use the NIR clip/cull distances to avoid reading ucp_enables in the
3457 * shader key.
3458 */
3459 unsigned clip_cull_size =
3460 ctx->so->shader->nir->info.clip_distance_array_size +
3461 ctx->so->shader->nir->info.cull_distance_array_size;
3462 unsigned clip_cull_mask = MASK(clip_cull_size);
3463
3464 for (unsigned i = 0; i < so->inputs_count; i++) {
3465 unsigned compmask = 0, maxcomp = 0;
3466
3467 so->inputs[i].inloc = inloc;
3468 so->inputs[i].bary = false;
3469
3470 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
3471 so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
3472 if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
3473 compmask = clip_cull_mask & 0xf;
3474 else
3475 compmask = clip_cull_mask >> 4;
3476 used_components[i] = compmask;
3477 }
3478
3479 for (unsigned j = 0; j < 4; j++) {
3480 if (!(used_components[i] & (1 << j)))
3481 continue;
3482
3483 compmask |= (1 << j);
3484 actual_in++;
3485 maxcomp = j + 1;
3486
3487 /* at this point, since used_components[i] mask is only
3488 * considering varyings (ie. not sysvals) we know this
3489 * is a varying:
3490 */
3491 so->inputs[i].bary = true;
3492 }
3493
3494 if (so->inputs[i].bary) {
3495 so->varying_in++;
3496 so->inputs[i].compmask = (1 << maxcomp) - 1;
3497 inloc += maxcomp;
3498 }
3499 }
3500
3501 /*
3502 * Third Step: reassign packed inloc's:
3503 */
3504
3505 foreach_block (block, &ctx->ir->block_list) {
3506 foreach_instr (instr, &block->instr_list) {
3507 if (is_input(instr)) {
3508 unsigned inloc = instr->srcs[0]->iim_val;
3509 unsigned i = inloc / 4;
3510 unsigned j = inloc % 4;
3511
3512 instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
3513 } else if (instr->opc == OPC_META_TEX_PREFETCH) {
3514 unsigned i = instr->prefetch.input_offset / 4;
3515 unsigned j = instr->prefetch.input_offset % 4;
3516 instr->prefetch.input_offset = so->inputs[i].inloc + j;
3517 }
3518 }
3519 }
3520 }
3521
3522 static void
setup_output(struct ir3_context * ctx,nir_intrinsic_instr * intr)3523 setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
3524 {
3525 struct ir3_shader_variant *so = ctx->so;
3526 nir_io_semantics io = nir_intrinsic_io_semantics(intr);
3527
3528 compile_assert(ctx, nir_src_is_const(intr->src[1]));
3529
3530 unsigned offset = nir_src_as_uint(intr->src[1]);
3531 unsigned n = nir_intrinsic_base(intr) + offset;
3532 unsigned frac = nir_intrinsic_component(intr);
3533 unsigned ncomp = nir_intrinsic_src_components(intr, 0);
3534
3535 /* For per-view variables, each user-facing slot corresponds to multiple
3536 * views, each with a corresponding driver_location, and the offset is for
3537 * the driver_location. To properly figure out of the slot, we'd need to
3538 * plumb through the number of views. However, for now we only use
3539 * per-view with gl_Position, so we assume that the variable is not an
3540 * array or matrix (so there are no indirect accesses to the variable
3541 * itself) and the indirect offset corresponds to the view.
3542 */
3543 unsigned slot = io.location + (io.per_view ? 0 : offset);
3544
3545 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3546 switch (slot) {
3547 case FRAG_RESULT_DEPTH:
3548 so->writes_pos = true;
3549 break;
3550 case FRAG_RESULT_COLOR:
3551 if (!ctx->s->info.fs.color_is_dual_source) {
3552 so->color0_mrt = 1;
3553 } else {
3554 slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
3555 }
3556 break;
3557 case FRAG_RESULT_SAMPLE_MASK:
3558 so->writes_smask = true;
3559 break;
3560 case FRAG_RESULT_STENCIL:
3561 so->writes_stencilref = true;
3562 break;
3563 default:
3564 slot += io.dual_source_blend_index; /* For dual-src blend */
3565 if (slot >= FRAG_RESULT_DATA0)
3566 break;
3567 ir3_context_error(ctx, "unknown FS output name: %s\n",
3568 gl_frag_result_name(slot));
3569 }
3570 } else if (ctx->so->type == MESA_SHADER_VERTEX ||
3571 ctx->so->type == MESA_SHADER_TESS_EVAL ||
3572 ctx->so->type == MESA_SHADER_GEOMETRY) {
3573 switch (slot) {
3574 case VARYING_SLOT_POS:
3575 so->writes_pos = true;
3576 break;
3577 case VARYING_SLOT_PSIZ:
3578 so->writes_psize = true;
3579 break;
3580 case VARYING_SLOT_PRIMITIVE_ID:
3581 case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
3582 debug_assert(ctx->so->type == MESA_SHADER_GEOMETRY);
3583 FALLTHROUGH;
3584 case VARYING_SLOT_COL0:
3585 case VARYING_SLOT_COL1:
3586 case VARYING_SLOT_BFC0:
3587 case VARYING_SLOT_BFC1:
3588 case VARYING_SLOT_FOGC:
3589 case VARYING_SLOT_CLIP_DIST0:
3590 case VARYING_SLOT_CLIP_DIST1:
3591 case VARYING_SLOT_CLIP_VERTEX:
3592 case VARYING_SLOT_LAYER:
3593 case VARYING_SLOT_VIEWPORT:
3594 break;
3595 default:
3596 if (slot >= VARYING_SLOT_VAR0)
3597 break;
3598 if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
3599 break;
3600 ir3_context_error(ctx, "unknown %s shader output name: %s\n",
3601 _mesa_shader_stage_to_string(ctx->so->type),
3602 gl_varying_slot_name_for_stage(slot, ctx->so->type));
3603 }
3604 } else {
3605 ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
3606 }
3607
3608 so->outputs_count = MAX2(so->outputs_count, n + 1);
3609 compile_assert(ctx, so->outputs_count < ARRAY_SIZE(so->outputs));
3610
3611 so->outputs[n].slot = slot;
3612 if (io.per_view)
3613 so->outputs[n].view = offset;
3614
3615 for (int i = 0; i < ncomp; i++) {
3616 unsigned idx = (n * 4) + i + frac;
3617 compile_assert(ctx, idx < ctx->noutputs);
3618 ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
3619 }
3620
3621 /* if varying packing doesn't happen, we could end up in a situation
3622 * with "holes" in the output, and since the per-generation code that
3623 * sets up varying linkage registers doesn't expect to have more than
3624 * one varying per vec4 slot, pad the holes.
3625 *
3626 * Note that this should probably generate a performance warning of
3627 * some sort.
3628 */
3629 for (int i = 0; i < frac; i++) {
3630 unsigned idx = (n * 4) + i;
3631 if (!ctx->outputs[idx]) {
3632 ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
3633 }
3634 }
3635
3636 struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
3637 for (int i = 0; i < ncomp; i++) {
3638 unsigned idx = (n * 4) + i + frac;
3639 ctx->outputs[idx] = src[i];
3640 }
3641 }
3642
3643 static bool
uses_load_input(struct ir3_shader_variant * so)3644 uses_load_input(struct ir3_shader_variant *so)
3645 {
3646 return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
3647 }
3648
3649 static bool
uses_store_output(struct ir3_shader_variant * so)3650 uses_store_output(struct ir3_shader_variant *so)
3651 {
3652 switch (so->type) {
3653 case MESA_SHADER_VERTEX:
3654 return !so->key.has_gs && !so->key.tessellation;
3655 case MESA_SHADER_TESS_EVAL:
3656 return !so->key.has_gs;
3657 case MESA_SHADER_GEOMETRY:
3658 case MESA_SHADER_FRAGMENT:
3659 return true;
3660 case MESA_SHADER_TESS_CTRL:
3661 case MESA_SHADER_COMPUTE:
3662 return false;
3663 default:
3664 unreachable("unknown stage");
3665 }
3666 }
3667
3668 static void
emit_instructions(struct ir3_context * ctx)3669 emit_instructions(struct ir3_context *ctx)
3670 {
3671 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
3672
3673 /* some varying setup which can't be done in setup_input(): */
3674 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3675 nir_foreach_shader_in_variable (var, ctx->s) {
3676 /* if any varyings have 'sample' qualifer, that triggers us
3677 * to run in per-sample mode:
3678 */
3679 if (var->data.sample)
3680 ctx->so->per_samp = true;
3681
3682 /* set rasterflat flag for front/back color */
3683 if (var->data.interpolation == INTERP_MODE_NONE) {
3684 switch (var->data.location) {
3685 case VARYING_SLOT_COL0:
3686 case VARYING_SLOT_COL1:
3687 case VARYING_SLOT_BFC0:
3688 case VARYING_SLOT_BFC1:
3689 ctx->so->inputs[var->data.driver_location].rasterflat = true;
3690 break;
3691 default:
3692 break;
3693 }
3694 }
3695 }
3696 }
3697
3698 if (uses_load_input(ctx->so)) {
3699 ctx->so->inputs_count = ctx->s->num_inputs;
3700 compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
3701 ctx->ninputs = ctx->s->num_inputs * 4;
3702 ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
3703 } else {
3704 ctx->ninputs = 0;
3705 ctx->so->inputs_count = 0;
3706 }
3707
3708 if (uses_store_output(ctx->so)) {
3709 ctx->noutputs = ctx->s->num_outputs * 4;
3710 ctx->outputs =
3711 rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
3712 } else {
3713 ctx->noutputs = 0;
3714 }
3715
3716 ctx->ir = ir3_create(ctx->compiler, ctx->so);
3717
3718 /* Create inputs in first block: */
3719 ctx->block = get_block(ctx, nir_start_block(fxn));
3720 ctx->in_block = ctx->block;
3721
3722 /* for fragment shader, the vcoord input register is used as the
3723 * base for bary.f varying fetch instrs:
3724 *
3725 * TODO defer creating ctx->ij_pixel and corresponding sysvals
3726 * until emit_intrinsic when we know they are actually needed.
3727 * For now, we defer creating ctx->ij_centroid, etc, since we
3728 * only need ij_pixel for "old style" varying inputs (ie.
3729 * tgsi_to_nir)
3730 */
3731 if (ctx->so->type == MESA_SHADER_FRAGMENT) {
3732 ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
3733 }
3734
3735 /* Defer add_sysval_input() stuff until after setup_inputs(),
3736 * because sysvals need to be appended after varyings:
3737 */
3738 if (ctx->ij[IJ_PERSP_PIXEL]) {
3739 add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
3740 ctx->ij[IJ_PERSP_PIXEL]);
3741 }
3742
3743 /* Tesselation shaders always need primitive ID for indexing the
3744 * BO. Geometry shaders don't always need it but when they do it has be
3745 * delivered and unclobbered in the VS. To make things easy, we always
3746 * make room for it in VS/DS.
3747 */
3748 bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
3749 bool has_gs = ctx->so->key.has_gs;
3750 switch (ctx->so->type) {
3751 case MESA_SHADER_VERTEX:
3752 if (has_tess) {
3753 ctx->tcs_header =
3754 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
3755 ctx->rel_patch_id =
3756 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
3757 ctx->primitive_id =
3758 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3759 } else if (has_gs) {
3760 ctx->gs_header =
3761 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3762 ctx->primitive_id =
3763 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3764 }
3765 break;
3766 case MESA_SHADER_TESS_CTRL:
3767 ctx->tcs_header =
3768 create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
3769 ctx->rel_patch_id =
3770 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
3771 break;
3772 case MESA_SHADER_TESS_EVAL:
3773 if (has_gs) {
3774 ctx->gs_header =
3775 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3776 ctx->primitive_id =
3777 create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
3778 }
3779 ctx->rel_patch_id =
3780 create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
3781 break;
3782 case MESA_SHADER_GEOMETRY:
3783 ctx->gs_header =
3784 create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
3785 break;
3786 default:
3787 break;
3788 }
3789
3790 /* Find # of samplers. Just assume that we'll be reading from images.. if
3791 * it is write-only we don't have to count it, but after lowering derefs
3792 * is too late to compact indices for that.
3793 */
3794 ctx->so->num_samp =
3795 BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
3796
3797 /* Save off clip+cull information. */
3798 ctx->so->clip_mask = MASK(ctx->s->info.clip_distance_array_size);
3799 ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
3800 << ctx->s->info.clip_distance_array_size;
3801
3802 ctx->so->pvtmem_size = ctx->s->scratch_size;
3803 ctx->so->shared_size = ctx->s->info.shared_size;
3804
3805 /* NOTE: need to do something more clever when we support >1 fxn */
3806 nir_foreach_register (reg, &fxn->registers) {
3807 ir3_declare_array(ctx, reg);
3808 }
3809
3810 if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
3811 ctx->compiler->tess_use_shared) {
3812 struct ir3_instruction *barrier = ir3_BAR(ctx->block);
3813 barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
3814 barrier->barrier_class = IR3_BARRIER_EVERYTHING;
3815 array_insert(ctx->block, ctx->block->keeps, barrier);
3816 }
3817
3818 /* And emit the body: */
3819 ctx->impl = fxn;
3820 emit_function(ctx, fxn);
3821 }
3822
3823 /* Fixup tex sampler state for astc/srgb workaround instructions. We
3824 * need to assign the tex state indexes for these after we know the
3825 * max tex index.
3826 */
3827 static void
fixup_astc_srgb(struct ir3_context * ctx)3828 fixup_astc_srgb(struct ir3_context *ctx)
3829 {
3830 struct ir3_shader_variant *so = ctx->so;
3831 /* indexed by original tex idx, value is newly assigned alpha sampler
3832 * state tex idx. Zero is invalid since there is at least one sampler
3833 * if we get here.
3834 */
3835 unsigned alt_tex_state[16] = {0};
3836 unsigned tex_idx = ctx->max_texture_index + 1;
3837 unsigned idx = 0;
3838
3839 so->astc_srgb.base = tex_idx;
3840
3841 for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
3842 struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
3843
3844 compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
3845
3846 if (alt_tex_state[sam->cat5.tex] == 0) {
3847 /* assign new alternate/alpha tex state slot: */
3848 alt_tex_state[sam->cat5.tex] = tex_idx++;
3849 so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
3850 so->astc_srgb.count++;
3851 }
3852
3853 sam->cat5.tex = alt_tex_state[sam->cat5.tex];
3854 }
3855 }
3856
3857 static bool
output_slot_used_for_binning(gl_varying_slot slot)3858 output_slot_used_for_binning(gl_varying_slot slot)
3859 {
3860 return slot == VARYING_SLOT_POS || slot == VARYING_SLOT_PSIZ ||
3861 slot == VARYING_SLOT_CLIP_DIST0 || slot == VARYING_SLOT_CLIP_DIST1 ||
3862 slot == VARYING_SLOT_VIEWPORT;
3863 }
3864
3865 static struct ir3_instruction *
find_end(struct ir3 * ir)3866 find_end(struct ir3 *ir)
3867 {
3868 foreach_block_rev (block, &ir->block_list) {
3869 foreach_instr_rev (instr, &block->instr_list) {
3870 if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
3871 return instr;
3872 }
3873 }
3874 unreachable("couldn't find end instruction");
3875 }
3876
3877 static void
fixup_binning_pass(struct ir3_context * ctx,struct ir3_instruction * end)3878 fixup_binning_pass(struct ir3_context *ctx, struct ir3_instruction *end)
3879 {
3880 struct ir3_shader_variant *so = ctx->so;
3881 unsigned i, j;
3882
3883 /* first pass, remove unused outputs from the IR level outputs: */
3884 for (i = 0, j = 0; i < end->srcs_count; i++) {
3885 unsigned outidx = end->end.outidxs[i];
3886 unsigned slot = so->outputs[outidx].slot;
3887
3888 if (output_slot_used_for_binning(slot)) {
3889 end->srcs[j] = end->srcs[i];
3890 end->end.outidxs[j] = end->end.outidxs[i];
3891 j++;
3892 }
3893 }
3894 end->srcs_count = j;
3895
3896 /* second pass, cleanup the unused slots in ir3_shader_variant::outputs
3897 * table:
3898 */
3899 for (i = 0, j = 0; i < so->outputs_count; i++) {
3900 unsigned slot = so->outputs[i].slot;
3901
3902 if (output_slot_used_for_binning(slot)) {
3903 so->outputs[j] = so->outputs[i];
3904
3905 /* fixup outidx to point to new output table entry: */
3906 for (unsigned k = 0; k < end->srcs_count; k++) {
3907 if (end->end.outidxs[k] == i) {
3908 end->end.outidxs[k] = j;
3909 break;
3910 }
3911 }
3912
3913 j++;
3914 }
3915 }
3916 so->outputs_count = j;
3917 }
3918
3919 static void
collect_tex_prefetches(struct ir3_context * ctx,struct ir3 * ir)3920 collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
3921 {
3922 unsigned idx = 0;
3923
3924 /* Collect sampling instructions eligible for pre-dispatch. */
3925 foreach_block (block, &ir->block_list) {
3926 foreach_instr_safe (instr, &block->instr_list) {
3927 if (instr->opc == OPC_META_TEX_PREFETCH) {
3928 assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
3929 struct ir3_sampler_prefetch *fetch =
3930 &ctx->so->sampler_prefetch[idx];
3931 idx++;
3932
3933 if (instr->flags & IR3_INSTR_B) {
3934 fetch->cmd = IR3_SAMPLER_BINDLESS_PREFETCH_CMD;
3935 /* In bindless mode, the index is actually the base */
3936 fetch->tex_id = instr->prefetch.tex_base;
3937 fetch->samp_id = instr->prefetch.samp_base;
3938 fetch->tex_bindless_id = instr->prefetch.tex;
3939 fetch->samp_bindless_id = instr->prefetch.samp;
3940 } else {
3941 fetch->cmd = IR3_SAMPLER_PREFETCH_CMD;
3942 fetch->tex_id = instr->prefetch.tex;
3943 fetch->samp_id = instr->prefetch.samp;
3944 }
3945 fetch->wrmask = instr->dsts[0]->wrmask;
3946 fetch->dst = instr->dsts[0]->num;
3947 fetch->src = instr->prefetch.input_offset;
3948
3949 /* These are the limits on a5xx/a6xx, we might need to
3950 * revisit if SP_FS_PREFETCH[n] changes on later gens:
3951 */
3952 assert(fetch->dst <= 0x3f);
3953 assert(fetch->tex_id <= 0x1f);
3954 assert(fetch->samp_id <= 0xf);
3955
3956 ctx->so->total_in =
3957 MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
3958
3959 fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
3960
3961 /* Remove the prefetch placeholder instruction: */
3962 list_delinit(&instr->node);
3963 }
3964 }
3965 }
3966 }
3967
3968 int
ir3_compile_shader_nir(struct ir3_compiler * compiler,struct ir3_shader_variant * so)3969 ir3_compile_shader_nir(struct ir3_compiler *compiler,
3970 struct ir3_shader_variant *so)
3971 {
3972 struct ir3_context *ctx;
3973 struct ir3 *ir;
3974 int ret = 0, max_bary;
3975 bool progress;
3976
3977 assert(!so->ir);
3978
3979 ctx = ir3_context_init(compiler, so);
3980 if (!ctx) {
3981 DBG("INIT failed!");
3982 ret = -1;
3983 goto out;
3984 }
3985
3986 emit_instructions(ctx);
3987
3988 if (ctx->error) {
3989 DBG("EMIT failed!");
3990 ret = -1;
3991 goto out;
3992 }
3993
3994 ir = so->ir = ctx->ir;
3995
3996 if (so->type == MESA_SHADER_COMPUTE) {
3997 so->local_size[0] = ctx->s->info.workgroup_size[0];
3998 so->local_size[1] = ctx->s->info.workgroup_size[1];
3999 so->local_size[2] = ctx->s->info.workgroup_size[2];
4000 so->local_size_variable = ctx->s->info.workgroup_size_variable;
4001 }
4002
4003 /* Vertex shaders in a tessellation or geometry pipeline treat END as a
4004 * NOP and has an epilogue that writes the VS outputs to local storage, to
4005 * be read by the HS. Then it resets execution mask (chmask) and chains
4006 * to the next shader (chsh). There are also a few output values which we
4007 * must send to the next stage via registers, and in order for both stages
4008 * to agree on the register used we must force these to be in specific
4009 * registers.
4010 */
4011 if ((so->type == MESA_SHADER_VERTEX &&
4012 (so->key.has_gs || so->key.tessellation)) ||
4013 (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
4014 struct ir3_instruction *outputs[3];
4015 unsigned outidxs[3];
4016 unsigned regids[3];
4017 unsigned outputs_count = 0;
4018
4019 if (ctx->primitive_id) {
4020 unsigned n = so->outputs_count++;
4021 so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
4022
4023 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->primitive_id);
4024 outputs[outputs_count] = out;
4025 outidxs[outputs_count] = n;
4026 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
4027 regids[outputs_count] = regid(0, 2);
4028 else
4029 regids[outputs_count] = regid(0, 1);
4030 outputs_count++;
4031 }
4032
4033 if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
4034 unsigned n = so->outputs_count++;
4035 so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
4036 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->rel_patch_id);
4037 outputs[outputs_count] = out;
4038 outidxs[outputs_count] = n;
4039 regids[outputs_count] = regid(0, 1);
4040 outputs_count++;
4041 }
4042
4043 if (ctx->gs_header) {
4044 unsigned n = so->outputs_count++;
4045 so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
4046 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->gs_header);
4047 outputs[outputs_count] = out;
4048 outidxs[outputs_count] = n;
4049 regids[outputs_count] = regid(0, 0);
4050 outputs_count++;
4051 }
4052
4053 if (ctx->tcs_header) {
4054 unsigned n = so->outputs_count++;
4055 so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
4056 struct ir3_instruction *out = ir3_collect(ctx->block, ctx->tcs_header);
4057 outputs[outputs_count] = out;
4058 outidxs[outputs_count] = n;
4059 regids[outputs_count] = regid(0, 0);
4060 outputs_count++;
4061 }
4062
4063 struct ir3_instruction *chmask =
4064 ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
4065 chmask->barrier_class = IR3_BARRIER_EVERYTHING;
4066 chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
4067
4068 for (unsigned i = 0; i < outputs_count; i++)
4069 __ssa_src(chmask, outputs[i], 0)->num = regids[i];
4070
4071 chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
4072 memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4073
4074 array_insert(ctx->block, ctx->block->keeps, chmask);
4075
4076 struct ir3_instruction *chsh = ir3_CHSH(ctx->block);
4077 chsh->barrier_class = IR3_BARRIER_EVERYTHING;
4078 chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
4079 } else {
4080 assert((ctx->noutputs % 4) == 0);
4081 unsigned outidxs[ctx->noutputs / 4];
4082 struct ir3_instruction *outputs[ctx->noutputs / 4];
4083 unsigned outputs_count = 0;
4084
4085 struct ir3_block *b = ctx->block;
4086 /* Insert these collect's in the block before the end-block if
4087 * possible, so that any moves they generate can be shuffled around to
4088 * reduce nop's:
4089 */
4090 if (ctx->block->predecessors_count == 1)
4091 b = ctx->block->predecessors[0];
4092
4093 /* Setup IR level outputs, which are "collects" that gather
4094 * the scalar components of outputs.
4095 */
4096 for (unsigned i = 0; i < ctx->noutputs; i += 4) {
4097 unsigned ncomp = 0;
4098 /* figure out the # of components written:
4099 *
4100 * TODO do we need to handle holes, ie. if .x and .z
4101 * components written, but .y component not written?
4102 */
4103 for (unsigned j = 0; j < 4; j++) {
4104 if (!ctx->outputs[i + j])
4105 break;
4106 ncomp++;
4107 }
4108
4109 /* Note that in some stages, like TCS, store_output is
4110 * lowered to memory writes, so no components of the
4111 * are "written" from the PoV of traditional store-
4112 * output instructions:
4113 */
4114 if (!ncomp)
4115 continue;
4116
4117 struct ir3_instruction *out =
4118 ir3_create_collect(b, &ctx->outputs[i], ncomp);
4119
4120 int outidx = i / 4;
4121 assert(outidx < so->outputs_count);
4122
4123 outidxs[outputs_count] = outidx;
4124 outputs[outputs_count] = out;
4125 outputs_count++;
4126 }
4127
4128 /* for a6xx+, binning and draw pass VS use same VBO state, so we
4129 * need to make sure not to remove any inputs that are used by
4130 * the nonbinning VS.
4131 */
4132 if (ctx->compiler->gen >= 6 && so->binning_pass &&
4133 so->type == MESA_SHADER_VERTEX) {
4134 for (int i = 0; i < ctx->ninputs; i++) {
4135 struct ir3_instruction *in = ctx->inputs[i];
4136
4137 if (!in)
4138 continue;
4139
4140 unsigned n = i / 4;
4141 unsigned c = i % 4;
4142
4143 debug_assert(n < so->nonbinning->inputs_count);
4144
4145 if (so->nonbinning->inputs[n].sysval)
4146 continue;
4147
4148 /* be sure to keep inputs, even if only used in VS */
4149 if (so->nonbinning->inputs[n].compmask & (1 << c))
4150 array_insert(in->block, in->block->keeps, in);
4151 }
4152 }
4153
4154 struct ir3_instruction *end =
4155 ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
4156
4157 for (unsigned i = 0; i < outputs_count; i++) {
4158 __ssa_src(end, outputs[i], 0);
4159 }
4160
4161 end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
4162 memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
4163
4164 array_insert(ctx->block, ctx->block->keeps, end);
4165
4166 /* at this point, for binning pass, throw away unneeded outputs: */
4167 if (so->binning_pass && (ctx->compiler->gen < 6))
4168 fixup_binning_pass(ctx, end);
4169 }
4170
4171 ir3_debug_print(ir, "AFTER: nir->ir3");
4172 ir3_validate(ir);
4173
4174 IR3_PASS(ir, ir3_remove_unreachable);
4175
4176 IR3_PASS(ir, ir3_array_to_ssa);
4177
4178 do {
4179 progress = false;
4180
4181 progress |= IR3_PASS(ir, ir3_cf);
4182 progress |= IR3_PASS(ir, ir3_cp, so);
4183 progress |= IR3_PASS(ir, ir3_cse);
4184 progress |= IR3_PASS(ir, ir3_dce, so);
4185 } while (progress);
4186
4187 /* at this point, for binning pass, throw away unneeded outputs:
4188 * Note that for a6xx and later, we do this after ir3_cp to ensure
4189 * that the uniform/constant layout for BS and VS matches, so that
4190 * we can re-use same VS_CONST state group.
4191 */
4192 if (so->binning_pass && (ctx->compiler->gen >= 6)) {
4193 fixup_binning_pass(ctx, find_end(ctx->so->ir));
4194 /* cleanup the result of removing unneeded outputs: */
4195 while (IR3_PASS(ir, ir3_dce, so)) {
4196 }
4197 }
4198
4199 IR3_PASS(ir, ir3_sched_add_deps);
4200
4201 /* At this point, all the dead code should be long gone: */
4202 assert(!IR3_PASS(ir, ir3_dce, so));
4203
4204 ret = ir3_sched(ir);
4205 if (ret) {
4206 DBG("SCHED failed!");
4207 goto out;
4208 }
4209
4210 ir3_debug_print(ir, "AFTER: ir3_sched");
4211
4212 if (IR3_PASS(ir, ir3_cp_postsched)) {
4213 /* cleanup the result of removing unneeded mov's: */
4214 while (IR3_PASS(ir, ir3_dce, so)) {
4215 }
4216 }
4217
4218 /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
4219 * with draw pass VS, so binning and draw pass can both use the
4220 * same VBO state.
4221 *
4222 * Note that VS inputs are expected to be full precision.
4223 */
4224 bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
4225 (ir->type == MESA_SHADER_VERTEX) &&
4226 so->binning_pass;
4227
4228 if (pre_assign_inputs) {
4229 foreach_input (in, ir) {
4230 assert(in->opc == OPC_META_INPUT);
4231 unsigned inidx = in->input.inidx;
4232
4233 in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
4234 }
4235 } else if (ctx->tcs_header) {
4236 /* We need to have these values in the same registers between VS and TCS
4237 * since the VS chains to TCS and doesn't get the sysvals redelivered.
4238 */
4239
4240 ctx->tcs_header->dsts[0]->num = regid(0, 0);
4241 ctx->rel_patch_id->dsts[0]->num = regid(0, 1);
4242 if (ctx->primitive_id)
4243 ctx->primitive_id->dsts[0]->num = regid(0, 2);
4244 } else if (ctx->gs_header) {
4245 /* We need to have these values in the same registers between producer
4246 * (VS or DS) and GS since the producer chains to GS and doesn't get
4247 * the sysvals redelivered.
4248 */
4249
4250 ctx->gs_header->dsts[0]->num = regid(0, 0);
4251 if (ctx->primitive_id)
4252 ctx->primitive_id->dsts[0]->num = regid(0, 1);
4253 } else if (so->num_sampler_prefetch) {
4254 assert(so->type == MESA_SHADER_FRAGMENT);
4255 int idx = 0;
4256
4257 foreach_input (instr, ir) {
4258 if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
4259 continue;
4260
4261 assert(idx < 2);
4262 instr->dsts[0]->num = idx;
4263 idx++;
4264 }
4265 }
4266
4267 ret = ir3_ra(so);
4268
4269 if (ret) {
4270 mesa_loge("ir3_ra() failed!");
4271 goto out;
4272 }
4273
4274 IR3_PASS(ir, ir3_postsched, so);
4275
4276 IR3_PASS(ir, ir3_lower_subgroups);
4277
4278 if (so->type == MESA_SHADER_FRAGMENT)
4279 pack_inlocs(ctx);
4280
4281 /*
4282 * Fixup inputs/outputs to point to the actual registers assigned:
4283 *
4284 * 1) initialize to r63.x (invalid/unused)
4285 * 2) iterate IR level inputs/outputs and update the variants
4286 * inputs/outputs table based on the assigned registers for
4287 * the remaining inputs/outputs.
4288 */
4289
4290 for (unsigned i = 0; i < so->inputs_count; i++)
4291 so->inputs[i].regid = INVALID_REG;
4292 for (unsigned i = 0; i < so->outputs_count; i++)
4293 so->outputs[i].regid = INVALID_REG;
4294
4295 struct ir3_instruction *end = find_end(so->ir);
4296
4297 for (unsigned i = 0; i < end->srcs_count; i++) {
4298 unsigned outidx = end->end.outidxs[i];
4299 struct ir3_register *reg = end->srcs[i];
4300
4301 so->outputs[outidx].regid = reg->num;
4302 so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
4303 }
4304
4305 foreach_input (in, ir) {
4306 assert(in->opc == OPC_META_INPUT);
4307 unsigned inidx = in->input.inidx;
4308
4309 if (pre_assign_inputs && !so->inputs[inidx].sysval) {
4310 if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
4311 compile_assert(
4312 ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
4313 compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
4314 so->nonbinning->inputs[inidx].half);
4315 }
4316 so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
4317 so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
4318 } else {
4319 so->inputs[inidx].regid = in->dsts[0]->num;
4320 so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
4321 }
4322 }
4323
4324 if (ctx->astc_srgb)
4325 fixup_astc_srgb(ctx);
4326
4327 /* We need to do legalize after (for frag shader's) the "bary.f"
4328 * offsets (inloc) have been assigned.
4329 */
4330 IR3_PASS(ir, ir3_legalize, so, &max_bary);
4331
4332 /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
4333 * know what we might have to wait on when coming in from VS chsh.
4334 */
4335 if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
4336 foreach_block (block, &ir->block_list) {
4337 foreach_instr (instr, &block->instr_list) {
4338 instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
4339 break;
4340 }
4341 }
4342 }
4343
4344 so->branchstack = ctx->max_stack;
4345
4346 /* Note that actual_in counts inputs that are not bary.f'd for FS: */
4347 if (so->type == MESA_SHADER_FRAGMENT)
4348 so->total_in = max_bary + 1;
4349
4350 /* Collect sampling instructions eligible for pre-dispatch. */
4351 collect_tex_prefetches(ctx, ir);
4352
4353 if (so->type == MESA_SHADER_FRAGMENT &&
4354 ctx->s->info.fs.needs_quad_helper_invocations)
4355 so->need_pixlod = true;
4356
4357 out:
4358 if (ret) {
4359 if (so->ir)
4360 ir3_destroy(so->ir);
4361 so->ir = NULL;
4362 }
4363 ir3_context_free(ctx);
4364
4365 return ret;
4366 }
4367