1 /*
2  * Copyright © 2016-2017 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "util/u_prim.h"
27 #include "compiler/nir/nir_schedule.h"
28 #include "compiler/nir/nir_builder.h"
29 
30 int
vir_get_nsrc(struct qinst * inst)31 vir_get_nsrc(struct qinst *inst)
32 {
33         switch (inst->qpu.type) {
34         case V3D_QPU_INSTR_TYPE_BRANCH:
35                 return 0;
36         case V3D_QPU_INSTR_TYPE_ALU:
37                 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
38                         return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
39                 else
40                         return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
41         }
42 
43         return 0;
44 }
45 
46 /**
47  * Returns whether the instruction has any side effects that must be
48  * preserved.
49  */
50 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)51 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
52 {
53         switch (inst->qpu.type) {
54         case V3D_QPU_INSTR_TYPE_BRANCH:
55                 return true;
56         case V3D_QPU_INSTR_TYPE_ALU:
57                 switch (inst->qpu.alu.add.op) {
58                 case V3D_QPU_A_SETREVF:
59                 case V3D_QPU_A_SETMSF:
60                 case V3D_QPU_A_VPMSETUP:
61                 case V3D_QPU_A_STVPMV:
62                 case V3D_QPU_A_STVPMD:
63                 case V3D_QPU_A_STVPMP:
64                 case V3D_QPU_A_VPMWT:
65                 case V3D_QPU_A_TMUWT:
66                         return true;
67                 default:
68                         break;
69                 }
70 
71                 switch (inst->qpu.alu.mul.op) {
72                 case V3D_QPU_M_MULTOP:
73                         return true;
74                 default:
75                         break;
76                 }
77         }
78 
79         if (inst->qpu.sig.ldtmu ||
80             inst->qpu.sig.ldvary ||
81             inst->qpu.sig.ldtlbu ||
82             inst->qpu.sig.ldtlb ||
83             inst->qpu.sig.wrtmuc ||
84             inst->qpu.sig.thrsw) {
85                 return true;
86         }
87 
88         /* ldunifa works like ldunif: it reads an element and advances the
89          * pointer, so each read has a side effect (we don't care for ldunif
90          * because we reconstruct the uniform stream buffer after compiling
91          * with the surviving uniforms), so allowing DCE to remove
92          * one would break follow-up loads. We could fix this by emiting a
93          * unifa for each ldunifa, but each unifa requires 3 delay slots
94          * before a ldunifa, so that would be quite expensive.
95          */
96         if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
97                 return true;
98 
99         return false;
100 }
101 
102 bool
vir_is_raw_mov(struct qinst * inst)103 vir_is_raw_mov(struct qinst *inst)
104 {
105         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
106             (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
107              inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
108                 return false;
109         }
110 
111         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
112             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
113                 return false;
114         }
115 
116         if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
117             inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
118             inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
119             inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
120                 return false;
121         }
122 
123         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
124             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
125                 return false;
126 
127         return true;
128 }
129 
130 bool
vir_is_add(struct qinst * inst)131 vir_is_add(struct qinst *inst)
132 {
133         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
134                 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
135 }
136 
137 bool
vir_is_mul(struct qinst * inst)138 vir_is_mul(struct qinst *inst)
139 {
140         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
141                 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
142 }
143 
144 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)145 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
146 {
147         if (inst->dst.file == QFILE_MAGIC)
148                 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
149 
150         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
151             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
152                 return true;
153         }
154 
155         return false;
156 }
157 
158 bool
vir_writes_r3(const struct v3d_device_info * devinfo,struct qinst * inst)159 vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
160 {
161         for (int i = 0; i < vir_get_nsrc(inst); i++) {
162                 switch (inst->src[i].file) {
163                 case QFILE_VPM:
164                         return true;
165                 default:
166                         break;
167                 }
168         }
169 
170         if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
171                                   inst->qpu.sig.ldtlb ||
172                                   inst->qpu.sig.ldtlbu ||
173                                   inst->qpu.sig.ldvpm)) {
174                 return true;
175         }
176 
177         return false;
178 }
179 
180 bool
vir_writes_r4(const struct v3d_device_info * devinfo,struct qinst * inst)181 vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
182 {
183         switch (inst->dst.file) {
184         case QFILE_MAGIC:
185                 switch (inst->dst.index) {
186                 case V3D_QPU_WADDR_RECIP:
187                 case V3D_QPU_WADDR_RSQRT:
188                 case V3D_QPU_WADDR_EXP:
189                 case V3D_QPU_WADDR_LOG:
190                 case V3D_QPU_WADDR_SIN:
191                         return true;
192                 }
193                 break;
194         default:
195                 break;
196         }
197 
198         if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
199                 return true;
200 
201         return false;
202 }
203 
204 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)205 vir_set_unpack(struct qinst *inst, int src,
206                enum v3d_qpu_input_unpack unpack)
207 {
208         assert(src == 0 || src == 1);
209 
210         if (vir_is_add(inst)) {
211                 if (src == 0)
212                         inst->qpu.alu.add.a_unpack = unpack;
213                 else
214                         inst->qpu.alu.add.b_unpack = unpack;
215         } else {
216                 assert(vir_is_mul(inst));
217                 if (src == 0)
218                         inst->qpu.alu.mul.a_unpack = unpack;
219                 else
220                         inst->qpu.alu.mul.b_unpack = unpack;
221         }
222 }
223 
224 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)225 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
226 {
227         if (vir_is_add(inst)) {
228                 inst->qpu.alu.add.output_pack = pack;
229         } else {
230                 assert(vir_is_mul(inst));
231                 inst->qpu.alu.mul.output_pack = pack;
232         }
233 }
234 
235 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)236 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
237 {
238         if (vir_is_add(inst)) {
239                 inst->qpu.flags.ac = cond;
240         } else {
241                 assert(vir_is_mul(inst));
242                 inst->qpu.flags.mc = cond;
243         }
244 }
245 
246 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)247 vir_get_cond(struct qinst *inst)
248 {
249         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
250 
251         if (vir_is_add(inst))
252                 return inst->qpu.flags.ac;
253         else if (vir_is_mul(inst))
254                 return inst->qpu.flags.mc;
255         else /* NOP */
256                 return V3D_QPU_COND_NONE;
257 }
258 
259 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)260 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
261 {
262         c->flags_temp = -1;
263         if (vir_is_add(inst)) {
264                 inst->qpu.flags.apf = pf;
265         } else {
266                 assert(vir_is_mul(inst));
267                 inst->qpu.flags.mpf = pf;
268         }
269 }
270 
271 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)272 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
273 {
274         c->flags_temp = -1;
275         if (vir_is_add(inst)) {
276                 inst->qpu.flags.auf = uf;
277         } else {
278                 assert(vir_is_mul(inst));
279                 inst->qpu.flags.muf = uf;
280         }
281 }
282 
283 #if 0
284 uint8_t
285 vir_channels_written(struct qinst *inst)
286 {
287         if (vir_is_mul(inst)) {
288                 switch (inst->dst.pack) {
289                 case QPU_PACK_MUL_NOP:
290                 case QPU_PACK_MUL_8888:
291                         return 0xf;
292                 case QPU_PACK_MUL_8A:
293                         return 0x1;
294                 case QPU_PACK_MUL_8B:
295                         return 0x2;
296                 case QPU_PACK_MUL_8C:
297                         return 0x4;
298                 case QPU_PACK_MUL_8D:
299                         return 0x8;
300                 }
301         } else {
302                 switch (inst->dst.pack) {
303                 case QPU_PACK_A_NOP:
304                 case QPU_PACK_A_8888:
305                 case QPU_PACK_A_8888_SAT:
306                 case QPU_PACK_A_32_SAT:
307                         return 0xf;
308                 case QPU_PACK_A_8A:
309                 case QPU_PACK_A_8A_SAT:
310                         return 0x1;
311                 case QPU_PACK_A_8B:
312                 case QPU_PACK_A_8B_SAT:
313                         return 0x2;
314                 case QPU_PACK_A_8C:
315                 case QPU_PACK_A_8C_SAT:
316                         return 0x4;
317                 case QPU_PACK_A_8D:
318                 case QPU_PACK_A_8D_SAT:
319                         return 0x8;
320                 case QPU_PACK_A_16A:
321                 case QPU_PACK_A_16A_SAT:
322                         return 0x3;
323                 case QPU_PACK_A_16B:
324                 case QPU_PACK_A_16B_SAT:
325                         return 0xc;
326                 }
327         }
328         unreachable("Bad pack field");
329 }
330 #endif
331 
332 struct qreg
vir_get_temp(struct v3d_compile * c)333 vir_get_temp(struct v3d_compile *c)
334 {
335         struct qreg reg;
336 
337         reg.file = QFILE_TEMP;
338         reg.index = c->num_temps++;
339 
340         if (c->num_temps > c->defs_array_size) {
341                 uint32_t old_size = c->defs_array_size;
342                 c->defs_array_size = MAX2(old_size * 2, 16);
343 
344                 c->defs = reralloc(c, c->defs, struct qinst *,
345                                    c->defs_array_size);
346                 memset(&c->defs[old_size], 0,
347                        sizeof(c->defs[0]) * (c->defs_array_size - old_size));
348 
349                 c->spillable = reralloc(c, c->spillable,
350                                         BITSET_WORD,
351                                         BITSET_WORDS(c->defs_array_size));
352                 for (int i = old_size; i < c->defs_array_size; i++)
353                         BITSET_SET(c->spillable, i);
354         }
355 
356         return reg;
357 }
358 
359 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)360 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
361 {
362         struct qinst *inst = calloc(1, sizeof(*inst));
363 
364         inst->qpu = v3d_qpu_nop();
365         inst->qpu.alu.add.op = op;
366 
367         inst->dst = dst;
368         inst->src[0] = src0;
369         inst->src[1] = src1;
370         inst->uniform = ~0;
371 
372         inst->ip = -1;
373 
374         return inst;
375 }
376 
377 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)378 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
379 {
380         struct qinst *inst = calloc(1, sizeof(*inst));
381 
382         inst->qpu = v3d_qpu_nop();
383         inst->qpu.alu.mul.op = op;
384 
385         inst->dst = dst;
386         inst->src[0] = src0;
387         inst->src[1] = src1;
388         inst->uniform = ~0;
389 
390         inst->ip = -1;
391 
392         return inst;
393 }
394 
395 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)396 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
397 {
398         struct qinst *inst = calloc(1, sizeof(*inst));
399 
400         inst->qpu = v3d_qpu_nop();
401         inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
402         inst->qpu.branch.cond = cond;
403         inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
404         inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
405         inst->qpu.branch.ub = true;
406         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
407 
408         inst->dst = vir_nop_reg();
409         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
410 
411         inst->ip = -1;
412 
413         return inst;
414 }
415 
416 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)417 vir_emit(struct v3d_compile *c, struct qinst *inst)
418 {
419         inst->ip = -1;
420 
421         switch (c->cursor.mode) {
422         case vir_cursor_add:
423                 list_add(&inst->link, c->cursor.link);
424                 break;
425         case vir_cursor_addtail:
426                 list_addtail(&inst->link, c->cursor.link);
427                 break;
428         }
429 
430         c->cursor = vir_after_inst(inst);
431         c->live_intervals_valid = false;
432 }
433 
434 /* Updates inst to write to a new temporary, emits it, and notes the def. */
435 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)436 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
437 {
438         assert(inst->dst.file == QFILE_NULL);
439 
440         /* If we're emitting an instruction that's a def, it had better be
441          * writing a register.
442          */
443         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
444                 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
445                        v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
446                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
447                        v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
448         }
449 
450         inst->dst = vir_get_temp(c);
451 
452         if (inst->dst.file == QFILE_TEMP)
453                 c->defs[inst->dst.index] = inst;
454 
455         vir_emit(c, inst);
456 
457         return inst->dst;
458 }
459 
460 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)461 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
462 {
463         if (inst->dst.file == QFILE_TEMP)
464                 c->defs[inst->dst.index] = NULL;
465 
466         vir_emit(c, inst);
467 
468         return inst;
469 }
470 
471 struct qblock *
vir_new_block(struct v3d_compile * c)472 vir_new_block(struct v3d_compile *c)
473 {
474         struct qblock *block = rzalloc(c, struct qblock);
475 
476         list_inithead(&block->instructions);
477 
478         block->predecessors = _mesa_set_create(block,
479                                                _mesa_hash_pointer,
480                                                _mesa_key_pointer_equal);
481 
482         block->index = c->next_block_index++;
483 
484         return block;
485 }
486 
487 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)488 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
489 {
490         c->cur_block = block;
491         c->cursor = vir_after_block(block);
492         list_addtail(&block->link, &c->blocks);
493 }
494 
495 struct qblock *
vir_entry_block(struct v3d_compile * c)496 vir_entry_block(struct v3d_compile *c)
497 {
498         return list_first_entry(&c->blocks, struct qblock, link);
499 }
500 
501 struct qblock *
vir_exit_block(struct v3d_compile * c)502 vir_exit_block(struct v3d_compile *c)
503 {
504         return list_last_entry(&c->blocks, struct qblock, link);
505 }
506 
507 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)508 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
509 {
510         _mesa_set_add(successor->predecessors, predecessor);
511         if (predecessor->successors[0]) {
512                 assert(!predecessor->successors[1]);
513                 predecessor->successors[1] = successor;
514         } else {
515                 predecessor->successors[0] = successor;
516         }
517 }
518 
519 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo,uint32_t max_inline_uniform_buffers)520 v3d_compiler_init(const struct v3d_device_info *devinfo,
521                   uint32_t max_inline_uniform_buffers)
522 {
523         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
524         if (!compiler)
525                 return NULL;
526 
527         compiler->devinfo = devinfo;
528         compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
529 
530         if (!vir_init_reg_sets(compiler)) {
531                 ralloc_free(compiler);
532                 return NULL;
533         }
534 
535         return compiler;
536 }
537 
538 void
v3d_compiler_free(const struct v3d_compiler * compiler)539 v3d_compiler_free(const struct v3d_compiler *compiler)
540 {
541         ralloc_free((void *)compiler);
542 }
543 
544 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t max_threads,uint32_t min_threads_for_reg_alloc,uint32_t max_tmu_spills,bool disable_general_tmu_sched,bool disable_loop_unrolling,bool disable_constant_ubo_load_sorting,bool disable_tmu_pipelining,bool fallback_scheduler)545 vir_compile_init(const struct v3d_compiler *compiler,
546                  struct v3d_key *key,
547                  nir_shader *s,
548                  void (*debug_output)(const char *msg,
549                                       void *debug_output_data),
550                  void *debug_output_data,
551                  int program_id, int variant_id,
552                  uint32_t max_threads,
553                  uint32_t min_threads_for_reg_alloc,
554                  uint32_t max_tmu_spills,
555                  bool disable_general_tmu_sched,
556                  bool disable_loop_unrolling,
557                  bool disable_constant_ubo_load_sorting,
558                  bool disable_tmu_pipelining,
559                  bool fallback_scheduler)
560 {
561         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
562 
563         c->compiler = compiler;
564         c->devinfo = compiler->devinfo;
565         c->key = key;
566         c->program_id = program_id;
567         c->variant_id = variant_id;
568         c->threads = max_threads;
569         c->debug_output = debug_output;
570         c->debug_output_data = debug_output_data;
571         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
572         c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
573         c->max_tmu_spills = max_tmu_spills;
574         c->fallback_scheduler = fallback_scheduler;
575         c->disable_general_tmu_sched = disable_general_tmu_sched;
576         c->disable_tmu_pipelining = disable_tmu_pipelining;
577         c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
578         c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
579                 ? true : disable_loop_unrolling;
580 
581         s = nir_shader_clone(c, s);
582         c->s = s;
583 
584         list_inithead(&c->blocks);
585         vir_set_emit_block(c, vir_new_block(c));
586 
587         c->output_position_index = -1;
588         c->output_sample_mask_index = -1;
589 
590         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
591                                             _mesa_key_pointer_equal);
592 
593         c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
594         c->flags_temp = -1;
595 
596         return c;
597 }
598 
599 static int
type_size_vec4(const struct glsl_type * type,bool bindless)600 type_size_vec4(const struct glsl_type *type, bool bindless)
601 {
602         return glsl_count_attribute_slots(type, false);
603 }
604 
605 static void
v3d_lower_nir(struct v3d_compile * c)606 v3d_lower_nir(struct v3d_compile *c)
607 {
608         struct nir_lower_tex_options tex_options = {
609                 .lower_txd = true,
610                 .lower_tg4_broadcom_swizzle = true,
611 
612                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
613                 .lower_txp = ~0,
614                 /* Apply swizzles to all samplers. */
615                 .swizzle_result = ~0,
616         };
617 
618         /* Lower the format swizzle and (for 32-bit returns)
619          * ARB_texture_swizzle-style swizzle.
620          */
621         assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
622         for (int i = 0; i < c->key->num_tex_used; i++) {
623                 for (int j = 0; j < 4; j++)
624                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
625         }
626 
627         assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
628         for (int i = 0; i < c->key->num_samplers_used; i++) {
629                 if (c->key->sampler[i].return_size == 16) {
630                         tex_options.lower_tex_packing[i] =
631                                 nir_lower_tex_packing_16;
632                 }
633         }
634 
635         /* CS textures may not have return_size reflecting the shadow state. */
636         nir_foreach_uniform_variable(var, c->s) {
637                 const struct glsl_type *type = glsl_without_array(var->type);
638                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
639 
640                 if (!glsl_type_is_sampler(type) ||
641                     !glsl_sampler_type_is_shadow(type))
642                         continue;
643 
644                 for (int i = 0; i < array_len; i++) {
645                         tex_options.lower_tex_packing[var->data.binding + i] =
646                                 nir_lower_tex_packing_16;
647                 }
648         }
649 
650         NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
651         NIR_PASS_V(c->s, nir_lower_system_values);
652         NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
653 
654         NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
655                    nir_var_function_temp,
656                    0,
657                    glsl_get_natural_size_align_bytes);
658         NIR_PASS_V(c->s, v3d_nir_lower_scratch);
659 }
660 
661 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)662 v3d_set_prog_data_uniforms(struct v3d_compile *c,
663                            struct v3d_prog_data *prog_data)
664 {
665         int count = c->num_uniforms;
666         struct v3d_uniform_list *ulist = &prog_data->uniforms;
667 
668         ulist->count = count;
669         ulist->data = ralloc_array(prog_data, uint32_t, count);
670         memcpy(ulist->data, c->uniform_data,
671                count * sizeof(*ulist->data));
672         ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
673         memcpy(ulist->contents, c->uniform_contents,
674                count * sizeof(*ulist->contents));
675 }
676 
677 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)678 v3d_vs_set_prog_data(struct v3d_compile *c,
679                      struct v3d_vs_prog_data *prog_data)
680 {
681         /* The vertex data gets format converted by the VPM so that
682          * each attribute channel takes up a VPM column.  Precompute
683          * the sizes for the shader record.
684          */
685         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
686                 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
687                 prog_data->vpm_input_size += c->vattr_sizes[i];
688         }
689 
690         memset(prog_data->driver_location_map, -1,
691                sizeof(prog_data->driver_location_map));
692 
693         nir_foreach_shader_in_variable(var, c->s) {
694                 prog_data->driver_location_map[var->data.location] =
695                         var->data.driver_location;
696         }
697 
698         prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
699                                           SYSTEM_VALUE_VERTEX_ID) ||
700                               BITSET_TEST(c->s->info.system_values_read,
701                                           SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
702 
703         prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
704                                            SYSTEM_VALUE_BASE_INSTANCE);
705 
706         prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
707                                           SYSTEM_VALUE_INSTANCE_ID) ||
708                               BITSET_TEST(c->s->info.system_values_read,
709                                           SYSTEM_VALUE_INSTANCE_INDEX);
710 
711         if (prog_data->uses_vid)
712                 prog_data->vpm_input_size++;
713         if (prog_data->uses_biid)
714                 prog_data->vpm_input_size++;
715         if (prog_data->uses_iid)
716                 prog_data->vpm_input_size++;
717 
718         /* Input/output segment size are in sectors (8 rows of 32 bits per
719          * channel).
720          */
721         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
722         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
723 
724         /* Set us up for shared input/output segments.  This is apparently
725          * necessary for our VCM setup to avoid varying corruption.
726          */
727         prog_data->separate_segments = false;
728         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
729                                           prog_data->vpm_input_size);
730         prog_data->vpm_input_size = 0;
731 
732         /* Compute VCM cache size.  We set up our program to take up less than
733          * half of the VPM, so that any set of bin and render programs won't
734          * run out of space.  We need space for at least one input segment,
735          * and then allocate the rest to output segments (one for the current
736          * program, the rest to VCM).  The valid range of the VCM cache size
737          * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
738          * batches.
739          */
740         assert(c->devinfo->vpm_size);
741         int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
742         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
743         int half_vpm = vpm_size_in_sectors / 2;
744         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
745         int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
746         assert(vpm_output_batches >= 2);
747         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
748 }
749 
750 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)751 v3d_gs_set_prog_data(struct v3d_compile *c,
752                      struct v3d_gs_prog_data *prog_data)
753 {
754         prog_data->num_inputs = c->num_inputs;
755         memcpy(prog_data->input_slots, c->input_slots,
756                c->num_inputs * sizeof(*c->input_slots));
757 
758         /* gl_PrimitiveIdIn is written by the GBG into the first word of the
759          * VPM output header automatically and the shader will overwrite
760          * it after reading it if necessary, so it doesn't add to the VPM
761          * size requirements.
762          */
763         prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
764                                           SYSTEM_VALUE_PRIMITIVE_ID);
765 
766         /* Output segment size is in sectors (8 rows of 32 bits per channel) */
767         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
768 
769         /* Compute SIMD dispatch width and update VPM output size accordingly
770          * to ensure we can fit our program in memory. Available widths are
771          * 16, 8, 4, 1.
772          *
773          * Notice that at draw time we will have to consider VPM memory
774          * requirements from other stages and choose a smaller dispatch
775          * width if needed to fit the program in VPM memory.
776          */
777         prog_data->simd_width = 16;
778         while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
779                prog_data->simd_width == 2) {
780                 prog_data->simd_width >>= 1;
781                 prog_data->vpm_output_size =
782                         align(prog_data->vpm_output_size, 2) / 2;
783         }
784         assert(prog_data->vpm_output_size <= 16);
785         assert(prog_data->simd_width != 2);
786 
787         prog_data->out_prim_type = c->s->info.gs.output_primitive;
788         prog_data->num_invocations = c->s->info.gs.invocations;
789 
790         prog_data->writes_psiz =
791             c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
792 }
793 
794 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)795 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
796                             struct v3d_fs_prog_data *prog_data)
797 {
798         prog_data->num_inputs = c->num_inputs;
799         memcpy(prog_data->input_slots, c->input_slots,
800                c->num_inputs * sizeof(*c->input_slots));
801 
802         STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
803                       (V3D_MAX_FS_INPUTS - 1) / 24);
804         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
805                 if (BITSET_TEST(c->flat_shade_flags, i))
806                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
807 
808                 if (BITSET_TEST(c->noperspective_flags, i))
809                         prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
810 
811                 if (BITSET_TEST(c->centroid_flags, i))
812                         prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
813         }
814 }
815 
816 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)817 v3d_fs_set_prog_data(struct v3d_compile *c,
818                      struct v3d_fs_prog_data *prog_data)
819 {
820         v3d_set_fs_prog_data_inputs(c, prog_data);
821         prog_data->writes_z = c->writes_z;
822         prog_data->writes_z_from_fep = c->writes_z_from_fep;
823         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
824         prog_data->uses_center_w = c->uses_center_w;
825         prog_data->uses_implicit_point_line_varyings =
826                 c->uses_implicit_point_line_varyings;
827         prog_data->lock_scoreboard_on_first_thrsw =
828                 c->lock_scoreboard_on_first_thrsw;
829         prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
830         prog_data->uses_pid = c->fs_uses_primitive_id;
831 }
832 
833 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)834 v3d_cs_set_prog_data(struct v3d_compile *c,
835                      struct v3d_compute_prog_data *prog_data)
836 {
837         prog_data->shared_size = c->s->info.shared_size;
838 
839         prog_data->local_size[0] = c->s->info.workgroup_size[0];
840         prog_data->local_size[1] = c->s->info.workgroup_size[1];
841         prog_data->local_size[2] = c->s->info.workgroup_size[2];
842 
843         prog_data->has_subgroups = c->has_subgroups;
844 }
845 
846 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)847 v3d_set_prog_data(struct v3d_compile *c,
848                   struct v3d_prog_data *prog_data)
849 {
850         prog_data->threads = c->threads;
851         prog_data->single_seg = !c->last_thrsw;
852         prog_data->spill_size = c->spill_size;
853         prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
854         prog_data->has_control_barrier = c->s->info.uses_control_barrier;
855 
856         v3d_set_prog_data_uniforms(c, prog_data);
857 
858         switch (c->s->info.stage) {
859         case MESA_SHADER_VERTEX:
860                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
861                 break;
862         case MESA_SHADER_GEOMETRY:
863                 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
864                 break;
865         case MESA_SHADER_FRAGMENT:
866                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
867                 break;
868         case MESA_SHADER_COMPUTE:
869                 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
870                 break;
871         default:
872                 unreachable("unsupported shader stage");
873         }
874 }
875 
876 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)877 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
878 {
879         *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
880 
881         uint64_t *qpu_insts = malloc(*final_assembly_size);
882         if (!qpu_insts)
883                 return NULL;
884 
885         memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
886 
887         vir_compile_destroy(c);
888 
889         return qpu_insts;
890 }
891 
892 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)893 v3d_nir_lower_vs_early(struct v3d_compile *c)
894 {
895         /* Split our I/O vars and dead code eliminate the unused
896          * components.
897          */
898         NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
899                    nir_var_shader_in | nir_var_shader_out);
900         uint64_t used_outputs[4] = {0};
901         for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
902                 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
903                 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
904                 used_outputs[comp] |= 1ull << slot;
905         }
906         NIR_PASS_V(c->s, nir_remove_unused_io_vars,
907                    nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
908         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
909         v3d_optimize_nir(c, c->s);
910         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
911 
912         /* This must go before nir_lower_io */
913         if (c->vs_key->per_vertex_point_size)
914                 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
915 
916         NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
917                    type_size_vec4,
918                    (nir_lower_io_options)0);
919         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
920          * on the code it generated.
921          */
922         NIR_PASS_V(c->s, nir_opt_dce);
923         NIR_PASS_V(c->s, nir_opt_constant_folding);
924 }
925 
926 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)927 v3d_nir_lower_gs_early(struct v3d_compile *c)
928 {
929         /* Split our I/O vars and dead code eliminate the unused
930          * components.
931          */
932         NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
933                    nir_var_shader_in | nir_var_shader_out);
934         uint64_t used_outputs[4] = {0};
935         for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
936                 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
937                 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
938                 used_outputs[comp] |= 1ull << slot;
939         }
940         NIR_PASS_V(c->s, nir_remove_unused_io_vars,
941                    nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
942         NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
943         v3d_optimize_nir(c, c->s);
944         NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
945 
946         /* This must go before nir_lower_io */
947         if (c->gs_key->per_vertex_point_size)
948                 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
949 
950         NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
951                    type_size_vec4,
952                    (nir_lower_io_options)0);
953         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
954          * on the code it generated.
955          */
956         NIR_PASS_V(c->s, nir_opt_dce);
957         NIR_PASS_V(c->s, nir_opt_constant_folding);
958 }
959 
960 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)961 v3d_fixup_fs_output_types(struct v3d_compile *c)
962 {
963         nir_foreach_shader_out_variable(var, c->s) {
964                 uint32_t mask = 0;
965 
966                 switch (var->data.location) {
967                 case FRAG_RESULT_COLOR:
968                         mask = ~0;
969                         break;
970                 case FRAG_RESULT_DATA0:
971                 case FRAG_RESULT_DATA1:
972                 case FRAG_RESULT_DATA2:
973                 case FRAG_RESULT_DATA3:
974                         mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
975                         break;
976                 }
977 
978                 if (c->fs_key->int_color_rb & mask) {
979                         var->type =
980                                 glsl_vector_type(GLSL_TYPE_INT,
981                                                  glsl_get_components(var->type));
982                 } else if (c->fs_key->uint_color_rb & mask) {
983                         var->type =
984                                 glsl_vector_type(GLSL_TYPE_UINT,
985                                                  glsl_get_components(var->type));
986                 }
987         }
988 }
989 
990 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)991 v3d_nir_lower_fs_early(struct v3d_compile *c)
992 {
993         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
994                 v3d_fixup_fs_output_types(c);
995 
996         NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
997 
998         if (c->fs_key->line_smoothing) {
999                 v3d_nir_lower_line_smooth(c->s);
1000                 NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
1001                 /* The lowering pass can introduce new sysval reads */
1002                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
1003         }
1004 }
1005 
1006 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)1007 v3d_nir_lower_gs_late(struct v3d_compile *c)
1008 {
1009         if (c->key->ucp_enables) {
1010                 NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
1011                            false, NULL);
1012         }
1013 
1014         /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
1015         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
1016 }
1017 
1018 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1019 v3d_nir_lower_vs_late(struct v3d_compile *c)
1020 {
1021         if (c->key->ucp_enables) {
1022                 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
1023                            false, false, NULL);
1024                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1025                            nir_var_shader_out);
1026         }
1027 
1028         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1029         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
1030 }
1031 
1032 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1033 v3d_nir_lower_fs_late(struct v3d_compile *c)
1034 {
1035         /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1036          * Vulkan allows it, in which case the SPIR-V compiler will declare
1037          * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1038          * the last parameter to always operate with a compact array in both
1039          * OpenGL and Vulkan so we do't have to care about the API we
1040          * are using.
1041          */
1042         if (c->key->ucp_enables)
1043                 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
1044 
1045         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
1046 }
1047 
1048 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1049 vir_get_max_temps(struct v3d_compile *c)
1050 {
1051         int max_ip = 0;
1052         vir_for_each_inst_inorder(inst, c)
1053                 max_ip++;
1054 
1055         uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1056 
1057         for (int t = 0; t < c->num_temps; t++) {
1058                 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1059                                                 i < max_ip); i++) {
1060                         if (i > max_ip)
1061                                 break;
1062                         pressure[i]++;
1063                 }
1064         }
1065 
1066         uint32_t max_temps = 0;
1067         for (int i = 0; i < max_ip; i++)
1068                 max_temps = MAX2(max_temps, pressure[i]);
1069 
1070         ralloc_free(pressure);
1071 
1072         return max_temps;
1073 }
1074 
1075 enum v3d_dependency_class {
1076         V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1077 };
1078 
1079 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1080 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1081                             nir_schedule_dependency *dep,
1082                             void *user_data)
1083 {
1084         struct v3d_compile *c = user_data;
1085 
1086         switch (intr->intrinsic) {
1087         case nir_intrinsic_store_output:
1088                 /* Writing to location 0 overwrites the value passed in for
1089                  * gl_PrimitiveID on geometry shaders
1090                  */
1091                 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1092                     nir_intrinsic_base(intr) != 0)
1093                         break;
1094 
1095                 nir_const_value *const_value =
1096                         nir_src_as_const_value(intr->src[1]);
1097 
1098                 if (const_value == NULL)
1099                         break;
1100 
1101                 uint64_t offset =
1102                         nir_const_value_as_uint(*const_value,
1103                                                 nir_src_bit_size(intr->src[1]));
1104                 if (offset != 0)
1105                         break;
1106 
1107                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1108                 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1109                 return true;
1110 
1111         case nir_intrinsic_load_primitive_id:
1112                 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1113                         break;
1114 
1115                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1116                 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1117                 return true;
1118 
1119         default:
1120                 break;
1121         }
1122 
1123         return false;
1124 }
1125 
1126 static unsigned
v3d_instr_delay_cb(nir_instr * instr,void * data)1127 v3d_instr_delay_cb(nir_instr *instr, void *data)
1128 {
1129    struct v3d_compile *c = (struct v3d_compile *) data;
1130 
1131    switch (instr->type) {
1132    case nir_instr_type_ssa_undef:
1133    case nir_instr_type_load_const:
1134    case nir_instr_type_alu:
1135    case nir_instr_type_deref:
1136    case nir_instr_type_jump:
1137    case nir_instr_type_parallel_copy:
1138    case nir_instr_type_call:
1139    case nir_instr_type_phi:
1140       return 1;
1141 
1142    case nir_instr_type_intrinsic: {
1143       if (!c->disable_general_tmu_sched) {
1144          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1145          switch (intr->intrinsic) {
1146          case nir_intrinsic_load_ssbo:
1147          case nir_intrinsic_load_scratch:
1148          case nir_intrinsic_load_shared:
1149          case nir_intrinsic_image_load:
1150             return 30;
1151          case nir_intrinsic_load_ubo:
1152             if (nir_src_is_divergent(intr->src[1]))
1153                return 30;
1154             FALLTHROUGH;
1155          default:
1156             return 1;
1157          }
1158       } else {
1159          return 1;
1160       }
1161       break;
1162    }
1163 
1164    case nir_instr_type_tex:
1165       return 50;
1166    }
1167 
1168    return 0;
1169 }
1170 
1171 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1172 should_split_wrmask(const nir_instr *instr, const void *data)
1173 {
1174         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1175         switch (intr->intrinsic) {
1176         case nir_intrinsic_store_ssbo:
1177         case nir_intrinsic_store_shared:
1178         case nir_intrinsic_store_global:
1179         case nir_intrinsic_store_scratch:
1180                 return true;
1181         default:
1182                 return false;
1183         }
1184 }
1185 
1186 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1187 nir_instr_as_constant_ubo_load(nir_instr *inst)
1188 {
1189         if (inst->type != nir_instr_type_intrinsic)
1190                 return NULL;
1191 
1192         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1193         if (intr->intrinsic != nir_intrinsic_load_ubo)
1194                 return NULL;
1195 
1196         assert(nir_src_is_const(intr->src[0]));
1197         if (!nir_src_is_const(intr->src[1]))
1198                 return NULL;
1199 
1200         return intr;
1201 }
1202 
1203 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1204 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1205 {
1206         bool progress = false;
1207 
1208         nir_instr *ref_inst = &ref->instr;
1209         uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1210         uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1211 
1212         /* Go through all instructions after ref searching for constant UBO
1213          * loads for the same UBO index.
1214          */
1215         bool seq_break = false;
1216         nir_instr *inst = &ref->instr;
1217         nir_instr *next_inst = NULL;
1218         while (true) {
1219                 inst = next_inst ? next_inst : nir_instr_next(inst);
1220                 if (!inst)
1221                         break;
1222 
1223                 next_inst = NULL;
1224 
1225                 if (inst->type != nir_instr_type_intrinsic)
1226                         continue;
1227 
1228                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1229                 if (intr->intrinsic != nir_intrinsic_load_ubo)
1230                         continue;
1231 
1232                 /* We only produce unifa sequences for non-divergent loads */
1233                 if (nir_src_is_divergent(intr->src[1]))
1234                         continue;
1235 
1236                 /* If there are any UBO loads that are not constant or that
1237                  * use a different UBO index in between the reference load and
1238                  * any other constant load for the same index, they would break
1239                  * the unifa sequence. We will flag that so we can then move
1240                  * all constant UBO loads for the reference index before these
1241                  * and not just the ones that are not ordered to avoid breaking
1242                  * the sequence and reduce unifa writes.
1243                  */
1244                 if (!nir_src_is_const(intr->src[1])) {
1245                         seq_break = true;
1246                         continue;
1247                 }
1248                 uint32_t offset = nir_src_as_uint(intr->src[1]);
1249 
1250                 assert(nir_src_is_const(intr->src[0]));
1251                 uint32_t index = nir_src_as_uint(intr->src[0]);
1252                 if (index != ref_index) {
1253                        seq_break = true;
1254                        continue;
1255                 }
1256 
1257                 /* Only move loads with an offset that is close enough to the
1258                  * reference offset, since otherwise we would not be able to
1259                  * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1260                  */
1261                 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
1262                         continue;
1263 
1264                 /* We will move this load if its offset is smaller than ref's
1265                  * (in which case we will move it before ref) or if the offset
1266                  * is larger than ref's but there are sequence breakers in
1267                  * in between (in which case we will move it after ref and
1268                  * before the sequence breakers).
1269                  */
1270                 if (!seq_break && offset >= ref_offset)
1271                         continue;
1272 
1273                 /* Find where exactly we want to move this load:
1274                  *
1275                  * If we are moving it before ref, we want to check any other
1276                  * UBO loads we placed before ref and make sure we insert this
1277                  * one properly ordered with them. Likewise, if we are moving
1278                  * it after ref.
1279                  */
1280                 nir_instr *pos = ref_inst;
1281                 nir_instr *tmp = pos;
1282                 do {
1283                         if (offset < ref_offset)
1284                                 tmp = nir_instr_prev(tmp);
1285                         else
1286                                 tmp = nir_instr_next(tmp);
1287 
1288                         if (!tmp || tmp == inst)
1289                                 break;
1290 
1291                         /* Ignore non-unifa UBO loads */
1292                         if (tmp->type != nir_instr_type_intrinsic)
1293                                 continue;
1294 
1295                         nir_intrinsic_instr *tmp_intr =
1296                                 nir_instr_as_intrinsic(tmp);
1297                         if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1298                                 continue;
1299 
1300                         if (nir_src_is_divergent(tmp_intr->src[1]))
1301                                 continue;
1302 
1303                         /* Stop if we find a unifa UBO load that breaks the
1304                          * sequence.
1305                          */
1306                         if (!nir_src_is_const(tmp_intr->src[1]))
1307                                 break;
1308 
1309                         if (nir_src_as_uint(tmp_intr->src[0]) != index)
1310                                 break;
1311 
1312                         uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1313                         if (offset < ref_offset) {
1314                                 if (tmp_offset < offset ||
1315                                     tmp_offset >= ref_offset) {
1316                                         break;
1317                                 } else {
1318                                         pos = tmp;
1319                                 }
1320                         } else {
1321                                 if (tmp_offset > offset ||
1322                                     tmp_offset <= ref_offset) {
1323                                         break;
1324                                 } else {
1325                                         pos = tmp;
1326                                 }
1327                         }
1328                 } while (true);
1329 
1330                 /* We can't move the UBO load before the instruction that
1331                  * defines its constant offset. If that instruction is placed
1332                  * in between the new location (pos) and the current location
1333                  * of this load, we will have to move that instruction too.
1334                  *
1335                  * We don't care about the UBO index definition because that
1336                  * is optimized to be reused by all UBO loads for the same
1337                  * index and therefore is certain to be defined before the
1338                  * first UBO load that uses it.
1339                  */
1340                 nir_instr *offset_inst = NULL;
1341                 tmp = inst;
1342                 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1343                         if (pos == tmp) {
1344                                 /* We reached the target location without
1345                                  * finding the instruction that defines the
1346                                  * offset, so that instruction must be before
1347                                  * the new position and we don't have to fix it.
1348                                  */
1349                                 break;
1350                         }
1351                         if (intr->src[1].ssa->parent_instr == tmp) {
1352                                 offset_inst = tmp;
1353                                 break;
1354                         }
1355                 }
1356 
1357                 if (offset_inst) {
1358                         exec_node_remove(&offset_inst->node);
1359                         exec_node_insert_node_before(&pos->node,
1360                                                      &offset_inst->node);
1361                 }
1362 
1363                 /* Since we are moving the instruction before its current
1364                  * location, grab its successor before the move so that
1365                  * we can continue the next iteration of the main loop from
1366                  * that instruction.
1367                  */
1368                 next_inst = nir_instr_next(inst);
1369 
1370                 /* Move this load to the selected location */
1371                 exec_node_remove(&inst->node);
1372                 if (offset < ref_offset)
1373                         exec_node_insert_node_before(&pos->node, &inst->node);
1374                 else
1375                         exec_node_insert_after(&pos->node, &inst->node);
1376 
1377                 progress = true;
1378         }
1379 
1380         return progress;
1381 }
1382 
1383 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1384 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1385                                       nir_block *block)
1386 {
1387         bool progress = false;
1388         bool local_progress;
1389         do {
1390                 local_progress = false;
1391                 nir_foreach_instr_safe(inst, block) {
1392                         nir_intrinsic_instr *intr =
1393                                 nir_instr_as_constant_ubo_load(inst);
1394                         if (intr) {
1395                                 local_progress |=
1396                                         v3d_nir_sort_constant_ubo_load(block, intr);
1397                         }
1398                 }
1399                 progress |= local_progress;
1400         } while (local_progress);
1401 
1402         return progress;
1403 }
1404 
1405 /**
1406  * Sorts constant UBO loads in each block by offset to maximize chances of
1407  * skipping unifa writes when converting to VIR. This can increase register
1408  * pressure.
1409  */
1410 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1411 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1412 {
1413         nir_foreach_function(function, s) {
1414                 if (function->impl) {
1415                         nir_foreach_block(block, function->impl) {
1416                                 c->sorted_any_ubo_loads |=
1417                                         v3d_nir_sort_constant_ubo_loads_block(c, block);
1418                         }
1419                         nir_metadata_preserve(function->impl,
1420                                               nir_metadata_block_index |
1421                                               nir_metadata_dominance);
1422                 }
1423         }
1424         return c->sorted_any_ubo_loads;
1425 }
1426 
1427 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1428 lower_load_num_subgroups(struct v3d_compile *c,
1429                          nir_builder *b,
1430                          nir_intrinsic_instr *intr)
1431 {
1432         assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1433         assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1434 
1435         b->cursor = nir_after_instr(&intr->instr);
1436         uint32_t num_subgroups =
1437                 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1438                              c->s->info.workgroup_size[1] *
1439                              c->s->info.workgroup_size[2], V3D_CHANNELS);
1440         nir_ssa_def *result = nir_imm_int(b, num_subgroups);
1441         nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
1442         nir_instr_remove(&intr->instr);
1443 }
1444 
1445 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1446 lower_subgroup_intrinsics(struct v3d_compile *c,
1447                           nir_block *block, nir_builder *b)
1448 {
1449         bool progress = false;
1450         nir_foreach_instr_safe(inst, block) {
1451                 if (inst->type != nir_instr_type_intrinsic)
1452                         continue;;
1453 
1454                 nir_intrinsic_instr *intr =
1455                         nir_instr_as_intrinsic(inst);
1456                 if (!intr)
1457                         continue;
1458 
1459                 switch (intr->intrinsic) {
1460                 case nir_intrinsic_load_num_subgroups:
1461                         lower_load_num_subgroups(c, b, intr);
1462                         progress = true;
1463                         FALLTHROUGH;
1464                 case nir_intrinsic_load_subgroup_id:
1465                 case nir_intrinsic_load_subgroup_size:
1466                 case nir_intrinsic_load_subgroup_invocation:
1467                 case nir_intrinsic_elect:
1468                         c->has_subgroups = true;
1469                         break;
1470                 default:
1471                         break;
1472                 }
1473         }
1474 
1475         return progress;
1476 }
1477 
1478 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1479 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1480 {
1481         bool progress = false;
1482         nir_foreach_function(function, s) {
1483                 if (function->impl) {
1484                         nir_builder b;
1485                         nir_builder_init(&b, function->impl);
1486 
1487                         nir_foreach_block(block, function->impl)
1488                                 progress |= lower_subgroup_intrinsics(c, block, &b);
1489 
1490                         nir_metadata_preserve(function->impl,
1491                                               nir_metadata_block_index |
1492                                               nir_metadata_dominance);
1493                 }
1494         }
1495         return progress;
1496 }
1497 
1498 static void
v3d_attempt_compile(struct v3d_compile * c)1499 v3d_attempt_compile(struct v3d_compile *c)
1500 {
1501         switch (c->s->info.stage) {
1502         case MESA_SHADER_VERTEX:
1503                 c->vs_key = (struct v3d_vs_key *) c->key;
1504                 break;
1505         case MESA_SHADER_GEOMETRY:
1506                 c->gs_key = (struct v3d_gs_key *) c->key;
1507                 break;
1508         case MESA_SHADER_FRAGMENT:
1509                 c->fs_key = (struct v3d_fs_key *) c->key;
1510                 break;
1511         case MESA_SHADER_COMPUTE:
1512                 break;
1513         default:
1514                 unreachable("unsupported shader stage");
1515         }
1516 
1517         switch (c->s->info.stage) {
1518         case MESA_SHADER_VERTEX:
1519                 v3d_nir_lower_vs_early(c);
1520                 break;
1521         case MESA_SHADER_GEOMETRY:
1522                 v3d_nir_lower_gs_early(c);
1523                 break;
1524         case MESA_SHADER_FRAGMENT:
1525                 v3d_nir_lower_fs_early(c);
1526                 break;
1527         default:
1528                 break;
1529         }
1530 
1531         v3d_lower_nir(c);
1532 
1533         switch (c->s->info.stage) {
1534         case MESA_SHADER_VERTEX:
1535                 v3d_nir_lower_vs_late(c);
1536                 break;
1537         case MESA_SHADER_GEOMETRY:
1538                 v3d_nir_lower_gs_late(c);
1539                 break;
1540         case MESA_SHADER_FRAGMENT:
1541                 v3d_nir_lower_fs_late(c);
1542                 break;
1543         default:
1544                 break;
1545         }
1546 
1547         NIR_PASS_V(c->s, v3d_nir_lower_io, c);
1548         NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
1549         NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
1550         nir_lower_idiv_options idiv_options = {
1551                 .imprecise_32bit_lowering = true,
1552                 .allow_fp16 = true,
1553         };
1554         NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
1555 
1556         if (c->key->robust_buffer_access) {
1557            /* v3d_nir_lower_robust_buffer_access assumes constant buffer
1558             * indices on ubo/ssbo intrinsics so run copy propagation and
1559             * constant folding passes before we run the lowering to warrant
1560             * this. We also want to run the lowering before v3d_optimize to
1561             * clean-up redundant get_buffer_size calls produced in the pass.
1562             */
1563            NIR_PASS_V(c->s, nir_copy_prop);
1564            NIR_PASS_V(c->s, nir_opt_constant_folding);
1565            NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
1566         }
1567 
1568         NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1569 
1570         NIR_PASS_V(c->s, v3d_nir_lower_load_store_bitsize, c);
1571 
1572         NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
1573 
1574         v3d_optimize_nir(c, c->s);
1575 
1576         /* Do late algebraic optimization to turn add(a, neg(b)) back into
1577          * subs, then the mandatory cleanup after algebraic.  Note that it may
1578          * produce fnegs, and if so then we need to keep running to squash
1579          * fneg(fneg(a)).
1580          */
1581         bool more_late_algebraic = true;
1582         while (more_late_algebraic) {
1583                 more_late_algebraic = false;
1584                 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1585                 NIR_PASS_V(c->s, nir_opt_constant_folding);
1586                 NIR_PASS_V(c->s, nir_copy_prop);
1587                 NIR_PASS_V(c->s, nir_opt_dce);
1588                 NIR_PASS_V(c->s, nir_opt_cse);
1589         }
1590 
1591         NIR_PASS_V(c->s, nir_lower_bool_to_int32);
1592         nir_convert_to_lcssa(c->s, true, true);
1593         NIR_PASS_V(c->s, nir_divergence_analysis);
1594         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
1595 
1596         struct nir_schedule_options schedule_options = {
1597                 /* Schedule for about half our register space, to enable more
1598                  * shaders to hit 4 threads.
1599                  */
1600                 .threshold = c->threads == 4 ? 24 : 48,
1601 
1602                 /* Vertex shaders share the same memory for inputs and outputs,
1603                  * fragement and geometry shaders do not.
1604                  */
1605                 .stages_with_shared_io_memory =
1606                 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1607                  ~((1 << MESA_SHADER_FRAGMENT) |
1608                    (1 << MESA_SHADER_GEOMETRY))),
1609 
1610                 .fallback = c->fallback_scheduler,
1611 
1612                 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1613                 .intrinsic_cb_data = c,
1614 
1615                 .instr_delay_cb = v3d_instr_delay_cb,
1616                 .instr_delay_cb_data = c,
1617         };
1618         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1619 
1620         if (!c->disable_constant_ubo_load_sorting)
1621                 NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
1622 
1623         NIR_PASS_V(c->s, nir_opt_move, nir_move_load_uniform |
1624                                        nir_move_const_undef);
1625 
1626         v3d_nir_to_vir(c);
1627 }
1628 
1629 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1630 v3d_prog_data_size(gl_shader_stage stage)
1631 {
1632         static const int prog_data_size[] = {
1633                 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1634                 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1635                 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1636                 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1637         };
1638 
1639         assert(stage >= 0 &&
1640                stage < ARRAY_SIZE(prog_data_size) &&
1641                prog_data_size[stage]);
1642 
1643         return prog_data_size[stage];
1644 }
1645 
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1646 int v3d_shaderdb_dump(struct v3d_compile *c,
1647 		      char **shaderdb_str)
1648 {
1649         if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1650                 return -1;
1651 
1652         return asprintf(shaderdb_str,
1653                         "%s shader: %d inst, %d threads, %d loops, "
1654                         "%d uniforms, %d max-temps, %d:%d spills:fills, "
1655                         "%d sfu-stalls, %d inst-and-stalls, %d nops",
1656                         vir_get_stage_name(c),
1657                         c->qpu_inst_count,
1658                         c->threads,
1659                         c->loops,
1660                         c->num_uniforms,
1661                         vir_get_max_temps(c),
1662                         c->spills,
1663                         c->fills,
1664                         c->qpu_inst_stalled_count,
1665                         c->qpu_inst_count + c->qpu_inst_stalled_count,
1666                         c->nop_count);
1667 }
1668 
1669 /* This is a list of incremental changes to the compilation strategy
1670  * that will be used to try to compile the shader successfully. The
1671  * default strategy is to enable all optimizations which will have
1672  * the highest register pressure but is expected to produce most
1673  * optimal code. Following strategies incrementally disable specific
1674  * optimizations that are known to contribute to register pressure
1675  * in order to be able to compile the shader successfully while meeting
1676  * thread count requirements.
1677  *
1678  * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1679  * cover previous hardware as well (meaning that we are not limiting
1680  * register allocation to any particular thread count). This is fine
1681  * because v3d_nir_to_vir will cap this to the actual minimum.
1682  */
1683 struct v3d_compiler_strategy {
1684         const char *name;
1685         uint32_t max_threads;
1686         uint32_t min_threads;
1687         bool disable_general_tmu_sched;
1688         bool disable_loop_unrolling;
1689         bool disable_ubo_load_sorting;
1690         bool disable_tmu_pipelining;
1691         uint32_t max_tmu_spills;
1692 } static const strategies[] = {
1693   /*0*/  { "default",                        4, 4, false, false, false, false,  0 },
1694   /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false,  0 },
1695   /*2*/  { "disable loop unrolling",         4, 4, true,  true,  false, false,  0 },
1696   /*3*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  false,  0 },
1697   /*4*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,   0 },
1698   /*5*/  { "lower thread count",             2, 1, false, false, false, false, -1 },
1699   /*6*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, -1 },
1700   /*7*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  false, false, -1 },
1701   /*8*/  { "disable UBO load sorting (2t)",  2, 1, true,  true,  true,  false, -1 },
1702   /*9*/  { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  -1 },
1703   /*10*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  -1 }
1704 };
1705 
1706 /**
1707  * If a particular optimization didn't make any progress during a compile
1708  * attempt disabling it alone won't allow us to compile the shader successfuly,
1709  * since we'll end up with the same code. Detect these scenarios so we can
1710  * avoid wasting time with useless compiles. We should also consider if the
1711  * gy changes other aspects of the compilation process though, like
1712  * spilling, and not skip it in that case.
1713  */
1714 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1715 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1716 {
1717    /* We decide if we can skip a strategy based on the optimizations that
1718     * were active in the previous strategy, so we should only be calling this
1719     * for strategies after the first.
1720     */
1721    assert(idx > 0);
1722 
1723    /* Don't skip a strategy that changes spilling behavior */
1724    if (strategies[idx].max_tmu_spills !=
1725        strategies[idx - 1].max_tmu_spills) {
1726            return false;
1727    }
1728 
1729    switch (idx) {
1730    /* General TMU sched.: skip if we didn't emit any TMU loads */
1731    case 1:
1732    case 6:
1733            return !c->has_general_tmu_load;
1734    /* Loop unrolling: skip if we didn't unroll any loops */
1735    case 2:
1736    case 7:
1737            return !c->unrolled_any_loops;
1738    /* UBO load sorting: skip if we didn't sort any loads */
1739    case 3:
1740    case 8:
1741            return !c->sorted_any_ubo_loads;
1742    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1743    case 4:
1744    case 9:
1745            return !c->pipelined_any_tmu;
1746    /* Lower thread count: skip if we already tried less that 4 threads */
1747    case 5:
1748           return c->threads < 4;
1749    default:
1750            return false;
1751    };
1752 }
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)1753 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1754                       struct v3d_key *key,
1755                       struct v3d_prog_data **out_prog_data,
1756                       nir_shader *s,
1757                       void (*debug_output)(const char *msg,
1758                                            void *debug_output_data),
1759                       void *debug_output_data,
1760                       int program_id, int variant_id,
1761                       uint32_t *final_assembly_size)
1762 {
1763         struct v3d_compile *c = NULL;
1764 
1765         uint32_t best_spill_fill_count = UINT32_MAX;
1766         struct v3d_compile *best_c = NULL;
1767         for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
1768                 /* Fallback strategy */
1769                 if (strat > 0) {
1770                         assert(c);
1771                         if (skip_compile_strategy(c, strat))
1772                                 continue;
1773 
1774                         char *debug_msg;
1775                         int ret = asprintf(&debug_msg,
1776                                            "Falling back to strategy '%s' "
1777                                            "for %s prog %d/%d",
1778                                            strategies[strat].name,
1779                                            vir_get_stage_name(c),
1780                                            c->program_id, c->variant_id);
1781 
1782                         if (ret >= 0) {
1783                                 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
1784                                         fprintf(stderr, "%s\n", debug_msg);
1785 
1786                                 c->debug_output(debug_msg, c->debug_output_data);
1787                                 free(debug_msg);
1788                         }
1789 
1790                         if (c != best_c)
1791                                 vir_compile_destroy(c);
1792                 }
1793 
1794                 c = vir_compile_init(compiler, key, s,
1795                                      debug_output, debug_output_data,
1796                                      program_id, variant_id,
1797                                      strategies[strat].max_threads,
1798                                      strategies[strat].min_threads,
1799                                      strategies[strat].max_tmu_spills,
1800                                      strategies[strat].disable_general_tmu_sched,
1801                                      strategies[strat].disable_loop_unrolling,
1802                                      strategies[strat].disable_ubo_load_sorting,
1803                                      strategies[strat].disable_tmu_pipelining,
1804                                      strat == ARRAY_SIZE(strategies) - 1);
1805 
1806                 v3d_attempt_compile(c);
1807 
1808                 /* Broken shader or driver bug */
1809                 if (c->compilation_result == V3D_COMPILATION_FAILED)
1810                         break;
1811 
1812                 /* If we compiled without spills, choose this.
1813                  * Otherwise if this is a 4-thread compile, choose this (these
1814                  * have a very low cap on the allowed TMU spills so we assume
1815                  * it will be better than a 2-thread compile without spills).
1816                  * Otherwise, keep going while tracking the strategy with the
1817                  * lowest spill count.
1818                  */
1819                 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
1820                         if (c->spills == 0 ||
1821                             strategies[strat].min_threads == 4) {
1822                                 best_c = c;
1823                                 break;
1824                         } else if (c->spills + c->fills <
1825                                    best_spill_fill_count) {
1826                                 best_c = c;
1827                                 best_spill_fill_count = c->spills + c->fills;
1828                         }
1829 
1830                         if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) {
1831                                 char *debug_msg;
1832                                 int ret = asprintf(&debug_msg,
1833                                                    "Compiled %s prog %d/%d with %d "
1834                                                    "spills and %d fills. Will try "
1835                                                    "more strategies.",
1836                                                    vir_get_stage_name(c),
1837                                                    c->program_id, c->variant_id,
1838                                                    c->spills, c->fills);
1839                                 if (ret >= 0) {
1840                                         fprintf(stderr, "%s\n", debug_msg);
1841                                         c->debug_output(debug_msg, c->debug_output_data);
1842                                         free(debug_msg);
1843                                 }
1844                         }
1845                 }
1846 
1847                 /* Only try next streategy if we failed to register allocate
1848                  * or we had to spill.
1849                  */
1850                 assert(c->compilation_result ==
1851                        V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
1852                        c->spills > 0);
1853         }
1854 
1855         /* If the best strategy was not the last, choose that */
1856         if (best_c && c != best_c) {
1857                 vir_compile_destroy(c);
1858                 c = best_c;
1859         }
1860 
1861         if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
1862             c->compilation_result !=
1863             V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
1864             c->spills > 0) {
1865                 char *debug_msg;
1866                 int ret = asprintf(&debug_msg,
1867                                    "Compiled %s prog %d/%d with %d "
1868                                    "spills and %d fills",
1869                                    vir_get_stage_name(c),
1870                                    c->program_id, c->variant_id,
1871                                    c->spills, c->fills);
1872                 fprintf(stderr, "%s\n", debug_msg);
1873 
1874                 if (ret >= 0) {
1875                         c->debug_output(debug_msg, c->debug_output_data);
1876                         free(debug_msg);
1877                 }
1878         }
1879 
1880         if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
1881                 fprintf(stderr, "Failed to compile %s prog %d/%d "
1882                         "with any strategy.\n",
1883                         vir_get_stage_name(c), c->program_id, c->variant_id);
1884         }
1885 
1886         struct v3d_prog_data *prog_data;
1887 
1888         prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
1889 
1890         v3d_set_prog_data(c, prog_data);
1891 
1892         *out_prog_data = prog_data;
1893 
1894         char *shaderdb;
1895         int ret = v3d_shaderdb_dump(c, &shaderdb);
1896         if (ret >= 0) {
1897                 if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
1898                         fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
1899 
1900                 c->debug_output(shaderdb, c->debug_output_data);
1901                 free(shaderdb);
1902         }
1903 
1904        return v3d_return_qpu_insts(c, final_assembly_size);
1905 }
1906 
1907 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)1908 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
1909 {
1910         if (qinst->dst.file == QFILE_TEMP)
1911                 c->defs[qinst->dst.index] = NULL;
1912 
1913         assert(&qinst->link != c->cursor.link);
1914 
1915         list_del(&qinst->link);
1916         free(qinst);
1917 
1918         c->live_intervals_valid = false;
1919 }
1920 
1921 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)1922 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
1923 {
1924         /* XXX
1925         int pack = reg.pack;
1926 
1927         while (reg.file == QFILE_TEMP &&
1928                c->defs[reg.index] &&
1929                (c->defs[reg.index]->op == QOP_MOV ||
1930                 c->defs[reg.index]->op == QOP_FMOV) &&
1931                !c->defs[reg.index]->dst.pack &&
1932                !c->defs[reg.index]->src[0].pack) {
1933                 reg = c->defs[reg.index]->src[0];
1934         }
1935 
1936         reg.pack = pack;
1937         */
1938         return reg;
1939 }
1940 
1941 void
vir_compile_destroy(struct v3d_compile * c)1942 vir_compile_destroy(struct v3d_compile *c)
1943 {
1944         /* Defuse the assert that we aren't removing the cursor's instruction.
1945          */
1946         c->cursor.link = NULL;
1947 
1948         vir_for_each_block(block, c) {
1949                 while (!list_is_empty(&block->instructions)) {
1950                         struct qinst *qinst =
1951                                 list_first_entry(&block->instructions,
1952                                                  struct qinst, link);
1953                         vir_remove_instruction(c, qinst);
1954                 }
1955         }
1956 
1957         ralloc_free(c);
1958 }
1959 
1960 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)1961 vir_get_uniform_index(struct v3d_compile *c,
1962                       enum quniform_contents contents,
1963                       uint32_t data)
1964 {
1965         for (int i = 0; i < c->num_uniforms; i++) {
1966                 if (c->uniform_contents[i] == contents &&
1967                     c->uniform_data[i] == data) {
1968                         return i;
1969                 }
1970         }
1971 
1972         uint32_t uniform = c->num_uniforms++;
1973 
1974         if (uniform >= c->uniform_array_size) {
1975                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
1976                                              c->uniform_array_size * 2);
1977 
1978                 c->uniform_data = reralloc(c, c->uniform_data,
1979                                            uint32_t,
1980                                            c->uniform_array_size);
1981                 c->uniform_contents = reralloc(c, c->uniform_contents,
1982                                                enum quniform_contents,
1983                                                c->uniform_array_size);
1984         }
1985 
1986         c->uniform_contents[uniform] = contents;
1987         c->uniform_data[uniform] = data;
1988 
1989         return uniform;
1990 }
1991 
1992 /* Looks back into the current block to find the ldunif that wrote the uniform
1993  * at the requested index. If it finds it, it returns true and writes the
1994  * destination register of the ldunif instruction to 'unif'.
1995  *
1996  * This can impact register pressure and end up leading to worse code, so we
1997  * limit the number of instructions we are willing to look back through to
1998  * strike a good balance.
1999  */
2000 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)2001 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
2002 {
2003         uint32_t count = 20;
2004         struct qinst *prev_inst = NULL;
2005         assert(c->cur_block);
2006 
2007 #ifdef DEBUG
2008         /* We can only reuse a uniform if it was emitted in the same block,
2009          * so callers must make sure the current instruction is being emitted
2010          * in the current block.
2011          */
2012         bool found = false;
2013         vir_for_each_inst(inst, c->cur_block) {
2014                 if (&inst->link == c->cursor.link) {
2015                         found = true;
2016                         break;
2017                 }
2018         }
2019 
2020         assert(found || &c->cur_block->instructions == c->cursor.link);
2021 #endif
2022 
2023         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
2024                                      &c->cur_block->instructions, link) {
2025                 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
2026                     inst->uniform == index) {
2027                         prev_inst = inst;
2028                         break;
2029                 }
2030 
2031                 if (--count == 0)
2032                         break;
2033         }
2034 
2035         if (!prev_inst)
2036                 return false;
2037 
2038 
2039         list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
2040                                  &c->cur_block->instructions, link) {
2041                 if (inst->dst.file == prev_inst->dst.file &&
2042                     inst->dst.index == prev_inst->dst.index) {
2043                         return false;
2044                 }
2045         }
2046 
2047         *unif = prev_inst->dst;
2048         return true;
2049 }
2050 
2051 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2052 vir_uniform(struct v3d_compile *c,
2053             enum quniform_contents contents,
2054             uint32_t data)
2055 {
2056         const int num_uniforms = c->num_uniforms;
2057         const int index = vir_get_uniform_index(c, contents, data);
2058 
2059         /* If this is not the first time we see this uniform try to reuse the
2060          * result of the last ldunif that loaded it.
2061          */
2062         const bool is_new_uniform = num_uniforms != c->num_uniforms;
2063         if (!is_new_uniform && !c->disable_ldunif_opt) {
2064                 struct qreg ldunif_dst;
2065                 if (try_opt_ldunif(c, index, &ldunif_dst))
2066                         return ldunif_dst;
2067         }
2068 
2069         struct qinst *inst = vir_NOP(c);
2070         inst->qpu.sig.ldunif = true;
2071         inst->uniform = index;
2072         inst->dst = vir_get_temp(c);
2073         c->defs[inst->dst.index] = inst;
2074         return inst->dst;
2075 }
2076 
2077 #define OPTPASS(func)                                                   \
2078         do {                                                            \
2079                 bool stage_progress = func(c);                          \
2080                 if (stage_progress) {                                   \
2081                         progress = true;                                \
2082                         if (print_opt_debug) {                          \
2083                                 fprintf(stderr,                         \
2084                                         "VIR opt pass %2d: %s progress\n", \
2085                                         pass, #func);                   \
2086                         }                                               \
2087                         /*XXX vir_validate(c);*/                        \
2088                 }                                                       \
2089         } while (0)
2090 
2091 void
vir_optimize(struct v3d_compile * c)2092 vir_optimize(struct v3d_compile *c)
2093 {
2094         bool print_opt_debug = false;
2095         int pass = 1;
2096 
2097         while (true) {
2098                 bool progress = false;
2099 
2100                 OPTPASS(vir_opt_copy_propagate);
2101                 OPTPASS(vir_opt_redundant_flags);
2102                 OPTPASS(vir_opt_dead_code);
2103                 OPTPASS(vir_opt_small_immediates);
2104                 OPTPASS(vir_opt_constant_alu);
2105 
2106                 if (!progress)
2107                         break;
2108 
2109                 pass++;
2110         }
2111 }
2112 
2113 const char *
vir_get_stage_name(struct v3d_compile * c)2114 vir_get_stage_name(struct v3d_compile *c)
2115 {
2116         if (c->vs_key && c->vs_key->is_coord)
2117                 return "MESA_SHADER_VERTEX_BIN";
2118         else if (c->gs_key && c->gs_key->is_coord)
2119                 return "MESA_SHADER_GEOMETRY_BIN";
2120         else
2121                 return gl_shader_stage_name(c->s->info.stage);
2122 }
2123 
2124 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)2125 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
2126 {
2127    assert(devinfo->vpm_size > 0);
2128    const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
2129    return devinfo->vpm_size / sector_size;
2130 }
2131 
2132 /* Computes various parameters affecting VPM memory configuration for programs
2133  * involving geometry shaders to ensure the program fits in memory and honors
2134  * requirements described in section "VPM usage" of the programming manual.
2135  */
2136 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2137 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2138                       struct v3d_vs_prog_data *vs,
2139                       struct v3d_gs_prog_data *gs,
2140                       struct vpm_config *vpm_cfg_out)
2141 {
2142    const uint32_t A = vs->separate_segments ? 1 : 0;
2143    const uint32_t Ad = vs->vpm_input_size;
2144    const uint32_t Vd = vs->vpm_output_size;
2145 
2146    const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2147 
2148    /* Try to fit program into our VPM memory budget by adjusting
2149     * configurable parameters iteratively. We do this in two phases:
2150     * the first phase tries to fit the program into the total available
2151     * VPM memory. If we succeed at that, then the second phase attempts
2152     * to fit the program into half of that budget so we can run bin and
2153     * render programs in parallel.
2154     */
2155    struct vpm_config vpm_cfg[2];
2156    struct vpm_config *final_vpm_cfg = NULL;
2157    uint32_t phase = 0;
2158 
2159    vpm_cfg[phase].As = 1;
2160    vpm_cfg[phase].Gs = 1;
2161    vpm_cfg[phase].Gd = gs->vpm_output_size;
2162    vpm_cfg[phase].gs_width = gs->simd_width;
2163 
2164    /* While there is a requirement that Vc >= [Vn / 16], this is
2165     * always the case when tessellation is not present because in that
2166     * case Vn can only be 6 at most (when input primitive is triangles
2167     * with adjacency).
2168     *
2169     * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2170     * and Broadcom has not found it worth it to increase it beyond this
2171     * in general. Increasing Vc also increases VPM memory pressure which
2172     * can turn up being detrimental for performance in some scenarios.
2173     */
2174    vpm_cfg[phase].Vc = 2;
2175 
2176    /* Gv is a constraint on the hardware to not exceed the
2177     * specified number of vertex segments per GS batch. If adding a
2178     * new primitive to a GS batch would result in a range of more
2179     * than Gv vertex segments being referenced by the batch, then
2180     * the hardware will flush the batch and start a new one. This
2181     * means that we can choose any value we want, we just need to
2182     * be aware that larger values improve GS batch utilization
2183     * at the expense of more VPM memory pressure (which can affect
2184     * other performance aspects, such as GS dispatch width).
2185     * We start with the largest value, and will reduce it if we
2186     * find that total memory pressure is too high.
2187     */
2188    vpm_cfg[phase].Gv = 3;
2189    do {
2190       /* When GS is present in absence of TES, then we need to satisfy
2191        * that Ve >= Gv. We go with the smallest value of Ve to avoid
2192        * increasing memory pressure.
2193        */
2194       vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2195 
2196       uint32_t vpm_sectors =
2197          A * vpm_cfg[phase].As * Ad +
2198          (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2199          vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2200 
2201       /* Ideally we want to use no more than half of the available
2202        * memory so we can execute a bin and render program in parallel
2203        * without stalls. If we achieved that then we are done.
2204        */
2205       if (vpm_sectors <= vpm_size / 2) {
2206          final_vpm_cfg = &vpm_cfg[phase];
2207          break;
2208       }
2209 
2210       /* At the very least, we should not allocate more than the
2211        * total available VPM memory. If we have a configuration that
2212        * succeeds at this we save it and continue to see if we can
2213        * meet the half-memory-use criteria too.
2214        */
2215       if (phase == 0 && vpm_sectors <= vpm_size) {
2216          vpm_cfg[1] = vpm_cfg[0];
2217          phase = 1;
2218       }
2219 
2220       /* Try lowering Gv */
2221       if (vpm_cfg[phase].Gv > 0) {
2222          vpm_cfg[phase].Gv--;
2223          continue;
2224       }
2225 
2226       /* Try lowering GS dispatch width */
2227       if (vpm_cfg[phase].gs_width > 1) {
2228          do {
2229             vpm_cfg[phase].gs_width >>= 1;
2230             vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2231          } while (vpm_cfg[phase].gs_width == 2);
2232 
2233          /* Reset Gv to max after dropping dispatch width */
2234          vpm_cfg[phase].Gv = 3;
2235          continue;
2236       }
2237 
2238       /* We ran out of options to reduce memory pressure. If we
2239        * are at phase 1 we have at least a valid configuration, so we
2240        * we use that.
2241        */
2242       if (phase == 1)
2243          final_vpm_cfg = &vpm_cfg[0];
2244       break;
2245    } while (true);
2246 
2247    if (!final_vpm_cfg)
2248       return false;
2249 
2250    assert(final_vpm_cfg);
2251    assert(final_vpm_cfg->Gd <= 16);
2252    assert(final_vpm_cfg->Gv < 4);
2253    assert(final_vpm_cfg->Ve < 4);
2254    assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2255    assert(final_vpm_cfg->gs_width == 1 ||
2256           final_vpm_cfg->gs_width == 4 ||
2257           final_vpm_cfg->gs_width == 8 ||
2258           final_vpm_cfg->gs_width == 16);
2259 
2260    *vpm_cfg_out = *final_vpm_cfg;
2261    return true;
2262 }
2263 
2264 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2265 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2266                        struct v3d_vs_prog_data *vs_bin,
2267                        struct v3d_vs_prog_data *vs,
2268                        struct v3d_gs_prog_data *gs_bin,
2269                        struct v3d_gs_prog_data *gs,
2270                        struct vpm_config *vpm_cfg_bin,
2271                        struct vpm_config *vpm_cfg)
2272 {
2273    assert(vs && vs_bin);
2274    assert((gs != NULL) == (gs_bin != NULL));
2275 
2276    if (!gs) {
2277       vpm_cfg_bin->As = 1;
2278       vpm_cfg_bin->Ve = 0;
2279       vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2280 
2281       vpm_cfg->As = 1;
2282       vpm_cfg->Ve = 0;
2283       vpm_cfg->Vc = vs->vcm_cache_size;
2284    } else {
2285       if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2286          return false;
2287 
2288       if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2289          return false;
2290    }
2291 
2292    return true;
2293 }
2294