1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef IR3_H_
25 #define IR3_H_
26
27 #include <stdbool.h>
28 #include <stdint.h>
29
30 #include "compiler/shader_enums.h"
31
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36
37 #include "instr-a3xx.h"
38
39 /* low level intermediate representation of an adreno shader program */
40
41 struct ir3_compiler;
42 struct ir3;
43 struct ir3_instruction;
44 struct ir3_block;
45
46 struct ir3_info {
47 void *data; /* used internally in ir3 assembler */
48 /* Size in bytes of the shader binary, including NIR constants and
49 * padding
50 */
51 uint32_t size;
52 /* byte offset from start of the shader to the NIR constant data. */
53 uint32_t constant_data_offset;
54 /* Size in dwords of the instructions. */
55 uint16_t sizedwords;
56 uint16_t instrs_count; /* expanded to account for rpt's */
57 uint16_t nops_count; /* # of nop instructions, including nopN */
58 uint16_t mov_count;
59 uint16_t cov_count;
60 uint16_t stp_count;
61 uint16_t ldp_count;
62 /* NOTE: max_reg, etc, does not include registers not touched
63 * by the shader (ie. vertex fetched via VFD_DECODE but not
64 * touched by shader)
65 */
66 int8_t max_reg; /* highest GPR # used by shader */
67 int8_t max_half_reg;
68 int16_t max_const;
69 /* This is the maximum # of waves that can executed at once in one core,
70 * assuming that they are all executing this shader.
71 */
72 int8_t max_waves;
73 bool double_threadsize;
74 bool multi_dword_ldp_stp;
75
76 /* number of sync bits: */
77 uint16_t ss, sy;
78
79 /* estimate of number of cycles stalled on (ss) */
80 uint16_t sstall;
81 /* estimate of number of cycles stalled on (sy) */
82 uint16_t systall;
83
84 uint16_t last_baryf; /* instruction # of last varying fetch */
85
86 /* Number of instructions of a given category: */
87 uint16_t instrs_per_cat[8];
88 };
89
90 struct ir3_merge_set {
91 uint16_t preferred_reg;
92 uint16_t size;
93 uint16_t alignment;
94
95 unsigned interval_start;
96 unsigned spill_slot;
97
98 unsigned regs_count;
99 struct ir3_register **regs;
100 };
101
102 struct ir3_register {
103 enum {
104 IR3_REG_CONST = 0x001,
105 IR3_REG_IMMED = 0x002,
106 IR3_REG_HALF = 0x004,
107 /* Shared registers have the same value for all threads when read.
108 * They can only be written when one thread is active (that is, inside
109 * a "getone" block).
110 */
111 IR3_REG_SHARED = 0x008,
112 IR3_REG_RELATIV = 0x010,
113 IR3_REG_R = 0x020,
114 /* Most instructions, it seems, can do float abs/neg but not
115 * integer. The CP pass needs to know what is intended (int or
116 * float) in order to do the right thing. For this reason the
117 * abs/neg flags are split out into float and int variants. In
118 * addition, .b (bitwise) operations, the negate is actually a
119 * bitwise not, so split that out into a new flag to make it
120 * more clear.
121 */
122 IR3_REG_FNEG = 0x040,
123 IR3_REG_FABS = 0x080,
124 IR3_REG_SNEG = 0x100,
125 IR3_REG_SABS = 0x200,
126 IR3_REG_BNOT = 0x400,
127 /* (ei) flag, end-input? Set on last bary, presumably to signal
128 * that the shader needs no more input:
129 *
130 * Note: Has different meaning on other instructions like add.s/u
131 */
132 IR3_REG_EI = 0x2000,
133 /* meta-flags, for intermediate stages of IR, ie.
134 * before register assignment is done:
135 */
136 IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */
137 IR3_REG_ARRAY = 0x8000,
138
139 /* Set on a use whenever the SSA value becomes dead after the current
140 * instruction.
141 */
142 IR3_REG_KILL = 0x10000,
143
144 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
145 * same SSA value in a single instruction, this is only set on the first
146 * use.
147 */
148 IR3_REG_FIRST_KILL = 0x20000,
149
150 /* Set when a destination doesn't have any uses and is dead immediately
151 * after the instruction. This can happen even after optimizations for
152 * corner cases such as destinations of atomic instructions.
153 */
154 IR3_REG_UNUSED = 0x40000,
155
156 /* "Early-clobber" on a destination means that the destination is
157 * (potentially) written before any sources are read and therefore
158 * interferes with the sources of the instruction.
159 */
160 IR3_REG_EARLY_CLOBBER = 0x80000,
161 } flags;
162
163 unsigned name;
164
165 /* used for cat5 instructions, but also for internal/IR level
166 * tracking of what registers are read/written by an instruction.
167 * wrmask may be a bad name since it is used to represent both
168 * src and dst that touch multiple adjacent registers.
169 */
170 unsigned wrmask : 16; /* up to vec16 */
171
172 /* for relative addressing, 32bits for array size is too small,
173 * but otoh we don't need to deal with disjoint sets, so instead
174 * use a simple size field (number of scalar components).
175 *
176 * Note the size field isn't important for relative const (since
177 * we don't have to do register allocation for constants).
178 */
179 unsigned size : 16;
180
181 /* normal registers:
182 * the component is in the low two bits of the reg #, so
183 * rN.x becomes: (N << 2) | x
184 */
185 uint16_t num;
186 union {
187 /* immediate: */
188 int32_t iim_val;
189 uint32_t uim_val;
190 float fim_val;
191 /* relative: */
192 struct {
193 uint16_t id;
194 int16_t offset;
195 uint16_t base;
196 } array;
197 };
198
199 /* For IR3_REG_DEST, pointer back to the instruction containing this
200 * register.
201 */
202 struct ir3_instruction *instr;
203
204 /* For IR3_REG_SSA, src registers contain ptr back to assigning
205 * instruction.
206 *
207 * For IR3_REG_ARRAY, the pointer is back to the last dependent
208 * array access (although the net effect is the same, it points
209 * back to a previous instruction that we depend on).
210 */
211 struct ir3_register *def;
212
213 /* Pointer to another register in the instruction that must share the same
214 * physical register. Each destination can be tied with one source, and
215 * they must have "tied" pointing to each other.
216 */
217 struct ir3_register *tied;
218
219 unsigned spill_slot, next_use;
220
221 unsigned merge_set_offset;
222 struct ir3_merge_set *merge_set;
223 unsigned interval_start, interval_end;
224 };
225
226 /*
227 * Stupid/simple growable array implementation:
228 */
229 #define DECLARE_ARRAY(type, name) \
230 unsigned name##_count, name##_sz; \
231 type *name;
232
233 #define array_insert(ctx, arr, ...) \
234 do { \
235 if (arr##_count == arr##_sz) { \
236 arr##_sz = MAX2(2 * arr##_sz, 16); \
237 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \
238 } \
239 arr[arr##_count++] = __VA_ARGS__; \
240 } while (0)
241
242 typedef enum {
243 REDUCE_OP_ADD_U,
244 REDUCE_OP_ADD_F,
245 REDUCE_OP_MUL_U,
246 REDUCE_OP_MUL_F,
247 REDUCE_OP_MIN_U,
248 REDUCE_OP_MIN_S,
249 REDUCE_OP_MIN_F,
250 REDUCE_OP_MAX_U,
251 REDUCE_OP_MAX_S,
252 REDUCE_OP_MAX_F,
253 REDUCE_OP_AND_B,
254 REDUCE_OP_OR_B,
255 REDUCE_OP_XOR_B,
256 } reduce_op_t;
257
258 struct ir3_instruction {
259 struct ir3_block *block;
260 opc_t opc;
261 enum {
262 /* (sy) flag is set on first instruction, and after sample
263 * instructions (probably just on RAW hazard).
264 */
265 IR3_INSTR_SY = 0x001,
266 /* (ss) flag is set on first instruction, and first instruction
267 * to depend on the result of "long" instructions (RAW hazard):
268 *
269 * rcp, rsq, log2, exp2, sin, cos, sqrt
270 *
271 * It seems to synchronize until all in-flight instructions are
272 * completed, for example:
273 *
274 * rsq hr1.w, hr1.w
275 * add.f hr2.z, (neg)hr2.z, hc0.y
276 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
277 * rsq hr2.x, hr2.x
278 * (rpt1)nop
279 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
280 * nop
281 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
282 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
283 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
284 *
285 * The last mul.f does not have (ss) set, presumably because the
286 * (ss) on the previous instruction does the job.
287 *
288 * The blob driver also seems to set it on WAR hazards, although
289 * not really clear if this is needed or just blob compiler being
290 * sloppy. So far I haven't found a case where removing the (ss)
291 * causes problems for WAR hazard, but I could just be getting
292 * lucky:
293 *
294 * rcp r1.y, r3.y
295 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
296 *
297 */
298 IR3_INSTR_SS = 0x002,
299 /* (jp) flag is set on jump targets:
300 */
301 IR3_INSTR_JP = 0x004,
302 IR3_INSTR_UL = 0x008,
303 IR3_INSTR_3D = 0x010,
304 IR3_INSTR_A = 0x020,
305 IR3_INSTR_O = 0x040,
306 IR3_INSTR_P = 0x080,
307 IR3_INSTR_S = 0x100,
308 IR3_INSTR_S2EN = 0x200,
309 IR3_INSTR_SAT = 0x400,
310 /* (cat5/cat6) Bindless */
311 IR3_INSTR_B = 0x800,
312 /* (cat5/cat6) nonuniform */
313 IR3_INSTR_NONUNIF = 0x1000,
314 /* (cat5-only) Get some parts of the encoding from a1.x */
315 IR3_INSTR_A1EN = 0x02000,
316 /* meta-flags, for intermediate stages of IR, ie.
317 * before register assignment is done:
318 */
319 IR3_INSTR_MARK = 0x04000,
320 IR3_INSTR_UNUSED = 0x08000,
321 } flags;
322 uint8_t repeat;
323 uint8_t nop;
324 #ifdef DEBUG
325 unsigned srcs_max, dsts_max;
326 #endif
327 unsigned srcs_count, dsts_count;
328 struct ir3_register **dsts;
329 struct ir3_register **srcs;
330 union {
331 struct {
332 char inv1, inv2;
333 char comp1, comp2;
334 int immed;
335 struct ir3_block *target;
336 const char *target_label;
337 brtype_t brtype;
338 unsigned idx; /* for brac.N */
339 } cat0;
340 struct {
341 type_t src_type, dst_type;
342 round_t round;
343 reduce_op_t reduce_op;
344 } cat1;
345 struct {
346 enum {
347 IR3_COND_LT = 0,
348 IR3_COND_LE = 1,
349 IR3_COND_GT = 2,
350 IR3_COND_GE = 3,
351 IR3_COND_EQ = 4,
352 IR3_COND_NE = 5,
353 } condition;
354 } cat2;
355 struct {
356 enum {
357 IR3_SRC_UNSIGNED = 0,
358 IR3_SRC_MIXED = 1,
359 } signedness;
360 enum {
361 IR3_SRC_PACKED_LOW = 0,
362 IR3_SRC_PACKED_HIGH = 1,
363 } packed;
364 bool swapped;
365 } cat3;
366 struct {
367 unsigned samp, tex;
368 unsigned tex_base : 3;
369 unsigned cluster_size : 4;
370 type_t type;
371 } cat5;
372 struct {
373 type_t type;
374 /* TODO remove dst_offset and handle as a ir3_register
375 * which might be IMMED, similar to how src_offset is
376 * handled.
377 */
378 int dst_offset;
379 int iim_val; /* for ldgb/stgb, # of components */
380 unsigned d : 3; /* for ldc, component offset */
381 bool typed : 1;
382 unsigned base : 3;
383 } cat6;
384 struct {
385 unsigned w : 1; /* write */
386 unsigned r : 1; /* read */
387 unsigned l : 1; /* local */
388 unsigned g : 1; /* global */
389 } cat7;
390 /* for meta-instructions, just used to hold extra data
391 * before instruction scheduling, etc
392 */
393 struct {
394 int off; /* component/offset */
395 } split;
396 struct {
397 /* Per-source index back to the entry in the
398 * ir3_shader_variant::outputs table.
399 */
400 unsigned *outidxs;
401 } end;
402 struct {
403 /* used to temporarily hold reference to nir_phi_instr
404 * until we resolve the phi srcs
405 */
406 void *nphi;
407 } phi;
408 struct {
409 unsigned samp, tex;
410 unsigned input_offset;
411 unsigned samp_base : 3;
412 unsigned tex_base : 3;
413 } prefetch;
414 struct {
415 /* maps back to entry in ir3_shader_variant::inputs table: */
416 int inidx;
417 /* for sysvals, identifies the sysval type. Mostly so we can
418 * identify the special cases where a sysval should not be DCE'd
419 * (currently, just pre-fs texture fetch)
420 */
421 gl_system_value sysval;
422 } input;
423 };
424
425 /* For assigning jump offsets, we need instruction's position: */
426 uint32_t ip;
427
428 /* used for per-pass extra instruction data.
429 *
430 * TODO we should remove the per-pass data like this and 'use_count'
431 * and do something similar to what RA does w/ ir3_ra_instr_data..
432 * ie. use the ir3_count_instructions pass, and then use instr->ip
433 * to index into a table of pass-private data.
434 */
435 void *data;
436
437 /**
438 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
439 */
440 struct set *uses;
441
442 int use_count; /* currently just updated/used by cp */
443
444 /* an instruction can reference at most one address register amongst
445 * it's src/dst registers. Beyond that, you need to insert mov's.
446 *
447 * NOTE: do not write this directly, use ir3_instr_set_address()
448 */
449 struct ir3_register *address;
450
451 /* Tracking for additional dependent instructions. Used to handle
452 * barriers, WAR hazards for arrays/SSBOs/etc.
453 */
454 DECLARE_ARRAY(struct ir3_instruction *, deps);
455
456 /*
457 * From PoV of instruction scheduling, not execution (ie. ignores global/
458 * local distinction):
459 * shared image atomic SSBO everything
460 * barrier()/ - R/W R/W R/W R/W X
461 * groupMemoryBarrier()
462 * memoryBarrier()
463 * (but only images declared coherent?)
464 * memoryBarrierAtomic() - R/W
465 * memoryBarrierBuffer() - R/W
466 * memoryBarrierImage() - R/W
467 * memoryBarrierShared() - R/W
468 *
469 * TODO I think for SSBO/image/shared, in cases where we can determine
470 * which variable is accessed, we don't need to care about accesses to
471 * different variables (unless declared coherent??)
472 */
473 enum {
474 IR3_BARRIER_EVERYTHING = 1 << 0,
475 IR3_BARRIER_SHARED_R = 1 << 1,
476 IR3_BARRIER_SHARED_W = 1 << 2,
477 IR3_BARRIER_IMAGE_R = 1 << 3,
478 IR3_BARRIER_IMAGE_W = 1 << 4,
479 IR3_BARRIER_BUFFER_R = 1 << 5,
480 IR3_BARRIER_BUFFER_W = 1 << 6,
481 IR3_BARRIER_ARRAY_R = 1 << 7,
482 IR3_BARRIER_ARRAY_W = 1 << 8,
483 IR3_BARRIER_PRIVATE_R = 1 << 9,
484 IR3_BARRIER_PRIVATE_W = 1 << 10,
485 IR3_BARRIER_CONST_W = 1 << 11,
486 } barrier_class,
487 barrier_conflict;
488
489 /* Entry in ir3_block's instruction list: */
490 struct list_head node;
491
492 uint32_t serialno;
493
494 // TODO only computerator/assembler:
495 int line;
496 };
497
498 struct ir3 {
499 struct ir3_compiler *compiler;
500 gl_shader_stage type;
501
502 DECLARE_ARRAY(struct ir3_instruction *, inputs);
503
504 /* Track bary.f (and ldlv) instructions.. this is needed in
505 * scheduling to ensure that all varying fetches happen before
506 * any potential kill instructions. The hw gets grumpy if all
507 * threads in a group are killed before the last bary.f gets
508 * a chance to signal end of input (ei).
509 */
510 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
511
512 /* Track all indirect instructions (read and write). To avoid
513 * deadlock scenario where an address register gets scheduled,
514 * but other dependent src instructions cannot be scheduled due
515 * to dependency on a *different* address register value, the
516 * scheduler needs to ensure that all dependencies other than
517 * the instruction other than the address register are scheduled
518 * before the one that writes the address register. Having a
519 * convenient list of instructions that reference some address
520 * register simplifies this.
521 */
522 DECLARE_ARRAY(struct ir3_instruction *, a0_users);
523
524 /* same for a1.x: */
525 DECLARE_ARRAY(struct ir3_instruction *, a1_users);
526
527 /* and same for instructions that consume predicate register: */
528 DECLARE_ARRAY(struct ir3_instruction *, predicates);
529
530 /* Track texture sample instructions which need texture state
531 * patched in (for astc-srgb workaround):
532 */
533 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
534
535 /* Track tg4 instructions which need texture state patched in (for tg4
536 * swizzling workaround):
537 */
538 DECLARE_ARRAY(struct ir3_instruction *, tg4);
539
540 /* List of blocks: */
541 struct list_head block_list;
542
543 /* List of ir3_array's: */
544 struct list_head array_list;
545
546 #ifdef DEBUG
547 unsigned block_count;
548 #endif
549 unsigned instr_count;
550 };
551
552 struct ir3_array {
553 struct list_head node;
554 unsigned length;
555 unsigned id;
556
557 struct nir_register *r;
558
559 /* To avoid array write's from getting DCE'd, keep track of the
560 * most recent write. Any array access depends on the most
561 * recent write. This way, nothing depends on writes after the
562 * last read. But all the writes that happen before that have
563 * something depending on them
564 */
565 struct ir3_register *last_write;
566
567 /* extra stuff used in RA pass: */
568 unsigned base; /* base vreg name */
569 unsigned reg; /* base physical reg */
570 uint16_t start_ip, end_ip;
571
572 /* Indicates if half-precision */
573 bool half;
574
575 bool unused;
576 };
577
578 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
579
580 enum ir3_branch_type {
581 IR3_BRANCH_COND, /* condition */
582 IR3_BRANCH_ANY, /* subgroupAny(condition) */
583 IR3_BRANCH_ALL, /* subgroupAll(condition) */
584 IR3_BRANCH_GETONE, /* subgroupElect() */
585 IR3_BRANCH_SHPS, /* preamble start */
586 };
587
588 struct ir3_block {
589 struct list_head node;
590 struct ir3 *shader;
591
592 const struct nir_block *nblock;
593
594 struct list_head instr_list; /* list of ir3_instruction */
595
596 /* The actual branch condition, if there are two successors */
597 enum ir3_branch_type brtype;
598
599 /* each block has either one or two successors.. in case of two
600 * successors, 'condition' decides which one to follow. A block preceding
601 * an if/else has two successors.
602 *
603 * In some cases the path that the machine actually takes through the
604 * program may not match the per-thread view of the CFG. In particular
605 * this is the case for if/else, where the machine jumps from the end of
606 * the if to the beginning of the else and switches active lanes. While
607 * most things only care about the per-thread view, we need to use the
608 * "physical" view when allocating shared registers. "successors" contains
609 * the per-thread successors, and "physical_successors" contains the
610 * physical successors which includes the fallthrough edge from the if to
611 * the else.
612 */
613 struct ir3_instruction *condition;
614 struct ir3_block *successors[2];
615 struct ir3_block *physical_successors[2];
616
617 DECLARE_ARRAY(struct ir3_block *, predecessors);
618 DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
619
620 uint16_t start_ip, end_ip;
621
622 /* Track instructions which do not write a register but other-
623 * wise must not be discarded (such as kill, stg, etc)
624 */
625 DECLARE_ARRAY(struct ir3_instruction *, keeps);
626
627 /* used for per-pass extra block data. Mainly used right
628 * now in RA step to track livein/liveout.
629 */
630 void *data;
631
632 uint32_t index;
633
634 struct ir3_block *imm_dom;
635 DECLARE_ARRAY(struct ir3_block *, dom_children);
636
637 uint32_t dom_pre_index;
638 uint32_t dom_post_index;
639
640 uint32_t loop_id;
641 uint32_t loop_depth;
642
643 #ifdef DEBUG
644 uint32_t serialno;
645 #endif
646 };
647
648 static inline uint32_t
block_id(struct ir3_block * block)649 block_id(struct ir3_block *block)
650 {
651 #ifdef DEBUG
652 return block->serialno;
653 #else
654 return (uint32_t)(unsigned long)block;
655 #endif
656 }
657
658 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)659 ir3_start_block(struct ir3 *ir)
660 {
661 return list_first_entry(&ir->block_list, struct ir3_block, node);
662 }
663
664 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)665 ir3_after_preamble(struct ir3 *ir)
666 {
667 struct ir3_block *block = ir3_start_block(ir);
668 /* The preamble will have a usually-empty else branch, and we want to skip
669 * that to get to the block after the preamble.
670 */
671 if (block->brtype == IR3_BRANCH_SHPS)
672 return block->successors[1]->successors[0];
673 else
674 return block;
675 }
676
677 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
678 void ir3_block_add_physical_predecessor(struct ir3_block *block,
679 struct ir3_block *pred);
680 void ir3_block_remove_predecessor(struct ir3_block *block,
681 struct ir3_block *pred);
682 void ir3_block_remove_physical_predecessor(struct ir3_block *block,
683 struct ir3_block *pred);
684 unsigned ir3_block_get_pred_index(struct ir3_block *block,
685 struct ir3_block *pred);
686
687 void ir3_calc_dominance(struct ir3 *ir);
688 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
689
690 struct ir3_shader_variant;
691
692 struct ir3 *ir3_create(struct ir3_compiler *compiler,
693 struct ir3_shader_variant *v);
694 void ir3_destroy(struct ir3 *shader);
695
696 void ir3_collect_info(struct ir3_shader_variant *v);
697 void *ir3_alloc(struct ir3 *shader, int sz);
698
699 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
700 unsigned reg_count,
701 bool double_threadsize);
702
703 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
704 bool double_threadsize);
705
706 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
707 unsigned regs_count);
708
709 struct ir3_block *ir3_block_create(struct ir3 *shader);
710
711 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
712 int ndst, int nsrc);
713 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
714 void ir3_instr_add_dep(struct ir3_instruction *instr,
715 struct ir3_instruction *dep);
716 const char *ir3_instr_name(struct ir3_instruction *instr);
717
718 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
719 int flags);
720 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
721 int flags);
722 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
723 struct ir3_register *reg);
724
725 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)726 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
727 {
728 assert(!dst->tied && !src->tied);
729 dst->tied = src;
730 src->tied = dst;
731 }
732
733 void ir3_reg_set_last_array(struct ir3_instruction *instr,
734 struct ir3_register *reg,
735 struct ir3_register *last_write);
736
737 void ir3_instr_set_address(struct ir3_instruction *instr,
738 struct ir3_instruction *addr);
739
740 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)741 ir3_instr_check_mark(struct ir3_instruction *instr)
742 {
743 if (instr->flags & IR3_INSTR_MARK)
744 return true; /* already visited */
745 instr->flags |= IR3_INSTR_MARK;
746 return false;
747 }
748
749 void ir3_block_clear_mark(struct ir3_block *block);
750 void ir3_clear_mark(struct ir3 *shader);
751
752 unsigned ir3_count_instructions(struct ir3 *ir);
753 unsigned ir3_count_instructions_ra(struct ir3 *ir);
754
755 /**
756 * Move 'instr' to just before 'after'
757 */
758 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)759 ir3_instr_move_before(struct ir3_instruction *instr,
760 struct ir3_instruction *after)
761 {
762 list_delinit(&instr->node);
763 list_addtail(&instr->node, &after->node);
764 }
765
766 /**
767 * Move 'instr' to just after 'before':
768 */
769 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)770 ir3_instr_move_after(struct ir3_instruction *instr,
771 struct ir3_instruction *before)
772 {
773 list_delinit(&instr->node);
774 list_add(&instr->node, &before->node);
775 }
776
777 /**
778 * Move 'instr' to the beginning of the block:
779 */
780 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)781 ir3_instr_move_before_block(struct ir3_instruction *instr,
782 struct ir3_block *block)
783 {
784 list_delinit(&instr->node);
785 list_add(&instr->node, &block->instr_list);
786 }
787
788 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
789
790 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
791 void ir3_fixup_src_type(struct ir3_instruction *instr);
792
793 int ir3_flut(struct ir3_register *src_reg);
794
795 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
796
797 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
798
799 #include "util/set.h"
800 #define foreach_ssa_use(__use, __instr) \
801 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \
802 __use = NULL) \
803 set_foreach ((__instr)->uses, __entry) \
804 if ((__use = (void *)__entry->key))
805
806 static inline uint32_t
reg_num(const struct ir3_register * reg)807 reg_num(const struct ir3_register *reg)
808 {
809 return reg->num >> 2;
810 }
811
812 static inline uint32_t
reg_comp(const struct ir3_register * reg)813 reg_comp(const struct ir3_register *reg)
814 {
815 return reg->num & 0x3;
816 }
817
818 static inline bool
is_flow(struct ir3_instruction * instr)819 is_flow(struct ir3_instruction *instr)
820 {
821 return (opc_cat(instr->opc) == 0);
822 }
823
824 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)825 is_kill_or_demote(struct ir3_instruction *instr)
826 {
827 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
828 }
829
830 static inline bool
is_nop(struct ir3_instruction * instr)831 is_nop(struct ir3_instruction *instr)
832 {
833 return instr->opc == OPC_NOP;
834 }
835
836 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)837 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
838 {
839 unsigned dst_type = (dst->flags & IR3_REG_HALF);
840 unsigned src_type = (src->flags & IR3_REG_HALF);
841
842 /* Treat shared->normal copies as same-type, because they can generally be
843 * folded, but not normal->shared copies.
844 */
845 if (dst_type != src_type ||
846 ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
847 return false;
848 else
849 return true;
850 }
851
852 /* Is it a non-transformative (ie. not type changing) mov? This can
853 * also include absneg.s/absneg.f, which for the most part can be
854 * treated as a mov (single src argument).
855 */
856 static inline bool
is_same_type_mov(struct ir3_instruction * instr)857 is_same_type_mov(struct ir3_instruction *instr)
858 {
859 struct ir3_register *dst;
860
861 switch (instr->opc) {
862 case OPC_MOV:
863 if (instr->cat1.src_type != instr->cat1.dst_type)
864 return false;
865 /* If the type of dest reg and src reg are different,
866 * it shouldn't be considered as same type mov
867 */
868 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
869 return false;
870 break;
871 case OPC_ABSNEG_F:
872 case OPC_ABSNEG_S:
873 if (instr->flags & IR3_INSTR_SAT)
874 return false;
875 /* If the type of dest reg and src reg are different,
876 * it shouldn't be considered as same type mov
877 */
878 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
879 return false;
880 break;
881 case OPC_META_PHI:
882 return instr->srcs_count == 1;
883 default:
884 return false;
885 }
886
887 dst = instr->dsts[0];
888
889 /* mov's that write to a0 or p0.x are special: */
890 if (dst->num == regid(REG_P0, 0))
891 return false;
892 if (reg_num(dst) == REG_A0)
893 return false;
894
895 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
896 return false;
897
898 return true;
899 }
900
901 /* A move from const, which changes size but not type, can also be
902 * folded into dest instruction in some cases.
903 */
904 static inline bool
is_const_mov(struct ir3_instruction * instr)905 is_const_mov(struct ir3_instruction *instr)
906 {
907 if (instr->opc != OPC_MOV)
908 return false;
909
910 if (!(instr->srcs[0]->flags & IR3_REG_CONST))
911 return false;
912
913 type_t src_type = instr->cat1.src_type;
914 type_t dst_type = instr->cat1.dst_type;
915
916 return (type_float(src_type) && type_float(dst_type)) ||
917 (type_uint(src_type) && type_uint(dst_type)) ||
918 (type_sint(src_type) && type_sint(dst_type));
919 }
920
921 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)922 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
923 {
924 switch (instr->opc) {
925 case OPC_BALLOT_MACRO:
926 case OPC_ANY_MACRO:
927 case OPC_ALL_MACRO:
928 case OPC_ELECT_MACRO:
929 case OPC_READ_COND_MACRO:
930 case OPC_READ_FIRST_MACRO:
931 case OPC_SWZ_SHARED_MACRO:
932 case OPC_SCAN_MACRO:
933 return true;
934 default:
935 return false;
936 }
937 }
938
939 static inline bool
is_alu(struct ir3_instruction * instr)940 is_alu(struct ir3_instruction *instr)
941 {
942 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
943 }
944
945 static inline bool
is_sfu(struct ir3_instruction * instr)946 is_sfu(struct ir3_instruction *instr)
947 {
948 return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
949 }
950
951 static inline bool
is_tex(struct ir3_instruction * instr)952 is_tex(struct ir3_instruction *instr)
953 {
954 return (opc_cat(instr->opc) == 5);
955 }
956
957 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)958 is_tex_or_prefetch(struct ir3_instruction *instr)
959 {
960 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
961 }
962
963 static inline bool
is_mem(struct ir3_instruction * instr)964 is_mem(struct ir3_instruction *instr)
965 {
966 return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
967 }
968
969 static inline bool
is_barrier(struct ir3_instruction * instr)970 is_barrier(struct ir3_instruction *instr)
971 {
972 return (opc_cat(instr->opc) == 7);
973 }
974
975 static inline bool
is_half(struct ir3_instruction * instr)976 is_half(struct ir3_instruction *instr)
977 {
978 return !!(instr->dsts[0]->flags & IR3_REG_HALF);
979 }
980
981 static inline bool
is_shared(struct ir3_instruction * instr)982 is_shared(struct ir3_instruction *instr)
983 {
984 return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
985 }
986
987 static inline bool
is_store(struct ir3_instruction * instr)988 is_store(struct ir3_instruction *instr)
989 {
990 /* these instructions, the "destination" register is
991 * actually a source, the address to store to.
992 */
993 switch (instr->opc) {
994 case OPC_STG:
995 case OPC_STG_A:
996 case OPC_STGB:
997 case OPC_STIB:
998 case OPC_STP:
999 case OPC_STL:
1000 case OPC_STLW:
1001 case OPC_L2G:
1002 case OPC_G2L:
1003 return true;
1004 default:
1005 return false;
1006 }
1007 }
1008
1009 static inline bool
is_load(struct ir3_instruction * instr)1010 is_load(struct ir3_instruction *instr)
1011 {
1012 switch (instr->opc) {
1013 case OPC_LDG:
1014 case OPC_LDG_A:
1015 case OPC_LDGB:
1016 case OPC_LDIB:
1017 case OPC_LDL:
1018 case OPC_LDP:
1019 case OPC_L2G:
1020 case OPC_LDLW:
1021 case OPC_LDC:
1022 case OPC_LDLV:
1023 /* probably some others too.. */
1024 return true;
1025 default:
1026 return false;
1027 }
1028 }
1029
1030 static inline bool
is_input(struct ir3_instruction * instr)1031 is_input(struct ir3_instruction *instr)
1032 {
1033 /* in some cases, ldlv is used to fetch varying without
1034 * interpolation.. fortunately inloc is the first src
1035 * register in either case
1036 */
1037 switch (instr->opc) {
1038 case OPC_LDLV:
1039 case OPC_BARY_F:
1040 case OPC_FLAT_B:
1041 return true;
1042 default:
1043 return false;
1044 }
1045 }
1046
1047 static inline bool
is_bool(struct ir3_instruction * instr)1048 is_bool(struct ir3_instruction *instr)
1049 {
1050 switch (instr->opc) {
1051 case OPC_CMPS_F:
1052 case OPC_CMPS_S:
1053 case OPC_CMPS_U:
1054 return true;
1055 default:
1056 return false;
1057 }
1058 }
1059
1060 static inline opc_t
cat3_half_opc(opc_t opc)1061 cat3_half_opc(opc_t opc)
1062 {
1063 switch (opc) {
1064 case OPC_MAD_F32:
1065 return OPC_MAD_F16;
1066 case OPC_SEL_B32:
1067 return OPC_SEL_B16;
1068 case OPC_SEL_S32:
1069 return OPC_SEL_S16;
1070 case OPC_SEL_F32:
1071 return OPC_SEL_F16;
1072 case OPC_SAD_S32:
1073 return OPC_SAD_S16;
1074 default:
1075 return opc;
1076 }
1077 }
1078
1079 static inline opc_t
cat3_full_opc(opc_t opc)1080 cat3_full_opc(opc_t opc)
1081 {
1082 switch (opc) {
1083 case OPC_MAD_F16:
1084 return OPC_MAD_F32;
1085 case OPC_SEL_B16:
1086 return OPC_SEL_B32;
1087 case OPC_SEL_S16:
1088 return OPC_SEL_S32;
1089 case OPC_SEL_F16:
1090 return OPC_SEL_F32;
1091 case OPC_SAD_S16:
1092 return OPC_SAD_S32;
1093 default:
1094 return opc;
1095 }
1096 }
1097
1098 static inline opc_t
cat4_half_opc(opc_t opc)1099 cat4_half_opc(opc_t opc)
1100 {
1101 switch (opc) {
1102 case OPC_RSQ:
1103 return OPC_HRSQ;
1104 case OPC_LOG2:
1105 return OPC_HLOG2;
1106 case OPC_EXP2:
1107 return OPC_HEXP2;
1108 default:
1109 return opc;
1110 }
1111 }
1112
1113 static inline opc_t
cat4_full_opc(opc_t opc)1114 cat4_full_opc(opc_t opc)
1115 {
1116 switch (opc) {
1117 case OPC_HRSQ:
1118 return OPC_RSQ;
1119 case OPC_HLOG2:
1120 return OPC_LOG2;
1121 case OPC_HEXP2:
1122 return OPC_EXP2;
1123 default:
1124 return opc;
1125 }
1126 }
1127
1128 static inline bool
is_meta(struct ir3_instruction * instr)1129 is_meta(struct ir3_instruction *instr)
1130 {
1131 return (opc_cat(instr->opc) == -1);
1132 }
1133
1134 static inline unsigned
reg_elems(const struct ir3_register * reg)1135 reg_elems(const struct ir3_register *reg)
1136 {
1137 if (reg->flags & IR3_REG_ARRAY)
1138 return reg->size;
1139 else
1140 return util_last_bit(reg->wrmask);
1141 }
1142
1143 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1144 reg_elem_size(const struct ir3_register *reg)
1145 {
1146 return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1147 }
1148
1149 static inline unsigned
reg_size(const struct ir3_register * reg)1150 reg_size(const struct ir3_register *reg)
1151 {
1152 return reg_elems(reg) * reg_elem_size(reg);
1153 }
1154
1155 static inline unsigned
dest_regs(struct ir3_instruction * instr)1156 dest_regs(struct ir3_instruction *instr)
1157 {
1158 if (instr->dsts_count == 0)
1159 return 0;
1160
1161 debug_assert(instr->dsts_count == 1);
1162 return util_last_bit(instr->dsts[0]->wrmask);
1163 }
1164
1165 /* is dst a normal temp register: */
1166 static inline bool
is_dest_gpr(struct ir3_register * dst)1167 is_dest_gpr(struct ir3_register *dst)
1168 {
1169 if (dst->wrmask == 0)
1170 return false;
1171 if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1172 return false;
1173 return true;
1174 }
1175
1176 static inline bool
writes_gpr(struct ir3_instruction * instr)1177 writes_gpr(struct ir3_instruction *instr)
1178 {
1179 if (dest_regs(instr) == 0)
1180 return false;
1181 return is_dest_gpr(instr->dsts[0]);
1182 }
1183
1184 static inline bool
writes_addr0(struct ir3_instruction * instr)1185 writes_addr0(struct ir3_instruction *instr)
1186 {
1187 /* Note: only the first dest can write to a0.x */
1188 if (instr->dsts_count > 0) {
1189 struct ir3_register *dst = instr->dsts[0];
1190 return dst->num == regid(REG_A0, 0);
1191 }
1192 return false;
1193 }
1194
1195 static inline bool
writes_addr1(struct ir3_instruction * instr)1196 writes_addr1(struct ir3_instruction *instr)
1197 {
1198 /* Note: only the first dest can write to a1.x */
1199 if (instr->dsts_count > 0) {
1200 struct ir3_register *dst = instr->dsts[0];
1201 return dst->num == regid(REG_A0, 1);
1202 }
1203 return false;
1204 }
1205
1206 static inline bool
writes_pred(struct ir3_instruction * instr)1207 writes_pred(struct ir3_instruction *instr)
1208 {
1209 /* Note: only the first dest can write to p0.x */
1210 if (instr->dsts_count > 0) {
1211 struct ir3_register *dst = instr->dsts[0];
1212 return reg_num(dst) == REG_P0;
1213 }
1214 return false;
1215 }
1216
1217 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
1218 * are considered special here. Special registers are always accessed with one
1219 * size and never alias normal registers, even though a naive calculation
1220 * would sometimes make it seem like e.g. r30.z aliases a0.x.
1221 */
1222 static inline bool
is_reg_special(const struct ir3_register * reg)1223 is_reg_special(const struct ir3_register *reg)
1224 {
1225 return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1226 (reg_num(reg) == REG_P0);
1227 }
1228
1229 /* Same as above but in cases where we don't have a register. r48.x and above
1230 * are shared/special.
1231 */
1232 static inline bool
is_reg_num_special(unsigned num)1233 is_reg_num_special(unsigned num)
1234 {
1235 return num >= 48 * 4;
1236 }
1237
1238 /* returns defining instruction for reg */
1239 /* TODO better name */
1240 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1241 ssa(struct ir3_register *reg)
1242 {
1243 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1244 return reg->def->instr;
1245 return NULL;
1246 }
1247
1248 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1249 conflicts(struct ir3_register *a, struct ir3_register *b)
1250 {
1251 return (a && b) && (a->def != b->def);
1252 }
1253
1254 static inline bool
reg_gpr(struct ir3_register * r)1255 reg_gpr(struct ir3_register *r)
1256 {
1257 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1258 return false;
1259 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1260 return false;
1261 return true;
1262 }
1263
1264 static inline type_t
half_type(type_t type)1265 half_type(type_t type)
1266 {
1267 switch (type) {
1268 case TYPE_F32:
1269 return TYPE_F16;
1270 case TYPE_U32:
1271 return TYPE_U16;
1272 case TYPE_S32:
1273 return TYPE_S16;
1274 case TYPE_F16:
1275 case TYPE_U16:
1276 case TYPE_S16:
1277 return type;
1278 case TYPE_U8:
1279 case TYPE_S8:
1280 return type;
1281 default:
1282 assert(0);
1283 return ~0;
1284 }
1285 }
1286
1287 static inline type_t
full_type(type_t type)1288 full_type(type_t type)
1289 {
1290 switch (type) {
1291 case TYPE_F16:
1292 return TYPE_F32;
1293 case TYPE_U8:
1294 case TYPE_U16:
1295 return TYPE_U32;
1296 case TYPE_S8:
1297 case TYPE_S16:
1298 return TYPE_S32;
1299 case TYPE_F32:
1300 case TYPE_U32:
1301 case TYPE_S32:
1302 return type;
1303 default:
1304 assert(0);
1305 return ~0;
1306 }
1307 }
1308
1309 /* some cat2 instructions (ie. those which are not float) can embed an
1310 * immediate:
1311 */
1312 static inline bool
ir3_cat2_int(opc_t opc)1313 ir3_cat2_int(opc_t opc)
1314 {
1315 switch (opc) {
1316 case OPC_ADD_U:
1317 case OPC_ADD_S:
1318 case OPC_SUB_U:
1319 case OPC_SUB_S:
1320 case OPC_CMPS_U:
1321 case OPC_CMPS_S:
1322 case OPC_MIN_U:
1323 case OPC_MIN_S:
1324 case OPC_MAX_U:
1325 case OPC_MAX_S:
1326 case OPC_CMPV_U:
1327 case OPC_CMPV_S:
1328 case OPC_MUL_U24:
1329 case OPC_MUL_S24:
1330 case OPC_MULL_U:
1331 case OPC_CLZ_S:
1332 case OPC_ABSNEG_S:
1333 case OPC_AND_B:
1334 case OPC_OR_B:
1335 case OPC_NOT_B:
1336 case OPC_XOR_B:
1337 case OPC_BFREV_B:
1338 case OPC_CLZ_B:
1339 case OPC_SHL_B:
1340 case OPC_SHR_B:
1341 case OPC_ASHR_B:
1342 case OPC_MGEN_B:
1343 case OPC_GETBIT_B:
1344 case OPC_CBITS_B:
1345 case OPC_BARY_F:
1346 case OPC_FLAT_B:
1347 return true;
1348
1349 default:
1350 return false;
1351 }
1352 }
1353
1354 /* map cat2 instruction to valid abs/neg flags: */
1355 static inline unsigned
ir3_cat2_absneg(opc_t opc)1356 ir3_cat2_absneg(opc_t opc)
1357 {
1358 switch (opc) {
1359 case OPC_ADD_F:
1360 case OPC_MIN_F:
1361 case OPC_MAX_F:
1362 case OPC_MUL_F:
1363 case OPC_SIGN_F:
1364 case OPC_CMPS_F:
1365 case OPC_ABSNEG_F:
1366 case OPC_CMPV_F:
1367 case OPC_FLOOR_F:
1368 case OPC_CEIL_F:
1369 case OPC_RNDNE_F:
1370 case OPC_RNDAZ_F:
1371 case OPC_TRUNC_F:
1372 case OPC_BARY_F:
1373 return IR3_REG_FABS | IR3_REG_FNEG;
1374
1375 case OPC_ADD_U:
1376 case OPC_ADD_S:
1377 case OPC_SUB_U:
1378 case OPC_SUB_S:
1379 case OPC_CMPS_U:
1380 case OPC_CMPS_S:
1381 case OPC_MIN_U:
1382 case OPC_MIN_S:
1383 case OPC_MAX_U:
1384 case OPC_MAX_S:
1385 case OPC_CMPV_U:
1386 case OPC_CMPV_S:
1387 case OPC_MUL_U24:
1388 case OPC_MUL_S24:
1389 case OPC_MULL_U:
1390 case OPC_CLZ_S:
1391 return 0;
1392
1393 case OPC_ABSNEG_S:
1394 return IR3_REG_SABS | IR3_REG_SNEG;
1395
1396 case OPC_AND_B:
1397 case OPC_OR_B:
1398 case OPC_NOT_B:
1399 case OPC_XOR_B:
1400 case OPC_BFREV_B:
1401 case OPC_CLZ_B:
1402 case OPC_SHL_B:
1403 case OPC_SHR_B:
1404 case OPC_ASHR_B:
1405 case OPC_MGEN_B:
1406 case OPC_GETBIT_B:
1407 case OPC_CBITS_B:
1408 return IR3_REG_BNOT;
1409
1410 default:
1411 return 0;
1412 }
1413 }
1414
1415 /* map cat3 instructions to valid abs/neg flags: */
1416 static inline unsigned
ir3_cat3_absneg(opc_t opc)1417 ir3_cat3_absneg(opc_t opc)
1418 {
1419 switch (opc) {
1420 case OPC_MAD_F16:
1421 case OPC_MAD_F32:
1422 case OPC_SEL_F16:
1423 case OPC_SEL_F32:
1424 return IR3_REG_FNEG;
1425
1426 case OPC_MAD_U16:
1427 case OPC_MADSH_U16:
1428 case OPC_MAD_S16:
1429 case OPC_MADSH_M16:
1430 case OPC_MAD_U24:
1431 case OPC_MAD_S24:
1432 case OPC_SEL_S16:
1433 case OPC_SEL_S32:
1434 case OPC_SAD_S16:
1435 case OPC_SAD_S32:
1436 /* neg *may* work on 3rd src.. */
1437
1438 case OPC_SEL_B16:
1439 case OPC_SEL_B32:
1440
1441 case OPC_SHRM:
1442 case OPC_SHLM:
1443 case OPC_SHRG:
1444 case OPC_SHLG:
1445 case OPC_ANDG:
1446 case OPC_WMM:
1447 case OPC_WMM_ACCU:
1448
1449 default:
1450 return 0;
1451 }
1452 }
1453
1454 /* Return the type (float, int, or uint) the op uses when converting from the
1455 * internal result of the op (which is assumed to be the same size as the
1456 * sources) to the destination when they are not the same size. If F32 it does
1457 * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1458 * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1459 * doesn't do anything sensible or is unknown.
1460 */
1461 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1462 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1463 {
1464 *can_fold = true;
1465 switch (instr->opc) {
1466 case OPC_ADD_F:
1467 case OPC_MUL_F:
1468 case OPC_BARY_F:
1469 case OPC_MAD_F32:
1470 case OPC_MAD_F16:
1471 case OPC_WMM:
1472 case OPC_WMM_ACCU:
1473 return TYPE_F32;
1474
1475 case OPC_ADD_U:
1476 case OPC_SUB_U:
1477 case OPC_MIN_U:
1478 case OPC_MAX_U:
1479 case OPC_AND_B:
1480 case OPC_OR_B:
1481 case OPC_NOT_B:
1482 case OPC_XOR_B:
1483 case OPC_MUL_U24:
1484 case OPC_MULL_U:
1485 case OPC_SHL_B:
1486 case OPC_SHR_B:
1487 case OPC_ASHR_B:
1488 case OPC_MAD_U24:
1489 case OPC_SHRM:
1490 case OPC_SHLM:
1491 case OPC_SHRG:
1492 case OPC_SHLG:
1493 case OPC_ANDG:
1494 /* Comparison ops zero-extend/truncate their results, so consider them as
1495 * unsigned here.
1496 */
1497 case OPC_CMPS_F:
1498 case OPC_CMPV_F:
1499 case OPC_CMPS_U:
1500 case OPC_CMPS_S:
1501 return TYPE_U32;
1502
1503 case OPC_ADD_S:
1504 case OPC_SUB_S:
1505 case OPC_MIN_S:
1506 case OPC_MAX_S:
1507 case OPC_ABSNEG_S:
1508 case OPC_MUL_S24:
1509 case OPC_MAD_S24:
1510 return TYPE_S32;
1511
1512 /* We assume that any move->move folding that could be done was done by
1513 * NIR.
1514 */
1515 case OPC_MOV:
1516 default:
1517 *can_fold = false;
1518 return TYPE_U32;
1519 }
1520 }
1521
1522 /* Return the src and dst types for the conversion which is already folded
1523 * into the op. We can assume that instr has folded in a conversion from
1524 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1525 * to call if ir3_output_conv_type() returns can_fold = true.
1526 */
1527 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1528 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1529 {
1530 switch (instr->opc) {
1531 case OPC_CMPS_F:
1532 case OPC_CMPV_F:
1533 case OPC_CMPS_U:
1534 case OPC_CMPS_S:
1535 /* Comparisons only return 0/1 and the size of the comparison sources
1536 * is irrelevant, never consider them as having an output conversion
1537 * by returning a type with the dest size here:
1538 */
1539 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1540 : full_type(base_type);
1541
1542 case OPC_BARY_F:
1543 /* bary.f doesn't have an explicit source, but we can assume here that
1544 * the varying data it reads is in fp32.
1545 *
1546 * This may be fp16 on older gen's depending on some register
1547 * settings, but it's probably not worth plumbing that through for a
1548 * small improvement that NIR would hopefully handle for us anyway.
1549 */
1550 return TYPE_F32;
1551
1552 case OPC_FLAT_B:
1553 /* Treat the input data as u32 if not interpolating. */
1554 return TYPE_U32;
1555
1556 default:
1557 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1558 : full_type(base_type);
1559 }
1560 }
1561
1562 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1563 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1564 {
1565 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1566 : full_type(base_type);
1567 }
1568
1569 /* Some instructions have signed/unsigned variants which are identical except
1570 * for whether the folded conversion sign-extends or zero-extends, and we can
1571 * fold in a mismatching move by rewriting the opcode. Return the opcode to
1572 * switch signedness, and whether one exists.
1573 */
1574 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1575 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1576 {
1577 switch (opc) {
1578 #define PAIR(u, s) \
1579 case OPC_##u: \
1580 return OPC_##s; \
1581 case OPC_##s: \
1582 return OPC_##u;
1583 PAIR(ADD_U, ADD_S)
1584 PAIR(SUB_U, SUB_S)
1585 /* Note: these are only identical when the sources are half, but that's
1586 * the only case we call this function for anyway.
1587 */
1588 PAIR(MUL_U24, MUL_S24)
1589
1590 default:
1591 *can_swap = false;
1592 return opc;
1593 }
1594 }
1595
1596 #define MASK(n) ((1 << (n)) - 1)
1597
1598 /* iterator for an instructions's sources (reg), also returns src #: */
1599 #define foreach_src_n(__srcreg, __n, __instr) \
1600 if ((__instr)->srcs_count) \
1601 for (struct ir3_register *__srcreg = (void *)~0; __srcreg; \
1602 __srcreg = NULL) \
1603 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \
1604 __n++) \
1605 if ((__srcreg = (__instr)->srcs[__n]))
1606
1607 /* iterator for an instructions's sources (reg): */
1608 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1609
1610 /* iterator for an instructions's destinations (reg), also returns dst #: */
1611 #define foreach_dst_n(__dstreg, __n, __instr) \
1612 if ((__instr)->dsts_count) \
1613 for (struct ir3_register *__dstreg = (void *)~0; __dstreg; \
1614 __dstreg = NULL) \
1615 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \
1616 __n++) \
1617 if ((__dstreg = (__instr)->dsts[__n]))
1618
1619 /* iterator for an instructions's destinations (reg): */
1620 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1621
1622 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1623 __ssa_src_cnt(struct ir3_instruction *instr)
1624 {
1625 return instr->srcs_count + instr->deps_count;
1626 }
1627
1628 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1629 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1630 {
1631 if (n >= instr->srcs_count)
1632 return true;
1633 return false;
1634 }
1635
1636 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1637 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1638 {
1639 if (__is_false_dep(instr, n))
1640 return &instr->deps[n - instr->srcs_count];
1641 if (ssa(instr->srcs[n]))
1642 return &instr->srcs[n]->def->instr;
1643 return NULL;
1644 }
1645
1646 #define foreach_ssa_srcp_n(__srcp, __n, __instr) \
1647 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
1648 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \
1649 __n++) \
1650 if ((__srcp = __ssa_srcp_n(__instr, __n)))
1651
1652 #define foreach_ssa_srcp(__srcp, __instr) \
1653 foreach_ssa_srcp_n (__srcp, __i, __instr)
1654
1655 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1656 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
1657 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \
1658 __srcinst = NULL) \
1659 foreach_ssa_srcp_n (__srcp, __n, __instr) \
1660 if ((__srcinst = *__srcp))
1661
1662 /* iterator for an instruction's SSA sources (instr): */
1663 #define foreach_ssa_src(__srcinst, __instr) \
1664 foreach_ssa_src_n (__srcinst, __i, __instr)
1665
1666 /* iterators for shader inputs: */
1667 #define foreach_input_n(__ininstr, __cnt, __ir) \
1668 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \
1669 __ininstr = NULL) \
1670 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
1671 if ((__ininstr = (__ir)->inputs[__cnt]))
1672 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1673
1674 /* iterators for instructions: */
1675 #define foreach_instr(__instr, __list) \
1676 list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1677 #define foreach_instr_rev(__instr, __list) \
1678 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1679 #define foreach_instr_safe(__instr, __list) \
1680 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1681 #define foreach_instr_from_safe(__instr, __start, __list) \
1682 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \
1683 __list, node)
1684
1685 /* iterators for blocks: */
1686 #define foreach_block(__block, __list) \
1687 list_for_each_entry (struct ir3_block, __block, __list, node)
1688 #define foreach_block_safe(__block, __list) \
1689 list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1690 #define foreach_block_rev(__block, __list) \
1691 list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1692
1693 /* iterators for arrays: */
1694 #define foreach_array(__array, __list) \
1695 list_for_each_entry (struct ir3_array, __array, __list, node)
1696 #define foreach_array_safe(__array, __list) \
1697 list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1698
1699 #define IR3_PASS(ir, pass, ...) \
1700 ({ \
1701 bool progress = pass(ir, ##__VA_ARGS__); \
1702 if (progress) { \
1703 ir3_debug_print(ir, "AFTER: " #pass); \
1704 ir3_validate(ir); \
1705 } \
1706 progress; \
1707 })
1708
1709 /* validate: */
1710 void ir3_validate(struct ir3 *ir);
1711
1712 /* dump: */
1713 void ir3_print(struct ir3 *ir);
1714 void ir3_print_instr(struct ir3_instruction *instr);
1715
1716 struct log_stream;
1717 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1718
1719 /* delay calculation: */
1720 int ir3_delayslots(struct ir3_instruction *assigner,
1721 struct ir3_instruction *consumer, unsigned n, bool soft);
1722 unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
1723 struct ir3_instruction *consumer,
1724 unsigned assigner_n, unsigned consumer_n);
1725 unsigned ir3_delay_calc(struct ir3_block *block,
1726 struct ir3_instruction *instr, bool mergedregs);
1727
1728 /* estimated (ss)/(sy) delay calculation */
1729
1730 static inline bool
is_local_mem_load(struct ir3_instruction * instr)1731 is_local_mem_load(struct ir3_instruction *instr)
1732 {
1733 return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
1734 instr->opc == OPC_LDLW;
1735 }
1736
1737 /* Does this instruction need (ss) to wait for its result? */
1738 static inline bool
is_ss_producer(struct ir3_instruction * instr)1739 is_ss_producer(struct ir3_instruction *instr)
1740 {
1741 foreach_dst (dst, instr) {
1742 if (dst->flags & IR3_REG_SHARED)
1743 return true;
1744 }
1745 return is_sfu(instr) || is_local_mem_load(instr);
1746 }
1747
1748 /* The soft delay for approximating the cost of (ss). */
1749 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)1750 soft_ss_delay(struct ir3_instruction *instr)
1751 {
1752 /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
1753 * using nop's instead of (ss) is:
1754 *
1755 * 8 - single warp
1756 * 9 - two warps
1757 * 10 - four warps
1758 *
1759 * and so on. Not quite sure where it tapers out (ie. how many warps share an
1760 * SFU unit). But 10 seems like a reasonable # to choose:
1761 */
1762 if (is_sfu(instr) || is_local_mem_load(instr))
1763 return 10;
1764
1765 /* The blob adds 6 nops between shared producers and consumers, and before we
1766 * used (ss) this was sufficient in most cases.
1767 */
1768 return 6;
1769 }
1770
1771 static inline bool
is_sy_producer(struct ir3_instruction * instr)1772 is_sy_producer(struct ir3_instruction *instr)
1773 {
1774 return is_tex_or_prefetch(instr) ||
1775 (is_load(instr) && !is_local_mem_load(instr)) ||
1776 is_atomic(instr->opc);
1777 }
1778
1779 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)1780 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
1781 {
1782 /* TODO: this is just an optimistic guess, we can do better post-RA.
1783 */
1784 bool double_wavesize =
1785 shader->type == MESA_SHADER_FRAGMENT ||
1786 shader->type == MESA_SHADER_COMPUTE;
1787
1788 unsigned components = reg_elems(instr->dsts[0]);
1789
1790 /* These numbers come from counting the number of delay slots to get
1791 * cat5/cat6 results back using nops instead of (sy). Note that these numbers
1792 * are with the result preloaded to cache by loading it before in the same
1793 * shader - uncached results are much larger.
1794 *
1795 * Note: most ALU instructions can't complete at the full doubled rate, so
1796 * they take 2 cycles. The only exception is fp16 instructions with no
1797 * built-in conversions. Therefore divide the latency by 2.
1798 *
1799 * TODO: Handle this properly in the scheduler and remove this.
1800 */
1801 if (instr->opc == OPC_LDC) {
1802 if (double_wavesize)
1803 return (21 + 8 * components) / 2;
1804 else
1805 return 18 + 4 * components;
1806 } else if (is_tex_or_prefetch(instr)) {
1807 if (double_wavesize) {
1808 switch (components) {
1809 case 1: return 58 / 2;
1810 case 2: return 60 / 2;
1811 case 3: return 77 / 2;
1812 case 4: return 79 / 2;
1813 default: unreachable("bad number of components");
1814 }
1815 } else {
1816 switch (components) {
1817 case 1: return 51;
1818 case 2: return 53;
1819 case 3: return 62;
1820 case 4: return 64;
1821 default: unreachable("bad number of components");
1822 }
1823 }
1824 } else {
1825 /* TODO: measure other cat6 opcodes like ldg */
1826 if (double_wavesize)
1827 return (172 + components) / 2;
1828 else
1829 return 109 + components;
1830 }
1831 }
1832
1833
1834 /* unreachable block elimination: */
1835 bool ir3_remove_unreachable(struct ir3 *ir);
1836
1837 /* dead code elimination: */
1838 struct ir3_shader_variant;
1839 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1840
1841 /* fp16 conversion folding */
1842 bool ir3_cf(struct ir3 *ir);
1843
1844 /* copy-propagate: */
1845 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1846 bool ir3_cp_postsched(struct ir3 *ir);
1847
1848 /* common subexpression elimination: */
1849 bool ir3_cse(struct ir3 *ir);
1850
1851 /* Make arrays SSA */
1852 bool ir3_array_to_ssa(struct ir3 *ir);
1853
1854 /* scheduling: */
1855 bool ir3_sched_add_deps(struct ir3 *ir);
1856 int ir3_sched(struct ir3 *ir);
1857
1858 struct ir3_context;
1859 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1860
1861 /* register assignment: */
1862 int ir3_ra(struct ir3_shader_variant *v);
1863
1864 /* lower subgroup ops: */
1865 bool ir3_lower_subgroups(struct ir3 *ir);
1866
1867 /* legalize: */
1868 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1869
1870 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)1871 ir3_has_latency_to_hide(struct ir3 *ir)
1872 {
1873 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
1874 * know the nature of the fragment shader. Just assume it will have
1875 * latency to hide:
1876 */
1877 if (ir->type != MESA_SHADER_FRAGMENT)
1878 return true;
1879
1880 foreach_block (block, &ir->block_list) {
1881 foreach_instr (instr, &block->instr_list) {
1882 if (is_tex_or_prefetch(instr))
1883 return true;
1884
1885 if (is_load(instr)) {
1886 switch (instr->opc) {
1887 case OPC_LDLV:
1888 case OPC_LDL:
1889 case OPC_LDLW:
1890 break;
1891 default:
1892 return true;
1893 }
1894 }
1895 }
1896 }
1897
1898 return false;
1899 }
1900
1901 /* ************************************************************************* */
1902 /* instruction helpers */
1903
1904 /* creates SSA src of correct type (ie. half vs full precision) */
1905 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)1906 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
1907 unsigned flags)
1908 {
1909 struct ir3_register *reg;
1910 if (src->dsts[0]->flags & IR3_REG_HALF)
1911 flags |= IR3_REG_HALF;
1912 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
1913 reg->def = src->dsts[0];
1914 reg->wrmask = src->dsts[0]->wrmask;
1915 return reg;
1916 }
1917
1918 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)1919 __ssa_dst(struct ir3_instruction *instr)
1920 {
1921 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
1922 reg->instr = instr;
1923 return reg;
1924 }
1925
1926 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)1927 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1928 {
1929 struct ir3_instruction *mov;
1930 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1931
1932 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1933 mov->cat1.src_type = type;
1934 mov->cat1.dst_type = type;
1935 __ssa_dst(mov)->flags |= flags;
1936 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
1937
1938 return mov;
1939 }
1940
1941 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)1942 create_immed(struct ir3_block *block, uint32_t val)
1943 {
1944 return create_immed_typed(block, val, TYPE_U32);
1945 }
1946
1947 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)1948 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
1949 {
1950 struct ir3_instruction *mov;
1951 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1952
1953 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1954 mov->cat1.src_type = type;
1955 mov->cat1.dst_type = type;
1956 __ssa_dst(mov)->flags |= flags;
1957 ir3_src_create(mov, n, IR3_REG_CONST | flags);
1958
1959 return mov;
1960 }
1961
1962 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)1963 create_uniform(struct ir3_block *block, unsigned n)
1964 {
1965 return create_uniform_typed(block, n, TYPE_F32);
1966 }
1967
1968 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)1969 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
1970 struct ir3_instruction *address)
1971 {
1972 struct ir3_instruction *mov;
1973
1974 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1975 mov->cat1.src_type = type;
1976 mov->cat1.dst_type = type;
1977 __ssa_dst(mov);
1978 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1979
1980 ir3_instr_set_address(mov, address);
1981
1982 return mov;
1983 }
1984
1985 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)1986 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1987 {
1988 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1989 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1990
1991 __ssa_dst(instr)->flags |= flags;
1992 if (src->dsts[0]->flags & IR3_REG_ARRAY) {
1993 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1994 src_reg->array = src->dsts[0]->array;
1995 } else {
1996 __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
1997 }
1998 debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
1999 instr->cat1.src_type = type;
2000 instr->cat1.dst_type = type;
2001 return instr;
2002 }
2003
2004 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2005 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2006 type_t dst_type)
2007 {
2008 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2009 unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
2010 unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
2011
2012 debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2013
2014 __ssa_dst(instr)->flags |= dst_flags;
2015 __ssa_src(instr, src, 0);
2016 instr->cat1.src_type = src_type;
2017 instr->cat1.dst_type = dst_type;
2018 debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2019 return instr;
2020 }
2021
2022 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2023 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2024 {
2025 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2026
2027 struct ir3_register *dst = __ssa_dst(instr);
2028 dst->flags |= IR3_REG_SHARED;
2029 dst->wrmask = (1 << components) - 1;
2030 instr->repeat = components - 1;
2031 return instr;
2032 }
2033
2034 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2035 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2036 unsigned components)
2037 {
2038 struct ir3_instruction *instr =
2039 ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2040
2041 struct ir3_register *dst = __ssa_dst(instr);
2042 dst->flags |= IR3_REG_SHARED;
2043 dst->wrmask = (1 << components) - 1;
2044
2045 __ssa_src(instr, src, 0);
2046
2047 return instr;
2048 }
2049
2050 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2051 ir3_NOP(struct ir3_block *block)
2052 {
2053 return ir3_instr_create(block, OPC_NOP, 0, 0);
2054 }
2055
2056 /* clang-format off */
2057 #define __INSTR0(flag, name, opc) \
2058 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block) \
2059 { \
2060 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0); \
2061 instr->flags |= flag; \
2062 return instr; \
2063 }
2064 /* clang-format on */
2065 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2066 #define INSTR0(name) __INSTR0(0, name, OPC_##name)
2067
2068 /* clang-format off */
2069 #define __INSTR1(flag, dst_count, name, opc) \
2070 static inline struct ir3_instruction *ir3_##name( \
2071 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags) \
2072 { \
2073 struct ir3_instruction *instr = \
2074 ir3_instr_create(block, opc, dst_count, 1); \
2075 for (unsigned i = 0; i < dst_count; i++) \
2076 __ssa_dst(instr); \
2077 __ssa_src(instr, a, aflags); \
2078 instr->flags |= flag; \
2079 return instr; \
2080 }
2081 /* clang-format on */
2082 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2083 #define INSTR1(name) __INSTR1(0, 1, name, OPC_##name)
2084 #define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name)
2085
2086 /* clang-format off */
2087 #define __INSTR2(flag, dst_count, name, opc) \
2088 static inline struct ir3_instruction *ir3_##name( \
2089 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2090 struct ir3_instruction *b, unsigned bflags) \
2091 { \
2092 struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2093 for (unsigned i = 0; i < dst_count; i++) \
2094 __ssa_dst(instr); \
2095 __ssa_src(instr, a, aflags); \
2096 __ssa_src(instr, b, bflags); \
2097 instr->flags |= flag; \
2098 return instr; \
2099 }
2100 /* clang-format on */
2101 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2102 #define INSTR2(name) __INSTR2(0, 1, name, OPC_##name)
2103 #define INSTR2NODST(name) __INSTR2(0, 0, name, OPC_##name)
2104
2105 /* clang-format off */
2106 #define __INSTR3(flag, dst_count, name, opc) \
2107 static inline struct ir3_instruction *ir3_##name( \
2108 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2109 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2110 unsigned cflags) \
2111 { \
2112 struct ir3_instruction *instr = \
2113 ir3_instr_create(block, opc, dst_count, 3); \
2114 for (unsigned i = 0; i < dst_count; i++) \
2115 __ssa_dst(instr); \
2116 __ssa_src(instr, a, aflags); \
2117 __ssa_src(instr, b, bflags); \
2118 __ssa_src(instr, c, cflags); \
2119 instr->flags |= flag; \
2120 return instr; \
2121 }
2122 /* clang-format on */
2123 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2124 #define INSTR3(name) __INSTR3(0, 1, name, OPC_##name)
2125 #define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name)
2126
2127 /* clang-format off */
2128 #define __INSTR4(flag, dst_count, name, opc) \
2129 static inline struct ir3_instruction *ir3_##name( \
2130 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2131 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2132 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \
2133 { \
2134 struct ir3_instruction *instr = \
2135 ir3_instr_create(block, opc, dst_count, 4); \
2136 for (unsigned i = 0; i < dst_count; i++) \
2137 __ssa_dst(instr); \
2138 __ssa_src(instr, a, aflags); \
2139 __ssa_src(instr, b, bflags); \
2140 __ssa_src(instr, c, cflags); \
2141 __ssa_src(instr, d, dflags); \
2142 instr->flags |= flag; \
2143 return instr; \
2144 }
2145 /* clang-format on */
2146 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2147 #define INSTR4(name) __INSTR4(0, 1, name, OPC_##name)
2148 #define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name)
2149
2150 /* clang-format off */
2151 #define __INSTR5(flag, name, opc) \
2152 static inline struct ir3_instruction *ir3_##name( \
2153 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2154 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2155 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2156 struct ir3_instruction *e, unsigned eflags) \
2157 { \
2158 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5); \
2159 __ssa_dst(instr); \
2160 __ssa_src(instr, a, aflags); \
2161 __ssa_src(instr, b, bflags); \
2162 __ssa_src(instr, c, cflags); \
2163 __ssa_src(instr, d, dflags); \
2164 __ssa_src(instr, e, eflags); \
2165 instr->flags |= flag; \
2166 return instr; \
2167 }
2168 /* clang-format on */
2169 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2170 #define INSTR5(name) __INSTR5(0, name, OPC_##name)
2171
2172 /* clang-format off */
2173 #define __INSTR6(flag, dst_count, name, opc) \
2174 static inline struct ir3_instruction *ir3_##name( \
2175 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2176 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2177 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2178 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \
2179 unsigned fflags) \
2180 { \
2181 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6); \
2182 for (unsigned i = 0; i < dst_count; i++) \
2183 __ssa_dst(instr); \
2184 __ssa_src(instr, a, aflags); \
2185 __ssa_src(instr, b, bflags); \
2186 __ssa_src(instr, c, cflags); \
2187 __ssa_src(instr, d, dflags); \
2188 __ssa_src(instr, e, eflags); \
2189 __ssa_src(instr, f, fflags); \
2190 instr->flags |= flag; \
2191 return instr; \
2192 }
2193 /* clang-format on */
2194 #define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2195 #define INSTR6(name) __INSTR6(0, 1, name, OPC_##name)
2196 #define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name)
2197
2198 /* cat0 instructions: */
2199 INSTR1NODST(B)
INSTR0(JUMP)2200 INSTR0(JUMP)
2201 INSTR1NODST(KILL)
2202 INSTR1NODST(DEMOTE)
2203 INSTR0(END)
2204 INSTR0(CHSH)
2205 INSTR0(CHMASK)
2206 INSTR1NODST(PREDT)
2207 INSTR0(PREDF)
2208 INSTR0(PREDE)
2209 INSTR0(GETONE)
2210 INSTR0(SHPS)
2211 INSTR0(SHPE)
2212
2213 /* cat1 macros */
2214 INSTR1(ANY_MACRO)
2215 INSTR1(ALL_MACRO)
2216 INSTR1(READ_FIRST_MACRO)
2217 INSTR2(READ_COND_MACRO)
2218
2219 static inline struct ir3_instruction *
2220 ir3_ELECT_MACRO(struct ir3_block *block)
2221 {
2222 struct ir3_instruction *instr =
2223 ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2224 __ssa_dst(instr);
2225 return instr;
2226 }
2227
2228 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2229 ir3_SHPS_MACRO(struct ir3_block *block)
2230 {
2231 struct ir3_instruction *instr =
2232 ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2233 __ssa_dst(instr);
2234 return instr;
2235 }
2236
2237 /* cat2 instructions, most 2 src but some 1 src: */
2238 INSTR2(ADD_F)
INSTR2(MIN_F)2239 INSTR2(MIN_F)
2240 INSTR2(MAX_F)
2241 INSTR2(MUL_F)
2242 INSTR1(SIGN_F)
2243 INSTR2(CMPS_F)
2244 INSTR1(ABSNEG_F)
2245 INSTR2(CMPV_F)
2246 INSTR1(FLOOR_F)
2247 INSTR1(CEIL_F)
2248 INSTR1(RNDNE_F)
2249 INSTR1(RNDAZ_F)
2250 INSTR1(TRUNC_F)
2251 INSTR2(ADD_U)
2252 INSTR2(ADD_S)
2253 INSTR2(SUB_U)
2254 INSTR2(SUB_S)
2255 INSTR2(CMPS_U)
2256 INSTR2(CMPS_S)
2257 INSTR2(MIN_U)
2258 INSTR2(MIN_S)
2259 INSTR2(MAX_U)
2260 INSTR2(MAX_S)
2261 INSTR1(ABSNEG_S)
2262 INSTR2(AND_B)
2263 INSTR2(OR_B)
2264 INSTR1(NOT_B)
2265 INSTR2(XOR_B)
2266 INSTR2(CMPV_U)
2267 INSTR2(CMPV_S)
2268 INSTR2(MUL_U24)
2269 INSTR2(MUL_S24)
2270 INSTR2(MULL_U)
2271 INSTR1(BFREV_B)
2272 INSTR1(CLZ_S)
2273 INSTR1(CLZ_B)
2274 INSTR2(SHL_B)
2275 INSTR2(SHR_B)
2276 INSTR2(ASHR_B)
2277 INSTR2(BARY_F)
2278 INSTR2(FLAT_B)
2279 INSTR2(MGEN_B)
2280 INSTR2(GETBIT_B)
2281 INSTR1(SETRM)
2282 INSTR1(CBITS_B)
2283 INSTR2(SHB)
2284 INSTR2(MSAD)
2285
2286 /* cat3 instructions: */
2287 INSTR3(MAD_U16)
2288 INSTR3(MADSH_U16)
2289 INSTR3(MAD_S16)
2290 INSTR3(MADSH_M16)
2291 INSTR3(MAD_U24)
2292 INSTR3(MAD_S24)
2293 INSTR3(MAD_F16)
2294 INSTR3(MAD_F32)
2295 INSTR3(DP2ACC)
2296 INSTR3(DP4ACC)
2297 /* NOTE: SEL_B32 checks for zero vs nonzero */
2298 INSTR3(SEL_B16)
2299 INSTR3(SEL_B32)
2300 INSTR3(SEL_S16)
2301 INSTR3(SEL_S32)
2302 INSTR3(SEL_F16)
2303 INSTR3(SEL_F32)
2304 INSTR3(SAD_S16)
2305 INSTR3(SAD_S32)
2306
2307 /* cat4 instructions: */
2308 INSTR1(RCP)
2309 INSTR1(RSQ)
2310 INSTR1(HRSQ)
2311 INSTR1(LOG2)
2312 INSTR1(HLOG2)
2313 INSTR1(EXP2)
2314 INSTR1(HEXP2)
2315 INSTR1(SIN)
2316 INSTR1(COS)
2317 INSTR1(SQRT)
2318
2319 /* cat5 instructions: */
2320 INSTR1(DSX)
2321 INSTR1(DSXPP_MACRO)
2322 INSTR1(DSY)
2323 INSTR1(DSYPP_MACRO)
2324 INSTR1F(3D, DSX)
2325 INSTR1F(3D, DSY)
2326 INSTR1(RGETPOS)
2327
2328 static inline struct ir3_instruction *
2329 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2330 unsigned flags, struct ir3_instruction *samp_tex,
2331 struct ir3_instruction *src0, struct ir3_instruction *src1)
2332 {
2333 struct ir3_instruction *sam;
2334 unsigned nreg = 0;
2335
2336 if (flags & IR3_INSTR_S2EN) {
2337 nreg++;
2338 }
2339 if (src0) {
2340 nreg++;
2341 }
2342 if (src1) {
2343 nreg++;
2344 }
2345
2346 sam = ir3_instr_create(block, opc, 1, nreg);
2347 sam->flags |= flags;
2348 __ssa_dst(sam)->wrmask = wrmask;
2349 if (flags & IR3_INSTR_S2EN) {
2350 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2351 }
2352 if (src0) {
2353 __ssa_src(sam, src0, 0);
2354 }
2355 if (src1) {
2356 __ssa_src(sam, src1, 0);
2357 }
2358 sam->cat5.type = type;
2359
2360 return sam;
2361 }
2362
2363 /* cat6 instructions: */
2364 INSTR0(GETFIBERID)
2365 INSTR2(LDLV)
2366 INSTR3(LDG)
2367 INSTR3(LDL)
2368 INSTR3(LDLW)
2369 INSTR3(LDP)
2370 INSTR4NODST(STG)
2371 INSTR3NODST(STL)
2372 INSTR3NODST(STLW)
2373 INSTR3NODST(STP)
2374 INSTR1(RESINFO)
2375 INSTR1(RESFMT)
2376 INSTR2(ATOMIC_ADD)
2377 INSTR2(ATOMIC_SUB)
2378 INSTR2(ATOMIC_XCHG)
2379 INSTR2(ATOMIC_INC)
2380 INSTR2(ATOMIC_DEC)
2381 INSTR2(ATOMIC_CMPXCHG)
2382 INSTR2(ATOMIC_MIN)
2383 INSTR2(ATOMIC_MAX)
2384 INSTR2(ATOMIC_AND)
2385 INSTR2(ATOMIC_OR)
2386 INSTR2(ATOMIC_XOR)
2387 INSTR2(LDC)
2388 INSTR2(QUAD_SHUFFLE_BRCST)
2389 INSTR1(QUAD_SHUFFLE_HORIZ)
2390 INSTR1(QUAD_SHUFFLE_VERT)
2391 INSTR1(QUAD_SHUFFLE_DIAG)
2392 INSTR2NODST(LDC_K)
2393 INSTR2NODST(STC)
2394 #if GPU >= 600
2395 INSTR3NODST(STIB);
2396 INSTR2(LDIB);
2397 INSTR5(LDG_A);
2398 INSTR6NODST(STG_A);
2399 INSTR2(ATOMIC_G_ADD)
2400 INSTR2(ATOMIC_G_SUB)
2401 INSTR2(ATOMIC_G_XCHG)
2402 INSTR2(ATOMIC_G_INC)
2403 INSTR2(ATOMIC_G_DEC)
2404 INSTR2(ATOMIC_G_CMPXCHG)
2405 INSTR2(ATOMIC_G_MIN)
2406 INSTR2(ATOMIC_G_MAX)
2407 INSTR2(ATOMIC_G_AND)
2408 INSTR2(ATOMIC_G_OR)
2409 INSTR2(ATOMIC_G_XOR)
2410 INSTR3(ATOMIC_B_ADD)
2411 INSTR3(ATOMIC_B_SUB)
2412 INSTR3(ATOMIC_B_XCHG)
2413 INSTR3(ATOMIC_B_INC)
2414 INSTR3(ATOMIC_B_DEC)
2415 INSTR3(ATOMIC_B_CMPXCHG)
2416 INSTR3(ATOMIC_B_MIN)
2417 INSTR3(ATOMIC_B_MAX)
2418 INSTR3(ATOMIC_B_AND)
2419 INSTR3(ATOMIC_B_OR)
2420 INSTR3(ATOMIC_B_XOR)
2421 #elif GPU >= 400
2422 INSTR3(LDGB)
2423 #if GPU >= 500
2424 INSTR3(LDIB)
2425 #endif
2426 INSTR4NODST(STGB)
2427 INSTR4NODST(STIB)
2428 INSTR4(ATOMIC_S_ADD)
2429 INSTR4(ATOMIC_S_SUB)
2430 INSTR4(ATOMIC_S_XCHG)
2431 INSTR4(ATOMIC_S_INC)
2432 INSTR4(ATOMIC_S_DEC)
2433 INSTR4(ATOMIC_S_CMPXCHG)
2434 INSTR4(ATOMIC_S_MIN)
2435 INSTR4(ATOMIC_S_MAX)
2436 INSTR4(ATOMIC_S_AND)
2437 INSTR4(ATOMIC_S_OR)
2438 INSTR4(ATOMIC_S_XOR)
2439 #endif
2440
2441 /* cat7 instructions: */
2442 INSTR0(BAR)
2443 INSTR0(FENCE)
2444
2445 /* ************************************************************************* */
2446 #include "bitset.h"
2447
2448 #define MAX_REG 256
2449
2450 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2451
2452 typedef struct {
2453 bool mergedregs;
2454 regmaskstate_t mask;
2455 } regmask_t;
2456
2457 static inline bool
__regmask_get(regmask_t * regmask,bool half,unsigned n)2458 __regmask_get(regmask_t *regmask, bool half, unsigned n)
2459 {
2460 if (regmask->mergedregs) {
2461 /* a6xx+ case, with merged register file, we track things in terms
2462 * of half-precision registers, with a full precisions register
2463 * using two half-precision slots.
2464 *
2465 * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2466 * avoid having them alias normal full regs.
2467 */
2468 if (half && !is_reg_num_special(n)) {
2469 return BITSET_TEST(regmask->mask, n);
2470 } else {
2471 n *= 2;
2472 return BITSET_TEST(regmask->mask, n) ||
2473 BITSET_TEST(regmask->mask, n + 1);
2474 }
2475 } else {
2476 /* pre a6xx case, with separate register file for half and full
2477 * precision:
2478 */
2479 if (half)
2480 n += MAX_REG;
2481 return BITSET_TEST(regmask->mask, n);
2482 }
2483 }
2484
2485 static inline void
__regmask_set(regmask_t * regmask,bool half,unsigned n)2486 __regmask_set(regmask_t *regmask, bool half, unsigned n)
2487 {
2488 if (regmask->mergedregs) {
2489 /* a6xx+ case, with merged register file, we track things in terms
2490 * of half-precision registers, with a full precisions register
2491 * using two half-precision slots:
2492 */
2493 if (half && !is_reg_num_special(n)) {
2494 BITSET_SET(regmask->mask, n);
2495 } else {
2496 n *= 2;
2497 BITSET_SET(regmask->mask, n);
2498 BITSET_SET(regmask->mask, n + 1);
2499 }
2500 } else {
2501 /* pre a6xx case, with separate register file for half and full
2502 * precision:
2503 */
2504 if (half)
2505 n += MAX_REG;
2506 BITSET_SET(regmask->mask, n);
2507 }
2508 }
2509
2510 static inline void
__regmask_clear(regmask_t * regmask,bool half,unsigned n)2511 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
2512 {
2513 if (regmask->mergedregs) {
2514 /* a6xx+ case, with merged register file, we track things in terms
2515 * of half-precision registers, with a full precisions register
2516 * using two half-precision slots:
2517 */
2518 if (half && !is_reg_num_special(n)) {
2519 BITSET_CLEAR(regmask->mask, n);
2520 } else {
2521 n *= 2;
2522 BITSET_CLEAR(regmask->mask, n);
2523 BITSET_CLEAR(regmask->mask, n + 1);
2524 }
2525 } else {
2526 /* pre a6xx case, with separate register file for half and full
2527 * precision:
2528 */
2529 if (half)
2530 n += MAX_REG;
2531 BITSET_CLEAR(regmask->mask, n);
2532 }
2533 }
2534
2535 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)2536 regmask_init(regmask_t *regmask, bool mergedregs)
2537 {
2538 memset(®mask->mask, 0, sizeof(regmask->mask));
2539 regmask->mergedregs = mergedregs;
2540 }
2541
2542 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)2543 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2544 {
2545 assert(dst->mergedregs == a->mergedregs);
2546 assert(dst->mergedregs == b->mergedregs);
2547
2548 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2549 dst->mask[i] = a->mask[i] | b->mask[i];
2550 }
2551
2552 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)2553 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
2554 {
2555 regmaskstate_t shared_mask;
2556 BITSET_ZERO(shared_mask);
2557
2558 if (b->mergedregs) {
2559 BITSET_SET_RANGE(shared_mask, 2 * 4 * 48, 2 * 4 * 56 - 1);
2560 } else {
2561 BITSET_SET_RANGE(shared_mask, 4 * 48, 4 * 56 - 1);
2562 }
2563
2564 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2565 dst->mask[i] = a->mask[i] | (b->mask[i] & shared_mask[i]);
2566 }
2567
2568 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)2569 regmask_set(regmask_t *regmask, struct ir3_register *reg)
2570 {
2571 bool half = reg->flags & IR3_REG_HALF;
2572 if (reg->flags & IR3_REG_RELATIV) {
2573 for (unsigned i = 0; i < reg->size; i++)
2574 __regmask_set(regmask, half, reg->array.base + i);
2575 } else {
2576 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2577 if (mask & 1)
2578 __regmask_set(regmask, half, n);
2579 }
2580 }
2581
2582 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)2583 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
2584 {
2585 bool half = reg->flags & IR3_REG_HALF;
2586 if (reg->flags & IR3_REG_RELATIV) {
2587 for (unsigned i = 0; i < reg->size; i++)
2588 __regmask_clear(regmask, half, reg->array.base + i);
2589 } else {
2590 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2591 if (mask & 1)
2592 __regmask_clear(regmask, half, n);
2593 }
2594 }
2595
2596 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)2597 regmask_get(regmask_t *regmask, struct ir3_register *reg)
2598 {
2599 bool half = reg->flags & IR3_REG_HALF;
2600 if (reg->flags & IR3_REG_RELATIV) {
2601 for (unsigned i = 0; i < reg->size; i++)
2602 if (__regmask_get(regmask, half, reg->array.base + i))
2603 return true;
2604 } else {
2605 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2606 if (mask & 1)
2607 if (__regmask_get(regmask, half, n))
2608 return true;
2609 }
2610 return false;
2611 }
2612 /* ************************************************************************* */
2613
2614 #endif /* IR3_H_ */
2615