1 /*
2  * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef IR3_H_
25 #define IR3_H_
26 
27 #include <stdbool.h>
28 #include <stdint.h>
29 
30 #include "compiler/shader_enums.h"
31 
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36 
37 #include "instr-a3xx.h"
38 
39 /* low level intermediate representation of an adreno shader program */
40 
41 struct ir3_compiler;
42 struct ir3;
43 struct ir3_instruction;
44 struct ir3_block;
45 
46 struct ir3_info {
47    void *data; /* used internally in ir3 assembler */
48    /* Size in bytes of the shader binary, including NIR constants and
49     * padding
50     */
51    uint32_t size;
52    /* byte offset from start of the shader to the NIR constant data. */
53    uint32_t constant_data_offset;
54    /* Size in dwords of the instructions. */
55    uint16_t sizedwords;
56    uint16_t instrs_count; /* expanded to account for rpt's */
57    uint16_t nops_count;   /* # of nop instructions, including nopN */
58    uint16_t mov_count;
59    uint16_t cov_count;
60    uint16_t stp_count;
61    uint16_t ldp_count;
62    /* NOTE: max_reg, etc, does not include registers not touched
63     * by the shader (ie. vertex fetched via VFD_DECODE but not
64     * touched by shader)
65     */
66    int8_t max_reg; /* highest GPR # used by shader */
67    int8_t max_half_reg;
68    int16_t max_const;
69    /* This is the maximum # of waves that can executed at once in one core,
70     * assuming that they are all executing this shader.
71     */
72    int8_t max_waves;
73    bool double_threadsize;
74    bool multi_dword_ldp_stp;
75 
76    /* number of sync bits: */
77    uint16_t ss, sy;
78 
79    /* estimate of number of cycles stalled on (ss) */
80    uint16_t sstall;
81    /* estimate of number of cycles stalled on (sy) */
82    uint16_t systall;
83 
84    uint16_t last_baryf; /* instruction # of last varying fetch */
85 
86    /* Number of instructions of a given category: */
87    uint16_t instrs_per_cat[8];
88 };
89 
90 struct ir3_merge_set {
91    uint16_t preferred_reg;
92    uint16_t size;
93    uint16_t alignment;
94 
95    unsigned interval_start;
96    unsigned spill_slot;
97 
98    unsigned regs_count;
99    struct ir3_register **regs;
100 };
101 
102 struct ir3_register {
103    enum {
104       IR3_REG_CONST = 0x001,
105       IR3_REG_IMMED = 0x002,
106       IR3_REG_HALF = 0x004,
107       /* Shared registers have the same value for all threads when read.
108        * They can only be written when one thread is active (that is, inside
109        * a "getone" block).
110        */
111       IR3_REG_SHARED = 0x008,
112       IR3_REG_RELATIV = 0x010,
113       IR3_REG_R = 0x020,
114       /* Most instructions, it seems, can do float abs/neg but not
115        * integer.  The CP pass needs to know what is intended (int or
116        * float) in order to do the right thing.  For this reason the
117        * abs/neg flags are split out into float and int variants.  In
118        * addition, .b (bitwise) operations, the negate is actually a
119        * bitwise not, so split that out into a new flag to make it
120        * more clear.
121        */
122       IR3_REG_FNEG = 0x040,
123       IR3_REG_FABS = 0x080,
124       IR3_REG_SNEG = 0x100,
125       IR3_REG_SABS = 0x200,
126       IR3_REG_BNOT = 0x400,
127       /* (ei) flag, end-input?  Set on last bary, presumably to signal
128        * that the shader needs no more input:
129        *
130        * Note: Has different meaning on other instructions like add.s/u
131        */
132       IR3_REG_EI = 0x2000,
133       /* meta-flags, for intermediate stages of IR, ie.
134        * before register assignment is done:
135        */
136       IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */
137       IR3_REG_ARRAY = 0x8000,
138 
139       /* Set on a use whenever the SSA value becomes dead after the current
140        * instruction.
141        */
142       IR3_REG_KILL = 0x10000,
143 
144       /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
145        * same SSA value in a single instruction, this is only set on the first
146        * use.
147        */
148       IR3_REG_FIRST_KILL = 0x20000,
149 
150       /* Set when a destination doesn't have any uses and is dead immediately
151        * after the instruction. This can happen even after optimizations for
152        * corner cases such as destinations of atomic instructions.
153        */
154       IR3_REG_UNUSED = 0x40000,
155 
156       /* "Early-clobber" on a destination means that the destination is
157        * (potentially) written before any sources are read and therefore
158        * interferes with the sources of the instruction.
159        */
160       IR3_REG_EARLY_CLOBBER = 0x80000,
161    } flags;
162 
163    unsigned name;
164 
165    /* used for cat5 instructions, but also for internal/IR level
166     * tracking of what registers are read/written by an instruction.
167     * wrmask may be a bad name since it is used to represent both
168     * src and dst that touch multiple adjacent registers.
169     */
170    unsigned wrmask : 16; /* up to vec16 */
171 
172    /* for relative addressing, 32bits for array size is too small,
173     * but otoh we don't need to deal with disjoint sets, so instead
174     * use a simple size field (number of scalar components).
175     *
176     * Note the size field isn't important for relative const (since
177     * we don't have to do register allocation for constants).
178     */
179    unsigned size : 16;
180 
181    /* normal registers:
182     * the component is in the low two bits of the reg #, so
183     * rN.x becomes: (N << 2) | x
184     */
185    uint16_t num;
186    union {
187       /* immediate: */
188       int32_t iim_val;
189       uint32_t uim_val;
190       float fim_val;
191       /* relative: */
192       struct {
193          uint16_t id;
194          int16_t offset;
195          uint16_t base;
196       } array;
197    };
198 
199    /* For IR3_REG_DEST, pointer back to the instruction containing this
200     * register.
201     */
202    struct ir3_instruction *instr;
203 
204    /* For IR3_REG_SSA, src registers contain ptr back to assigning
205     * instruction.
206     *
207     * For IR3_REG_ARRAY, the pointer is back to the last dependent
208     * array access (although the net effect is the same, it points
209     * back to a previous instruction that we depend on).
210     */
211    struct ir3_register *def;
212 
213    /* Pointer to another register in the instruction that must share the same
214     * physical register. Each destination can be tied with one source, and
215     * they must have "tied" pointing to each other.
216     */
217    struct ir3_register *tied;
218 
219    unsigned spill_slot, next_use;
220 
221    unsigned merge_set_offset;
222    struct ir3_merge_set *merge_set;
223    unsigned interval_start, interval_end;
224 };
225 
226 /*
227  * Stupid/simple growable array implementation:
228  */
229 #define DECLARE_ARRAY(type, name)                                              \
230    unsigned name##_count, name##_sz;                                           \
231    type *name;
232 
233 #define array_insert(ctx, arr, ...)                                            \
234    do {                                                                        \
235       if (arr##_count == arr##_sz) {                                           \
236          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
237          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
238       }                                                                        \
239       arr[arr##_count++] = __VA_ARGS__;                                        \
240    } while (0)
241 
242 typedef enum {
243    REDUCE_OP_ADD_U,
244    REDUCE_OP_ADD_F,
245    REDUCE_OP_MUL_U,
246    REDUCE_OP_MUL_F,
247    REDUCE_OP_MIN_U,
248    REDUCE_OP_MIN_S,
249    REDUCE_OP_MIN_F,
250    REDUCE_OP_MAX_U,
251    REDUCE_OP_MAX_S,
252    REDUCE_OP_MAX_F,
253    REDUCE_OP_AND_B,
254    REDUCE_OP_OR_B,
255    REDUCE_OP_XOR_B,
256 } reduce_op_t;
257 
258 struct ir3_instruction {
259    struct ir3_block *block;
260    opc_t opc;
261    enum {
262       /* (sy) flag is set on first instruction, and after sample
263        * instructions (probably just on RAW hazard).
264        */
265       IR3_INSTR_SY = 0x001,
266       /* (ss) flag is set on first instruction, and first instruction
267        * to depend on the result of "long" instructions (RAW hazard):
268        *
269        *   rcp, rsq, log2, exp2, sin, cos, sqrt
270        *
271        * It seems to synchronize until all in-flight instructions are
272        * completed, for example:
273        *
274        *   rsq hr1.w, hr1.w
275        *   add.f hr2.z, (neg)hr2.z, hc0.y
276        *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
277        *   rsq hr2.x, hr2.x
278        *   (rpt1)nop
279        *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
280        *   nop
281        *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
282        *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
283        *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
284        *
285        * The last mul.f does not have (ss) set, presumably because the
286        * (ss) on the previous instruction does the job.
287        *
288        * The blob driver also seems to set it on WAR hazards, although
289        * not really clear if this is needed or just blob compiler being
290        * sloppy.  So far I haven't found a case where removing the (ss)
291        * causes problems for WAR hazard, but I could just be getting
292        * lucky:
293        *
294        *   rcp r1.y, r3.y
295        *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
296        *
297        */
298       IR3_INSTR_SS = 0x002,
299       /* (jp) flag is set on jump targets:
300        */
301       IR3_INSTR_JP = 0x004,
302       IR3_INSTR_UL = 0x008,
303       IR3_INSTR_3D = 0x010,
304       IR3_INSTR_A = 0x020,
305       IR3_INSTR_O = 0x040,
306       IR3_INSTR_P = 0x080,
307       IR3_INSTR_S = 0x100,
308       IR3_INSTR_S2EN = 0x200,
309       IR3_INSTR_SAT = 0x400,
310       /* (cat5/cat6) Bindless */
311       IR3_INSTR_B = 0x800,
312       /* (cat5/cat6) nonuniform */
313       IR3_INSTR_NONUNIF = 0x1000,
314       /* (cat5-only) Get some parts of the encoding from a1.x */
315       IR3_INSTR_A1EN = 0x02000,
316       /* meta-flags, for intermediate stages of IR, ie.
317        * before register assignment is done:
318        */
319       IR3_INSTR_MARK = 0x04000,
320       IR3_INSTR_UNUSED = 0x08000,
321    } flags;
322    uint8_t repeat;
323    uint8_t nop;
324 #ifdef DEBUG
325    unsigned srcs_max, dsts_max;
326 #endif
327    unsigned srcs_count, dsts_count;
328    struct ir3_register **dsts;
329    struct ir3_register **srcs;
330    union {
331       struct {
332          char inv1, inv2;
333          char comp1, comp2;
334          int immed;
335          struct ir3_block *target;
336          const char *target_label;
337          brtype_t brtype;
338          unsigned idx; /* for brac.N */
339       } cat0;
340       struct {
341          type_t src_type, dst_type;
342          round_t round;
343          reduce_op_t reduce_op;
344       } cat1;
345       struct {
346          enum {
347             IR3_COND_LT = 0,
348             IR3_COND_LE = 1,
349             IR3_COND_GT = 2,
350             IR3_COND_GE = 3,
351             IR3_COND_EQ = 4,
352             IR3_COND_NE = 5,
353          } condition;
354       } cat2;
355       struct {
356          enum {
357             IR3_SRC_UNSIGNED = 0,
358             IR3_SRC_MIXED = 1,
359          } signedness;
360          enum {
361             IR3_SRC_PACKED_LOW = 0,
362             IR3_SRC_PACKED_HIGH = 1,
363          } packed;
364          bool swapped;
365       } cat3;
366       struct {
367          unsigned samp, tex;
368          unsigned tex_base : 3;
369          unsigned cluster_size : 4;
370          type_t type;
371       } cat5;
372       struct {
373          type_t type;
374          /* TODO remove dst_offset and handle as a ir3_register
375           * which might be IMMED, similar to how src_offset is
376           * handled.
377           */
378          int dst_offset;
379          int iim_val;       /* for ldgb/stgb, # of components */
380          unsigned d    : 3; /* for ldc, component offset */
381          bool typed    : 1;
382          unsigned base : 3;
383       } cat6;
384       struct {
385          unsigned w : 1; /* write */
386          unsigned r : 1; /* read */
387          unsigned l : 1; /* local */
388          unsigned g : 1; /* global */
389       } cat7;
390       /* for meta-instructions, just used to hold extra data
391        * before instruction scheduling, etc
392        */
393       struct {
394          int off; /* component/offset */
395       } split;
396       struct {
397          /* Per-source index back to the entry in the
398           * ir3_shader_variant::outputs table.
399           */
400          unsigned *outidxs;
401       } end;
402       struct {
403          /* used to temporarily hold reference to nir_phi_instr
404           * until we resolve the phi srcs
405           */
406          void *nphi;
407       } phi;
408       struct {
409          unsigned samp, tex;
410          unsigned input_offset;
411          unsigned samp_base : 3;
412          unsigned tex_base  : 3;
413       } prefetch;
414       struct {
415          /* maps back to entry in ir3_shader_variant::inputs table: */
416          int inidx;
417          /* for sysvals, identifies the sysval type.  Mostly so we can
418           * identify the special cases where a sysval should not be DCE'd
419           * (currently, just pre-fs texture fetch)
420           */
421          gl_system_value sysval;
422       } input;
423    };
424 
425    /* For assigning jump offsets, we need instruction's position: */
426    uint32_t ip;
427 
428    /* used for per-pass extra instruction data.
429     *
430     * TODO we should remove the per-pass data like this and 'use_count'
431     * and do something similar to what RA does w/ ir3_ra_instr_data..
432     * ie. use the ir3_count_instructions pass, and then use instr->ip
433     * to index into a table of pass-private data.
434     */
435    void *data;
436 
437    /**
438     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
439     */
440    struct set *uses;
441 
442    int use_count; /* currently just updated/used by cp */
443 
444    /* an instruction can reference at most one address register amongst
445     * it's src/dst registers.  Beyond that, you need to insert mov's.
446     *
447     * NOTE: do not write this directly, use ir3_instr_set_address()
448     */
449    struct ir3_register *address;
450 
451    /* Tracking for additional dependent instructions.  Used to handle
452     * barriers, WAR hazards for arrays/SSBOs/etc.
453     */
454    DECLARE_ARRAY(struct ir3_instruction *, deps);
455 
456    /*
457     * From PoV of instruction scheduling, not execution (ie. ignores global/
458     * local distinction):
459     *                            shared  image  atomic  SSBO  everything
460     *   barrier()/            -   R/W     R/W    R/W     R/W       X
461     *     groupMemoryBarrier()
462     *     memoryBarrier()
463     *     (but only images declared coherent?)
464     *   memoryBarrierAtomic() -                  R/W
465     *   memoryBarrierBuffer() -                          R/W
466     *   memoryBarrierImage()  -           R/W
467     *   memoryBarrierShared() -   R/W
468     *
469     * TODO I think for SSBO/image/shared, in cases where we can determine
470     * which variable is accessed, we don't need to care about accesses to
471     * different variables (unless declared coherent??)
472     */
473    enum {
474       IR3_BARRIER_EVERYTHING = 1 << 0,
475       IR3_BARRIER_SHARED_R = 1 << 1,
476       IR3_BARRIER_SHARED_W = 1 << 2,
477       IR3_BARRIER_IMAGE_R = 1 << 3,
478       IR3_BARRIER_IMAGE_W = 1 << 4,
479       IR3_BARRIER_BUFFER_R = 1 << 5,
480       IR3_BARRIER_BUFFER_W = 1 << 6,
481       IR3_BARRIER_ARRAY_R = 1 << 7,
482       IR3_BARRIER_ARRAY_W = 1 << 8,
483       IR3_BARRIER_PRIVATE_R = 1 << 9,
484       IR3_BARRIER_PRIVATE_W = 1 << 10,
485       IR3_BARRIER_CONST_W = 1 << 11,
486    } barrier_class,
487       barrier_conflict;
488 
489    /* Entry in ir3_block's instruction list: */
490    struct list_head node;
491 
492    uint32_t serialno;
493 
494    // TODO only computerator/assembler:
495    int line;
496 };
497 
498 struct ir3 {
499    struct ir3_compiler *compiler;
500    gl_shader_stage type;
501 
502    DECLARE_ARRAY(struct ir3_instruction *, inputs);
503 
504    /* Track bary.f (and ldlv) instructions.. this is needed in
505     * scheduling to ensure that all varying fetches happen before
506     * any potential kill instructions.  The hw gets grumpy if all
507     * threads in a group are killed before the last bary.f gets
508     * a chance to signal end of input (ei).
509     */
510    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
511 
512    /* Track all indirect instructions (read and write).  To avoid
513     * deadlock scenario where an address register gets scheduled,
514     * but other dependent src instructions cannot be scheduled due
515     * to dependency on a *different* address register value, the
516     * scheduler needs to ensure that all dependencies other than
517     * the instruction other than the address register are scheduled
518     * before the one that writes the address register.  Having a
519     * convenient list of instructions that reference some address
520     * register simplifies this.
521     */
522    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
523 
524    /* same for a1.x: */
525    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
526 
527    /* and same for instructions that consume predicate register: */
528    DECLARE_ARRAY(struct ir3_instruction *, predicates);
529 
530    /* Track texture sample instructions which need texture state
531     * patched in (for astc-srgb workaround):
532     */
533    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
534 
535    /* Track tg4 instructions which need texture state patched in (for tg4
536     * swizzling workaround):
537     */
538    DECLARE_ARRAY(struct ir3_instruction *, tg4);
539 
540    /* List of blocks: */
541    struct list_head block_list;
542 
543    /* List of ir3_array's: */
544    struct list_head array_list;
545 
546 #ifdef DEBUG
547    unsigned block_count;
548 #endif
549    unsigned instr_count;
550 };
551 
552 struct ir3_array {
553    struct list_head node;
554    unsigned length;
555    unsigned id;
556 
557    struct nir_register *r;
558 
559    /* To avoid array write's from getting DCE'd, keep track of the
560     * most recent write.  Any array access depends on the most
561     * recent write.  This way, nothing depends on writes after the
562     * last read.  But all the writes that happen before that have
563     * something depending on them
564     */
565    struct ir3_register *last_write;
566 
567    /* extra stuff used in RA pass: */
568    unsigned base; /* base vreg name */
569    unsigned reg;  /* base physical reg */
570    uint16_t start_ip, end_ip;
571 
572    /* Indicates if half-precision */
573    bool half;
574 
575    bool unused;
576 };
577 
578 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
579 
580 enum ir3_branch_type {
581    IR3_BRANCH_COND,   /* condition */
582    IR3_BRANCH_ANY,    /* subgroupAny(condition) */
583    IR3_BRANCH_ALL,    /* subgroupAll(condition) */
584    IR3_BRANCH_GETONE, /* subgroupElect() */
585    IR3_BRANCH_SHPS,   /* preamble start */
586 };
587 
588 struct ir3_block {
589    struct list_head node;
590    struct ir3 *shader;
591 
592    const struct nir_block *nblock;
593 
594    struct list_head instr_list; /* list of ir3_instruction */
595 
596    /* The actual branch condition, if there are two successors */
597    enum ir3_branch_type brtype;
598 
599    /* each block has either one or two successors.. in case of two
600     * successors, 'condition' decides which one to follow.  A block preceding
601     * an if/else has two successors.
602     *
603     * In some cases the path that the machine actually takes through the
604     * program may not match the per-thread view of the CFG. In particular
605     * this is the case for if/else, where the machine jumps from the end of
606     * the if to the beginning of the else and switches active lanes. While
607     * most things only care about the per-thread view, we need to use the
608     * "physical" view when allocating shared registers. "successors" contains
609     * the per-thread successors, and "physical_successors" contains the
610     * physical successors which includes the fallthrough edge from the if to
611     * the else.
612     */
613    struct ir3_instruction *condition;
614    struct ir3_block *successors[2];
615    struct ir3_block *physical_successors[2];
616 
617    DECLARE_ARRAY(struct ir3_block *, predecessors);
618    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
619 
620    uint16_t start_ip, end_ip;
621 
622    /* Track instructions which do not write a register but other-
623     * wise must not be discarded (such as kill, stg, etc)
624     */
625    DECLARE_ARRAY(struct ir3_instruction *, keeps);
626 
627    /* used for per-pass extra block data.  Mainly used right
628     * now in RA step to track livein/liveout.
629     */
630    void *data;
631 
632    uint32_t index;
633 
634    struct ir3_block *imm_dom;
635    DECLARE_ARRAY(struct ir3_block *, dom_children);
636 
637    uint32_t dom_pre_index;
638    uint32_t dom_post_index;
639 
640    uint32_t loop_id;
641    uint32_t loop_depth;
642 
643 #ifdef DEBUG
644    uint32_t serialno;
645 #endif
646 };
647 
648 static inline uint32_t
block_id(struct ir3_block * block)649 block_id(struct ir3_block *block)
650 {
651 #ifdef DEBUG
652    return block->serialno;
653 #else
654    return (uint32_t)(unsigned long)block;
655 #endif
656 }
657 
658 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)659 ir3_start_block(struct ir3 *ir)
660 {
661    return list_first_entry(&ir->block_list, struct ir3_block, node);
662 }
663 
664 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)665 ir3_after_preamble(struct ir3 *ir)
666 {
667    struct ir3_block *block = ir3_start_block(ir);
668    /* The preamble will have a usually-empty else branch, and we want to skip
669     * that to get to the block after the preamble.
670     */
671    if (block->brtype == IR3_BRANCH_SHPS)
672       return block->successors[1]->successors[0];
673    else
674       return block;
675 }
676 
677 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
678 void ir3_block_add_physical_predecessor(struct ir3_block *block,
679                                         struct ir3_block *pred);
680 void ir3_block_remove_predecessor(struct ir3_block *block,
681                                   struct ir3_block *pred);
682 void ir3_block_remove_physical_predecessor(struct ir3_block *block,
683                                            struct ir3_block *pred);
684 unsigned ir3_block_get_pred_index(struct ir3_block *block,
685                                   struct ir3_block *pred);
686 
687 void ir3_calc_dominance(struct ir3 *ir);
688 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
689 
690 struct ir3_shader_variant;
691 
692 struct ir3 *ir3_create(struct ir3_compiler *compiler,
693                        struct ir3_shader_variant *v);
694 void ir3_destroy(struct ir3 *shader);
695 
696 void ir3_collect_info(struct ir3_shader_variant *v);
697 void *ir3_alloc(struct ir3 *shader, int sz);
698 
699 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
700                                          unsigned reg_count,
701                                          bool double_threadsize);
702 
703 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
704                                            bool double_threadsize);
705 
706 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
707                                   unsigned regs_count);
708 
709 struct ir3_block *ir3_block_create(struct ir3 *shader);
710 
711 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
712                                          int ndst, int nsrc);
713 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
714 void ir3_instr_add_dep(struct ir3_instruction *instr,
715                        struct ir3_instruction *dep);
716 const char *ir3_instr_name(struct ir3_instruction *instr);
717 
718 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
719                                     int flags);
720 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
721                                     int flags);
722 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
723                                    struct ir3_register *reg);
724 
725 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)726 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
727 {
728    assert(!dst->tied && !src->tied);
729    dst->tied = src;
730    src->tied = dst;
731 }
732 
733 void ir3_reg_set_last_array(struct ir3_instruction *instr,
734                             struct ir3_register *reg,
735                             struct ir3_register *last_write);
736 
737 void ir3_instr_set_address(struct ir3_instruction *instr,
738                            struct ir3_instruction *addr);
739 
740 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)741 ir3_instr_check_mark(struct ir3_instruction *instr)
742 {
743    if (instr->flags & IR3_INSTR_MARK)
744       return true; /* already visited */
745    instr->flags |= IR3_INSTR_MARK;
746    return false;
747 }
748 
749 void ir3_block_clear_mark(struct ir3_block *block);
750 void ir3_clear_mark(struct ir3 *shader);
751 
752 unsigned ir3_count_instructions(struct ir3 *ir);
753 unsigned ir3_count_instructions_ra(struct ir3 *ir);
754 
755 /**
756  * Move 'instr' to just before 'after'
757  */
758 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)759 ir3_instr_move_before(struct ir3_instruction *instr,
760                       struct ir3_instruction *after)
761 {
762    list_delinit(&instr->node);
763    list_addtail(&instr->node, &after->node);
764 }
765 
766 /**
767  * Move 'instr' to just after 'before':
768  */
769 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)770 ir3_instr_move_after(struct ir3_instruction *instr,
771                      struct ir3_instruction *before)
772 {
773    list_delinit(&instr->node);
774    list_add(&instr->node, &before->node);
775 }
776 
777 /**
778  * Move 'instr' to the beginning of the block:
779  */
780 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)781 ir3_instr_move_before_block(struct ir3_instruction *instr,
782                             struct ir3_block *block)
783 {
784    list_delinit(&instr->node);
785    list_add(&instr->node, &block->instr_list);
786 }
787 
788 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
789 
790 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
791 void ir3_fixup_src_type(struct ir3_instruction *instr);
792 
793 int ir3_flut(struct ir3_register *src_reg);
794 
795 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
796 
797 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
798 
799 #include "util/set.h"
800 #define foreach_ssa_use(__use, __instr)                                        \
801    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
802         __use = NULL)                                                          \
803       set_foreach ((__instr)->uses, __entry)                                   \
804          if ((__use = (void *)__entry->key))
805 
806 static inline uint32_t
reg_num(const struct ir3_register * reg)807 reg_num(const struct ir3_register *reg)
808 {
809    return reg->num >> 2;
810 }
811 
812 static inline uint32_t
reg_comp(const struct ir3_register * reg)813 reg_comp(const struct ir3_register *reg)
814 {
815    return reg->num & 0x3;
816 }
817 
818 static inline bool
is_flow(struct ir3_instruction * instr)819 is_flow(struct ir3_instruction *instr)
820 {
821    return (opc_cat(instr->opc) == 0);
822 }
823 
824 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)825 is_kill_or_demote(struct ir3_instruction *instr)
826 {
827    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
828 }
829 
830 static inline bool
is_nop(struct ir3_instruction * instr)831 is_nop(struct ir3_instruction *instr)
832 {
833    return instr->opc == OPC_NOP;
834 }
835 
836 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)837 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
838 {
839    unsigned dst_type = (dst->flags & IR3_REG_HALF);
840    unsigned src_type = (src->flags & IR3_REG_HALF);
841 
842    /* Treat shared->normal copies as same-type, because they can generally be
843     * folded, but not normal->shared copies.
844     */
845    if (dst_type != src_type ||
846        ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
847       return false;
848    else
849       return true;
850 }
851 
852 /* Is it a non-transformative (ie. not type changing) mov?  This can
853  * also include absneg.s/absneg.f, which for the most part can be
854  * treated as a mov (single src argument).
855  */
856 static inline bool
is_same_type_mov(struct ir3_instruction * instr)857 is_same_type_mov(struct ir3_instruction *instr)
858 {
859    struct ir3_register *dst;
860 
861    switch (instr->opc) {
862    case OPC_MOV:
863       if (instr->cat1.src_type != instr->cat1.dst_type)
864          return false;
865       /* If the type of dest reg and src reg are different,
866        * it shouldn't be considered as same type mov
867        */
868       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
869          return false;
870       break;
871    case OPC_ABSNEG_F:
872    case OPC_ABSNEG_S:
873       if (instr->flags & IR3_INSTR_SAT)
874          return false;
875       /* If the type of dest reg and src reg are different,
876        * it shouldn't be considered as same type mov
877        */
878       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
879          return false;
880       break;
881    case OPC_META_PHI:
882       return instr->srcs_count == 1;
883    default:
884       return false;
885    }
886 
887    dst = instr->dsts[0];
888 
889    /* mov's that write to a0 or p0.x are special: */
890    if (dst->num == regid(REG_P0, 0))
891       return false;
892    if (reg_num(dst) == REG_A0)
893       return false;
894 
895    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
896       return false;
897 
898    return true;
899 }
900 
901 /* A move from const, which changes size but not type, can also be
902  * folded into dest instruction in some cases.
903  */
904 static inline bool
is_const_mov(struct ir3_instruction * instr)905 is_const_mov(struct ir3_instruction *instr)
906 {
907    if (instr->opc != OPC_MOV)
908       return false;
909 
910    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
911       return false;
912 
913    type_t src_type = instr->cat1.src_type;
914    type_t dst_type = instr->cat1.dst_type;
915 
916    return (type_float(src_type) && type_float(dst_type)) ||
917           (type_uint(src_type) && type_uint(dst_type)) ||
918           (type_sint(src_type) && type_sint(dst_type));
919 }
920 
921 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)922 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
923 {
924    switch (instr->opc) {
925    case OPC_BALLOT_MACRO:
926    case OPC_ANY_MACRO:
927    case OPC_ALL_MACRO:
928    case OPC_ELECT_MACRO:
929    case OPC_READ_COND_MACRO:
930    case OPC_READ_FIRST_MACRO:
931    case OPC_SWZ_SHARED_MACRO:
932    case OPC_SCAN_MACRO:
933       return true;
934    default:
935       return false;
936    }
937 }
938 
939 static inline bool
is_alu(struct ir3_instruction * instr)940 is_alu(struct ir3_instruction *instr)
941 {
942    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
943 }
944 
945 static inline bool
is_sfu(struct ir3_instruction * instr)946 is_sfu(struct ir3_instruction *instr)
947 {
948    return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
949 }
950 
951 static inline bool
is_tex(struct ir3_instruction * instr)952 is_tex(struct ir3_instruction *instr)
953 {
954    return (opc_cat(instr->opc) == 5);
955 }
956 
957 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)958 is_tex_or_prefetch(struct ir3_instruction *instr)
959 {
960    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
961 }
962 
963 static inline bool
is_mem(struct ir3_instruction * instr)964 is_mem(struct ir3_instruction *instr)
965 {
966    return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
967 }
968 
969 static inline bool
is_barrier(struct ir3_instruction * instr)970 is_barrier(struct ir3_instruction *instr)
971 {
972    return (opc_cat(instr->opc) == 7);
973 }
974 
975 static inline bool
is_half(struct ir3_instruction * instr)976 is_half(struct ir3_instruction *instr)
977 {
978    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
979 }
980 
981 static inline bool
is_shared(struct ir3_instruction * instr)982 is_shared(struct ir3_instruction *instr)
983 {
984    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
985 }
986 
987 static inline bool
is_store(struct ir3_instruction * instr)988 is_store(struct ir3_instruction *instr)
989 {
990    /* these instructions, the "destination" register is
991     * actually a source, the address to store to.
992     */
993    switch (instr->opc) {
994    case OPC_STG:
995    case OPC_STG_A:
996    case OPC_STGB:
997    case OPC_STIB:
998    case OPC_STP:
999    case OPC_STL:
1000    case OPC_STLW:
1001    case OPC_L2G:
1002    case OPC_G2L:
1003       return true;
1004    default:
1005       return false;
1006    }
1007 }
1008 
1009 static inline bool
is_load(struct ir3_instruction * instr)1010 is_load(struct ir3_instruction *instr)
1011 {
1012    switch (instr->opc) {
1013    case OPC_LDG:
1014    case OPC_LDG_A:
1015    case OPC_LDGB:
1016    case OPC_LDIB:
1017    case OPC_LDL:
1018    case OPC_LDP:
1019    case OPC_L2G:
1020    case OPC_LDLW:
1021    case OPC_LDC:
1022    case OPC_LDLV:
1023       /* probably some others too.. */
1024       return true;
1025    default:
1026       return false;
1027    }
1028 }
1029 
1030 static inline bool
is_input(struct ir3_instruction * instr)1031 is_input(struct ir3_instruction *instr)
1032 {
1033    /* in some cases, ldlv is used to fetch varying without
1034     * interpolation.. fortunately inloc is the first src
1035     * register in either case
1036     */
1037    switch (instr->opc) {
1038    case OPC_LDLV:
1039    case OPC_BARY_F:
1040    case OPC_FLAT_B:
1041       return true;
1042    default:
1043       return false;
1044    }
1045 }
1046 
1047 static inline bool
is_bool(struct ir3_instruction * instr)1048 is_bool(struct ir3_instruction *instr)
1049 {
1050    switch (instr->opc) {
1051    case OPC_CMPS_F:
1052    case OPC_CMPS_S:
1053    case OPC_CMPS_U:
1054       return true;
1055    default:
1056       return false;
1057    }
1058 }
1059 
1060 static inline opc_t
cat3_half_opc(opc_t opc)1061 cat3_half_opc(opc_t opc)
1062 {
1063    switch (opc) {
1064    case OPC_MAD_F32:
1065       return OPC_MAD_F16;
1066    case OPC_SEL_B32:
1067       return OPC_SEL_B16;
1068    case OPC_SEL_S32:
1069       return OPC_SEL_S16;
1070    case OPC_SEL_F32:
1071       return OPC_SEL_F16;
1072    case OPC_SAD_S32:
1073       return OPC_SAD_S16;
1074    default:
1075       return opc;
1076    }
1077 }
1078 
1079 static inline opc_t
cat3_full_opc(opc_t opc)1080 cat3_full_opc(opc_t opc)
1081 {
1082    switch (opc) {
1083    case OPC_MAD_F16:
1084       return OPC_MAD_F32;
1085    case OPC_SEL_B16:
1086       return OPC_SEL_B32;
1087    case OPC_SEL_S16:
1088       return OPC_SEL_S32;
1089    case OPC_SEL_F16:
1090       return OPC_SEL_F32;
1091    case OPC_SAD_S16:
1092       return OPC_SAD_S32;
1093    default:
1094       return opc;
1095    }
1096 }
1097 
1098 static inline opc_t
cat4_half_opc(opc_t opc)1099 cat4_half_opc(opc_t opc)
1100 {
1101    switch (opc) {
1102    case OPC_RSQ:
1103       return OPC_HRSQ;
1104    case OPC_LOG2:
1105       return OPC_HLOG2;
1106    case OPC_EXP2:
1107       return OPC_HEXP2;
1108    default:
1109       return opc;
1110    }
1111 }
1112 
1113 static inline opc_t
cat4_full_opc(opc_t opc)1114 cat4_full_opc(opc_t opc)
1115 {
1116    switch (opc) {
1117    case OPC_HRSQ:
1118       return OPC_RSQ;
1119    case OPC_HLOG2:
1120       return OPC_LOG2;
1121    case OPC_HEXP2:
1122       return OPC_EXP2;
1123    default:
1124       return opc;
1125    }
1126 }
1127 
1128 static inline bool
is_meta(struct ir3_instruction * instr)1129 is_meta(struct ir3_instruction *instr)
1130 {
1131    return (opc_cat(instr->opc) == -1);
1132 }
1133 
1134 static inline unsigned
reg_elems(const struct ir3_register * reg)1135 reg_elems(const struct ir3_register *reg)
1136 {
1137    if (reg->flags & IR3_REG_ARRAY)
1138       return reg->size;
1139    else
1140       return util_last_bit(reg->wrmask);
1141 }
1142 
1143 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1144 reg_elem_size(const struct ir3_register *reg)
1145 {
1146    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1147 }
1148 
1149 static inline unsigned
reg_size(const struct ir3_register * reg)1150 reg_size(const struct ir3_register *reg)
1151 {
1152    return reg_elems(reg) * reg_elem_size(reg);
1153 }
1154 
1155 static inline unsigned
dest_regs(struct ir3_instruction * instr)1156 dest_regs(struct ir3_instruction *instr)
1157 {
1158    if (instr->dsts_count == 0)
1159       return 0;
1160 
1161    debug_assert(instr->dsts_count == 1);
1162    return util_last_bit(instr->dsts[0]->wrmask);
1163 }
1164 
1165 /* is dst a normal temp register: */
1166 static inline bool
is_dest_gpr(struct ir3_register * dst)1167 is_dest_gpr(struct ir3_register *dst)
1168 {
1169    if (dst->wrmask == 0)
1170       return false;
1171    if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1172       return false;
1173    return true;
1174 }
1175 
1176 static inline bool
writes_gpr(struct ir3_instruction * instr)1177 writes_gpr(struct ir3_instruction *instr)
1178 {
1179    if (dest_regs(instr) == 0)
1180       return false;
1181    return is_dest_gpr(instr->dsts[0]);
1182 }
1183 
1184 static inline bool
writes_addr0(struct ir3_instruction * instr)1185 writes_addr0(struct ir3_instruction *instr)
1186 {
1187    /* Note: only the first dest can write to a0.x */
1188    if (instr->dsts_count > 0) {
1189       struct ir3_register *dst = instr->dsts[0];
1190       return dst->num == regid(REG_A0, 0);
1191    }
1192    return false;
1193 }
1194 
1195 static inline bool
writes_addr1(struct ir3_instruction * instr)1196 writes_addr1(struct ir3_instruction *instr)
1197 {
1198    /* Note: only the first dest can write to a1.x */
1199    if (instr->dsts_count > 0) {
1200       struct ir3_register *dst = instr->dsts[0];
1201       return dst->num == regid(REG_A0, 1);
1202    }
1203    return false;
1204 }
1205 
1206 static inline bool
writes_pred(struct ir3_instruction * instr)1207 writes_pred(struct ir3_instruction *instr)
1208 {
1209    /* Note: only the first dest can write to p0.x */
1210    if (instr->dsts_count > 0) {
1211       struct ir3_register *dst = instr->dsts[0];
1212       return reg_num(dst) == REG_P0;
1213    }
1214    return false;
1215 }
1216 
1217 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
1218  * are considered special here. Special registers are always accessed with one
1219  * size and never alias normal registers, even though a naive calculation
1220  * would sometimes make it seem like e.g. r30.z aliases a0.x.
1221  */
1222 static inline bool
is_reg_special(const struct ir3_register * reg)1223 is_reg_special(const struct ir3_register *reg)
1224 {
1225    return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1226           (reg_num(reg) == REG_P0);
1227 }
1228 
1229 /* Same as above but in cases where we don't have a register. r48.x and above
1230  * are shared/special.
1231  */
1232 static inline bool
is_reg_num_special(unsigned num)1233 is_reg_num_special(unsigned num)
1234 {
1235    return num >= 48 * 4;
1236 }
1237 
1238 /* returns defining instruction for reg */
1239 /* TODO better name */
1240 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1241 ssa(struct ir3_register *reg)
1242 {
1243    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1244       return reg->def->instr;
1245    return NULL;
1246 }
1247 
1248 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1249 conflicts(struct ir3_register *a, struct ir3_register *b)
1250 {
1251    return (a && b) && (a->def != b->def);
1252 }
1253 
1254 static inline bool
reg_gpr(struct ir3_register * r)1255 reg_gpr(struct ir3_register *r)
1256 {
1257    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1258       return false;
1259    if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1260       return false;
1261    return true;
1262 }
1263 
1264 static inline type_t
half_type(type_t type)1265 half_type(type_t type)
1266 {
1267    switch (type) {
1268    case TYPE_F32:
1269       return TYPE_F16;
1270    case TYPE_U32:
1271       return TYPE_U16;
1272    case TYPE_S32:
1273       return TYPE_S16;
1274    case TYPE_F16:
1275    case TYPE_U16:
1276    case TYPE_S16:
1277       return type;
1278    case TYPE_U8:
1279    case TYPE_S8:
1280       return type;
1281    default:
1282       assert(0);
1283       return ~0;
1284    }
1285 }
1286 
1287 static inline type_t
full_type(type_t type)1288 full_type(type_t type)
1289 {
1290    switch (type) {
1291    case TYPE_F16:
1292       return TYPE_F32;
1293    case TYPE_U8:
1294    case TYPE_U16:
1295       return TYPE_U32;
1296    case TYPE_S8:
1297    case TYPE_S16:
1298       return TYPE_S32;
1299    case TYPE_F32:
1300    case TYPE_U32:
1301    case TYPE_S32:
1302       return type;
1303    default:
1304       assert(0);
1305       return ~0;
1306    }
1307 }
1308 
1309 /* some cat2 instructions (ie. those which are not float) can embed an
1310  * immediate:
1311  */
1312 static inline bool
ir3_cat2_int(opc_t opc)1313 ir3_cat2_int(opc_t opc)
1314 {
1315    switch (opc) {
1316    case OPC_ADD_U:
1317    case OPC_ADD_S:
1318    case OPC_SUB_U:
1319    case OPC_SUB_S:
1320    case OPC_CMPS_U:
1321    case OPC_CMPS_S:
1322    case OPC_MIN_U:
1323    case OPC_MIN_S:
1324    case OPC_MAX_U:
1325    case OPC_MAX_S:
1326    case OPC_CMPV_U:
1327    case OPC_CMPV_S:
1328    case OPC_MUL_U24:
1329    case OPC_MUL_S24:
1330    case OPC_MULL_U:
1331    case OPC_CLZ_S:
1332    case OPC_ABSNEG_S:
1333    case OPC_AND_B:
1334    case OPC_OR_B:
1335    case OPC_NOT_B:
1336    case OPC_XOR_B:
1337    case OPC_BFREV_B:
1338    case OPC_CLZ_B:
1339    case OPC_SHL_B:
1340    case OPC_SHR_B:
1341    case OPC_ASHR_B:
1342    case OPC_MGEN_B:
1343    case OPC_GETBIT_B:
1344    case OPC_CBITS_B:
1345    case OPC_BARY_F:
1346    case OPC_FLAT_B:
1347       return true;
1348 
1349    default:
1350       return false;
1351    }
1352 }
1353 
1354 /* map cat2 instruction to valid abs/neg flags: */
1355 static inline unsigned
ir3_cat2_absneg(opc_t opc)1356 ir3_cat2_absneg(opc_t opc)
1357 {
1358    switch (opc) {
1359    case OPC_ADD_F:
1360    case OPC_MIN_F:
1361    case OPC_MAX_F:
1362    case OPC_MUL_F:
1363    case OPC_SIGN_F:
1364    case OPC_CMPS_F:
1365    case OPC_ABSNEG_F:
1366    case OPC_CMPV_F:
1367    case OPC_FLOOR_F:
1368    case OPC_CEIL_F:
1369    case OPC_RNDNE_F:
1370    case OPC_RNDAZ_F:
1371    case OPC_TRUNC_F:
1372    case OPC_BARY_F:
1373       return IR3_REG_FABS | IR3_REG_FNEG;
1374 
1375    case OPC_ADD_U:
1376    case OPC_ADD_S:
1377    case OPC_SUB_U:
1378    case OPC_SUB_S:
1379    case OPC_CMPS_U:
1380    case OPC_CMPS_S:
1381    case OPC_MIN_U:
1382    case OPC_MIN_S:
1383    case OPC_MAX_U:
1384    case OPC_MAX_S:
1385    case OPC_CMPV_U:
1386    case OPC_CMPV_S:
1387    case OPC_MUL_U24:
1388    case OPC_MUL_S24:
1389    case OPC_MULL_U:
1390    case OPC_CLZ_S:
1391       return 0;
1392 
1393    case OPC_ABSNEG_S:
1394       return IR3_REG_SABS | IR3_REG_SNEG;
1395 
1396    case OPC_AND_B:
1397    case OPC_OR_B:
1398    case OPC_NOT_B:
1399    case OPC_XOR_B:
1400    case OPC_BFREV_B:
1401    case OPC_CLZ_B:
1402    case OPC_SHL_B:
1403    case OPC_SHR_B:
1404    case OPC_ASHR_B:
1405    case OPC_MGEN_B:
1406    case OPC_GETBIT_B:
1407    case OPC_CBITS_B:
1408       return IR3_REG_BNOT;
1409 
1410    default:
1411       return 0;
1412    }
1413 }
1414 
1415 /* map cat3 instructions to valid abs/neg flags: */
1416 static inline unsigned
ir3_cat3_absneg(opc_t opc)1417 ir3_cat3_absneg(opc_t opc)
1418 {
1419    switch (opc) {
1420    case OPC_MAD_F16:
1421    case OPC_MAD_F32:
1422    case OPC_SEL_F16:
1423    case OPC_SEL_F32:
1424       return IR3_REG_FNEG;
1425 
1426    case OPC_MAD_U16:
1427    case OPC_MADSH_U16:
1428    case OPC_MAD_S16:
1429    case OPC_MADSH_M16:
1430    case OPC_MAD_U24:
1431    case OPC_MAD_S24:
1432    case OPC_SEL_S16:
1433    case OPC_SEL_S32:
1434    case OPC_SAD_S16:
1435    case OPC_SAD_S32:
1436       /* neg *may* work on 3rd src.. */
1437 
1438    case OPC_SEL_B16:
1439    case OPC_SEL_B32:
1440 
1441    case OPC_SHRM:
1442    case OPC_SHLM:
1443    case OPC_SHRG:
1444    case OPC_SHLG:
1445    case OPC_ANDG:
1446    case OPC_WMM:
1447    case OPC_WMM_ACCU:
1448 
1449    default:
1450       return 0;
1451    }
1452 }
1453 
1454 /* Return the type (float, int, or uint) the op uses when converting from the
1455  * internal result of the op (which is assumed to be the same size as the
1456  * sources) to the destination when they are not the same size. If F32 it does
1457  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1458  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1459  * doesn't do anything sensible or is unknown.
1460  */
1461 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1462 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1463 {
1464    *can_fold = true;
1465    switch (instr->opc) {
1466    case OPC_ADD_F:
1467    case OPC_MUL_F:
1468    case OPC_BARY_F:
1469    case OPC_MAD_F32:
1470    case OPC_MAD_F16:
1471    case OPC_WMM:
1472    case OPC_WMM_ACCU:
1473       return TYPE_F32;
1474 
1475    case OPC_ADD_U:
1476    case OPC_SUB_U:
1477    case OPC_MIN_U:
1478    case OPC_MAX_U:
1479    case OPC_AND_B:
1480    case OPC_OR_B:
1481    case OPC_NOT_B:
1482    case OPC_XOR_B:
1483    case OPC_MUL_U24:
1484    case OPC_MULL_U:
1485    case OPC_SHL_B:
1486    case OPC_SHR_B:
1487    case OPC_ASHR_B:
1488    case OPC_MAD_U24:
1489    case OPC_SHRM:
1490    case OPC_SHLM:
1491    case OPC_SHRG:
1492    case OPC_SHLG:
1493    case OPC_ANDG:
1494    /* Comparison ops zero-extend/truncate their results, so consider them as
1495     * unsigned here.
1496     */
1497    case OPC_CMPS_F:
1498    case OPC_CMPV_F:
1499    case OPC_CMPS_U:
1500    case OPC_CMPS_S:
1501       return TYPE_U32;
1502 
1503    case OPC_ADD_S:
1504    case OPC_SUB_S:
1505    case OPC_MIN_S:
1506    case OPC_MAX_S:
1507    case OPC_ABSNEG_S:
1508    case OPC_MUL_S24:
1509    case OPC_MAD_S24:
1510       return TYPE_S32;
1511 
1512    /* We assume that any move->move folding that could be done was done by
1513     * NIR.
1514     */
1515    case OPC_MOV:
1516    default:
1517       *can_fold = false;
1518       return TYPE_U32;
1519    }
1520 }
1521 
1522 /* Return the src and dst types for the conversion which is already folded
1523  * into the op. We can assume that instr has folded in a conversion from
1524  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1525  * to call if ir3_output_conv_type() returns can_fold = true.
1526  */
1527 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1528 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1529 {
1530    switch (instr->opc) {
1531    case OPC_CMPS_F:
1532    case OPC_CMPV_F:
1533    case OPC_CMPS_U:
1534    case OPC_CMPS_S:
1535       /* Comparisons only return 0/1 and the size of the comparison sources
1536        * is irrelevant, never consider them as having an output conversion
1537        * by returning a type with the dest size here:
1538        */
1539       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1540                                                     : full_type(base_type);
1541 
1542    case OPC_BARY_F:
1543       /* bary.f doesn't have an explicit source, but we can assume here that
1544        * the varying data it reads is in fp32.
1545        *
1546        * This may be fp16 on older gen's depending on some register
1547        * settings, but it's probably not worth plumbing that through for a
1548        * small improvement that NIR would hopefully handle for us anyway.
1549        */
1550       return TYPE_F32;
1551 
1552    case OPC_FLAT_B:
1553       /* Treat the input data as u32 if not interpolating. */
1554       return TYPE_U32;
1555 
1556    default:
1557       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1558                                                     : full_type(base_type);
1559    }
1560 }
1561 
1562 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1563 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1564 {
1565    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1566                                                  : full_type(base_type);
1567 }
1568 
1569 /* Some instructions have signed/unsigned variants which are identical except
1570  * for whether the folded conversion sign-extends or zero-extends, and we can
1571  * fold in a mismatching move by rewriting the opcode. Return the opcode to
1572  * switch signedness, and whether one exists.
1573  */
1574 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1575 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1576 {
1577    switch (opc) {
1578 #define PAIR(u, s)                                                             \
1579    case OPC_##u:                                                               \
1580       return OPC_##s;                                                          \
1581    case OPC_##s:                                                               \
1582       return OPC_##u;
1583       PAIR(ADD_U, ADD_S)
1584       PAIR(SUB_U, SUB_S)
1585       /* Note: these are only identical when the sources are half, but that's
1586        * the only case we call this function for anyway.
1587        */
1588       PAIR(MUL_U24, MUL_S24)
1589 
1590    default:
1591       *can_swap = false;
1592       return opc;
1593    }
1594 }
1595 
1596 #define MASK(n) ((1 << (n)) - 1)
1597 
1598 /* iterator for an instructions's sources (reg), also returns src #: */
1599 #define foreach_src_n(__srcreg, __n, __instr)                                  \
1600    if ((__instr)->srcs_count)                                                  \
1601       for (struct ir3_register *__srcreg = (void *)~0; __srcreg;               \
1602            __srcreg = NULL)                                                    \
1603          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1604               __n++)                                                           \
1605             if ((__srcreg = (__instr)->srcs[__n]))
1606 
1607 /* iterator for an instructions's sources (reg): */
1608 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1609 
1610 /* iterator for an instructions's destinations (reg), also returns dst #: */
1611 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
1612    if ((__instr)->dsts_count)                                                  \
1613       for (struct ir3_register *__dstreg = (void *)~0; __dstreg;               \
1614            __dstreg = NULL)                                                    \
1615          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1616               __n++)                                                           \
1617             if ((__dstreg = (__instr)->dsts[__n]))
1618 
1619 /* iterator for an instructions's destinations (reg): */
1620 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1621 
1622 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1623 __ssa_src_cnt(struct ir3_instruction *instr)
1624 {
1625    return instr->srcs_count + instr->deps_count;
1626 }
1627 
1628 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1629 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1630 {
1631    if (n >= instr->srcs_count)
1632       return true;
1633    return false;
1634 }
1635 
1636 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1637 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1638 {
1639    if (__is_false_dep(instr, n))
1640       return &instr->deps[n - instr->srcs_count];
1641    if (ssa(instr->srcs[n]))
1642       return &instr->srcs[n]->def->instr;
1643    return NULL;
1644 }
1645 
1646 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
1647    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
1648       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
1649            __n++)                                                              \
1650          if ((__srcp = __ssa_srcp_n(__instr, __n)))
1651 
1652 #define foreach_ssa_srcp(__srcp, __instr)                                      \
1653    foreach_ssa_srcp_n (__srcp, __i, __instr)
1654 
1655 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1656 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
1657    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
1658         __srcinst = NULL)                                                      \
1659       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
1660          if ((__srcinst = *__srcp))
1661 
1662 /* iterator for an instruction's SSA sources (instr): */
1663 #define foreach_ssa_src(__srcinst, __instr)                                    \
1664    foreach_ssa_src_n (__srcinst, __i, __instr)
1665 
1666 /* iterators for shader inputs: */
1667 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
1668    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
1669         __ininstr = NULL)                                                      \
1670       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
1671          if ((__ininstr = (__ir)->inputs[__cnt]))
1672 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1673 
1674 /* iterators for instructions: */
1675 #define foreach_instr(__instr, __list)                                         \
1676    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1677 #define foreach_instr_rev(__instr, __list)                                     \
1678    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1679 #define foreach_instr_safe(__instr, __list)                                    \
1680    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1681 #define foreach_instr_from_safe(__instr, __start, __list)                      \
1682    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
1683                                  __list, node)
1684 
1685 /* iterators for blocks: */
1686 #define foreach_block(__block, __list)                                         \
1687    list_for_each_entry (struct ir3_block, __block, __list, node)
1688 #define foreach_block_safe(__block, __list)                                    \
1689    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1690 #define foreach_block_rev(__block, __list)                                     \
1691    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1692 
1693 /* iterators for arrays: */
1694 #define foreach_array(__array, __list)                                         \
1695    list_for_each_entry (struct ir3_array, __array, __list, node)
1696 #define foreach_array_safe(__array, __list)                                    \
1697    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1698 
1699 #define IR3_PASS(ir, pass, ...)                                                \
1700    ({                                                                          \
1701       bool progress = pass(ir, ##__VA_ARGS__);                                 \
1702       if (progress) {                                                          \
1703          ir3_debug_print(ir, "AFTER: " #pass);                                 \
1704          ir3_validate(ir);                                                     \
1705       }                                                                        \
1706       progress;                                                                \
1707    })
1708 
1709 /* validate: */
1710 void ir3_validate(struct ir3 *ir);
1711 
1712 /* dump: */
1713 void ir3_print(struct ir3 *ir);
1714 void ir3_print_instr(struct ir3_instruction *instr);
1715 
1716 struct log_stream;
1717 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1718 
1719 /* delay calculation: */
1720 int ir3_delayslots(struct ir3_instruction *assigner,
1721                    struct ir3_instruction *consumer, unsigned n, bool soft);
1722 unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
1723                                     struct ir3_instruction *consumer,
1724                                     unsigned assigner_n, unsigned consumer_n);
1725 unsigned ir3_delay_calc(struct ir3_block *block,
1726                         struct ir3_instruction *instr, bool mergedregs);
1727 
1728 /* estimated (ss)/(sy) delay calculation */
1729 
1730 static inline bool
is_local_mem_load(struct ir3_instruction * instr)1731 is_local_mem_load(struct ir3_instruction *instr)
1732 {
1733    return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
1734       instr->opc == OPC_LDLW;
1735 }
1736 
1737 /* Does this instruction need (ss) to wait for its result? */
1738 static inline bool
is_ss_producer(struct ir3_instruction * instr)1739 is_ss_producer(struct ir3_instruction *instr)
1740 {
1741    foreach_dst (dst, instr) {
1742       if (dst->flags & IR3_REG_SHARED)
1743          return true;
1744    }
1745    return is_sfu(instr) || is_local_mem_load(instr);
1746 }
1747 
1748 /* The soft delay for approximating the cost of (ss). */
1749 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)1750 soft_ss_delay(struct ir3_instruction *instr)
1751 {
1752    /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
1753     * using nop's instead of (ss) is:
1754     *
1755     *     8 - single warp
1756     *     9 - two warps
1757     *    10 - four warps
1758     *
1759     * and so on. Not quite sure where it tapers out (ie. how many warps share an
1760     * SFU unit). But 10 seems like a reasonable # to choose:
1761     */
1762    if (is_sfu(instr) || is_local_mem_load(instr))
1763       return 10;
1764 
1765    /* The blob adds 6 nops between shared producers and consumers, and before we
1766     * used (ss) this was sufficient in most cases.
1767     */
1768    return 6;
1769 }
1770 
1771 static inline bool
is_sy_producer(struct ir3_instruction * instr)1772 is_sy_producer(struct ir3_instruction *instr)
1773 {
1774    return is_tex_or_prefetch(instr) ||
1775       (is_load(instr) && !is_local_mem_load(instr)) ||
1776       is_atomic(instr->opc);
1777 }
1778 
1779 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)1780 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
1781 {
1782    /* TODO: this is just an optimistic guess, we can do better post-RA.
1783     */
1784    bool double_wavesize =
1785       shader->type == MESA_SHADER_FRAGMENT ||
1786       shader->type == MESA_SHADER_COMPUTE;
1787 
1788    unsigned components = reg_elems(instr->dsts[0]);
1789 
1790    /* These numbers come from counting the number of delay slots to get
1791     * cat5/cat6 results back using nops instead of (sy). Note that these numbers
1792     * are with the result preloaded to cache by loading it before in the same
1793     * shader - uncached results are much larger.
1794     *
1795     * Note: most ALU instructions can't complete at the full doubled rate, so
1796     * they take 2 cycles. The only exception is fp16 instructions with no
1797     * built-in conversions. Therefore divide the latency by 2.
1798     *
1799     * TODO: Handle this properly in the scheduler and remove this.
1800     */
1801    if (instr->opc == OPC_LDC) {
1802       if (double_wavesize)
1803          return (21 + 8 * components) / 2;
1804       else
1805          return 18 + 4 * components;
1806    } else if (is_tex_or_prefetch(instr)) {
1807       if (double_wavesize) {
1808          switch (components) {
1809          case 1: return 58 / 2;
1810          case 2: return 60 / 2;
1811          case 3: return 77 / 2;
1812          case 4: return 79 / 2;
1813          default: unreachable("bad number of components");
1814          }
1815       } else {
1816          switch (components) {
1817          case 1: return 51;
1818          case 2: return 53;
1819          case 3: return 62;
1820          case 4: return 64;
1821          default: unreachable("bad number of components");
1822          }
1823       }
1824    } else {
1825       /* TODO: measure other cat6 opcodes like ldg */
1826       if (double_wavesize)
1827          return (172 + components) / 2;
1828       else
1829          return 109 + components;
1830    }
1831 }
1832 
1833 
1834 /* unreachable block elimination: */
1835 bool ir3_remove_unreachable(struct ir3 *ir);
1836 
1837 /* dead code elimination: */
1838 struct ir3_shader_variant;
1839 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1840 
1841 /* fp16 conversion folding */
1842 bool ir3_cf(struct ir3 *ir);
1843 
1844 /* copy-propagate: */
1845 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1846 bool ir3_cp_postsched(struct ir3 *ir);
1847 
1848 /* common subexpression elimination: */
1849 bool ir3_cse(struct ir3 *ir);
1850 
1851 /* Make arrays SSA */
1852 bool ir3_array_to_ssa(struct ir3 *ir);
1853 
1854 /* scheduling: */
1855 bool ir3_sched_add_deps(struct ir3 *ir);
1856 int ir3_sched(struct ir3 *ir);
1857 
1858 struct ir3_context;
1859 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1860 
1861 /* register assignment: */
1862 int ir3_ra(struct ir3_shader_variant *v);
1863 
1864 /* lower subgroup ops: */
1865 bool ir3_lower_subgroups(struct ir3 *ir);
1866 
1867 /* legalize: */
1868 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1869 
1870 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)1871 ir3_has_latency_to_hide(struct ir3 *ir)
1872 {
1873    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
1874     * know the nature of the fragment shader.  Just assume it will have
1875     * latency to hide:
1876     */
1877    if (ir->type != MESA_SHADER_FRAGMENT)
1878       return true;
1879 
1880    foreach_block (block, &ir->block_list) {
1881       foreach_instr (instr, &block->instr_list) {
1882          if (is_tex_or_prefetch(instr))
1883             return true;
1884 
1885          if (is_load(instr)) {
1886             switch (instr->opc) {
1887             case OPC_LDLV:
1888             case OPC_LDL:
1889             case OPC_LDLW:
1890                break;
1891             default:
1892                return true;
1893             }
1894          }
1895       }
1896    }
1897 
1898    return false;
1899 }
1900 
1901 /* ************************************************************************* */
1902 /* instruction helpers */
1903 
1904 /* creates SSA src of correct type (ie. half vs full precision) */
1905 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)1906 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
1907           unsigned flags)
1908 {
1909    struct ir3_register *reg;
1910    if (src->dsts[0]->flags & IR3_REG_HALF)
1911       flags |= IR3_REG_HALF;
1912    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
1913    reg->def = src->dsts[0];
1914    reg->wrmask = src->dsts[0]->wrmask;
1915    return reg;
1916 }
1917 
1918 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)1919 __ssa_dst(struct ir3_instruction *instr)
1920 {
1921    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
1922    reg->instr = instr;
1923    return reg;
1924 }
1925 
1926 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)1927 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1928 {
1929    struct ir3_instruction *mov;
1930    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1931 
1932    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1933    mov->cat1.src_type = type;
1934    mov->cat1.dst_type = type;
1935    __ssa_dst(mov)->flags |= flags;
1936    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
1937 
1938    return mov;
1939 }
1940 
1941 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)1942 create_immed(struct ir3_block *block, uint32_t val)
1943 {
1944    return create_immed_typed(block, val, TYPE_U32);
1945 }
1946 
1947 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)1948 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
1949 {
1950    struct ir3_instruction *mov;
1951    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1952 
1953    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1954    mov->cat1.src_type = type;
1955    mov->cat1.dst_type = type;
1956    __ssa_dst(mov)->flags |= flags;
1957    ir3_src_create(mov, n, IR3_REG_CONST | flags);
1958 
1959    return mov;
1960 }
1961 
1962 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)1963 create_uniform(struct ir3_block *block, unsigned n)
1964 {
1965    return create_uniform_typed(block, n, TYPE_F32);
1966 }
1967 
1968 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)1969 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
1970                         struct ir3_instruction *address)
1971 {
1972    struct ir3_instruction *mov;
1973 
1974    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1975    mov->cat1.src_type = type;
1976    mov->cat1.dst_type = type;
1977    __ssa_dst(mov);
1978    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1979 
1980    ir3_instr_set_address(mov, address);
1981 
1982    return mov;
1983 }
1984 
1985 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)1986 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1987 {
1988    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1989    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1990 
1991    __ssa_dst(instr)->flags |= flags;
1992    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
1993       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1994       src_reg->array = src->dsts[0]->array;
1995    } else {
1996       __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
1997    }
1998    debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
1999    instr->cat1.src_type = type;
2000    instr->cat1.dst_type = type;
2001    return instr;
2002 }
2003 
2004 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2005 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2006         type_t dst_type)
2007 {
2008    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2009    unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
2010    unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
2011 
2012    debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2013 
2014    __ssa_dst(instr)->flags |= dst_flags;
2015    __ssa_src(instr, src, 0);
2016    instr->cat1.src_type = src_type;
2017    instr->cat1.dst_type = dst_type;
2018    debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2019    return instr;
2020 }
2021 
2022 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2023 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2024 {
2025    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2026 
2027    struct ir3_register *dst = __ssa_dst(instr);
2028    dst->flags |= IR3_REG_SHARED;
2029    dst->wrmask = (1 << components) - 1;
2030    instr->repeat = components - 1;
2031    return instr;
2032 }
2033 
2034 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2035 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2036                  unsigned components)
2037 {
2038    struct ir3_instruction *instr =
2039       ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2040 
2041    struct ir3_register *dst = __ssa_dst(instr);
2042    dst->flags |= IR3_REG_SHARED;
2043    dst->wrmask = (1 << components) - 1;
2044 
2045    __ssa_src(instr, src, 0);
2046 
2047    return instr;
2048 }
2049 
2050 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2051 ir3_NOP(struct ir3_block *block)
2052 {
2053    return ir3_instr_create(block, OPC_NOP, 0, 0);
2054 }
2055 
2056 /* clang-format off */
2057 #define __INSTR0(flag, name, opc)                                              \
2058 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block)      \
2059 {                                                                              \
2060    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0);         \
2061    instr->flags |= flag;                                                       \
2062    return instr;                                                               \
2063 }
2064 /* clang-format on */
2065 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2066 #define INSTR0(name)     __INSTR0(0, name, OPC_##name)
2067 
2068 /* clang-format off */
2069 #define __INSTR1(flag, dst_count, name, opc)                                   \
2070 static inline struct ir3_instruction *ir3_##name(                              \
2071    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags)        \
2072 {                                                                              \
2073    struct ir3_instruction *instr =                                             \
2074       ir3_instr_create(block, opc, dst_count, 1);                              \
2075    for (unsigned i = 0; i < dst_count; i++)                                    \
2076       __ssa_dst(instr);                                                        \
2077    __ssa_src(instr, a, aflags);                                                \
2078    instr->flags |= flag;                                                       \
2079    return instr;                                                               \
2080 }
2081 /* clang-format on */
2082 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2083 #define INSTR1(name)      __INSTR1(0, 1, name, OPC_##name)
2084 #define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name)
2085 
2086 /* clang-format off */
2087 #define __INSTR2(flag, dst_count, name, opc)                                   \
2088 static inline struct ir3_instruction *ir3_##name(                              \
2089    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2090    struct ir3_instruction *b, unsigned bflags)                                 \
2091 {                                                                              \
2092    struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2093    for (unsigned i = 0; i < dst_count; i++)                                    \
2094       __ssa_dst(instr);                                                        \
2095    __ssa_src(instr, a, aflags);                                                \
2096    __ssa_src(instr, b, bflags);                                                \
2097    instr->flags |= flag;                                                       \
2098    return instr;                                                               \
2099 }
2100 /* clang-format on */
2101 #define INSTR2F(f, name)   __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2102 #define INSTR2(name)       __INSTR2(0, 1, name, OPC_##name)
2103 #define INSTR2NODST(name)  __INSTR2(0, 0, name, OPC_##name)
2104 
2105 /* clang-format off */
2106 #define __INSTR3(flag, dst_count, name, opc)                                   \
2107 static inline struct ir3_instruction *ir3_##name(                              \
2108    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2109    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2110    unsigned cflags)                                                            \
2111 {                                                                              \
2112    struct ir3_instruction *instr =                                             \
2113       ir3_instr_create(block, opc, dst_count, 3);                              \
2114    for (unsigned i = 0; i < dst_count; i++)                                    \
2115       __ssa_dst(instr);                                                        \
2116    __ssa_src(instr, a, aflags);                                                \
2117    __ssa_src(instr, b, bflags);                                                \
2118    __ssa_src(instr, c, cflags);                                                \
2119    instr->flags |= flag;                                                       \
2120    return instr;                                                               \
2121 }
2122 /* clang-format on */
2123 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2124 #define INSTR3(name)      __INSTR3(0, 1, name, OPC_##name)
2125 #define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name)
2126 
2127 /* clang-format off */
2128 #define __INSTR4(flag, dst_count, name, opc)                                   \
2129 static inline struct ir3_instruction *ir3_##name(                              \
2130    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2131    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2132    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
2133 {                                                                              \
2134    struct ir3_instruction *instr =                                             \
2135       ir3_instr_create(block, opc, dst_count, 4);                              \
2136    for (unsigned i = 0; i < dst_count; i++)                                    \
2137       __ssa_dst(instr);                                                        \
2138    __ssa_src(instr, a, aflags);                                                \
2139    __ssa_src(instr, b, bflags);                                                \
2140    __ssa_src(instr, c, cflags);                                                \
2141    __ssa_src(instr, d, dflags);                                                \
2142    instr->flags |= flag;                                                       \
2143    return instr;                                                               \
2144 }
2145 /* clang-format on */
2146 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2147 #define INSTR4(name)      __INSTR4(0, 1, name, OPC_##name)
2148 #define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name)
2149 
2150 /* clang-format off */
2151 #define __INSTR5(flag, name, opc)                                              \
2152 static inline struct ir3_instruction *ir3_##name(                              \
2153    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2154    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2155    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2156    struct ir3_instruction *e, unsigned eflags)                                 \
2157 {                                                                              \
2158    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5);         \
2159    __ssa_dst(instr);                                                           \
2160    __ssa_src(instr, a, aflags);                                                \
2161    __ssa_src(instr, b, bflags);                                                \
2162    __ssa_src(instr, c, cflags);                                                \
2163    __ssa_src(instr, d, dflags);                                                \
2164    __ssa_src(instr, e, eflags);                                                \
2165    instr->flags |= flag;                                                       \
2166    return instr;                                                               \
2167 }
2168 /* clang-format on */
2169 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2170 #define INSTR5(name)     __INSTR5(0, name, OPC_##name)
2171 
2172 /* clang-format off */
2173 #define __INSTR6(flag, dst_count, name, opc)                                   \
2174 static inline struct ir3_instruction *ir3_##name(                              \
2175    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2176    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2177    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2178    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
2179    unsigned fflags)                                                            \
2180 {                                                                              \
2181    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6);         \
2182    for (unsigned i = 0; i < dst_count; i++)                                    \
2183       __ssa_dst(instr);                                                        \
2184    __ssa_src(instr, a, aflags);                                                \
2185    __ssa_src(instr, b, bflags);                                                \
2186    __ssa_src(instr, c, cflags);                                                \
2187    __ssa_src(instr, d, dflags);                                                \
2188    __ssa_src(instr, e, eflags);                                                \
2189    __ssa_src(instr, f, fflags);                                                \
2190    instr->flags |= flag;                                                       \
2191    return instr;                                                               \
2192 }
2193 /* clang-format on */
2194 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2195 #define INSTR6(name)      __INSTR6(0, 1, name, OPC_##name)
2196 #define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name)
2197 
2198 /* cat0 instructions: */
2199 INSTR1NODST(B)
INSTR0(JUMP)2200 INSTR0(JUMP)
2201 INSTR1NODST(KILL)
2202 INSTR1NODST(DEMOTE)
2203 INSTR0(END)
2204 INSTR0(CHSH)
2205 INSTR0(CHMASK)
2206 INSTR1NODST(PREDT)
2207 INSTR0(PREDF)
2208 INSTR0(PREDE)
2209 INSTR0(GETONE)
2210 INSTR0(SHPS)
2211 INSTR0(SHPE)
2212 
2213 /* cat1 macros */
2214 INSTR1(ANY_MACRO)
2215 INSTR1(ALL_MACRO)
2216 INSTR1(READ_FIRST_MACRO)
2217 INSTR2(READ_COND_MACRO)
2218 
2219 static inline struct ir3_instruction *
2220 ir3_ELECT_MACRO(struct ir3_block *block)
2221 {
2222    struct ir3_instruction *instr =
2223       ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2224    __ssa_dst(instr);
2225    return instr;
2226 }
2227 
2228 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2229 ir3_SHPS_MACRO(struct ir3_block *block)
2230 {
2231    struct ir3_instruction *instr =
2232       ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2233    __ssa_dst(instr);
2234    return instr;
2235 }
2236 
2237 /* cat2 instructions, most 2 src but some 1 src: */
2238 INSTR2(ADD_F)
INSTR2(MIN_F)2239 INSTR2(MIN_F)
2240 INSTR2(MAX_F)
2241 INSTR2(MUL_F)
2242 INSTR1(SIGN_F)
2243 INSTR2(CMPS_F)
2244 INSTR1(ABSNEG_F)
2245 INSTR2(CMPV_F)
2246 INSTR1(FLOOR_F)
2247 INSTR1(CEIL_F)
2248 INSTR1(RNDNE_F)
2249 INSTR1(RNDAZ_F)
2250 INSTR1(TRUNC_F)
2251 INSTR2(ADD_U)
2252 INSTR2(ADD_S)
2253 INSTR2(SUB_U)
2254 INSTR2(SUB_S)
2255 INSTR2(CMPS_U)
2256 INSTR2(CMPS_S)
2257 INSTR2(MIN_U)
2258 INSTR2(MIN_S)
2259 INSTR2(MAX_U)
2260 INSTR2(MAX_S)
2261 INSTR1(ABSNEG_S)
2262 INSTR2(AND_B)
2263 INSTR2(OR_B)
2264 INSTR1(NOT_B)
2265 INSTR2(XOR_B)
2266 INSTR2(CMPV_U)
2267 INSTR2(CMPV_S)
2268 INSTR2(MUL_U24)
2269 INSTR2(MUL_S24)
2270 INSTR2(MULL_U)
2271 INSTR1(BFREV_B)
2272 INSTR1(CLZ_S)
2273 INSTR1(CLZ_B)
2274 INSTR2(SHL_B)
2275 INSTR2(SHR_B)
2276 INSTR2(ASHR_B)
2277 INSTR2(BARY_F)
2278 INSTR2(FLAT_B)
2279 INSTR2(MGEN_B)
2280 INSTR2(GETBIT_B)
2281 INSTR1(SETRM)
2282 INSTR1(CBITS_B)
2283 INSTR2(SHB)
2284 INSTR2(MSAD)
2285 
2286 /* cat3 instructions: */
2287 INSTR3(MAD_U16)
2288 INSTR3(MADSH_U16)
2289 INSTR3(MAD_S16)
2290 INSTR3(MADSH_M16)
2291 INSTR3(MAD_U24)
2292 INSTR3(MAD_S24)
2293 INSTR3(MAD_F16)
2294 INSTR3(MAD_F32)
2295 INSTR3(DP2ACC)
2296 INSTR3(DP4ACC)
2297 /* NOTE: SEL_B32 checks for zero vs nonzero */
2298 INSTR3(SEL_B16)
2299 INSTR3(SEL_B32)
2300 INSTR3(SEL_S16)
2301 INSTR3(SEL_S32)
2302 INSTR3(SEL_F16)
2303 INSTR3(SEL_F32)
2304 INSTR3(SAD_S16)
2305 INSTR3(SAD_S32)
2306 
2307 /* cat4 instructions: */
2308 INSTR1(RCP)
2309 INSTR1(RSQ)
2310 INSTR1(HRSQ)
2311 INSTR1(LOG2)
2312 INSTR1(HLOG2)
2313 INSTR1(EXP2)
2314 INSTR1(HEXP2)
2315 INSTR1(SIN)
2316 INSTR1(COS)
2317 INSTR1(SQRT)
2318 
2319 /* cat5 instructions: */
2320 INSTR1(DSX)
2321 INSTR1(DSXPP_MACRO)
2322 INSTR1(DSY)
2323 INSTR1(DSYPP_MACRO)
2324 INSTR1F(3D, DSX)
2325 INSTR1F(3D, DSY)
2326 INSTR1(RGETPOS)
2327 
2328 static inline struct ir3_instruction *
2329 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2330         unsigned flags, struct ir3_instruction *samp_tex,
2331         struct ir3_instruction *src0, struct ir3_instruction *src1)
2332 {
2333    struct ir3_instruction *sam;
2334    unsigned nreg = 0;
2335 
2336    if (flags & IR3_INSTR_S2EN) {
2337       nreg++;
2338    }
2339    if (src0) {
2340       nreg++;
2341    }
2342    if (src1) {
2343       nreg++;
2344    }
2345 
2346    sam = ir3_instr_create(block, opc, 1, nreg);
2347    sam->flags |= flags;
2348    __ssa_dst(sam)->wrmask = wrmask;
2349    if (flags & IR3_INSTR_S2EN) {
2350       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2351    }
2352    if (src0) {
2353       __ssa_src(sam, src0, 0);
2354    }
2355    if (src1) {
2356       __ssa_src(sam, src1, 0);
2357    }
2358    sam->cat5.type = type;
2359 
2360    return sam;
2361 }
2362 
2363 /* cat6 instructions: */
2364 INSTR0(GETFIBERID)
2365 INSTR2(LDLV)
2366 INSTR3(LDG)
2367 INSTR3(LDL)
2368 INSTR3(LDLW)
2369 INSTR3(LDP)
2370 INSTR4NODST(STG)
2371 INSTR3NODST(STL)
2372 INSTR3NODST(STLW)
2373 INSTR3NODST(STP)
2374 INSTR1(RESINFO)
2375 INSTR1(RESFMT)
2376 INSTR2(ATOMIC_ADD)
2377 INSTR2(ATOMIC_SUB)
2378 INSTR2(ATOMIC_XCHG)
2379 INSTR2(ATOMIC_INC)
2380 INSTR2(ATOMIC_DEC)
2381 INSTR2(ATOMIC_CMPXCHG)
2382 INSTR2(ATOMIC_MIN)
2383 INSTR2(ATOMIC_MAX)
2384 INSTR2(ATOMIC_AND)
2385 INSTR2(ATOMIC_OR)
2386 INSTR2(ATOMIC_XOR)
2387 INSTR2(LDC)
2388 INSTR2(QUAD_SHUFFLE_BRCST)
2389 INSTR1(QUAD_SHUFFLE_HORIZ)
2390 INSTR1(QUAD_SHUFFLE_VERT)
2391 INSTR1(QUAD_SHUFFLE_DIAG)
2392 INSTR2NODST(LDC_K)
2393 INSTR2NODST(STC)
2394 #if GPU >= 600
2395 INSTR3NODST(STIB);
2396 INSTR2(LDIB);
2397 INSTR5(LDG_A);
2398 INSTR6NODST(STG_A);
2399 INSTR2(ATOMIC_G_ADD)
2400 INSTR2(ATOMIC_G_SUB)
2401 INSTR2(ATOMIC_G_XCHG)
2402 INSTR2(ATOMIC_G_INC)
2403 INSTR2(ATOMIC_G_DEC)
2404 INSTR2(ATOMIC_G_CMPXCHG)
2405 INSTR2(ATOMIC_G_MIN)
2406 INSTR2(ATOMIC_G_MAX)
2407 INSTR2(ATOMIC_G_AND)
2408 INSTR2(ATOMIC_G_OR)
2409 INSTR2(ATOMIC_G_XOR)
2410 INSTR3(ATOMIC_B_ADD)
2411 INSTR3(ATOMIC_B_SUB)
2412 INSTR3(ATOMIC_B_XCHG)
2413 INSTR3(ATOMIC_B_INC)
2414 INSTR3(ATOMIC_B_DEC)
2415 INSTR3(ATOMIC_B_CMPXCHG)
2416 INSTR3(ATOMIC_B_MIN)
2417 INSTR3(ATOMIC_B_MAX)
2418 INSTR3(ATOMIC_B_AND)
2419 INSTR3(ATOMIC_B_OR)
2420 INSTR3(ATOMIC_B_XOR)
2421 #elif GPU >= 400
2422 INSTR3(LDGB)
2423 #if GPU >= 500
2424 INSTR3(LDIB)
2425 #endif
2426 INSTR4NODST(STGB)
2427 INSTR4NODST(STIB)
2428 INSTR4(ATOMIC_S_ADD)
2429 INSTR4(ATOMIC_S_SUB)
2430 INSTR4(ATOMIC_S_XCHG)
2431 INSTR4(ATOMIC_S_INC)
2432 INSTR4(ATOMIC_S_DEC)
2433 INSTR4(ATOMIC_S_CMPXCHG)
2434 INSTR4(ATOMIC_S_MIN)
2435 INSTR4(ATOMIC_S_MAX)
2436 INSTR4(ATOMIC_S_AND)
2437 INSTR4(ATOMIC_S_OR)
2438 INSTR4(ATOMIC_S_XOR)
2439 #endif
2440 
2441 /* cat7 instructions: */
2442 INSTR0(BAR)
2443 INSTR0(FENCE)
2444 
2445 /* ************************************************************************* */
2446 #include "bitset.h"
2447 
2448 #define MAX_REG 256
2449 
2450 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2451 
2452 typedef struct {
2453    bool mergedregs;
2454    regmaskstate_t mask;
2455 } regmask_t;
2456 
2457 static inline bool
__regmask_get(regmask_t * regmask,bool half,unsigned n)2458 __regmask_get(regmask_t *regmask, bool half, unsigned n)
2459 {
2460    if (regmask->mergedregs) {
2461       /* a6xx+ case, with merged register file, we track things in terms
2462        * of half-precision registers, with a full precisions register
2463        * using two half-precision slots.
2464        *
2465        * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2466        * avoid having them alias normal full regs.
2467        */
2468       if (half && !is_reg_num_special(n)) {
2469          return BITSET_TEST(regmask->mask, n);
2470       } else {
2471          n *= 2;
2472          return BITSET_TEST(regmask->mask, n) ||
2473                 BITSET_TEST(regmask->mask, n + 1);
2474       }
2475    } else {
2476       /* pre a6xx case, with separate register file for half and full
2477        * precision:
2478        */
2479       if (half)
2480          n += MAX_REG;
2481       return BITSET_TEST(regmask->mask, n);
2482    }
2483 }
2484 
2485 static inline void
__regmask_set(regmask_t * regmask,bool half,unsigned n)2486 __regmask_set(regmask_t *regmask, bool half, unsigned n)
2487 {
2488    if (regmask->mergedregs) {
2489       /* a6xx+ case, with merged register file, we track things in terms
2490        * of half-precision registers, with a full precisions register
2491        * using two half-precision slots:
2492        */
2493       if (half && !is_reg_num_special(n)) {
2494          BITSET_SET(regmask->mask, n);
2495       } else {
2496          n *= 2;
2497          BITSET_SET(regmask->mask, n);
2498          BITSET_SET(regmask->mask, n + 1);
2499       }
2500    } else {
2501       /* pre a6xx case, with separate register file for half and full
2502        * precision:
2503        */
2504       if (half)
2505          n += MAX_REG;
2506       BITSET_SET(regmask->mask, n);
2507    }
2508 }
2509 
2510 static inline void
__regmask_clear(regmask_t * regmask,bool half,unsigned n)2511 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
2512 {
2513    if (regmask->mergedregs) {
2514       /* a6xx+ case, with merged register file, we track things in terms
2515        * of half-precision registers, with a full precisions register
2516        * using two half-precision slots:
2517        */
2518       if (half && !is_reg_num_special(n)) {
2519          BITSET_CLEAR(regmask->mask, n);
2520       } else {
2521          n *= 2;
2522          BITSET_CLEAR(regmask->mask, n);
2523          BITSET_CLEAR(regmask->mask, n + 1);
2524       }
2525    } else {
2526       /* pre a6xx case, with separate register file for half and full
2527        * precision:
2528        */
2529       if (half)
2530          n += MAX_REG;
2531       BITSET_CLEAR(regmask->mask, n);
2532    }
2533 }
2534 
2535 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)2536 regmask_init(regmask_t *regmask, bool mergedregs)
2537 {
2538    memset(&regmask->mask, 0, sizeof(regmask->mask));
2539    regmask->mergedregs = mergedregs;
2540 }
2541 
2542 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)2543 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2544 {
2545    assert(dst->mergedregs == a->mergedregs);
2546    assert(dst->mergedregs == b->mergedregs);
2547 
2548    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2549       dst->mask[i] = a->mask[i] | b->mask[i];
2550 }
2551 
2552 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)2553 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
2554 {
2555    regmaskstate_t shared_mask;
2556    BITSET_ZERO(shared_mask);
2557 
2558    if (b->mergedregs) {
2559       BITSET_SET_RANGE(shared_mask, 2 * 4 * 48, 2 * 4 * 56 - 1);
2560    } else {
2561       BITSET_SET_RANGE(shared_mask, 4 * 48, 4 * 56 - 1);
2562    }
2563 
2564    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2565       dst->mask[i] = a->mask[i] | (b->mask[i] & shared_mask[i]);
2566 }
2567 
2568 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)2569 regmask_set(regmask_t *regmask, struct ir3_register *reg)
2570 {
2571    bool half = reg->flags & IR3_REG_HALF;
2572    if (reg->flags & IR3_REG_RELATIV) {
2573       for (unsigned i = 0; i < reg->size; i++)
2574          __regmask_set(regmask, half, reg->array.base + i);
2575    } else {
2576       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2577          if (mask & 1)
2578             __regmask_set(regmask, half, n);
2579    }
2580 }
2581 
2582 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)2583 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
2584 {
2585    bool half = reg->flags & IR3_REG_HALF;
2586    if (reg->flags & IR3_REG_RELATIV) {
2587       for (unsigned i = 0; i < reg->size; i++)
2588          __regmask_clear(regmask, half, reg->array.base + i);
2589    } else {
2590       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2591          if (mask & 1)
2592             __regmask_clear(regmask, half, n);
2593    }
2594 }
2595 
2596 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)2597 regmask_get(regmask_t *regmask, struct ir3_register *reg)
2598 {
2599    bool half = reg->flags & IR3_REG_HALF;
2600    if (reg->flags & IR3_REG_RELATIV) {
2601       for (unsigned i = 0; i < reg->size; i++)
2602          if (__regmask_get(regmask, half, reg->array.base + i))
2603             return true;
2604    } else {
2605       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2606          if (mask & 1)
2607             if (__regmask_get(regmask, half, n))
2608                return true;
2609    }
2610    return false;
2611 }
2612 /* ************************************************************************* */
2613 
2614 #endif /* IR3_H_ */
2615