1 /*
2  * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef IR3_H_
25 #define IR3_H_
26 
27 #include <stdbool.h>
28 #include <stdint.h>
29 
30 #include "compiler/shader_enums.h"
31 
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36 
37 #include "instr-a3xx.h"
38 
39 /* low level intermediate representation of an adreno shader program */
40 
41 struct ir3_compiler;
42 struct ir3;
43 struct ir3_instruction;
44 struct ir3_block;
45 
46 struct ir3_info {
47    void *data; /* used internally in ir3 assembler */
48    /* Size in bytes of the shader binary, including NIR constants and
49     * padding
50     */
51    uint32_t size;
52    /* byte offset from start of the shader to the NIR constant data. */
53    uint32_t constant_data_offset;
54    /* Size in dwords of the instructions. */
55    uint16_t sizedwords;
56    uint16_t instrs_count; /* expanded to account for rpt's */
57    uint16_t nops_count;   /* # of nop instructions, including nopN */
58    uint16_t mov_count;
59    uint16_t cov_count;
60    uint16_t stp_count;
61    uint16_t ldp_count;
62    /* NOTE: max_reg, etc, does not include registers not touched
63     * by the shader (ie. vertex fetched via VFD_DECODE but not
64     * touched by shader)
65     */
66    int8_t max_reg; /* highest GPR # used by shader */
67    int8_t max_half_reg;
68    int16_t max_const;
69    /* This is the maximum # of waves that can executed at once in one core,
70     * assuming that they are all executing this shader.
71     */
72    int8_t max_waves;
73    bool double_threadsize;
74    bool multi_dword_ldp_stp;
75 
76    /* number of sync bits: */
77    uint16_t ss, sy;
78 
79    /* estimate of number of cycles stalled on (ss) */
80    uint16_t sstall;
81 
82    uint16_t last_baryf; /* instruction # of last varying fetch */
83 
84    /* Number of instructions of a given category: */
85    uint16_t instrs_per_cat[8];
86 };
87 
88 struct ir3_merge_set {
89    uint16_t preferred_reg;
90    uint16_t size;
91    uint16_t alignment;
92 
93    unsigned interval_start;
94    unsigned spill_slot;
95 
96    unsigned regs_count;
97    struct ir3_register **regs;
98 };
99 
100 struct ir3_register {
101    enum {
102       IR3_REG_CONST = 0x001,
103       IR3_REG_IMMED = 0x002,
104       IR3_REG_HALF = 0x004,
105       /* Shared registers have the same value for all threads when read.
106        * They can only be written when one thread is active (that is, inside
107        * a "getone" block).
108        */
109       IR3_REG_SHARED = 0x008,
110       IR3_REG_RELATIV = 0x010,
111       IR3_REG_R = 0x020,
112       /* Most instructions, it seems, can do float abs/neg but not
113        * integer.  The CP pass needs to know what is intended (int or
114        * float) in order to do the right thing.  For this reason the
115        * abs/neg flags are split out into float and int variants.  In
116        * addition, .b (bitwise) operations, the negate is actually a
117        * bitwise not, so split that out into a new flag to make it
118        * more clear.
119        */
120       IR3_REG_FNEG = 0x040,
121       IR3_REG_FABS = 0x080,
122       IR3_REG_SNEG = 0x100,
123       IR3_REG_SABS = 0x200,
124       IR3_REG_BNOT = 0x400,
125       /* (ei) flag, end-input?  Set on last bary, presumably to signal
126        * that the shader needs no more input:
127        */
128       IR3_REG_EI = 0x2000,
129       /* meta-flags, for intermediate stages of IR, ie.
130        * before register assignment is done:
131        */
132       IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */
133       IR3_REG_ARRAY = 0x8000,
134 
135       /* Set on a use whenever the SSA value becomes dead after the current
136        * instruction.
137        */
138       IR3_REG_KILL = 0x10000,
139 
140       /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
141        * same SSA value in a single instruction, this is only set on the first
142        * use.
143        */
144       IR3_REG_FIRST_KILL = 0x20000,
145 
146       /* Set when a destination doesn't have any uses and is dead immediately
147        * after the instruction. This can happen even after optimizations for
148        * corner cases such as destinations of atomic instructions.
149        */
150       IR3_REG_UNUSED = 0x40000,
151    } flags;
152 
153    unsigned name;
154 
155    /* used for cat5 instructions, but also for internal/IR level
156     * tracking of what registers are read/written by an instruction.
157     * wrmask may be a bad name since it is used to represent both
158     * src and dst that touch multiple adjacent registers.
159     */
160    unsigned wrmask : 16; /* up to vec16 */
161 
162    /* for relative addressing, 32bits for array size is too small,
163     * but otoh we don't need to deal with disjoint sets, so instead
164     * use a simple size field (number of scalar components).
165     *
166     * Note the size field isn't important for relative const (since
167     * we don't have to do register allocation for constants).
168     */
169    unsigned size : 16;
170 
171    /* normal registers:
172     * the component is in the low two bits of the reg #, so
173     * rN.x becomes: (N << 2) | x
174     */
175    uint16_t num;
176    union {
177       /* immediate: */
178       int32_t iim_val;
179       uint32_t uim_val;
180       float fim_val;
181       /* relative: */
182       struct {
183          uint16_t id;
184          int16_t offset;
185          uint16_t base;
186       } array;
187    };
188 
189    /* For IR3_REG_DEST, pointer back to the instruction containing this
190     * register.
191     */
192    struct ir3_instruction *instr;
193 
194    /* For IR3_REG_SSA, src registers contain ptr back to assigning
195     * instruction.
196     *
197     * For IR3_REG_ARRAY, the pointer is back to the last dependent
198     * array access (although the net effect is the same, it points
199     * back to a previous instruction that we depend on).
200     */
201    struct ir3_register *def;
202 
203    /* Pointer to another register in the instruction that must share the same
204     * physical register. Each destination can be tied with one source, and
205     * they must have "tied" pointing to each other.
206     */
207    struct ir3_register *tied;
208 
209    unsigned spill_slot, next_use;
210 
211    unsigned merge_set_offset;
212    struct ir3_merge_set *merge_set;
213    unsigned interval_start, interval_end;
214 };
215 
216 /*
217  * Stupid/simple growable array implementation:
218  */
219 #define DECLARE_ARRAY(type, name)                                              \
220    unsigned name##_count, name##_sz;                                           \
221    type *name;
222 
223 #define array_insert(ctx, arr, ...)                                            \
224    do {                                                                        \
225       if (arr##_count == arr##_sz) {                                           \
226          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
227          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
228       }                                                                        \
229       arr[arr##_count++] = __VA_ARGS__;                                        \
230    } while (0)
231 
232 struct ir3_instruction {
233    struct ir3_block *block;
234    opc_t opc;
235    enum {
236       /* (sy) flag is set on first instruction, and after sample
237        * instructions (probably just on RAW hazard).
238        */
239       IR3_INSTR_SY = 0x001,
240       /* (ss) flag is set on first instruction, and first instruction
241        * to depend on the result of "long" instructions (RAW hazard):
242        *
243        *   rcp, rsq, log2, exp2, sin, cos, sqrt
244        *
245        * It seems to synchronize until all in-flight instructions are
246        * completed, for example:
247        *
248        *   rsq hr1.w, hr1.w
249        *   add.f hr2.z, (neg)hr2.z, hc0.y
250        *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
251        *   rsq hr2.x, hr2.x
252        *   (rpt1)nop
253        *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
254        *   nop
255        *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
256        *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
257        *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
258        *
259        * The last mul.f does not have (ss) set, presumably because the
260        * (ss) on the previous instruction does the job.
261        *
262        * The blob driver also seems to set it on WAR hazards, although
263        * not really clear if this is needed or just blob compiler being
264        * sloppy.  So far I haven't found a case where removing the (ss)
265        * causes problems for WAR hazard, but I could just be getting
266        * lucky:
267        *
268        *   rcp r1.y, r3.y
269        *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
270        *
271        */
272       IR3_INSTR_SS = 0x002,
273       /* (jp) flag is set on jump targets:
274        */
275       IR3_INSTR_JP = 0x004,
276       IR3_INSTR_UL = 0x008,
277       IR3_INSTR_3D = 0x010,
278       IR3_INSTR_A = 0x020,
279       IR3_INSTR_O = 0x040,
280       IR3_INSTR_P = 0x080,
281       IR3_INSTR_S = 0x100,
282       IR3_INSTR_S2EN = 0x200,
283       IR3_INSTR_G = 0x400,
284       IR3_INSTR_SAT = 0x800,
285       /* (cat5/cat6) Bindless */
286       IR3_INSTR_B = 0x1000,
287       /* (cat5/cat6) nonuniform */
288       IR3_INSTR_NONUNIF = 0x02000,
289       /* (cat5-only) Get some parts of the encoding from a1.x */
290       IR3_INSTR_A1EN = 0x04000,
291       /* meta-flags, for intermediate stages of IR, ie.
292        * before register assignment is done:
293        */
294       IR3_INSTR_MARK = 0x08000,
295       IR3_INSTR_UNUSED = 0x10000,
296    } flags;
297    uint8_t repeat;
298    uint8_t nop;
299 #ifdef DEBUG
300    unsigned srcs_max, dsts_max;
301 #endif
302    unsigned srcs_count, dsts_count;
303    struct ir3_register **dsts;
304    struct ir3_register **srcs;
305    union {
306       struct {
307          char inv1, inv2;
308          char comp1, comp2;
309          int immed;
310          struct ir3_block *target;
311          const char *target_label;
312          brtype_t brtype;
313          unsigned idx; /* for brac.N */
314       } cat0;
315       struct {
316          type_t src_type, dst_type;
317          round_t round;
318       } cat1;
319       struct {
320          enum {
321             IR3_COND_LT = 0,
322             IR3_COND_LE = 1,
323             IR3_COND_GT = 2,
324             IR3_COND_GE = 3,
325             IR3_COND_EQ = 4,
326             IR3_COND_NE = 5,
327          } condition;
328       } cat2;
329       struct {
330          unsigned samp, tex;
331          unsigned tex_base : 3;
332          type_t type;
333       } cat5;
334       struct {
335          type_t type;
336          /* TODO remove dst_offset and handle as a ir3_register
337           * which might be IMMED, similar to how src_offset is
338           * handled.
339           */
340          int dst_offset;
341          int iim_val   : 3; /* for ldgb/stgb, # of components */
342          unsigned d    : 3; /* for ldc, component offset */
343          bool typed    : 1;
344          unsigned base : 3;
345       } cat6;
346       struct {
347          unsigned w : 1; /* write */
348          unsigned r : 1; /* read */
349          unsigned l : 1; /* local */
350          unsigned g : 1; /* global */
351       } cat7;
352       /* for meta-instructions, just used to hold extra data
353        * before instruction scheduling, etc
354        */
355       struct {
356          int off; /* component/offset */
357       } split;
358       struct {
359          /* Per-source index back to the entry in the
360           * ir3_shader_variant::outputs table.
361           */
362          unsigned *outidxs;
363       } end;
364       struct {
365          /* used to temporarily hold reference to nir_phi_instr
366           * until we resolve the phi srcs
367           */
368          void *nphi;
369       } phi;
370       struct {
371          unsigned samp, tex;
372          unsigned input_offset;
373          unsigned samp_base : 3;
374          unsigned tex_base  : 3;
375       } prefetch;
376       struct {
377          /* maps back to entry in ir3_shader_variant::inputs table: */
378          int inidx;
379          /* for sysvals, identifies the sysval type.  Mostly so we can
380           * identify the special cases where a sysval should not be DCE'd
381           * (currently, just pre-fs texture fetch)
382           */
383          gl_system_value sysval;
384       } input;
385    };
386 
387    /* For assigning jump offsets, we need instruction's position: */
388    uint32_t ip;
389 
390    /* used for per-pass extra instruction data.
391     *
392     * TODO we should remove the per-pass data like this and 'use_count'
393     * and do something similar to what RA does w/ ir3_ra_instr_data..
394     * ie. use the ir3_count_instructions pass, and then use instr->ip
395     * to index into a table of pass-private data.
396     */
397    void *data;
398 
399    /**
400     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
401     */
402    struct set *uses;
403 
404    int use_count; /* currently just updated/used by cp */
405 
406    /* an instruction can reference at most one address register amongst
407     * it's src/dst registers.  Beyond that, you need to insert mov's.
408     *
409     * NOTE: do not write this directly, use ir3_instr_set_address()
410     */
411    struct ir3_register *address;
412 
413    /* Tracking for additional dependent instructions.  Used to handle
414     * barriers, WAR hazards for arrays/SSBOs/etc.
415     */
416    DECLARE_ARRAY(struct ir3_instruction *, deps);
417 
418    /*
419     * From PoV of instruction scheduling, not execution (ie. ignores global/
420     * local distinction):
421     *                            shared  image  atomic  SSBO  everything
422     *   barrier()/            -   R/W     R/W    R/W     R/W       X
423     *     groupMemoryBarrier()
424     *     memoryBarrier()
425     *     (but only images declared coherent?)
426     *   memoryBarrierAtomic() -                  R/W
427     *   memoryBarrierBuffer() -                          R/W
428     *   memoryBarrierImage()  -           R/W
429     *   memoryBarrierShared() -   R/W
430     *
431     * TODO I think for SSBO/image/shared, in cases where we can determine
432     * which variable is accessed, we don't need to care about accesses to
433     * different variables (unless declared coherent??)
434     */
435    enum {
436       IR3_BARRIER_EVERYTHING = 1 << 0,
437       IR3_BARRIER_SHARED_R = 1 << 1,
438       IR3_BARRIER_SHARED_W = 1 << 2,
439       IR3_BARRIER_IMAGE_R = 1 << 3,
440       IR3_BARRIER_IMAGE_W = 1 << 4,
441       IR3_BARRIER_BUFFER_R = 1 << 5,
442       IR3_BARRIER_BUFFER_W = 1 << 6,
443       IR3_BARRIER_ARRAY_R = 1 << 7,
444       IR3_BARRIER_ARRAY_W = 1 << 8,
445       IR3_BARRIER_PRIVATE_R = 1 << 9,
446       IR3_BARRIER_PRIVATE_W = 1 << 10,
447    } barrier_class,
448       barrier_conflict;
449 
450    /* Entry in ir3_block's instruction list: */
451    struct list_head node;
452 
453    uint32_t serialno;
454 
455    // TODO only computerator/assembler:
456    int line;
457 };
458 
459 struct ir3 {
460    struct ir3_compiler *compiler;
461    gl_shader_stage type;
462 
463    DECLARE_ARRAY(struct ir3_instruction *, inputs);
464 
465    /* Track bary.f (and ldlv) instructions.. this is needed in
466     * scheduling to ensure that all varying fetches happen before
467     * any potential kill instructions.  The hw gets grumpy if all
468     * threads in a group are killed before the last bary.f gets
469     * a chance to signal end of input (ei).
470     */
471    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
472 
473    /* Track all indirect instructions (read and write).  To avoid
474     * deadlock scenario where an address register gets scheduled,
475     * but other dependent src instructions cannot be scheduled due
476     * to dependency on a *different* address register value, the
477     * scheduler needs to ensure that all dependencies other than
478     * the instruction other than the address register are scheduled
479     * before the one that writes the address register.  Having a
480     * convenient list of instructions that reference some address
481     * register simplifies this.
482     */
483    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
484 
485    /* same for a1.x: */
486    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
487 
488    /* and same for instructions that consume predicate register: */
489    DECLARE_ARRAY(struct ir3_instruction *, predicates);
490 
491    /* Track texture sample instructions which need texture state
492     * patched in (for astc-srgb workaround):
493     */
494    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
495 
496    /* List of blocks: */
497    struct list_head block_list;
498 
499    /* List of ir3_array's: */
500    struct list_head array_list;
501 
502 #ifdef DEBUG
503    unsigned block_count;
504 #endif
505    unsigned instr_count;
506 };
507 
508 struct ir3_array {
509    struct list_head node;
510    unsigned length;
511    unsigned id;
512 
513    struct nir_register *r;
514 
515    /* To avoid array write's from getting DCE'd, keep track of the
516     * most recent write.  Any array access depends on the most
517     * recent write.  This way, nothing depends on writes after the
518     * last read.  But all the writes that happen before that have
519     * something depending on them
520     */
521    struct ir3_register *last_write;
522 
523    /* extra stuff used in RA pass: */
524    unsigned base; /* base vreg name */
525    unsigned reg;  /* base physical reg */
526    uint16_t start_ip, end_ip;
527 
528    /* Indicates if half-precision */
529    bool half;
530 
531    bool unused;
532 };
533 
534 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
535 
536 enum ir3_branch_type {
537    IR3_BRANCH_COND,   /* condition */
538    IR3_BRANCH_ANY,    /* subgroupAny(condition) */
539    IR3_BRANCH_ALL,    /* subgroupAll(condition) */
540    IR3_BRANCH_GETONE, /* subgroupElect() */
541 };
542 
543 struct ir3_block {
544    struct list_head node;
545    struct ir3 *shader;
546 
547    const struct nir_block *nblock;
548 
549    struct list_head instr_list; /* list of ir3_instruction */
550 
551    /* The actual branch condition, if there are two successors */
552    enum ir3_branch_type brtype;
553 
554    /* each block has either one or two successors.. in case of two
555     * successors, 'condition' decides which one to follow.  A block preceding
556     * an if/else has two successors.
557     *
558     * In some cases the path that the machine actually takes through the
559     * program may not match the per-thread view of the CFG. In particular
560     * this is the case for if/else, where the machine jumps from the end of
561     * the if to the beginning of the else and switches active lanes. While
562     * most things only care about the per-thread view, we need to use the
563     * "physical" view when allocating shared registers. "successors" contains
564     * the per-thread successors, and "physical_successors" contains the
565     * physical successors which includes the fallthrough edge from the if to
566     * the else.
567     */
568    struct ir3_instruction *condition;
569    struct ir3_block *successors[2];
570    struct ir3_block *physical_successors[2];
571 
572    DECLARE_ARRAY(struct ir3_block *, predecessors);
573    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
574 
575    uint16_t start_ip, end_ip;
576 
577    /* Track instructions which do not write a register but other-
578     * wise must not be discarded (such as kill, stg, etc)
579     */
580    DECLARE_ARRAY(struct ir3_instruction *, keeps);
581 
582    /* used for per-pass extra block data.  Mainly used right
583     * now in RA step to track livein/liveout.
584     */
585    void *data;
586 
587    uint32_t index;
588 
589    struct ir3_block *imm_dom;
590    DECLARE_ARRAY(struct ir3_block *, dom_children);
591 
592    uint32_t dom_pre_index;
593    uint32_t dom_post_index;
594 
595    uint32_t loop_id;
596    uint32_t loop_depth;
597 
598 #ifdef DEBUG
599    uint32_t serialno;
600 #endif
601 };
602 
603 static inline uint32_t
block_id(struct ir3_block * block)604 block_id(struct ir3_block *block)
605 {
606 #ifdef DEBUG
607    return block->serialno;
608 #else
609    return (uint32_t)(unsigned long)block;
610 #endif
611 }
612 
613 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)614 ir3_start_block(struct ir3 *ir)
615 {
616    return list_first_entry(&ir->block_list, struct ir3_block, node);
617 }
618 
619 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
620 void ir3_block_add_physical_predecessor(struct ir3_block *block,
621                                         struct ir3_block *pred);
622 void ir3_block_remove_predecessor(struct ir3_block *block,
623                                   struct ir3_block *pred);
624 void ir3_block_remove_physical_predecessor(struct ir3_block *block,
625                                            struct ir3_block *pred);
626 unsigned ir3_block_get_pred_index(struct ir3_block *block,
627                                   struct ir3_block *pred);
628 
629 void ir3_calc_dominance(struct ir3 *ir);
630 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
631 
632 struct ir3_shader_variant;
633 
634 struct ir3 *ir3_create(struct ir3_compiler *compiler,
635                        struct ir3_shader_variant *v);
636 void ir3_destroy(struct ir3 *shader);
637 
638 void ir3_collect_info(struct ir3_shader_variant *v);
639 void *ir3_alloc(struct ir3 *shader, int sz);
640 
641 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
642                                          unsigned reg_count,
643                                          bool double_threadsize);
644 
645 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
646                                            bool double_threadsize);
647 
648 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
649                                   unsigned regs_count);
650 
651 struct ir3_block *ir3_block_create(struct ir3 *shader);
652 
653 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
654                                          int ndst, int nsrc);
655 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
656 void ir3_instr_add_dep(struct ir3_instruction *instr,
657                        struct ir3_instruction *dep);
658 const char *ir3_instr_name(struct ir3_instruction *instr);
659 
660 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
661                                     int flags);
662 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
663                                     int flags);
664 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
665                                    struct ir3_register *reg);
666 
667 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)668 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
669 {
670    assert(!dst->tied && !src->tied);
671    dst->tied = src;
672    src->tied = dst;
673 }
674 
675 void ir3_reg_set_last_array(struct ir3_instruction *instr,
676                             struct ir3_register *reg,
677                             struct ir3_register *last_write);
678 
679 void ir3_instr_set_address(struct ir3_instruction *instr,
680                            struct ir3_instruction *addr);
681 
682 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)683 ir3_instr_check_mark(struct ir3_instruction *instr)
684 {
685    if (instr->flags & IR3_INSTR_MARK)
686       return true; /* already visited */
687    instr->flags |= IR3_INSTR_MARK;
688    return false;
689 }
690 
691 void ir3_block_clear_mark(struct ir3_block *block);
692 void ir3_clear_mark(struct ir3 *shader);
693 
694 unsigned ir3_count_instructions(struct ir3 *ir);
695 unsigned ir3_count_instructions_ra(struct ir3 *ir);
696 
697 /**
698  * Move 'instr' to just before 'after'
699  */
700 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)701 ir3_instr_move_before(struct ir3_instruction *instr,
702                       struct ir3_instruction *after)
703 {
704    list_delinit(&instr->node);
705    list_addtail(&instr->node, &after->node);
706 }
707 
708 /**
709  * Move 'instr' to just after 'before':
710  */
711 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)712 ir3_instr_move_after(struct ir3_instruction *instr,
713                      struct ir3_instruction *before)
714 {
715    list_delinit(&instr->node);
716    list_add(&instr->node, &before->node);
717 }
718 
719 /**
720  * Move 'instr' to the beginning of the block:
721  */
722 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)723 ir3_instr_move_before_block(struct ir3_instruction *instr,
724                             struct ir3_block *block)
725 {
726    list_delinit(&instr->node);
727    list_add(&instr->node, &block->instr_list);
728 }
729 
730 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
731 
732 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
733 void ir3_fixup_src_type(struct ir3_instruction *instr);
734 
735 int ir3_flut(struct ir3_register *src_reg);
736 
737 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
738 
739 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
740 
741 #include "util/set.h"
742 #define foreach_ssa_use(__use, __instr)                                        \
743    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
744         __use = NULL)                                                          \
745       set_foreach ((__instr)->uses, __entry)                                   \
746          if ((__use = (void *)__entry->key))
747 
748 static inline uint32_t
reg_num(const struct ir3_register * reg)749 reg_num(const struct ir3_register *reg)
750 {
751    return reg->num >> 2;
752 }
753 
754 static inline uint32_t
reg_comp(const struct ir3_register * reg)755 reg_comp(const struct ir3_register *reg)
756 {
757    return reg->num & 0x3;
758 }
759 
760 static inline bool
is_flow(struct ir3_instruction * instr)761 is_flow(struct ir3_instruction *instr)
762 {
763    return (opc_cat(instr->opc) == 0);
764 }
765 
766 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)767 is_kill_or_demote(struct ir3_instruction *instr)
768 {
769    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
770 }
771 
772 static inline bool
is_nop(struct ir3_instruction * instr)773 is_nop(struct ir3_instruction *instr)
774 {
775    return instr->opc == OPC_NOP;
776 }
777 
778 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)779 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
780 {
781    unsigned dst_type = (dst->flags & IR3_REG_HALF);
782    unsigned src_type = (src->flags & IR3_REG_HALF);
783 
784    /* Treat shared->normal copies as same-type, because they can generally be
785     * folded, but not normal->shared copies.
786     */
787    if (dst_type != src_type ||
788        ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED)))
789       return false;
790    else
791       return true;
792 }
793 
794 /* Is it a non-transformative (ie. not type changing) mov?  This can
795  * also include absneg.s/absneg.f, which for the most part can be
796  * treated as a mov (single src argument).
797  */
798 static inline bool
is_same_type_mov(struct ir3_instruction * instr)799 is_same_type_mov(struct ir3_instruction *instr)
800 {
801    struct ir3_register *dst;
802 
803    switch (instr->opc) {
804    case OPC_MOV:
805       if (instr->cat1.src_type != instr->cat1.dst_type)
806          return false;
807       /* If the type of dest reg and src reg are different,
808        * it shouldn't be considered as same type mov
809        */
810       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
811          return false;
812       break;
813    case OPC_ABSNEG_F:
814    case OPC_ABSNEG_S:
815       if (instr->flags & IR3_INSTR_SAT)
816          return false;
817       /* If the type of dest reg and src reg are different,
818        * it shouldn't be considered as same type mov
819        */
820       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
821          return false;
822       break;
823    case OPC_META_PHI:
824       return instr->srcs_count == 1;
825    default:
826       return false;
827    }
828 
829    dst = instr->dsts[0];
830 
831    /* mov's that write to a0 or p0.x are special: */
832    if (dst->num == regid(REG_P0, 0))
833       return false;
834    if (reg_num(dst) == REG_A0)
835       return false;
836 
837    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
838       return false;
839 
840    return true;
841 }
842 
843 /* A move from const, which changes size but not type, can also be
844  * folded into dest instruction in some cases.
845  */
846 static inline bool
is_const_mov(struct ir3_instruction * instr)847 is_const_mov(struct ir3_instruction *instr)
848 {
849    if (instr->opc != OPC_MOV)
850       return false;
851 
852    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
853       return false;
854 
855    type_t src_type = instr->cat1.src_type;
856    type_t dst_type = instr->cat1.dst_type;
857 
858    return (type_float(src_type) && type_float(dst_type)) ||
859           (type_uint(src_type) && type_uint(dst_type)) ||
860           (type_sint(src_type) && type_sint(dst_type));
861 }
862 
863 static inline bool
is_alu(struct ir3_instruction * instr)864 is_alu(struct ir3_instruction *instr)
865 {
866    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
867 }
868 
869 static inline bool
is_sfu(struct ir3_instruction * instr)870 is_sfu(struct ir3_instruction *instr)
871 {
872    return (opc_cat(instr->opc) == 4);
873 }
874 
875 static inline bool
is_tex(struct ir3_instruction * instr)876 is_tex(struct ir3_instruction *instr)
877 {
878    return (opc_cat(instr->opc) == 5);
879 }
880 
881 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)882 is_tex_or_prefetch(struct ir3_instruction *instr)
883 {
884    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
885 }
886 
887 static inline bool
is_mem(struct ir3_instruction * instr)888 is_mem(struct ir3_instruction *instr)
889 {
890    return (opc_cat(instr->opc) == 6);
891 }
892 
893 static inline bool
is_barrier(struct ir3_instruction * instr)894 is_barrier(struct ir3_instruction *instr)
895 {
896    return (opc_cat(instr->opc) == 7);
897 }
898 
899 static inline bool
is_half(struct ir3_instruction * instr)900 is_half(struct ir3_instruction *instr)
901 {
902    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
903 }
904 
905 static inline bool
is_shared(struct ir3_instruction * instr)906 is_shared(struct ir3_instruction *instr)
907 {
908    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
909 }
910 
911 static inline bool
is_store(struct ir3_instruction * instr)912 is_store(struct ir3_instruction *instr)
913 {
914    /* these instructions, the "destination" register is
915     * actually a source, the address to store to.
916     */
917    switch (instr->opc) {
918    case OPC_STG:
919    case OPC_STG_A:
920    case OPC_STGB:
921    case OPC_STIB:
922    case OPC_STP:
923    case OPC_STL:
924    case OPC_STLW:
925    case OPC_L2G:
926    case OPC_G2L:
927       return true;
928    default:
929       return false;
930    }
931 }
932 
933 static inline bool
is_load(struct ir3_instruction * instr)934 is_load(struct ir3_instruction *instr)
935 {
936    switch (instr->opc) {
937    case OPC_LDG:
938    case OPC_LDG_A:
939    case OPC_LDGB:
940    case OPC_LDIB:
941    case OPC_LDL:
942    case OPC_LDP:
943    case OPC_L2G:
944    case OPC_LDLW:
945    case OPC_LDC:
946    case OPC_LDLV:
947       /* probably some others too.. */
948       return true;
949    default:
950       return false;
951    }
952 }
953 
954 static inline bool
is_input(struct ir3_instruction * instr)955 is_input(struct ir3_instruction *instr)
956 {
957    /* in some cases, ldlv is used to fetch varying without
958     * interpolation.. fortunately inloc is the first src
959     * register in either case
960     */
961    switch (instr->opc) {
962    case OPC_LDLV:
963    case OPC_BARY_F:
964       return true;
965    default:
966       return false;
967    }
968 }
969 
970 static inline bool
is_bool(struct ir3_instruction * instr)971 is_bool(struct ir3_instruction *instr)
972 {
973    switch (instr->opc) {
974    case OPC_CMPS_F:
975    case OPC_CMPS_S:
976    case OPC_CMPS_U:
977       return true;
978    default:
979       return false;
980    }
981 }
982 
983 static inline opc_t
cat3_half_opc(opc_t opc)984 cat3_half_opc(opc_t opc)
985 {
986    switch (opc) {
987    case OPC_MAD_F32:
988       return OPC_MAD_F16;
989    case OPC_SEL_B32:
990       return OPC_SEL_B16;
991    case OPC_SEL_S32:
992       return OPC_SEL_S16;
993    case OPC_SEL_F32:
994       return OPC_SEL_F16;
995    case OPC_SAD_S32:
996       return OPC_SAD_S16;
997    default:
998       return opc;
999    }
1000 }
1001 
1002 static inline opc_t
cat3_full_opc(opc_t opc)1003 cat3_full_opc(opc_t opc)
1004 {
1005    switch (opc) {
1006    case OPC_MAD_F16:
1007       return OPC_MAD_F32;
1008    case OPC_SEL_B16:
1009       return OPC_SEL_B32;
1010    case OPC_SEL_S16:
1011       return OPC_SEL_S32;
1012    case OPC_SEL_F16:
1013       return OPC_SEL_F32;
1014    case OPC_SAD_S16:
1015       return OPC_SAD_S32;
1016    default:
1017       return opc;
1018    }
1019 }
1020 
1021 static inline opc_t
cat4_half_opc(opc_t opc)1022 cat4_half_opc(opc_t opc)
1023 {
1024    switch (opc) {
1025    case OPC_RSQ:
1026       return OPC_HRSQ;
1027    case OPC_LOG2:
1028       return OPC_HLOG2;
1029    case OPC_EXP2:
1030       return OPC_HEXP2;
1031    default:
1032       return opc;
1033    }
1034 }
1035 
1036 static inline opc_t
cat4_full_opc(opc_t opc)1037 cat4_full_opc(opc_t opc)
1038 {
1039    switch (opc) {
1040    case OPC_HRSQ:
1041       return OPC_RSQ;
1042    case OPC_HLOG2:
1043       return OPC_LOG2;
1044    case OPC_HEXP2:
1045       return OPC_EXP2;
1046    default:
1047       return opc;
1048    }
1049 }
1050 
1051 static inline bool
is_meta(struct ir3_instruction * instr)1052 is_meta(struct ir3_instruction *instr)
1053 {
1054    return (opc_cat(instr->opc) == -1);
1055 }
1056 
1057 static inline unsigned
reg_elems(const struct ir3_register * reg)1058 reg_elems(const struct ir3_register *reg)
1059 {
1060    if (reg->flags & IR3_REG_ARRAY)
1061       return reg->size;
1062    else
1063       return util_last_bit(reg->wrmask);
1064 }
1065 
1066 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1067 reg_elem_size(const struct ir3_register *reg)
1068 {
1069    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1070 }
1071 
1072 static inline unsigned
reg_size(const struct ir3_register * reg)1073 reg_size(const struct ir3_register *reg)
1074 {
1075    return reg_elems(reg) * reg_elem_size(reg);
1076 }
1077 
1078 static inline unsigned
dest_regs(struct ir3_instruction * instr)1079 dest_regs(struct ir3_instruction *instr)
1080 {
1081    if (instr->dsts_count == 0)
1082       return 0;
1083 
1084    debug_assert(instr->dsts_count == 1);
1085    return util_last_bit(instr->dsts[0]->wrmask);
1086 }
1087 
1088 /* is dst a normal temp register: */
1089 static inline bool
is_dest_gpr(struct ir3_register * dst)1090 is_dest_gpr(struct ir3_register *dst)
1091 {
1092    if (dst->wrmask == 0)
1093       return false;
1094    if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
1095       return false;
1096    return true;
1097 }
1098 
1099 static inline bool
writes_gpr(struct ir3_instruction * instr)1100 writes_gpr(struct ir3_instruction *instr)
1101 {
1102    if (dest_regs(instr) == 0)
1103       return false;
1104    return is_dest_gpr(instr->dsts[0]);
1105 }
1106 
1107 static inline bool
writes_addr0(struct ir3_instruction * instr)1108 writes_addr0(struct ir3_instruction *instr)
1109 {
1110    /* Note: only the first dest can write to a0.x */
1111    if (instr->dsts_count > 0) {
1112       struct ir3_register *dst = instr->dsts[0];
1113       return dst->num == regid(REG_A0, 0);
1114    }
1115    return false;
1116 }
1117 
1118 static inline bool
writes_addr1(struct ir3_instruction * instr)1119 writes_addr1(struct ir3_instruction *instr)
1120 {
1121    /* Note: only the first dest can write to a1.x */
1122    if (instr->dsts_count > 0) {
1123       struct ir3_register *dst = instr->dsts[0];
1124       return dst->num == regid(REG_A0, 1);
1125    }
1126    return false;
1127 }
1128 
1129 static inline bool
writes_pred(struct ir3_instruction * instr)1130 writes_pred(struct ir3_instruction *instr)
1131 {
1132    /* Note: only the first dest can write to p0.x */
1133    if (instr->dsts_count > 0) {
1134       struct ir3_register *dst = instr->dsts[0];
1135       return reg_num(dst) == REG_P0;
1136    }
1137    return false;
1138 }
1139 
1140 /* Is it something other than a normal register. Shared regs, p0, and a0/a1
1141  * are considered special here. Special registers are always accessed with one
1142  * size and never alias normal registers, even though a naive calculation
1143  * would sometimes make it seem like e.g. r30.z aliases a0.x.
1144  */
1145 static inline bool
is_reg_special(const struct ir3_register * reg)1146 is_reg_special(const struct ir3_register *reg)
1147 {
1148    return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
1149           (reg_num(reg) == REG_P0);
1150 }
1151 
1152 /* Same as above but in cases where we don't have a register. r48.x and above
1153  * are shared/special.
1154  */
1155 static inline bool
is_reg_num_special(unsigned num)1156 is_reg_num_special(unsigned num)
1157 {
1158    return num >= 48 * 4;
1159 }
1160 
1161 /* returns defining instruction for reg */
1162 /* TODO better name */
1163 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1164 ssa(struct ir3_register *reg)
1165 {
1166    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1167       return reg->def->instr;
1168    return NULL;
1169 }
1170 
1171 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1172 conflicts(struct ir3_register *a, struct ir3_register *b)
1173 {
1174    return (a && b) && (a->def != b->def);
1175 }
1176 
1177 static inline bool
reg_gpr(struct ir3_register * r)1178 reg_gpr(struct ir3_register *r)
1179 {
1180    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1181       return false;
1182    if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
1183       return false;
1184    return true;
1185 }
1186 
1187 static inline type_t
half_type(type_t type)1188 half_type(type_t type)
1189 {
1190    switch (type) {
1191    case TYPE_F32:
1192       return TYPE_F16;
1193    case TYPE_U32:
1194       return TYPE_U16;
1195    case TYPE_S32:
1196       return TYPE_S16;
1197    case TYPE_F16:
1198    case TYPE_U16:
1199    case TYPE_S16:
1200       return type;
1201    default:
1202       assert(0);
1203       return ~0;
1204    }
1205 }
1206 
1207 static inline type_t
full_type(type_t type)1208 full_type(type_t type)
1209 {
1210    switch (type) {
1211    case TYPE_F16:
1212       return TYPE_F32;
1213    case TYPE_U16:
1214       return TYPE_U32;
1215    case TYPE_S16:
1216       return TYPE_S32;
1217    case TYPE_F32:
1218    case TYPE_U32:
1219    case TYPE_S32:
1220       return type;
1221    default:
1222       assert(0);
1223       return ~0;
1224    }
1225 }
1226 
1227 /* some cat2 instructions (ie. those which are not float) can embed an
1228  * immediate:
1229  */
1230 static inline bool
ir3_cat2_int(opc_t opc)1231 ir3_cat2_int(opc_t opc)
1232 {
1233    switch (opc) {
1234    case OPC_ADD_U:
1235    case OPC_ADD_S:
1236    case OPC_SUB_U:
1237    case OPC_SUB_S:
1238    case OPC_CMPS_U:
1239    case OPC_CMPS_S:
1240    case OPC_MIN_U:
1241    case OPC_MIN_S:
1242    case OPC_MAX_U:
1243    case OPC_MAX_S:
1244    case OPC_CMPV_U:
1245    case OPC_CMPV_S:
1246    case OPC_MUL_U24:
1247    case OPC_MUL_S24:
1248    case OPC_MULL_U:
1249    case OPC_CLZ_S:
1250    case OPC_ABSNEG_S:
1251    case OPC_AND_B:
1252    case OPC_OR_B:
1253    case OPC_NOT_B:
1254    case OPC_XOR_B:
1255    case OPC_BFREV_B:
1256    case OPC_CLZ_B:
1257    case OPC_SHL_B:
1258    case OPC_SHR_B:
1259    case OPC_ASHR_B:
1260    case OPC_MGEN_B:
1261    case OPC_GETBIT_B:
1262    case OPC_CBITS_B:
1263    case OPC_BARY_F:
1264       return true;
1265 
1266    default:
1267       return false;
1268    }
1269 }
1270 
1271 /* map cat2 instruction to valid abs/neg flags: */
1272 static inline unsigned
ir3_cat2_absneg(opc_t opc)1273 ir3_cat2_absneg(opc_t opc)
1274 {
1275    switch (opc) {
1276    case OPC_ADD_F:
1277    case OPC_MIN_F:
1278    case OPC_MAX_F:
1279    case OPC_MUL_F:
1280    case OPC_SIGN_F:
1281    case OPC_CMPS_F:
1282    case OPC_ABSNEG_F:
1283    case OPC_CMPV_F:
1284    case OPC_FLOOR_F:
1285    case OPC_CEIL_F:
1286    case OPC_RNDNE_F:
1287    case OPC_RNDAZ_F:
1288    case OPC_TRUNC_F:
1289    case OPC_BARY_F:
1290       return IR3_REG_FABS | IR3_REG_FNEG;
1291 
1292    case OPC_ADD_U:
1293    case OPC_ADD_S:
1294    case OPC_SUB_U:
1295    case OPC_SUB_S:
1296    case OPC_CMPS_U:
1297    case OPC_CMPS_S:
1298    case OPC_MIN_U:
1299    case OPC_MIN_S:
1300    case OPC_MAX_U:
1301    case OPC_MAX_S:
1302    case OPC_CMPV_U:
1303    case OPC_CMPV_S:
1304    case OPC_MUL_U24:
1305    case OPC_MUL_S24:
1306    case OPC_MULL_U:
1307    case OPC_CLZ_S:
1308       return 0;
1309 
1310    case OPC_ABSNEG_S:
1311       return IR3_REG_SABS | IR3_REG_SNEG;
1312 
1313    case OPC_AND_B:
1314    case OPC_OR_B:
1315    case OPC_NOT_B:
1316    case OPC_XOR_B:
1317    case OPC_BFREV_B:
1318    case OPC_CLZ_B:
1319    case OPC_SHL_B:
1320    case OPC_SHR_B:
1321    case OPC_ASHR_B:
1322    case OPC_MGEN_B:
1323    case OPC_GETBIT_B:
1324    case OPC_CBITS_B:
1325       return IR3_REG_BNOT;
1326 
1327    default:
1328       return 0;
1329    }
1330 }
1331 
1332 /* map cat3 instructions to valid abs/neg flags: */
1333 static inline unsigned
ir3_cat3_absneg(opc_t opc)1334 ir3_cat3_absneg(opc_t opc)
1335 {
1336    switch (opc) {
1337    case OPC_MAD_F16:
1338    case OPC_MAD_F32:
1339    case OPC_SEL_F16:
1340    case OPC_SEL_F32:
1341       return IR3_REG_FNEG;
1342 
1343    case OPC_MAD_U16:
1344    case OPC_MADSH_U16:
1345    case OPC_MAD_S16:
1346    case OPC_MADSH_M16:
1347    case OPC_MAD_U24:
1348    case OPC_MAD_S24:
1349    case OPC_SEL_S16:
1350    case OPC_SEL_S32:
1351    case OPC_SAD_S16:
1352    case OPC_SAD_S32:
1353       /* neg *may* work on 3rd src.. */
1354 
1355    case OPC_SEL_B16:
1356    case OPC_SEL_B32:
1357 
1358    case OPC_SHLG_B16:
1359 
1360    default:
1361       return 0;
1362    }
1363 }
1364 
1365 /* Return the type (float, int, or uint) the op uses when converting from the
1366  * internal result of the op (which is assumed to be the same size as the
1367  * sources) to the destination when they are not the same size. If F32 it does
1368  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1369  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1370  * doesn't do anything sensible or is unknown.
1371  */
1372 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1373 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1374 {
1375    *can_fold = true;
1376    switch (instr->opc) {
1377    case OPC_ADD_F:
1378    case OPC_MUL_F:
1379    case OPC_BARY_F:
1380    case OPC_MAD_F32:
1381    case OPC_MAD_F16:
1382       return TYPE_F32;
1383 
1384    case OPC_ADD_U:
1385    case OPC_SUB_U:
1386    case OPC_MIN_U:
1387    case OPC_MAX_U:
1388    case OPC_AND_B:
1389    case OPC_OR_B:
1390    case OPC_NOT_B:
1391    case OPC_XOR_B:
1392    case OPC_MUL_U24:
1393    case OPC_MULL_U:
1394    case OPC_SHL_B:
1395    case OPC_SHR_B:
1396    case OPC_ASHR_B:
1397    case OPC_MAD_U24:
1398    /* Comparison ops zero-extend/truncate their results, so consider them as
1399     * unsigned here.
1400     */
1401    case OPC_CMPS_F:
1402    case OPC_CMPV_F:
1403    case OPC_CMPS_U:
1404    case OPC_CMPS_S:
1405       return TYPE_U32;
1406 
1407    case OPC_ADD_S:
1408    case OPC_SUB_S:
1409    case OPC_MIN_S:
1410    case OPC_MAX_S:
1411    case OPC_ABSNEG_S:
1412    case OPC_MUL_S24:
1413    case OPC_MAD_S24:
1414       return TYPE_S32;
1415 
1416    /* We assume that any move->move folding that could be done was done by
1417     * NIR.
1418     */
1419    case OPC_MOV:
1420    default:
1421       *can_fold = false;
1422       return TYPE_U32;
1423    }
1424 }
1425 
1426 /* Return the src and dst types for the conversion which is already folded
1427  * into the op. We can assume that instr has folded in a conversion from
1428  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1429  * to call if ir3_output_conv_type() returns can_fold = true.
1430  */
1431 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1432 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1433 {
1434    switch (instr->opc) {
1435    case OPC_CMPS_F:
1436    case OPC_CMPV_F:
1437    case OPC_CMPS_U:
1438    case OPC_CMPS_S:
1439       /* Comparisons only return 0/1 and the size of the comparison sources
1440        * is irrelevant, never consider them as having an output conversion
1441        * by returning a type with the dest size here:
1442        */
1443       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1444                                                     : full_type(base_type);
1445 
1446    case OPC_BARY_F:
1447       /* bary.f doesn't have an explicit source, but we can assume here that
1448        * the varying data it reads is in fp32.
1449        *
1450        * This may be fp16 on older gen's depending on some register
1451        * settings, but it's probably not worth plumbing that through for a
1452        * small improvement that NIR would hopefully handle for us anyway.
1453        */
1454       return TYPE_F32;
1455 
1456    default:
1457       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1458                                                     : full_type(base_type);
1459    }
1460 }
1461 
1462 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1463 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1464 {
1465    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1466                                                  : full_type(base_type);
1467 }
1468 
1469 /* Some instructions have signed/unsigned variants which are identical except
1470  * for whether the folded conversion sign-extends or zero-extends, and we can
1471  * fold in a mismatching move by rewriting the opcode. Return the opcode to
1472  * switch signedness, and whether one exists.
1473  */
1474 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1475 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1476 {
1477    switch (opc) {
1478 #define PAIR(u, s)                                                             \
1479    case OPC_##u:                                                               \
1480       return OPC_##s;                                                          \
1481    case OPC_##s:                                                               \
1482       return OPC_##u;
1483       PAIR(ADD_U, ADD_S)
1484       PAIR(SUB_U, SUB_S)
1485       /* Note: these are only identical when the sources are half, but that's
1486        * the only case we call this function for anyway.
1487        */
1488       PAIR(MUL_U24, MUL_S24)
1489 
1490    default:
1491       *can_swap = false;
1492       return opc;
1493    }
1494 }
1495 
1496 #define MASK(n) ((1 << (n)) - 1)
1497 
1498 /* iterator for an instructions's sources (reg), also returns src #: */
1499 #define foreach_src_n(__srcreg, __n, __instr)                                  \
1500    if ((__instr)->srcs_count)                                                  \
1501       for (struct ir3_register *__srcreg = (void *)~0; __srcreg;               \
1502            __srcreg = NULL)                                                    \
1503          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1504               __n++)                                                           \
1505             if ((__srcreg = (__instr)->srcs[__n]))
1506 
1507 /* iterator for an instructions's sources (reg): */
1508 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1509 
1510 /* iterator for an instructions's destinations (reg), also returns dst #: */
1511 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
1512    if ((__instr)->dsts_count)                                                  \
1513       for (struct ir3_register *__dstreg = (void *)~0; __dstreg;               \
1514            __dstreg = NULL)                                                    \
1515          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1516               __n++)                                                           \
1517             if ((__dstreg = (__instr)->dsts[__n]))
1518 
1519 /* iterator for an instructions's destinations (reg): */
1520 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1521 
1522 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1523 __ssa_src_cnt(struct ir3_instruction *instr)
1524 {
1525    return instr->srcs_count + instr->deps_count;
1526 }
1527 
1528 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1529 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1530 {
1531    if (n >= instr->srcs_count)
1532       return true;
1533    return false;
1534 }
1535 
1536 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1537 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1538 {
1539    if (__is_false_dep(instr, n))
1540       return &instr->deps[n - instr->srcs_count];
1541    if (ssa(instr->srcs[n]))
1542       return &instr->srcs[n]->def->instr;
1543    return NULL;
1544 }
1545 
1546 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
1547    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
1548       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
1549            __n++)                                                              \
1550          if ((__srcp = __ssa_srcp_n(__instr, __n)))
1551 
1552 #define foreach_ssa_srcp(__srcp, __instr)                                      \
1553    foreach_ssa_srcp_n (__srcp, __i, __instr)
1554 
1555 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1556 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
1557    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
1558         __srcinst = NULL)                                                      \
1559       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
1560          if ((__srcinst = *__srcp))
1561 
1562 /* iterator for an instruction's SSA sources (instr): */
1563 #define foreach_ssa_src(__srcinst, __instr)                                    \
1564    foreach_ssa_src_n (__srcinst, __i, __instr)
1565 
1566 /* iterators for shader inputs: */
1567 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
1568    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
1569         __ininstr = NULL)                                                      \
1570       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
1571          if ((__ininstr = (__ir)->inputs[__cnt]))
1572 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1573 
1574 /* iterators for instructions: */
1575 #define foreach_instr(__instr, __list)                                         \
1576    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1577 #define foreach_instr_rev(__instr, __list)                                     \
1578    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1579 #define foreach_instr_safe(__instr, __list)                                    \
1580    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1581 #define foreach_instr_from_safe(__instr, __start, __list)                      \
1582    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
1583                                  __list, node)
1584 
1585 /* iterators for blocks: */
1586 #define foreach_block(__block, __list)                                         \
1587    list_for_each_entry (struct ir3_block, __block, __list, node)
1588 #define foreach_block_safe(__block, __list)                                    \
1589    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1590 #define foreach_block_rev(__block, __list)                                     \
1591    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1592 
1593 /* iterators for arrays: */
1594 #define foreach_array(__array, __list)                                         \
1595    list_for_each_entry (struct ir3_array, __array, __list, node)
1596 #define foreach_array_safe(__array, __list)                                    \
1597    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
1598 
1599 #define IR3_PASS(ir, pass, ...)                                                \
1600    ({                                                                          \
1601       bool progress = pass(ir, ##__VA_ARGS__);                                 \
1602       if (progress) {                                                          \
1603          ir3_debug_print(ir, "AFTER: " #pass);                                 \
1604          ir3_validate(ir);                                                     \
1605       }                                                                        \
1606       progress;                                                                \
1607    })
1608 
1609 /* validate: */
1610 void ir3_validate(struct ir3 *ir);
1611 
1612 /* dump: */
1613 void ir3_print(struct ir3 *ir);
1614 void ir3_print_instr(struct ir3_instruction *instr);
1615 
1616 struct log_stream;
1617 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
1618 
1619 /* delay calculation: */
1620 int ir3_delayslots(struct ir3_instruction *assigner,
1621                    struct ir3_instruction *consumer, unsigned n, bool soft);
1622 unsigned ir3_delay_calc_prera(struct ir3_block *block,
1623                               struct ir3_instruction *instr);
1624 unsigned ir3_delay_calc_postra(struct ir3_block *block,
1625                                struct ir3_instruction *instr, bool soft,
1626                                bool mergedregs);
1627 unsigned ir3_delay_calc_exact(struct ir3_block *block,
1628                               struct ir3_instruction *instr, bool mergedregs);
1629 void ir3_remove_nops(struct ir3 *ir);
1630 
1631 /* unreachable block elimination: */
1632 bool ir3_remove_unreachable(struct ir3 *ir);
1633 
1634 /* dead code elimination: */
1635 struct ir3_shader_variant;
1636 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
1637 
1638 /* fp16 conversion folding */
1639 bool ir3_cf(struct ir3 *ir);
1640 
1641 /* copy-propagate: */
1642 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1643 bool ir3_cp_postsched(struct ir3 *ir);
1644 
1645 /* common subexpression elimination: */
1646 bool ir3_cse(struct ir3 *ir);
1647 
1648 /* Make arrays SSA */
1649 bool ir3_array_to_ssa(struct ir3 *ir);
1650 
1651 /* scheduling: */
1652 bool ir3_sched_add_deps(struct ir3 *ir);
1653 int ir3_sched(struct ir3 *ir);
1654 
1655 struct ir3_context;
1656 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
1657 
1658 /* register assignment: */
1659 int ir3_ra(struct ir3_shader_variant *v);
1660 
1661 /* lower subgroup ops: */
1662 bool ir3_lower_subgroups(struct ir3 *ir);
1663 
1664 /* legalize: */
1665 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1666 
1667 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)1668 ir3_has_latency_to_hide(struct ir3 *ir)
1669 {
1670    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
1671     * know the nature of the fragment shader.  Just assume it will have
1672     * latency to hide:
1673     */
1674    if (ir->type != MESA_SHADER_FRAGMENT)
1675       return true;
1676 
1677    foreach_block (block, &ir->block_list) {
1678       foreach_instr (instr, &block->instr_list) {
1679          if (is_tex_or_prefetch(instr))
1680             return true;
1681 
1682          if (is_load(instr)) {
1683             switch (instr->opc) {
1684             case OPC_LDLV:
1685             case OPC_LDL:
1686             case OPC_LDLW:
1687                break;
1688             default:
1689                return true;
1690             }
1691          }
1692       }
1693    }
1694 
1695    return false;
1696 }
1697 
1698 /* ************************************************************************* */
1699 /* instruction helpers */
1700 
1701 /* creates SSA src of correct type (ie. half vs full precision) */
1702 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)1703 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
1704           unsigned flags)
1705 {
1706    struct ir3_register *reg;
1707    if (src->dsts[0]->flags & IR3_REG_HALF)
1708       flags |= IR3_REG_HALF;
1709    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
1710    reg->def = src->dsts[0];
1711    reg->wrmask = src->dsts[0]->wrmask;
1712    return reg;
1713 }
1714 
1715 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)1716 __ssa_dst(struct ir3_instruction *instr)
1717 {
1718    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
1719    reg->instr = instr;
1720    return reg;
1721 }
1722 
1723 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)1724 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1725 {
1726    struct ir3_instruction *mov;
1727    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1728 
1729    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1730    mov->cat1.src_type = type;
1731    mov->cat1.dst_type = type;
1732    __ssa_dst(mov)->flags |= flags;
1733    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
1734 
1735    return mov;
1736 }
1737 
1738 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)1739 create_immed(struct ir3_block *block, uint32_t val)
1740 {
1741    return create_immed_typed(block, val, TYPE_U32);
1742 }
1743 
1744 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)1745 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
1746 {
1747    struct ir3_instruction *mov;
1748    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1749 
1750    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1751    mov->cat1.src_type = type;
1752    mov->cat1.dst_type = type;
1753    __ssa_dst(mov)->flags |= flags;
1754    ir3_src_create(mov, n, IR3_REG_CONST | flags);
1755 
1756    return mov;
1757 }
1758 
1759 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)1760 create_uniform(struct ir3_block *block, unsigned n)
1761 {
1762    return create_uniform_typed(block, n, TYPE_F32);
1763 }
1764 
1765 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)1766 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
1767                         struct ir3_instruction *address)
1768 {
1769    struct ir3_instruction *mov;
1770 
1771    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
1772    mov->cat1.src_type = type;
1773    mov->cat1.dst_type = type;
1774    __ssa_dst(mov);
1775    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1776 
1777    ir3_instr_set_address(mov, address);
1778 
1779    return mov;
1780 }
1781 
1782 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)1783 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1784 {
1785    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1786    unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1787 
1788    __ssa_dst(instr)->flags |= flags;
1789    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
1790       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1791       src_reg->array = src->dsts[0]->array;
1792    } else {
1793       __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED);
1794    }
1795    debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
1796    instr->cat1.src_type = type;
1797    instr->cat1.dst_type = type;
1798    return instr;
1799 }
1800 
1801 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)1802 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
1803         type_t dst_type)
1804 {
1805    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
1806    unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
1807    unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
1808 
1809    debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
1810 
1811    __ssa_dst(instr)->flags |= dst_flags;
1812    __ssa_src(instr, src, 0);
1813    instr->cat1.src_type = src_type;
1814    instr->cat1.dst_type = dst_type;
1815    debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
1816    return instr;
1817 }
1818 
1819 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)1820 ir3_MOVMSK(struct ir3_block *block, unsigned components)
1821 {
1822    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
1823 
1824    struct ir3_register *dst = __ssa_dst(instr);
1825    dst->flags |= IR3_REG_SHARED;
1826    dst->wrmask = (1 << components) - 1;
1827    instr->repeat = components - 1;
1828    return instr;
1829 }
1830 
1831 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)1832 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
1833                  unsigned components)
1834 {
1835    struct ir3_instruction *instr =
1836       ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
1837 
1838    struct ir3_register *dst = __ssa_dst(instr);
1839    dst->flags |= IR3_REG_SHARED;
1840    dst->wrmask = (1 << components) - 1;
1841 
1842    __ssa_src(instr, src, 0);
1843 
1844    return instr;
1845 }
1846 
1847 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)1848 ir3_NOP(struct ir3_block *block)
1849 {
1850    return ir3_instr_create(block, OPC_NOP, 0, 0);
1851 }
1852 
1853 #define IR3_INSTR_0 0
1854 
1855 /* clang-format off */
1856 #define __INSTR0(flag, name, opc)                                              \
1857 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block)      \
1858 {                                                                              \
1859    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0);         \
1860    instr->flags |= flag;                                                       \
1861    return instr;                                                               \
1862 }
1863 /* clang-format on */
1864 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
1865 #define INSTR0(name)     __INSTR0(0, name, OPC_##name)
1866 
1867 /* clang-format off */
1868 #define __INSTR1(flag, dst_count, name, opc)                                   \
1869 static inline struct ir3_instruction *ir3_##name(                              \
1870    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags)        \
1871 {                                                                              \
1872    struct ir3_instruction *instr =                                             \
1873       ir3_instr_create(block, opc, dst_count, 1);                              \
1874    for (unsigned i = 0; i < dst_count; i++)                                    \
1875       __ssa_dst(instr);                                                        \
1876    __ssa_src(instr, a, aflags);                                                \
1877    instr->flags |= flag;                                                       \
1878    return instr;                                                               \
1879 }
1880 /* clang-format on */
1881 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1882 #define INSTR1(name)      __INSTR1(0, 1, name, OPC_##name)
1883 #define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name)
1884 
1885 /* clang-format off */
1886 #define __INSTR2(flag, name, opc)                                              \
1887 static inline struct ir3_instruction *ir3_##name(                              \
1888    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1889    struct ir3_instruction *b, unsigned bflags)                                 \
1890 {                                                                              \
1891    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2);         \
1892    __ssa_dst(instr);                                                           \
1893    __ssa_src(instr, a, aflags);                                                \
1894    __ssa_src(instr, b, bflags);                                                \
1895    instr->flags |= flag;                                                       \
1896    return instr;                                                               \
1897 }
1898 /* clang-format on */
1899 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name)
1900 #define INSTR2(name)     __INSTR2(0, name, OPC_##name)
1901 
1902 /* clang-format off */
1903 #define __INSTR3(flag, dst_count, name, opc)                                   \
1904 static inline struct ir3_instruction *ir3_##name(                              \
1905    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1906    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1907    unsigned cflags)                                                            \
1908 {                                                                              \
1909    struct ir3_instruction *instr =                                             \
1910       ir3_instr_create(block, opc, dst_count, 3);                              \
1911    for (unsigned i = 0; i < dst_count; i++)                                    \
1912       __ssa_dst(instr);                                                        \
1913    __ssa_src(instr, a, aflags);                                                \
1914    __ssa_src(instr, b, bflags);                                                \
1915    __ssa_src(instr, c, cflags);                                                \
1916    instr->flags |= flag;                                                       \
1917    return instr;                                                               \
1918 }
1919 /* clang-format on */
1920 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1921 #define INSTR3(name)      __INSTR3(0, 1, name, OPC_##name)
1922 #define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name)
1923 
1924 /* clang-format off */
1925 #define __INSTR4(flag, dst_count, name, opc)                                   \
1926 static inline struct ir3_instruction *ir3_##name(                              \
1927    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1928    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1929    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
1930 {                                                                              \
1931    struct ir3_instruction *instr =                                             \
1932       ir3_instr_create(block, opc, dst_count, 4);                              \
1933    for (unsigned i = 0; i < dst_count; i++)                                    \
1934       __ssa_dst(instr);                                                        \
1935    __ssa_src(instr, a, aflags);                                                \
1936    __ssa_src(instr, b, bflags);                                                \
1937    __ssa_src(instr, c, cflags);                                                \
1938    __ssa_src(instr, d, dflags);                                                \
1939    instr->flags |= flag;                                                       \
1940    return instr;                                                               \
1941 }
1942 /* clang-format on */
1943 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1944 #define INSTR4(name)      __INSTR4(0, 1, name, OPC_##name)
1945 #define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name)
1946 
1947 /* clang-format off */
1948 #define __INSTR5(flag, name, opc)                                              \
1949 static inline struct ir3_instruction *ir3_##name(                              \
1950    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1951    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1952    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
1953    struct ir3_instruction *e, unsigned eflags)                                 \
1954 {                                                                              \
1955    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5);         \
1956    __ssa_dst(instr);                                                           \
1957    __ssa_src(instr, a, aflags);                                                \
1958    __ssa_src(instr, b, bflags);                                                \
1959    __ssa_src(instr, c, cflags);                                                \
1960    __ssa_src(instr, d, dflags);                                                \
1961    __ssa_src(instr, e, eflags);                                                \
1962    instr->flags |= flag;                                                       \
1963    return instr;                                                               \
1964 }
1965 /* clang-format on */
1966 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
1967 #define INSTR5(name)     __INSTR5(0, name, OPC_##name)
1968 
1969 /* clang-format off */
1970 #define __INSTR6(flag, dst_count, name, opc)                                   \
1971 static inline struct ir3_instruction *ir3_##name(                              \
1972    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
1973    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
1974    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
1975    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
1976    unsigned fflags)                                                            \
1977 {                                                                              \
1978    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6);         \
1979    for (unsigned i = 0; i < dst_count; i++)                                    \
1980       __ssa_dst(instr);                                                        \
1981    __ssa_src(instr, a, aflags);                                                \
1982    __ssa_src(instr, b, bflags);                                                \
1983    __ssa_src(instr, c, cflags);                                                \
1984    __ssa_src(instr, d, dflags);                                                \
1985    __ssa_src(instr, e, eflags);                                                \
1986    __ssa_src(instr, f, fflags);                                                \
1987    instr->flags |= flag;                                                       \
1988    return instr;                                                               \
1989 }
1990 /* clang-format on */
1991 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
1992 #define INSTR6(name)      __INSTR6(0, 1, name, OPC_##name)
1993 #define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name)
1994 
1995 /* cat0 instructions: */
1996 INSTR1NODST(B)
INSTR0(JUMP)1997 INSTR0(JUMP)
1998 INSTR1NODST(KILL)
1999 INSTR1NODST(DEMOTE)
2000 INSTR0(END)
2001 INSTR0(CHSH)
2002 INSTR0(CHMASK)
2003 INSTR1NODST(PREDT)
2004 INSTR0(PREDF)
2005 INSTR0(PREDE)
2006 INSTR0(GETONE)
2007 
2008 /* cat1 macros */
2009 INSTR1(ANY_MACRO)
2010 INSTR1(ALL_MACRO)
2011 INSTR1(READ_FIRST_MACRO)
2012 INSTR2(READ_COND_MACRO)
2013 
2014 static inline struct ir3_instruction *
2015 ir3_ELECT_MACRO(struct ir3_block *block)
2016 {
2017    struct ir3_instruction *instr =
2018       ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2019    __ssa_dst(instr);
2020    return instr;
2021 }
2022 
2023 /* cat2 instructions, most 2 src but some 1 src: */
2024 INSTR2(ADD_F)
INSTR2(MIN_F)2025 INSTR2(MIN_F)
2026 INSTR2(MAX_F)
2027 INSTR2(MUL_F)
2028 INSTR1(SIGN_F)
2029 INSTR2(CMPS_F)
2030 INSTR1(ABSNEG_F)
2031 INSTR2(CMPV_F)
2032 INSTR1(FLOOR_F)
2033 INSTR1(CEIL_F)
2034 INSTR1(RNDNE_F)
2035 INSTR1(RNDAZ_F)
2036 INSTR1(TRUNC_F)
2037 INSTR2(ADD_U)
2038 INSTR2(ADD_S)
2039 INSTR2(SUB_U)
2040 INSTR2(SUB_S)
2041 INSTR2(CMPS_U)
2042 INSTR2(CMPS_S)
2043 INSTR2(MIN_U)
2044 INSTR2(MIN_S)
2045 INSTR2(MAX_U)
2046 INSTR2(MAX_S)
2047 INSTR1(ABSNEG_S)
2048 INSTR2(AND_B)
2049 INSTR2(OR_B)
2050 INSTR1(NOT_B)
2051 INSTR2(XOR_B)
2052 INSTR2(CMPV_U)
2053 INSTR2(CMPV_S)
2054 INSTR2(MUL_U24)
2055 INSTR2(MUL_S24)
2056 INSTR2(MULL_U)
2057 INSTR1(BFREV_B)
2058 INSTR1(CLZ_S)
2059 INSTR1(CLZ_B)
2060 INSTR2(SHL_B)
2061 INSTR2(SHR_B)
2062 INSTR2(ASHR_B)
2063 INSTR2(BARY_F)
2064 INSTR2(MGEN_B)
2065 INSTR2(GETBIT_B)
2066 INSTR1(SETRM)
2067 INSTR1(CBITS_B)
2068 INSTR2(SHB)
2069 INSTR2(MSAD)
2070 
2071 /* cat3 instructions: */
2072 INSTR3(MAD_U16)
2073 INSTR3(MADSH_U16)
2074 INSTR3(MAD_S16)
2075 INSTR3(MADSH_M16)
2076 INSTR3(MAD_U24)
2077 INSTR3(MAD_S24)
2078 INSTR3(MAD_F16)
2079 INSTR3(MAD_F32)
2080 /* NOTE: SEL_B32 checks for zero vs nonzero */
2081 INSTR3(SEL_B16)
2082 INSTR3(SEL_B32)
2083 INSTR3(SEL_S16)
2084 INSTR3(SEL_S32)
2085 INSTR3(SEL_F16)
2086 INSTR3(SEL_F32)
2087 INSTR3(SAD_S16)
2088 INSTR3(SAD_S32)
2089 
2090 /* cat4 instructions: */
2091 INSTR1(RCP)
2092 INSTR1(RSQ)
2093 INSTR1(HRSQ)
2094 INSTR1(LOG2)
2095 INSTR1(HLOG2)
2096 INSTR1(EXP2)
2097 INSTR1(HEXP2)
2098 INSTR1(SIN)
2099 INSTR1(COS)
2100 INSTR1(SQRT)
2101 
2102 /* cat5 instructions: */
2103 INSTR1(DSX)
2104 INSTR1(DSXPP_MACRO)
2105 INSTR1(DSY)
2106 INSTR1(DSYPP_MACRO)
2107 INSTR1F(3D, DSX)
2108 INSTR1F(3D, DSY)
2109 INSTR1(RGETPOS)
2110 
2111 static inline struct ir3_instruction *
2112 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2113         unsigned flags, struct ir3_instruction *samp_tex,
2114         struct ir3_instruction *src0, struct ir3_instruction *src1)
2115 {
2116    struct ir3_instruction *sam;
2117    unsigned nreg = 0;
2118 
2119    if (flags & IR3_INSTR_S2EN) {
2120       nreg++;
2121    }
2122    if (src0) {
2123       nreg++;
2124    }
2125    if (src1) {
2126       nreg++;
2127    }
2128 
2129    sam = ir3_instr_create(block, opc, 1, nreg);
2130    sam->flags |= flags;
2131    __ssa_dst(sam)->wrmask = wrmask;
2132    if (flags & IR3_INSTR_S2EN) {
2133       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2134    }
2135    if (src0) {
2136       __ssa_src(sam, src0, 0);
2137    }
2138    if (src1) {
2139       __ssa_src(sam, src1, 0);
2140    }
2141    sam->cat5.type = type;
2142 
2143    return sam;
2144 }
2145 
2146 /* cat6 instructions: */
2147 INSTR2(LDLV)
2148 INSTR3(LDG)
2149 INSTR3(LDL)
2150 INSTR3(LDLW)
2151 INSTR3(LDP)
2152 INSTR4NODST(STG)
2153 INSTR3NODST(STL)
2154 INSTR3NODST(STLW)
2155 INSTR3NODST(STP)
2156 INSTR1(RESINFO)
2157 INSTR1(RESFMT)
2158 INSTR2(ATOMIC_ADD)
2159 INSTR2(ATOMIC_SUB)
2160 INSTR2(ATOMIC_XCHG)
2161 INSTR2(ATOMIC_INC)
2162 INSTR2(ATOMIC_DEC)
2163 INSTR2(ATOMIC_CMPXCHG)
2164 INSTR2(ATOMIC_MIN)
2165 INSTR2(ATOMIC_MAX)
2166 INSTR2(ATOMIC_AND)
2167 INSTR2(ATOMIC_OR)
2168 INSTR2(ATOMIC_XOR)
2169 INSTR2(LDC)
2170 #if GPU >= 600
2171 INSTR3NODST(STIB);
2172 INSTR2(LDIB);
2173 INSTR5(LDG_A);
2174 INSTR6NODST(STG_A);
2175 INSTR3F(G, ATOMIC_ADD)
2176 INSTR3F(G, ATOMIC_SUB)
2177 INSTR3F(G, ATOMIC_XCHG)
2178 INSTR3F(G, ATOMIC_INC)
2179 INSTR3F(G, ATOMIC_DEC)
2180 INSTR3F(G, ATOMIC_CMPXCHG)
2181 INSTR3F(G, ATOMIC_MIN)
2182 INSTR3F(G, ATOMIC_MAX)
2183 INSTR3F(G, ATOMIC_AND)
2184 INSTR3F(G, ATOMIC_OR)
2185 INSTR3F(G, ATOMIC_XOR)
2186 #elif GPU >= 400
2187 INSTR3(LDGB)
2188 #if GPU >= 500
2189 INSTR3(LDIB)
2190 #endif
2191 INSTR4NODST(STGB)
2192 INSTR4NODST(STIB)
2193 INSTR4F(G, ATOMIC_ADD)
2194 INSTR4F(G, ATOMIC_SUB)
2195 INSTR4F(G, ATOMIC_XCHG)
2196 INSTR4F(G, ATOMIC_INC)
2197 INSTR4F(G, ATOMIC_DEC)
2198 INSTR4F(G, ATOMIC_CMPXCHG)
2199 INSTR4F(G, ATOMIC_MIN)
2200 INSTR4F(G, ATOMIC_MAX)
2201 INSTR4F(G, ATOMIC_AND)
2202 INSTR4F(G, ATOMIC_OR)
2203 INSTR4F(G, ATOMIC_XOR)
2204 #endif
2205 
2206 /* cat7 instructions: */
2207 INSTR0(BAR)
2208 INSTR0(FENCE)
2209 
2210 /* ************************************************************************* */
2211 #include "bitset.h"
2212 
2213 #define MAX_REG 256
2214 
2215 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
2216 
2217 typedef struct {
2218    bool mergedregs;
2219    regmaskstate_t mask;
2220 } regmask_t;
2221 
2222 static inline bool
__regmask_get(regmask_t * regmask,bool half,unsigned n)2223 __regmask_get(regmask_t *regmask, bool half, unsigned n)
2224 {
2225    if (regmask->mergedregs) {
2226       /* a6xx+ case, with merged register file, we track things in terms
2227        * of half-precision registers, with a full precisions register
2228        * using two half-precision slots.
2229        *
2230        * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
2231        * avoid having them alias normal full regs.
2232        */
2233       if (half && !is_reg_num_special(n)) {
2234          return BITSET_TEST(regmask->mask, n);
2235       } else {
2236          n *= 2;
2237          return BITSET_TEST(regmask->mask, n) ||
2238                 BITSET_TEST(regmask->mask, n + 1);
2239       }
2240    } else {
2241       /* pre a6xx case, with separate register file for half and full
2242        * precision:
2243        */
2244       if (half)
2245          n += MAX_REG;
2246       return BITSET_TEST(regmask->mask, n);
2247    }
2248 }
2249 
2250 static inline void
__regmask_set(regmask_t * regmask,bool half,unsigned n)2251 __regmask_set(regmask_t *regmask, bool half, unsigned n)
2252 {
2253    if (regmask->mergedregs) {
2254       /* a6xx+ case, with merged register file, we track things in terms
2255        * of half-precision registers, with a full precisions register
2256        * using two half-precision slots:
2257        */
2258       if (half && !is_reg_num_special(n)) {
2259          BITSET_SET(regmask->mask, n);
2260       } else {
2261          n *= 2;
2262          BITSET_SET(regmask->mask, n);
2263          BITSET_SET(regmask->mask, n + 1);
2264       }
2265    } else {
2266       /* pre a6xx case, with separate register file for half and full
2267        * precision:
2268        */
2269       if (half)
2270          n += MAX_REG;
2271       BITSET_SET(regmask->mask, n);
2272    }
2273 }
2274 
2275 static inline void
__regmask_clear(regmask_t * regmask,bool half,unsigned n)2276 __regmask_clear(regmask_t *regmask, bool half, unsigned n)
2277 {
2278    if (regmask->mergedregs) {
2279       /* a6xx+ case, with merged register file, we track things in terms
2280        * of half-precision registers, with a full precisions register
2281        * using two half-precision slots:
2282        */
2283       if (half && !is_reg_num_special(n)) {
2284          BITSET_CLEAR(regmask->mask, n);
2285       } else {
2286          n *= 2;
2287          BITSET_CLEAR(regmask->mask, n);
2288          BITSET_CLEAR(regmask->mask, n + 1);
2289       }
2290    } else {
2291       /* pre a6xx case, with separate register file for half and full
2292        * precision:
2293        */
2294       if (half)
2295          n += MAX_REG;
2296       BITSET_CLEAR(regmask->mask, n);
2297    }
2298 }
2299 
2300 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)2301 regmask_init(regmask_t *regmask, bool mergedregs)
2302 {
2303    memset(&regmask->mask, 0, sizeof(regmask->mask));
2304    regmask->mergedregs = mergedregs;
2305 }
2306 
2307 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)2308 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
2309 {
2310    assert(dst->mergedregs == a->mergedregs);
2311    assert(dst->mergedregs == b->mergedregs);
2312 
2313    for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
2314       dst->mask[i] = a->mask[i] | b->mask[i];
2315 }
2316 
2317 
2318 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)2319 regmask_set(regmask_t *regmask, struct ir3_register *reg)
2320 {
2321    bool half = reg->flags & IR3_REG_HALF;
2322    if (reg->flags & IR3_REG_RELATIV) {
2323       for (unsigned i = 0; i < reg->size; i++)
2324          __regmask_set(regmask, half, reg->array.base + i);
2325    } else {
2326       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2327          if (mask & 1)
2328             __regmask_set(regmask, half, n);
2329    }
2330 }
2331 
2332 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)2333 regmask_get(regmask_t *regmask, struct ir3_register *reg)
2334 {
2335    bool half = reg->flags & IR3_REG_HALF;
2336    if (reg->flags & IR3_REG_RELATIV) {
2337       for (unsigned i = 0; i < reg->size; i++)
2338          if (__regmask_get(regmask, half, reg->array.base + i))
2339             return true;
2340    } else {
2341       for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
2342          if (mask & 1)
2343             if (__regmask_get(regmask, half, n))
2344                return true;
2345    }
2346    return false;
2347 }
2348 /* ************************************************************************* */
2349 
2350 #endif /* IR3_H_ */
2351