1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27 
28 #include "aco_opcodes.h"
29 #include "aco_util.h"
30 
31 #include "vulkan/radv_shader.h"
32 
33 #include "nir.h"
34 
35 #include <bitset>
36 #include <memory>
37 #include <vector>
38 
39 struct radv_shader_args;
40 struct radv_shader_info;
41 struct radv_vs_prolog_key;
42 
43 namespace aco {
44 
45 extern uint64_t debug_flags;
46 
47 enum {
48    DEBUG_VALIDATE_IR = 0x1,
49    DEBUG_VALIDATE_RA = 0x2,
50    DEBUG_PERFWARN = 0x4,
51    DEBUG_FORCE_WAITCNT = 0x8,
52    DEBUG_NO_VN = 0x10,
53    DEBUG_NO_OPT = 0x20,
54    DEBUG_NO_SCHED = 0x40,
55    DEBUG_PERF_INFO = 0x80,
56    DEBUG_LIVE_INFO = 0x100,
57 };
58 
59 /**
60  * Representation of the instruction's microcode encoding format
61  * Note: Some Vector ALU Formats can be combined, such that:
62  * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
63  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
64  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
65  *
66  * (*) The same is applicable for VOP1 and VOPC instructions.
67  */
68 enum class Format : std::uint16_t {
69    /* Pseudo Instruction Format */
70    PSEUDO = 0,
71    /* Scalar ALU & Control Formats */
72    SOP1 = 1,
73    SOP2 = 2,
74    SOPK = 3,
75    SOPP = 4,
76    SOPC = 5,
77    /* Scalar Memory Format */
78    SMEM = 6,
79    /* LDS/GDS Format */
80    DS = 8,
81    /* Vector Memory Buffer Formats */
82    MTBUF = 9,
83    MUBUF = 10,
84    /* Vector Memory Image Format */
85    MIMG = 11,
86    /* Export Format */
87    EXP = 12,
88    /* Flat Formats */
89    FLAT = 13,
90    GLOBAL = 14,
91    SCRATCH = 15,
92 
93    PSEUDO_BRANCH = 16,
94    PSEUDO_BARRIER = 17,
95    PSEUDO_REDUCTION = 18,
96 
97    /* Vector ALU Formats */
98    VOP3P = 19,
99    VOP1 = 1 << 8,
100    VOP2 = 1 << 9,
101    VOPC = 1 << 10,
102    VOP3 = 1 << 11,
103    /* Vector Parameter Interpolation Format */
104    VINTRP = 1 << 12,
105    DPP = 1 << 13,
106    SDWA = 1 << 14,
107 };
108 
109 enum class instr_class : uint8_t {
110    valu32 = 0,
111    valu_convert32 = 1,
112    valu64 = 2,
113    valu_quarter_rate32 = 3,
114    valu_fma = 4,
115    valu_transcendental32 = 5,
116    valu_double = 6,
117    valu_double_add = 7,
118    valu_double_convert = 8,
119    valu_double_transcendental = 9,
120    salu = 10,
121    smem = 11,
122    barrier = 12,
123    branch = 13,
124    sendmsg = 14,
125    ds = 15,
126    exp = 16,
127    vmem = 17,
128    waitcnt = 18,
129    other = 19,
130    count,
131 };
132 
133 enum storage_class : uint8_t {
134    storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
135    storage_buffer = 0x1, /* SSBOs and global memory */
136    storage_atomic_counter = 0x2, /* not used for Vulkan */
137    storage_image = 0x4,
138    storage_shared = 0x8,       /* or TCS output */
139    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
140    storage_scratch = 0x20,
141    storage_vgpr_spill = 0x40,
142    storage_count = 8,
143 };
144 
145 enum memory_semantics : uint8_t {
146    semantic_none = 0x0,
147    /* for loads: don't move any access after this load to before this load (even other loads)
148     * for barriers: don't move any access after the barrier to before any
149     * atomics/control_barriers/sendmsg_gs_done before the barrier */
150    semantic_acquire = 0x1,
151    /* for stores: don't move any access before this store to after this store
152     * for barriers: don't move any access before the barrier to after any
153     * atomics/control_barriers/sendmsg_gs_done after the barrier */
154    semantic_release = 0x2,
155 
156    /* the rest are for load/stores/atomics only */
157    /* cannot be DCE'd or CSE'd */
158    semantic_volatile = 0x4,
159    /* does not interact with barriers and assumes this lane is the only lane
160     * accessing this memory */
161    semantic_private = 0x8,
162    /* this operation can be reordered around operations of the same storage.
163     * says nothing about barriers */
164    semantic_can_reorder = 0x10,
165    /* this is a atomic instruction (may only read or write memory) */
166    semantic_atomic = 0x20,
167    /* this is instruction both reads and writes memory */
168    semantic_rmw = 0x40,
169 
170    semantic_acqrel = semantic_acquire | semantic_release,
171    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
172 };
173 
174 enum sync_scope : uint8_t {
175    scope_invocation = 0,
176    scope_subgroup = 1,
177    scope_workgroup = 2,
178    scope_queuefamily = 3,
179    scope_device = 4,
180 };
181 
182 struct memory_sync_info {
memory_sync_infomemory_sync_info183    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
184    memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
185        : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
186    {}
187 
188    storage_class storage : 8;
189    memory_semantics semantics : 8;
190    sync_scope scope : 8;
191 
192    bool operator==(const memory_sync_info& rhs) const
193    {
194       return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
195    }
196 
can_reordermemory_sync_info197    bool can_reorder() const
198    {
199       if (semantics & semantic_acqrel)
200          return false;
201       /* Also check storage so that zero-initialized memory_sync_info can be
202        * reordered. */
203       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
204    }
205 };
206 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
207 
208 enum fp_round {
209    fp_round_ne = 0,
210    fp_round_pi = 1,
211    fp_round_ni = 2,
212    fp_round_tz = 3,
213 };
214 
215 enum fp_denorm {
216    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
217     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
218    fp_denorm_flush = 0x0,
219    fp_denorm_keep_in = 0x1,
220    fp_denorm_keep_out = 0x2,
221    fp_denorm_keep = 0x3,
222 };
223 
224 struct float_mode {
225    /* matches encoding of the MODE register */
226    union {
227       struct {
228          fp_round round32 : 2;
229          fp_round round16_64 : 2;
230          unsigned denorm32 : 2;
231          unsigned denorm16_64 : 2;
232       };
233       struct {
234          uint8_t round : 4;
235          uint8_t denorm : 4;
236       };
237       uint8_t val = 0;
238    };
239    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
240    bool preserve_signed_zero_inf_nan32 : 1;
241    bool preserve_signed_zero_inf_nan16_64 : 1;
242    /* if false, optimizations which may remove denormal flushing can be done */
243    bool must_flush_denorms32 : 1;
244    bool must_flush_denorms16_64 : 1;
245    bool care_about_round32 : 1;
246    bool care_about_round16_64 : 1;
247 
248    /* Returns true if instructions using the mode "other" can safely use the
249     * current one instead. */
canReplacefloat_mode250    bool canReplace(float_mode other) const noexcept
251    {
252       return val == other.val &&
253              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
254              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
255              (must_flush_denorms32 || !other.must_flush_denorms32) &&
256              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
257              (care_about_round32 || !other.care_about_round32) &&
258              (care_about_round16_64 || !other.care_about_round16_64);
259    }
260 };
261 
262 struct wait_imm {
263    static const uint8_t unset_counter = 0xff;
264 
265    uint8_t vm;
266    uint8_t exp;
267    uint8_t lgkm;
268    uint8_t vs;
269 
270    wait_imm();
271    wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
272    wait_imm(enum chip_class chip, uint16_t packed);
273 
274    uint16_t pack(enum chip_class chip) const;
275 
276    bool combine(const wait_imm& other);
277 
278    bool empty() const;
279 };
280 
281 constexpr Format
asVOP3(Format format)282 asVOP3(Format format)
283 {
284    return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
285 };
286 
287 constexpr Format
asSDWA(Format format)288 asSDWA(Format format)
289 {
290    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
291    return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
292 }
293 
294 constexpr Format
withoutDPP(Format format)295 withoutDPP(Format format)
296 {
297    return (Format)((uint32_t)format & ~(uint32_t)Format::DPP);
298 }
299 
300 enum class RegType {
301    none = 0,
302    sgpr,
303    vgpr,
304    linear_vgpr,
305 };
306 
307 struct RegClass {
308 
309    enum RC : uint8_t {
310       s1 = 1,
311       s2 = 2,
312       s3 = 3,
313       s4 = 4,
314       s6 = 6,
315       s8 = 8,
316       s16 = 16,
317       v1 = s1 | (1 << 5),
318       v2 = s2 | (1 << 5),
319       v3 = s3 | (1 << 5),
320       v4 = s4 | (1 << 5),
321       v5 = 5 | (1 << 5),
322       v6 = 6 | (1 << 5),
323       v7 = 7 | (1 << 5),
324       v8 = 8 | (1 << 5),
325       /* byte-sized register class */
326       v1b = v1 | (1 << 7),
327       v2b = v2 | (1 << 7),
328       v3b = v3 | (1 << 7),
329       v4b = v4 | (1 << 7),
330       v6b = v6 | (1 << 7),
331       v8b = v8 | (1 << 7),
332       /* these are used for WWM and spills to vgpr */
333       v1_linear = v1 | (1 << 6),
334       v2_linear = v2 | (1 << 6),
335    };
336 
337    RegClass() = default;
RegClassRegClass338    constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass339    constexpr RegClass(RegType type, unsigned size)
340        : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
341    {}
342 
RCRegClass343    constexpr operator RC() const { return rc; }
344    explicit operator bool() = delete;
345 
typeRegClass346    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass347    constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass348    constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass349    constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
350    // TODO: use size() less in favor of bytes()
sizeRegClass351    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass352    constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass353    constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass354    constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
355 
getRegClass356    static constexpr RegClass get(RegType type, unsigned bytes)
357    {
358       if (type == RegType::sgpr) {
359          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
360       } else {
361          return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
362       }
363    }
364 
resizeRegClass365    constexpr RegClass resize(unsigned bytes) const
366    {
367       if (is_linear_vgpr()) {
368          assert(bytes % 4u == 0);
369          return get(RegType::vgpr, bytes).as_linear();
370       }
371       return get(type(), bytes);
372    }
373 
374 private:
375    RC rc;
376 };
377 
378 /* transitional helper expressions */
379 static constexpr RegClass s1{RegClass::s1};
380 static constexpr RegClass s2{RegClass::s2};
381 static constexpr RegClass s3{RegClass::s3};
382 static constexpr RegClass s4{RegClass::s4};
383 static constexpr RegClass s8{RegClass::s8};
384 static constexpr RegClass s16{RegClass::s16};
385 static constexpr RegClass v1{RegClass::v1};
386 static constexpr RegClass v2{RegClass::v2};
387 static constexpr RegClass v3{RegClass::v3};
388 static constexpr RegClass v4{RegClass::v4};
389 static constexpr RegClass v5{RegClass::v5};
390 static constexpr RegClass v6{RegClass::v6};
391 static constexpr RegClass v7{RegClass::v7};
392 static constexpr RegClass v8{RegClass::v8};
393 static constexpr RegClass v1b{RegClass::v1b};
394 static constexpr RegClass v2b{RegClass::v2b};
395 static constexpr RegClass v3b{RegClass::v3b};
396 static constexpr RegClass v4b{RegClass::v4b};
397 static constexpr RegClass v6b{RegClass::v6b};
398 static constexpr RegClass v8b{RegClass::v8b};
399 
400 /**
401  * Temp Class
402  * Each temporary virtual register has a
403  * register class (i.e. size and type)
404  * and SSA id.
405  */
406 struct Temp {
TempTemp407    Temp() noexcept : id_(0), reg_class(0) {}
TempTemp408    constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
409 
idTemp410    constexpr uint32_t id() const noexcept { return id_; }
regClassTemp411    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
412 
bytesTemp413    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp414    constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp415    constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp416    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
417 
418    constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
419    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
420    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
421 
422 private:
423    uint32_t id_ : 24;
424    uint32_t reg_class : 8;
425 };
426 
427 /**
428  * PhysReg
429  * Represents the physical register for each
430  * Operand and Definition.
431  */
432 struct PhysReg {
433    constexpr PhysReg() = default;
PhysRegPhysReg434    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg435    constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg436    constexpr unsigned byte() const { return reg_b & 0x3; }
437    constexpr operator unsigned() const { return reg(); }
438    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
439    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
440    constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg441    constexpr PhysReg advance(int bytes) const
442    {
443       PhysReg res = *this;
444       res.reg_b += bytes;
445       return res;
446    }
447 
448    uint16_t reg_b = 0;
449 };
450 
451 /* helper expressions for special registers */
452 static constexpr PhysReg m0{124};
453 static constexpr PhysReg vcc{106};
454 static constexpr PhysReg vcc_hi{107};
455 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
456 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
457 static constexpr PhysReg ttmp0{112};
458 static constexpr PhysReg ttmp1{113};
459 static constexpr PhysReg ttmp2{114};
460 static constexpr PhysReg ttmp3{115};
461 static constexpr PhysReg ttmp4{116};
462 static constexpr PhysReg ttmp5{117};
463 static constexpr PhysReg ttmp6{118};
464 static constexpr PhysReg ttmp7{119};
465 static constexpr PhysReg ttmp8{120};
466 static constexpr PhysReg ttmp9{121};
467 static constexpr PhysReg ttmp10{122};
468 static constexpr PhysReg ttmp11{123};
469 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
470 static constexpr PhysReg exec{126};
471 static constexpr PhysReg exec_lo{126};
472 static constexpr PhysReg exec_hi{127};
473 static constexpr PhysReg vccz{251};
474 static constexpr PhysReg execz{252};
475 static constexpr PhysReg scc{253};
476 
477 /**
478  * Operand Class
479  * Initially, each Operand refers to either
480  * a temporary virtual register
481  * or to a constant value
482  * Temporary registers get mapped to physical register during RA
483  * Constant values are inlined into the instruction sequence.
484  */
485 class Operand final {
486 public:
Operand()487    constexpr Operand()
488        : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
489          isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
490          is24bit_(false), signext(false)
491    {}
492 
Operand(Temp r)493    explicit Operand(Temp r) noexcept
494    {
495       data_.temp = r;
496       if (r.id()) {
497          isTemp_ = true;
498       } else {
499          isUndef_ = true;
500          setFixed(PhysReg{128});
501       }
502    };
Operand(Temp r,PhysReg reg)503    explicit Operand(Temp r, PhysReg reg) noexcept
504    {
505       assert(r.id()); /* Don't allow fixing an undef to a register */
506       data_.temp = r;
507       isTemp_ = true;
508       setFixed(reg);
509    };
510 
511    /* 8-bit constant */
c8(uint8_t v)512    static Operand c8(uint8_t v) noexcept
513    {
514       /* 8-bit constants are only used for copies and copies from any 8-bit
515        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
516        * to be inline constants. */
517       Operand op;
518       op.control_ = 0;
519       op.data_.i = v;
520       op.isConstant_ = true;
521       op.constSize = 0;
522       op.setFixed(PhysReg{0u});
523       return op;
524    };
525 
526    /* 16-bit constant */
c16(uint16_t v)527    static Operand c16(uint16_t v) noexcept
528    {
529       Operand op;
530       op.control_ = 0;
531       op.data_.i = v;
532       op.isConstant_ = true;
533       op.constSize = 1;
534       if (v <= 64)
535          op.setFixed(PhysReg{128u + v});
536       else if (v >= 0xFFF0) /* [-16 .. -1] */
537          op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
538       else if (v == 0x3800) /* 0.5 */
539          op.setFixed(PhysReg{240});
540       else if (v == 0xB800) /* -0.5 */
541          op.setFixed(PhysReg{241});
542       else if (v == 0x3C00) /* 1.0 */
543          op.setFixed(PhysReg{242});
544       else if (v == 0xBC00) /* -1.0 */
545          op.setFixed(PhysReg{243});
546       else if (v == 0x4000) /* 2.0 */
547          op.setFixed(PhysReg{244});
548       else if (v == 0xC000) /* -2.0 */
549          op.setFixed(PhysReg{245});
550       else if (v == 0x4400) /* 4.0 */
551          op.setFixed(PhysReg{246});
552       else if (v == 0xC400) /* -4.0 */
553          op.setFixed(PhysReg{247});
554       else if (v == 0x3118) /* 1/2 PI */
555          op.setFixed(PhysReg{248});
556       else /* Literal Constant */
557          op.setFixed(PhysReg{255});
558       return op;
559    }
560 
561    /* 32-bit constant */
c32(uint32_t v)562    static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
563 
564    /* 64-bit constant */
c64(uint64_t v)565    static Operand c64(uint64_t v) noexcept
566    {
567       Operand op;
568       op.control_ = 0;
569       op.isConstant_ = true;
570       op.constSize = 3;
571       if (v <= 64) {
572          op.data_.i = (uint32_t)v;
573          op.setFixed(PhysReg{128 + (uint32_t)v});
574       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
575          op.data_.i = (uint32_t)v;
576          op.setFixed(PhysReg{192 - (uint32_t)v});
577       } else if (v == 0x3FE0000000000000) { /* 0.5 */
578          op.data_.i = 0x3f000000;
579          op.setFixed(PhysReg{240});
580       } else if (v == 0xBFE0000000000000) { /* -0.5 */
581          op.data_.i = 0xbf000000;
582          op.setFixed(PhysReg{241});
583       } else if (v == 0x3FF0000000000000) { /* 1.0 */
584          op.data_.i = 0x3f800000;
585          op.setFixed(PhysReg{242});
586       } else if (v == 0xBFF0000000000000) { /* -1.0 */
587          op.data_.i = 0xbf800000;
588          op.setFixed(PhysReg{243});
589       } else if (v == 0x4000000000000000) { /* 2.0 */
590          op.data_.i = 0x40000000;
591          op.setFixed(PhysReg{244});
592       } else if (v == 0xC000000000000000) { /* -2.0 */
593          op.data_.i = 0xc0000000;
594          op.setFixed(PhysReg{245});
595       } else if (v == 0x4010000000000000) { /* 4.0 */
596          op.data_.i = 0x40800000;
597          op.setFixed(PhysReg{246});
598       } else if (v == 0xC010000000000000) { /* -4.0 */
599          op.data_.i = 0xc0800000;
600          op.setFixed(PhysReg{247});
601       } else { /* Literal Constant: we don't know if it is a long or double.*/
602          op.signext = v >> 63;
603          op.data_.i = v & 0xffffffffu;
604          op.setFixed(PhysReg{255});
605          assert(op.constantValue64() == v &&
606                 "attempt to create a unrepresentable 64-bit literal constant");
607       }
608       return op;
609    }
610 
611    /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)612    static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
613    {
614       Operand op;
615       op.control_ = 0;
616       op.data_.i = v;
617       op.isConstant_ = true;
618       op.constSize = is64bit ? 3 : 2;
619       if (v <= 64)
620          op.setFixed(PhysReg{128 + v});
621       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
622          op.setFixed(PhysReg{192 - v});
623       else if (v == 0x3f000000) /* 0.5 */
624          op.setFixed(PhysReg{240});
625       else if (v == 0xbf000000) /* -0.5 */
626          op.setFixed(PhysReg{241});
627       else if (v == 0x3f800000) /* 1.0 */
628          op.setFixed(PhysReg{242});
629       else if (v == 0xbf800000) /* -1.0 */
630          op.setFixed(PhysReg{243});
631       else if (v == 0x40000000) /* 2.0 */
632          op.setFixed(PhysReg{244});
633       else if (v == 0xc0000000) /* -2.0 */
634          op.setFixed(PhysReg{245});
635       else if (v == 0x40800000) /* 4.0 */
636          op.setFixed(PhysReg{246});
637       else if (v == 0xc0800000) /* -4.0 */
638          op.setFixed(PhysReg{247});
639       else { /* Literal Constant */
640          assert(!is64bit && "attempt to create a 64-bit literal constant");
641          op.setFixed(PhysReg{255});
642       }
643       return op;
644    }
645 
Operand(RegClass type)646    explicit Operand(RegClass type) noexcept
647    {
648       isUndef_ = true;
649       data_.temp = Temp(0, type);
650       setFixed(PhysReg{128});
651    };
Operand(PhysReg reg,RegClass type)652    explicit Operand(PhysReg reg, RegClass type) noexcept
653    {
654       data_.temp = Temp(0, type);
655       setFixed(reg);
656    }
657 
658    static Operand zero(unsigned bytes = 4) noexcept
659    {
660       if (bytes == 8)
661          return Operand::c64(0);
662       else if (bytes == 4)
663          return Operand::c32(0);
664       else if (bytes == 2)
665          return Operand::c16(0);
666       assert(bytes == 1);
667       return Operand::c8(0);
668    }
669 
670    /* This is useful over the constructors when you want to take a chip class
671     * for 1/2 PI or an unknown operand size.
672     */
get_const(enum chip_class chip,uint64_t val,unsigned bytes)673    static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes)
674    {
675       if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
676          /* 1/2 PI can be an inline constant on GFX8+ */
677          Operand op = Operand::c32(val);
678          op.setFixed(PhysReg{248});
679          return op;
680       }
681 
682       if (bytes == 8)
683          return Operand::c64(val);
684       else if (bytes == 4)
685          return Operand::c32(val);
686       else if (bytes == 2)
687          return Operand::c16(val);
688       assert(bytes == 1);
689       return Operand::c8(val);
690    }
691 
692    static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
693                                          bool sext = false)
694    {
695       if (bytes <= 4)
696          return true;
697 
698       if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
699          return true;
700       uint64_t upper33 = val & 0xFFFFFFFF80000000;
701       if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
702          return true;
703 
704       return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
705              val == 0x3FE0000000000000 ||              /* 0.5 */
706              val == 0xBFE0000000000000 ||              /* -0.5 */
707              val == 0x3FF0000000000000 ||              /* 1.0 */
708              val == 0xBFF0000000000000 ||              /* -1.0 */
709              val == 0x4000000000000000 ||              /* 2.0 */
710              val == 0xC000000000000000 ||              /* -2.0 */
711              val == 0x4010000000000000 ||              /* 4.0 */
712              val == 0xC010000000000000;                /* -4.0 */
713    }
714 
isTemp()715    constexpr bool isTemp() const noexcept { return isTemp_; }
716 
setTemp(Temp t)717    constexpr void setTemp(Temp t) noexcept
718    {
719       assert(!isConstant_);
720       isTemp_ = true;
721       data_.temp = t;
722    }
723 
getTemp()724    constexpr Temp getTemp() const noexcept { return data_.temp; }
725 
tempId()726    constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
727 
hasRegClass()728    constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
729 
regClass()730    constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
731 
bytes()732    constexpr unsigned bytes() const noexcept
733    {
734       if (isConstant())
735          return 1 << constSize;
736       else
737          return data_.temp.bytes();
738    }
739 
size()740    constexpr unsigned size() const noexcept
741    {
742       if (isConstant())
743          return constSize > 2 ? 2 : 1;
744       else
745          return data_.temp.size();
746    }
747 
isFixed()748    constexpr bool isFixed() const noexcept { return isFixed_; }
749 
physReg()750    constexpr PhysReg physReg() const noexcept { return reg_; }
751 
setFixed(PhysReg reg)752    constexpr void setFixed(PhysReg reg) noexcept
753    {
754       isFixed_ = reg != unsigned(-1);
755       reg_ = reg;
756    }
757 
isConstant()758    constexpr bool isConstant() const noexcept { return isConstant_; }
759 
isLiteral()760    constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
761 
isUndefined()762    constexpr bool isUndefined() const noexcept { return isUndef_; }
763 
constantValue()764    constexpr uint32_t constantValue() const noexcept { return data_.i; }
765 
constantEquals(uint32_t cmp)766    constexpr bool constantEquals(uint32_t cmp) const noexcept
767    {
768       return isConstant() && constantValue() == cmp;
769    }
770 
constantValue64()771    constexpr uint64_t constantValue64() const noexcept
772    {
773       if (constSize == 3) {
774          if (reg_ <= 192)
775             return reg_ - 128;
776          else if (reg_ <= 208)
777             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
778 
779          switch (reg_) {
780          case 240: return 0x3FE0000000000000;
781          case 241: return 0xBFE0000000000000;
782          case 242: return 0x3FF0000000000000;
783          case 243: return 0xBFF0000000000000;
784          case 244: return 0x4000000000000000;
785          case 245: return 0xC000000000000000;
786          case 246: return 0x4010000000000000;
787          case 247: return 0xC010000000000000;
788          case 255:
789             return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
790          }
791          unreachable("invalid register for 64-bit constant");
792       } else {
793          return data_.i;
794       }
795    }
796 
isOfType(RegType type)797    constexpr bool isOfType(RegType type) const noexcept
798    {
799       return hasRegClass() && regClass().type() == type;
800    }
801 
802    /* Indicates that the killed operand's live range intersects with the
803     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
804     * not set by liveness analysis. */
setLateKill(bool flag)805    constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
806 
isLateKill()807    constexpr bool isLateKill() const noexcept { return isLateKill_; }
808 
setKill(bool flag)809    constexpr void setKill(bool flag) noexcept
810    {
811       isKill_ = flag;
812       if (!flag)
813          setFirstKill(false);
814    }
815 
isKill()816    constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
817 
setFirstKill(bool flag)818    constexpr void setFirstKill(bool flag) noexcept
819    {
820       isFirstKill_ = flag;
821       if (flag)
822          setKill(flag);
823    }
824 
825    /* When there are multiple operands killing the same temporary,
826     * isFirstKill() is only returns true for the first one. */
isFirstKill()827    constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
828 
isKillBeforeDef()829    constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
830 
isFirstKillBeforeDef()831    constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
832 
833    constexpr bool operator==(Operand other) const noexcept
834    {
835       if (other.size() != size())
836          return false;
837       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
838          return false;
839       if (isFixed() && other.isFixed() && physReg() != other.physReg())
840          return false;
841       if (isLiteral())
842          return other.isLiteral() && other.constantValue() == constantValue();
843       else if (isConstant())
844          return other.isConstant() && other.physReg() == physReg();
845       else if (isUndefined())
846          return other.isUndefined() && other.regClass() == regClass();
847       else
848          return other.isTemp() && other.getTemp() == getTemp();
849    }
850 
851    constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
852 
set16bit(bool flag)853    constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
854 
is16bit()855    constexpr bool is16bit() const noexcept { return is16bit_; }
856 
set24bit(bool flag)857    constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
858 
is24bit()859    constexpr bool is24bit() const noexcept { return is24bit_; }
860 
861 private:
862    union {
863       Temp temp;
864       uint32_t i;
865       float f;
866    } data_ = {Temp(0, s1)};
867    PhysReg reg_;
868    union {
869       struct {
870          uint8_t isTemp_ : 1;
871          uint8_t isFixed_ : 1;
872          uint8_t isConstant_ : 1;
873          uint8_t isKill_ : 1;
874          uint8_t isUndef_ : 1;
875          uint8_t isFirstKill_ : 1;
876          uint8_t constSize : 2;
877          uint8_t isLateKill_ : 1;
878          uint8_t is16bit_ : 1;
879          uint8_t is24bit_ : 1;
880          uint8_t signext : 1;
881       };
882       /* can't initialize bit-fields in c++11, so work around using a union */
883       uint16_t control_ = 0;
884    };
885 };
886 
887 /**
888  * Definition Class
889  * Definitions are the results of Instructions
890  * and refer to temporary virtual registers
891  * which are later mapped to physical registers
892  */
893 class Definition final {
894 public:
Definition()895    constexpr Definition()
896        : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
897          isNoCSE_(0)
898    {}
Definition(uint32_t index,RegClass type)899    Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
Definition(Temp tmp)900    explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)901    Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)902    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
903    {
904       setFixed(reg);
905    }
906 
isTemp()907    constexpr bool isTemp() const noexcept { return tempId() > 0; }
908 
getTemp()909    constexpr Temp getTemp() const noexcept { return temp; }
910 
tempId()911    constexpr uint32_t tempId() const noexcept { return temp.id(); }
912 
setTemp(Temp t)913    constexpr void setTemp(Temp t) noexcept { temp = t; }
914 
swapTemp(Definition & other)915    void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
916 
regClass()917    constexpr RegClass regClass() const noexcept { return temp.regClass(); }
918 
bytes()919    constexpr unsigned bytes() const noexcept { return temp.bytes(); }
920 
size()921    constexpr unsigned size() const noexcept { return temp.size(); }
922 
isFixed()923    constexpr bool isFixed() const noexcept { return isFixed_; }
924 
physReg()925    constexpr PhysReg physReg() const noexcept { return reg_; }
926 
setFixed(PhysReg reg)927    constexpr void setFixed(PhysReg reg) noexcept
928    {
929       isFixed_ = 1;
930       reg_ = reg;
931    }
932 
setHint(PhysReg reg)933    constexpr void setHint(PhysReg reg) noexcept
934    {
935       hasHint_ = 1;
936       reg_ = reg;
937    }
938 
hasHint()939    constexpr bool hasHint() const noexcept { return hasHint_; }
940 
setKill(bool flag)941    constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
942 
isKill()943    constexpr bool isKill() const noexcept { return isKill_; }
944 
setPrecise(bool precise)945    constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
946 
isPrecise()947    constexpr bool isPrecise() const noexcept { return isPrecise_; }
948 
949    /* No Unsigned Wrap */
setNUW(bool nuw)950    constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
951 
isNUW()952    constexpr bool isNUW() const noexcept { return isNUW_; }
953 
setNoCSE(bool noCSE)954    constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
955 
isNoCSE()956    constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
957 
958 private:
959    Temp temp = Temp(0, s1);
960    PhysReg reg_;
961    union {
962       struct {
963          uint8_t isFixed_ : 1;
964          uint8_t hasHint_ : 1;
965          uint8_t isKill_ : 1;
966          uint8_t isPrecise_ : 1;
967          uint8_t isNUW_ : 1;
968          uint8_t isNoCSE_ : 1;
969       };
970       /* can't initialize bit-fields in c++11, so work around using a union */
971       uint8_t control_ = 0;
972    };
973 };
974 
975 struct Block;
976 struct Instruction;
977 struct Pseudo_instruction;
978 struct SOP1_instruction;
979 struct SOP2_instruction;
980 struct SOPK_instruction;
981 struct SOPP_instruction;
982 struct SOPC_instruction;
983 struct SMEM_instruction;
984 struct DS_instruction;
985 struct MTBUF_instruction;
986 struct MUBUF_instruction;
987 struct MIMG_instruction;
988 struct Export_instruction;
989 struct FLAT_instruction;
990 struct Pseudo_branch_instruction;
991 struct Pseudo_barrier_instruction;
992 struct Pseudo_reduction_instruction;
993 struct VOP3P_instruction;
994 struct VOP1_instruction;
995 struct VOP2_instruction;
996 struct VOPC_instruction;
997 struct VOP3_instruction;
998 struct Interp_instruction;
999 struct DPP_instruction;
1000 struct SDWA_instruction;
1001 
1002 struct Instruction {
1003    aco_opcode opcode;
1004    Format format;
1005    uint32_t pass_flags;
1006 
1007    aco::span<Operand> operands;
1008    aco::span<Definition> definitions;
1009 
1010    constexpr bool usesModifiers() const noexcept;
1011 
reads_execInstruction1012    constexpr bool reads_exec() const noexcept
1013    {
1014       for (const Operand& op : operands) {
1015          if (op.isFixed() && op.physReg() == exec)
1016             return true;
1017       }
1018       return false;
1019    }
1020 
pseudoInstruction1021    Pseudo_instruction& pseudo() noexcept
1022    {
1023       assert(isPseudo());
1024       return *(Pseudo_instruction*)this;
1025    }
pseudoInstruction1026    const Pseudo_instruction& pseudo() const noexcept
1027    {
1028       assert(isPseudo());
1029       return *(Pseudo_instruction*)this;
1030    }
isPseudoInstruction1031    constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
sop1Instruction1032    SOP1_instruction& sop1() noexcept
1033    {
1034       assert(isSOP1());
1035       return *(SOP1_instruction*)this;
1036    }
sop1Instruction1037    const SOP1_instruction& sop1() const noexcept
1038    {
1039       assert(isSOP1());
1040       return *(SOP1_instruction*)this;
1041    }
isSOP1Instruction1042    constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
sop2Instruction1043    SOP2_instruction& sop2() noexcept
1044    {
1045       assert(isSOP2());
1046       return *(SOP2_instruction*)this;
1047    }
sop2Instruction1048    const SOP2_instruction& sop2() const noexcept
1049    {
1050       assert(isSOP2());
1051       return *(SOP2_instruction*)this;
1052    }
isSOP2Instruction1053    constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
sopkInstruction1054    SOPK_instruction& sopk() noexcept
1055    {
1056       assert(isSOPK());
1057       return *(SOPK_instruction*)this;
1058    }
sopkInstruction1059    const SOPK_instruction& sopk() const noexcept
1060    {
1061       assert(isSOPK());
1062       return *(SOPK_instruction*)this;
1063    }
isSOPKInstruction1064    constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
soppInstruction1065    SOPP_instruction& sopp() noexcept
1066    {
1067       assert(isSOPP());
1068       return *(SOPP_instruction*)this;
1069    }
soppInstruction1070    const SOPP_instruction& sopp() const noexcept
1071    {
1072       assert(isSOPP());
1073       return *(SOPP_instruction*)this;
1074    }
isSOPPInstruction1075    constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
sopcInstruction1076    SOPC_instruction& sopc() noexcept
1077    {
1078       assert(isSOPC());
1079       return *(SOPC_instruction*)this;
1080    }
sopcInstruction1081    const SOPC_instruction& sopc() const noexcept
1082    {
1083       assert(isSOPC());
1084       return *(SOPC_instruction*)this;
1085    }
isSOPCInstruction1086    constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
smemInstruction1087    SMEM_instruction& smem() noexcept
1088    {
1089       assert(isSMEM());
1090       return *(SMEM_instruction*)this;
1091    }
smemInstruction1092    const SMEM_instruction& smem() const noexcept
1093    {
1094       assert(isSMEM());
1095       return *(SMEM_instruction*)this;
1096    }
isSMEMInstruction1097    constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1098    DS_instruction& ds() noexcept
1099    {
1100       assert(isDS());
1101       return *(DS_instruction*)this;
1102    }
dsInstruction1103    const DS_instruction& ds() const noexcept
1104    {
1105       assert(isDS());
1106       return *(DS_instruction*)this;
1107    }
isDSInstruction1108    constexpr bool isDS() const noexcept { return format == Format::DS; }
mtbufInstruction1109    MTBUF_instruction& mtbuf() noexcept
1110    {
1111       assert(isMTBUF());
1112       return *(MTBUF_instruction*)this;
1113    }
mtbufInstruction1114    const MTBUF_instruction& mtbuf() const noexcept
1115    {
1116       assert(isMTBUF());
1117       return *(MTBUF_instruction*)this;
1118    }
isMTBUFInstruction1119    constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1120    MUBUF_instruction& mubuf() noexcept
1121    {
1122       assert(isMUBUF());
1123       return *(MUBUF_instruction*)this;
1124    }
mubufInstruction1125    const MUBUF_instruction& mubuf() const noexcept
1126    {
1127       assert(isMUBUF());
1128       return *(MUBUF_instruction*)this;
1129    }
isMUBUFInstruction1130    constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1131    MIMG_instruction& mimg() noexcept
1132    {
1133       assert(isMIMG());
1134       return *(MIMG_instruction*)this;
1135    }
mimgInstruction1136    const MIMG_instruction& mimg() const noexcept
1137    {
1138       assert(isMIMG());
1139       return *(MIMG_instruction*)this;
1140    }
isMIMGInstruction1141    constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1142    Export_instruction& exp() noexcept
1143    {
1144       assert(isEXP());
1145       return *(Export_instruction*)this;
1146    }
expInstruction1147    const Export_instruction& exp() const noexcept
1148    {
1149       assert(isEXP());
1150       return *(Export_instruction*)this;
1151    }
isEXPInstruction1152    constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1153    FLAT_instruction& flat() noexcept
1154    {
1155       assert(isFlat());
1156       return *(FLAT_instruction*)this;
1157    }
flatInstruction1158    const FLAT_instruction& flat() const noexcept
1159    {
1160       assert(isFlat());
1161       return *(FLAT_instruction*)this;
1162    }
isFlatInstruction1163    constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1164    FLAT_instruction& global() noexcept
1165    {
1166       assert(isGlobal());
1167       return *(FLAT_instruction*)this;
1168    }
globalInstruction1169    const FLAT_instruction& global() const noexcept
1170    {
1171       assert(isGlobal());
1172       return *(FLAT_instruction*)this;
1173    }
isGlobalInstruction1174    constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1175    FLAT_instruction& scratch() noexcept
1176    {
1177       assert(isScratch());
1178       return *(FLAT_instruction*)this;
1179    }
scratchInstruction1180    const FLAT_instruction& scratch() const noexcept
1181    {
1182       assert(isScratch());
1183       return *(FLAT_instruction*)this;
1184    }
isScratchInstruction1185    constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1186    Pseudo_branch_instruction& branch() noexcept
1187    {
1188       assert(isBranch());
1189       return *(Pseudo_branch_instruction*)this;
1190    }
branchInstruction1191    const Pseudo_branch_instruction& branch() const noexcept
1192    {
1193       assert(isBranch());
1194       return *(Pseudo_branch_instruction*)this;
1195    }
isBranchInstruction1196    constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1197    Pseudo_barrier_instruction& barrier() noexcept
1198    {
1199       assert(isBarrier());
1200       return *(Pseudo_barrier_instruction*)this;
1201    }
barrierInstruction1202    const Pseudo_barrier_instruction& barrier() const noexcept
1203    {
1204       assert(isBarrier());
1205       return *(Pseudo_barrier_instruction*)this;
1206    }
isBarrierInstruction1207    constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1208    Pseudo_reduction_instruction& reduction() noexcept
1209    {
1210       assert(isReduction());
1211       return *(Pseudo_reduction_instruction*)this;
1212    }
reductionInstruction1213    const Pseudo_reduction_instruction& reduction() const noexcept
1214    {
1215       assert(isReduction());
1216       return *(Pseudo_reduction_instruction*)this;
1217    }
isReductionInstruction1218    constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
vop3pInstruction1219    VOP3P_instruction& vop3p() noexcept
1220    {
1221       assert(isVOP3P());
1222       return *(VOP3P_instruction*)this;
1223    }
vop3pInstruction1224    const VOP3P_instruction& vop3p() const noexcept
1225    {
1226       assert(isVOP3P());
1227       return *(VOP3P_instruction*)this;
1228    }
isVOP3PInstruction1229    constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
vop1Instruction1230    VOP1_instruction& vop1() noexcept
1231    {
1232       assert(isVOP1());
1233       return *(VOP1_instruction*)this;
1234    }
vop1Instruction1235    const VOP1_instruction& vop1() const noexcept
1236    {
1237       assert(isVOP1());
1238       return *(VOP1_instruction*)this;
1239    }
isVOP1Instruction1240    constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
vop2Instruction1241    VOP2_instruction& vop2() noexcept
1242    {
1243       assert(isVOP2());
1244       return *(VOP2_instruction*)this;
1245    }
vop2Instruction1246    const VOP2_instruction& vop2() const noexcept
1247    {
1248       assert(isVOP2());
1249       return *(VOP2_instruction*)this;
1250    }
isVOP2Instruction1251    constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
vopcInstruction1252    VOPC_instruction& vopc() noexcept
1253    {
1254       assert(isVOPC());
1255       return *(VOPC_instruction*)this;
1256    }
vopcInstruction1257    const VOPC_instruction& vopc() const noexcept
1258    {
1259       assert(isVOPC());
1260       return *(VOPC_instruction*)this;
1261    }
isVOPCInstruction1262    constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
vop3Instruction1263    VOP3_instruction& vop3() noexcept
1264    {
1265       assert(isVOP3());
1266       return *(VOP3_instruction*)this;
1267    }
vop3Instruction1268    const VOP3_instruction& vop3() const noexcept
1269    {
1270       assert(isVOP3());
1271       return *(VOP3_instruction*)this;
1272    }
isVOP3Instruction1273    constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1274    Interp_instruction& vintrp() noexcept
1275    {
1276       assert(isVINTRP());
1277       return *(Interp_instruction*)this;
1278    }
vintrpInstruction1279    const Interp_instruction& vintrp() const noexcept
1280    {
1281       assert(isVINTRP());
1282       return *(Interp_instruction*)this;
1283    }
isVINTRPInstruction1284    constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
dppInstruction1285    DPP_instruction& dpp() noexcept
1286    {
1287       assert(isDPP());
1288       return *(DPP_instruction*)this;
1289    }
dppInstruction1290    const DPP_instruction& dpp() const noexcept
1291    {
1292       assert(isDPP());
1293       return *(DPP_instruction*)this;
1294    }
isDPPInstruction1295    constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; }
sdwaInstruction1296    SDWA_instruction& sdwa() noexcept
1297    {
1298       assert(isSDWA());
1299       return *(SDWA_instruction*)this;
1300    }
sdwaInstruction1301    const SDWA_instruction& sdwa() const noexcept
1302    {
1303       assert(isSDWA());
1304       return *(SDWA_instruction*)this;
1305    }
isSDWAInstruction1306    constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1307 
flatlikeInstruction1308    FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1309 
flatlikeInstruction1310    const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1311 
isFlatLikeInstruction1312    constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1313 
isVALUInstruction1314    constexpr bool isVALU() const noexcept
1315    {
1316       return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1317    }
1318 
isSALUInstruction1319    constexpr bool isSALU() const noexcept
1320    {
1321       return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1322    }
1323 
isVMEMInstruction1324    constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1325 };
1326 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1327 
1328 struct SOPK_instruction : public Instruction {
1329    uint16_t imm;
1330    uint16_t padding;
1331 };
1332 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1333 
1334 struct SOPP_instruction : public Instruction {
1335    uint32_t imm;
1336    int block;
1337 };
1338 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1339 
1340 struct SOPC_instruction : public Instruction {};
1341 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1342 
1343 struct SOP1_instruction : public Instruction {};
1344 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1345 
1346 struct SOP2_instruction : public Instruction {};
1347 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1348 
1349 /**
1350  * Scalar Memory Format:
1351  * For s_(buffer_)load_dword*:
1352  * Operand(0): SBASE - SGPR-pair which provides base address
1353  * Operand(1): Offset - immediate (un)signed offset or SGPR
1354  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1355  * Operand(n-1): SOffset - SGPR offset (Vega only)
1356  *
1357  * Having no operands is also valid for instructions such as s_dcache_inv.
1358  *
1359  */
1360 struct SMEM_instruction : public Instruction {
1361    memory_sync_info sync;
1362    bool glc : 1; /* VI+: globally coherent */
1363    bool dlc : 1; /* NAVI: device level coherent */
1364    bool nv : 1;  /* VEGA only: Non-volatile */
1365    bool disable_wqm : 1;
1366    bool prevent_overflow : 1; /* avoid overflow when combining additions */
1367    uint8_t padding : 3;
1368 };
1369 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1370 
1371 struct VOP1_instruction : public Instruction {};
1372 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1373 
1374 struct VOP2_instruction : public Instruction {};
1375 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1376 
1377 struct VOPC_instruction : public Instruction {};
1378 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1379 
1380 struct VOP3_instruction : public Instruction {
1381    bool abs[3];
1382    bool neg[3];
1383    uint8_t opsel : 4;
1384    uint8_t omod : 2;
1385    bool clamp : 1;
1386    uint8_t padding0 : 1;
1387    uint8_t padding1;
1388 };
1389 static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1390 
1391 struct VOP3P_instruction : public Instruction {
1392    bool neg_lo[3];
1393    bool neg_hi[3];
1394    uint8_t opsel_lo : 3;
1395    uint8_t opsel_hi : 3;
1396    bool clamp : 1;
1397    uint8_t padding0 : 1;
1398    uint8_t padding1;
1399 };
1400 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1401 
1402 /**
1403  * Data Parallel Primitives Format:
1404  * This format can be used for VOP1, VOP2 or VOPC instructions.
1405  * The swizzle applies to the src0 operand.
1406  *
1407  */
1408 struct DPP_instruction : public Instruction {
1409    bool abs[2];
1410    bool neg[2];
1411    uint16_t dpp_ctrl;
1412    uint8_t row_mask : 4;
1413    uint8_t bank_mask : 4;
1414    bool bound_ctrl : 1;
1415    uint8_t padding : 7;
1416 };
1417 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1418 
1419 struct SubdwordSel {
1420    enum sdwa_sel : uint8_t {
1421       ubyte = 0x4,
1422       uword = 0x8,
1423       dword = 0x10,
1424       sext = 0x20,
1425       sbyte = ubyte | sext,
1426       sword = uword | sext,
1427 
1428       ubyte0 = ubyte,
1429       ubyte1 = ubyte | 1,
1430       ubyte2 = ubyte | 2,
1431       ubyte3 = ubyte | 3,
1432       sbyte0 = sbyte,
1433       sbyte1 = sbyte | 1,
1434       sbyte2 = sbyte | 2,
1435       sbyte3 = sbyte | 3,
1436       uword0 = uword,
1437       uword1 = uword | 2,
1438       sword0 = sword,
1439       sword1 = sword | 2,
1440    };
1441 
SubdwordSelSubdwordSel1442    SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1443    constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1444    constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1445        : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1446    {}
sdwa_selSubdwordSel1447    constexpr operator sdwa_sel() const { return sel; }
1448    explicit operator bool() const { return sel != 0; }
1449 
sizeSubdwordSel1450    constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1451    constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1452    constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1453    constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1454    {
1455       reg_byte_offset += offset();
1456       if (size() == 1)
1457          return reg_byte_offset;
1458       else if (size() == 2)
1459          return 4 + (reg_byte_offset >> 1);
1460       else
1461          return 6;
1462    }
1463 
1464 private:
1465    sdwa_sel sel;
1466 };
1467 
1468 /**
1469  * Sub-Dword Addressing Format:
1470  * This format can be used for VOP1, VOP2 or VOPC instructions.
1471  *
1472  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1473  * the definition doesn't have to be VCC on GFX9+.
1474  *
1475  */
1476 struct SDWA_instruction : public Instruction {
1477    /* these destination modifiers aren't available with VOPC except for
1478     * clamp on GFX8 */
1479    SubdwordSel sel[2];
1480    SubdwordSel dst_sel;
1481    bool neg[2];
1482    bool abs[2];
1483    bool clamp : 1;
1484    uint8_t omod : 2; /* GFX9+ */
1485    uint8_t padding : 5;
1486 };
1487 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1488 
1489 struct Interp_instruction : public Instruction {
1490    uint8_t attribute;
1491    uint8_t component;
1492    uint16_t padding;
1493 };
1494 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1495 
1496 /**
1497  * Local and Global Data Sharing instructions
1498  * Operand(0): ADDR - VGPR which supplies the address.
1499  * Operand(1): DATA0 - First data VGPR.
1500  * Operand(2): DATA1 - Second data VGPR.
1501  * Operand(n-1): M0 - LDS size.
1502  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1503  *
1504  */
1505 struct DS_instruction : public Instruction {
1506    memory_sync_info sync;
1507    bool gds;
1508    int16_t offset0;
1509    int8_t offset1;
1510    uint8_t padding;
1511 };
1512 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1513 
1514 /**
1515  * Vector Memory Untyped-buffer Instructions
1516  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1517  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1518  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1519  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1520  *
1521  */
1522 struct MUBUF_instruction : public Instruction {
1523    memory_sync_info sync;
1524    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1525    bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1526    bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1527    bool glc : 1;             /* globally coherent */
1528    bool dlc : 1;             /* NAVI: device level coherent */
1529    bool slc : 1;             /* system level coherent */
1530    bool tfe : 1;             /* texture fail enable */
1531    bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1532    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1533    uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
1534    uint16_t swizzled : 1;
1535    uint16_t padding0 : 2;
1536    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1537    uint16_t padding1 : 10;
1538 };
1539 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1540 
1541 /**
1542  * Vector Memory Typed-buffer Instructions
1543  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1544  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1545  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1546  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1547  *
1548  */
1549 struct MTBUF_instruction : public Instruction {
1550    memory_sync_info sync;
1551    uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1552    uint8_t nfmt : 3;         /* Numeric format of data in memory */
1553    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1554    uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
1555    uint16_t glc : 1;         /* globally coherent */
1556    uint16_t dlc : 1;         /* NAVI: device level coherent */
1557    uint16_t slc : 1;         /* system level coherent */
1558    uint16_t tfe : 1;         /* texture fail enable */
1559    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1560    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1561    uint16_t padding : 4;
1562    uint16_t offset; /* Unsigned byte offset - 12 bit */
1563 };
1564 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1565 
1566 /**
1567  * Vector Memory Image Instructions
1568  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1569  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1570  * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1571  * Operand(3): VADDR - Address source. Can carry an offset or an index.
1572  * Definition(0): VDATA - Vector GPR for read result.
1573  *
1574  */
1575 struct MIMG_instruction : public Instruction {
1576    memory_sync_info sync;
1577    uint8_t dmask;        /* Data VGPR enable mask */
1578    uint8_t dim : 3;      /* NAVI: dimensionality */
1579    bool unrm : 1;        /* Force address to be un-normalized */
1580    bool dlc : 1;         /* NAVI: device level coherent */
1581    bool glc : 1;         /* globally coherent */
1582    bool slc : 1;         /* system level coherent */
1583    bool tfe : 1;         /* texture fail enable */
1584    bool da : 1;          /* declare an array */
1585    bool lwe : 1;         /* LOD warning enable */
1586    bool r128 : 1;        /* NAVI: Texture resource size */
1587    bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1588    bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1589    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1590    uint8_t padding0 : 2;
1591    uint8_t padding1;
1592    uint8_t padding2;
1593 };
1594 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1595 
1596 /**
1597  * Flat/Scratch/Global Instructions
1598  * Operand(0): ADDR
1599  * Operand(1): SADDR
1600  * Operand(2) / Definition(0): DATA/VDST
1601  *
1602  */
1603 struct FLAT_instruction : public Instruction {
1604    memory_sync_info sync;
1605    bool slc : 1; /* system level coherent */
1606    bool glc : 1; /* globally coherent */
1607    bool dlc : 1; /* NAVI: device level coherent */
1608    bool lds : 1;
1609    bool nv : 1;
1610    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1611    uint8_t padding0 : 2;
1612    uint16_t offset; /* Vega/Navi only */
1613    uint16_t padding1;
1614 };
1615 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1616 
1617 struct Export_instruction : public Instruction {
1618    uint8_t enabled_mask;
1619    uint8_t dest;
1620    bool compressed : 1;
1621    bool done : 1;
1622    bool valid_mask : 1;
1623    uint8_t padding0 : 5;
1624    uint8_t padding1;
1625 };
1626 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1627 
1628 struct Pseudo_instruction : public Instruction {
1629    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1630    bool tmp_in_scc;
1631    uint8_t padding;
1632 };
1633 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1634 
1635 struct Pseudo_branch_instruction : public Instruction {
1636    /* target[0] is the block index of the branch target.
1637     * For conditional branches, target[1] contains the fall-through alternative.
1638     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1639     */
1640    uint32_t target[2];
1641 };
1642 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1643 
1644 struct Pseudo_barrier_instruction : public Instruction {
1645    memory_sync_info sync;
1646    sync_scope exec_scope;
1647 };
1648 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1649 
1650 enum ReduceOp : uint16_t {
1651    // clang-format off
1652    iadd8, iadd16, iadd32, iadd64,
1653    imul8, imul16, imul32, imul64,
1654           fadd16, fadd32, fadd64,
1655           fmul16, fmul32, fmul64,
1656    imin8, imin16, imin32, imin64,
1657    imax8, imax16, imax32, imax64,
1658    umin8, umin16, umin32, umin64,
1659    umax8, umax16, umax32, umax64,
1660           fmin16, fmin32, fmin64,
1661           fmax16, fmax32, fmax64,
1662    iand8, iand16, iand32, iand64,
1663    ior8, ior16, ior32, ior64,
1664    ixor8, ixor16, ixor32, ixor64,
1665    num_reduce_ops,
1666    // clang-format on
1667 };
1668 
1669 /**
1670  * Subgroup Reduction Instructions, everything except for the data to be
1671  * reduced and the result as inserted by setup_reduce_temp().
1672  * Operand(0): data to be reduced
1673  * Operand(1): reduce temporary
1674  * Operand(2): vector temporary
1675  * Definition(0): result
1676  * Definition(1): scalar temporary
1677  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1678  * Definition(3): scc clobber
1679  * Definition(4): vcc clobber
1680  *
1681  */
1682 struct Pseudo_reduction_instruction : public Instruction {
1683    ReduceOp reduce_op;
1684    uint16_t cluster_size; // must be 0 for scans
1685 };
1686 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1687               "Unexpected padding");
1688 
1689 struct instr_deleter_functor {
operatorinstr_deleter_functor1690    void operator()(void* p) { free(p); }
1691 };
1692 
1693 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1694 
1695 template <typename T>
1696 T*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1697 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1698                    uint32_t num_definitions)
1699 {
1700    std::size_t size =
1701       sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1702    char* data = (char*)calloc(1, size);
1703    T* inst = (T*)data;
1704 
1705    inst->opcode = opcode;
1706    inst->format = format;
1707 
1708    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1709    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1710    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1711    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1712 
1713    return inst;
1714 }
1715 
1716 constexpr bool
usesModifiers()1717 Instruction::usesModifiers() const noexcept
1718 {
1719    if (isDPP() || isSDWA())
1720       return true;
1721 
1722    if (isVOP3P()) {
1723       const VOP3P_instruction& vop3p = this->vop3p();
1724       for (unsigned i = 0; i < operands.size(); i++) {
1725          if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1726             return true;
1727 
1728          /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1729          if (!(vop3p.opsel_hi & (1 << i)))
1730             return true;
1731       }
1732       return vop3p.opsel_lo || vop3p.clamp;
1733    } else if (isVOP3()) {
1734       const VOP3_instruction& vop3 = this->vop3();
1735       for (unsigned i = 0; i < operands.size(); i++) {
1736          if (vop3.abs[i] || vop3.neg[i])
1737             return true;
1738       }
1739       return vop3.opsel || vop3.clamp || vop3.omod;
1740    }
1741    return false;
1742 }
1743 
1744 constexpr bool
is_phi(Instruction * instr)1745 is_phi(Instruction* instr)
1746 {
1747    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1748 }
1749 
1750 static inline bool
is_phi(aco_ptr<Instruction> & instr)1751 is_phi(aco_ptr<Instruction>& instr)
1752 {
1753    return is_phi(instr.get());
1754 }
1755 
1756 memory_sync_info get_sync_info(const Instruction* instr);
1757 
1758 bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1759 
1760 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
1761 bool instr_is_16bit(chip_class chip, aco_opcode op);
1762 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
1763 bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra);
1764 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1765 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1766 aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr);
1767 bool needs_exec_mask(const Instruction* instr);
1768 
1769 aco_opcode get_ordered(aco_opcode op);
1770 aco_opcode get_unordered(aco_opcode op);
1771 aco_opcode get_inverse(aco_opcode op);
1772 aco_opcode get_f32_cmp(aco_opcode op);
1773 unsigned get_cmp_bitsize(aco_opcode op);
1774 bool is_cmp(aco_opcode op);
1775 
1776 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1777 
1778 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1779 
1780 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1781 
1782 bool should_form_clause(const Instruction* a, const Instruction* b);
1783 
1784 enum block_kind {
1785    /* uniform indicates that leaving this block,
1786     * all actives lanes stay active */
1787    block_kind_uniform = 1 << 0,
1788    block_kind_top_level = 1 << 1,
1789    block_kind_loop_preheader = 1 << 2,
1790    block_kind_loop_header = 1 << 3,
1791    block_kind_loop_exit = 1 << 4,
1792    block_kind_continue = 1 << 5,
1793    block_kind_break = 1 << 6,
1794    block_kind_continue_or_break = 1 << 7,
1795    block_kind_discard = 1 << 8,
1796    block_kind_branch = 1 << 9,
1797    block_kind_merge = 1 << 10,
1798    block_kind_invert = 1 << 11,
1799    block_kind_uses_discard_if = 1 << 12,
1800    block_kind_needs_lowering = 1 << 13,
1801    block_kind_uses_demote = 1 << 14,
1802    block_kind_export_end = 1 << 15,
1803 };
1804 
1805 struct RegisterDemand {
1806    constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1807    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1808    int16_t vgpr = 0;
1809    int16_t sgpr = 0;
1810 
1811    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1812    {
1813       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1814    }
1815 
exceedsRegisterDemand1816    constexpr bool exceeds(const RegisterDemand other) const noexcept
1817    {
1818       return vgpr > other.vgpr || sgpr > other.sgpr;
1819    }
1820 
1821    constexpr RegisterDemand operator+(const Temp t) const noexcept
1822    {
1823       if (t.type() == RegType::sgpr)
1824          return RegisterDemand(vgpr, sgpr + t.size());
1825       else
1826          return RegisterDemand(vgpr + t.size(), sgpr);
1827    }
1828 
1829    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1830    {
1831       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1832    }
1833 
1834    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1835    {
1836       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1837    }
1838 
1839    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1840    {
1841       vgpr += other.vgpr;
1842       sgpr += other.sgpr;
1843       return *this;
1844    }
1845 
1846    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1847    {
1848       vgpr -= other.vgpr;
1849       sgpr -= other.sgpr;
1850       return *this;
1851    }
1852 
1853    constexpr RegisterDemand& operator+=(const Temp t) noexcept
1854    {
1855       if (t.type() == RegType::sgpr)
1856          sgpr += t.size();
1857       else
1858          vgpr += t.size();
1859       return *this;
1860    }
1861 
1862    constexpr RegisterDemand& operator-=(const Temp t) noexcept
1863    {
1864       if (t.type() == RegType::sgpr)
1865          sgpr -= t.size();
1866       else
1867          vgpr -= t.size();
1868       return *this;
1869    }
1870 
updateRegisterDemand1871    constexpr void update(const RegisterDemand other) noexcept
1872    {
1873       vgpr = std::max(vgpr, other.vgpr);
1874       sgpr = std::max(sgpr, other.sgpr);
1875    }
1876 };
1877 
1878 /* CFG */
1879 struct Block {
1880    float_mode fp_mode;
1881    unsigned index;
1882    unsigned offset = 0;
1883    std::vector<aco_ptr<Instruction>> instructions;
1884    std::vector<unsigned> logical_preds;
1885    std::vector<unsigned> linear_preds;
1886    std::vector<unsigned> logical_succs;
1887    std::vector<unsigned> linear_succs;
1888    RegisterDemand register_demand = RegisterDemand();
1889    uint16_t loop_nest_depth = 0;
1890    uint16_t divergent_if_logical_depth = 0;
1891    uint16_t uniform_if_depth = 0;
1892    uint16_t kind = 0;
1893    int logical_idom = -1;
1894    int linear_idom = -1;
1895 
1896    /* this information is needed for predecessors to blocks with phis when
1897     * moving out of ssa */
1898    bool scc_live_out = false;
1899    PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */
1900 
BlockBlock1901    Block() : index(0) {}
1902 };
1903 
1904 /*
1905  * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1906  */
1907 enum class SWStage : uint8_t {
1908    None = 0,
1909    VS = 1 << 0,     /* Vertex Shader */
1910    GS = 1 << 1,     /* Geometry Shader */
1911    TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
1912    TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
1913    FS = 1 << 4,     /* Fragment aka Pixel Shader */
1914    CS = 1 << 5,     /* Compute Shader */
1915    GSCopy = 1 << 6, /* GS Copy Shader (internal) */
1916 
1917    /* Stage combinations merged to run on a single HWStage */
1918    VS_GS = VS | GS,
1919    VS_TCS = VS | TCS,
1920    TES_GS = TES | GS,
1921 };
1922 
1923 constexpr SWStage
1924 operator|(SWStage a, SWStage b)
1925 {
1926    return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
1927 }
1928 
1929 /*
1930  * Shader stages as running on the AMD GPU.
1931  *
1932  * The relation between HWStages and SWStages is not a one-to-one mapping:
1933  * Some SWStages are merged by ACO to run on a single HWStage.
1934  * See README.md for details.
1935  */
1936 enum class HWStage : uint8_t {
1937    VS,
1938    ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1939    GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
1940    NGG, /* Primitive shader, used to implement VS, TES, GS. */
1941    LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1942    HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1943    FS,
1944    CS,
1945 };
1946 
1947 /*
1948  * Set of SWStages to be merged into a single shader paired with the
1949  * HWStage it will run on.
1950  */
1951 struct Stage {
1952    constexpr Stage() = default;
1953 
StageStage1954    explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1955 
1956    /* Check if the given SWStage is included */
hasStage1957    constexpr bool has(SWStage stage) const
1958    {
1959       return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
1960    }
1961 
num_sw_stagesStage1962    unsigned num_sw_stages() const { return util_bitcount(static_cast<uint8_t>(sw)); }
1963 
1964    constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
1965 
1966    constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
1967 
1968    /* Mask of merged software stages */
1969    SWStage sw = SWStage::None;
1970 
1971    /* Active hardware stage */
1972    HWStage hw{};
1973 };
1974 
1975 /* possible settings of Program::stage */
1976 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1977 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1978 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1979 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1980 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
1981 /* GFX10/NGG */
1982 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
1983 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
1984 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
1985 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
1986 /* GFX9 (and GFX10 if NGG isn't used) */
1987 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
1988 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
1989 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
1990 /* pre-GFX9 */
1991 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
1992 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
1993 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
1994 static constexpr Stage tess_eval_es(HWStage::ES,
1995                                     SWStage::TES); /* tesselation evaluation before geometry */
1996 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
1997 
1998 enum statistic {
1999    statistic_hash,
2000    statistic_instructions,
2001    statistic_copies,
2002    statistic_branches,
2003    statistic_latency,
2004    statistic_inv_throughput,
2005    statistic_vmem_clauses,
2006    statistic_smem_clauses,
2007    statistic_sgpr_presched,
2008    statistic_vgpr_presched,
2009    num_statistics
2010 };
2011 
2012 struct DeviceInfo {
2013    uint16_t lds_encoding_granule;
2014    uint16_t lds_alloc_granule;
2015    uint32_t lds_limit; /* in bytes */
2016    bool has_16bank_lds;
2017    uint16_t physical_sgprs;
2018    uint16_t physical_vgprs;
2019    uint16_t vgpr_limit;
2020    uint16_t sgpr_limit;
2021    uint16_t sgpr_alloc_granule;
2022    uint16_t vgpr_alloc_granule; /* must be power of two */
2023    unsigned max_wave64_per_simd;
2024    unsigned simd_per_cu;
2025    bool has_fast_fma32 = false;
2026    bool xnack_enabled = false;
2027    bool sram_ecc_enabled = false;
2028 };
2029 
2030 enum class CompilationProgress {
2031    after_isel,
2032    after_spilling,
2033    after_ra,
2034 };
2035 
2036 class Program final {
2037 public:
2038    std::vector<Block> blocks;
2039    std::vector<RegClass> temp_rc = {s1};
2040    RegisterDemand max_reg_demand = RegisterDemand();
2041    uint16_t num_waves = 0;
2042    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
2043    ac_shader_config* config;
2044    const struct radv_shader_info* info;
2045    enum chip_class chip_class;
2046    enum radeon_family family;
2047    DeviceInfo dev;
2048    unsigned wave_size;
2049    RegClass lane_mask;
2050    Stage stage;
2051    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2052    bool needs_wqm = false;   /* there exists a p_wqm instruction */
2053 
2054    std::vector<uint8_t> constant_data;
2055    Temp private_segment_buffer;
2056    Temp scratch_offset;
2057 
2058    uint16_t min_waves = 0;
2059    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2060    bool wgp_mode;
2061    bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2062 
2063    bool needs_vcc = false;
2064    bool needs_flat_scr = false;
2065 
2066    CompilationProgress progress;
2067 
2068    bool collect_statistics = false;
2069    uint32_t statistics[num_statistics];
2070 
2071    float_mode next_fp_mode;
2072    unsigned next_loop_depth = 0;
2073    unsigned next_divergent_if_logical_depth = 0;
2074    unsigned next_uniform_if_depth = 0;
2075 
2076    std::vector<Definition> vs_inputs;
2077 
2078    struct {
2079       FILE* output = stderr;
2080       bool shorten_messages = false;
2081       void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
2082       void* private_data;
2083    } debug;
2084 
allocateId(RegClass rc)2085    uint32_t allocateId(RegClass rc)
2086    {
2087       assert(allocationID <= 16777215);
2088       temp_rc.push_back(rc);
2089       return allocationID++;
2090    }
2091 
allocateRange(unsigned amount)2092    void allocateRange(unsigned amount)
2093    {
2094       assert(allocationID + amount <= 16777216);
2095       temp_rc.resize(temp_rc.size() + amount);
2096       allocationID += amount;
2097    }
2098 
allocateTmp(RegClass rc)2099    Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2100 
peekAllocationId()2101    uint32_t peekAllocationId() { return allocationID; }
2102 
2103    friend void reindex_ssa(Program* program);
2104    friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2105 
create_and_insert_block()2106    Block* create_and_insert_block()
2107    {
2108       Block block;
2109       return insert_block(std::move(block));
2110    }
2111 
insert_block(Block && block)2112    Block* insert_block(Block&& block)
2113    {
2114       block.index = blocks.size();
2115       block.fp_mode = next_fp_mode;
2116       block.loop_nest_depth = next_loop_depth;
2117       block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2118       block.uniform_if_depth = next_uniform_if_depth;
2119       blocks.emplace_back(std::move(block));
2120       return &blocks.back();
2121    }
2122 
2123 private:
2124    uint32_t allocationID = 1;
2125 };
2126 
2127 struct live {
2128    /* live temps out per block */
2129    std::vector<IDSet> live_out;
2130    /* register demand (sgpr/vgpr) per instruction per block */
2131    std::vector<std::vector<RegisterDemand>> register_demand;
2132 };
2133 
2134 struct ra_test_policy {
2135    /* Force RA to always use its pessimistic fallback algorithm */
2136    bool skip_optimistic_path = false;
2137 };
2138 
2139 void init();
2140 
2141 void init_program(Program* program, Stage stage, const struct radv_shader_info* info,
2142                   enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
2143                   ac_shader_config* config);
2144 
2145 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2146                     ac_shader_config* config, const struct radv_shader_args* args);
2147 void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2148                            const struct radv_shader_args* args);
2149 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2150                                 ac_shader_config* config, const struct radv_shader_args* args);
2151 void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
2152                       ac_shader_config* config, const struct radv_shader_args* args,
2153                       unsigned* num_preserved_sgprs);
2154 
2155 void lower_phis(Program* program);
2156 void calc_min_waves(Program* program);
2157 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2158 live live_var_analysis(Program* program);
2159 std::vector<uint16_t> dead_code_analysis(Program* program);
2160 void dominator_tree(Program* program);
2161 void insert_exec_mask(Program* program);
2162 void value_numbering(Program* program);
2163 void optimize(Program* program);
2164 void optimize_postRA(Program* program);
2165 void setup_reduce_temp(Program* program);
2166 void lower_to_cssa(Program* program, live& live_vars);
2167 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2168                          ra_test_policy = {});
2169 void ssa_elimination(Program* program);
2170 void lower_to_hw_instr(Program* program);
2171 void schedule_program(Program* program, live& live_vars);
2172 void spill(Program* program, live& live_vars);
2173 void insert_wait_states(Program* program);
2174 void insert_NOPs(Program* program);
2175 void form_hard_clauses(Program* program);
2176 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2177 /**
2178  * Returns true if print_asm can disassemble the given program for the current build/runtime
2179  * configuration
2180  */
2181 bool check_print_asm_support(Program* program);
2182 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2183 bool validate_ir(Program* program);
2184 bool validate_ra(Program* program);
2185 #ifndef NDEBUG
2186 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2187 #else
2188 #define perfwarn(program, cond, msg, ...)                                                          \
2189    do {                                                                                            \
2190    } while (0)
2191 #endif
2192 
2193 void collect_presched_stats(Program* program);
2194 void collect_preasm_stats(Program* program);
2195 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2196 
2197 enum print_flags {
2198    print_no_ssa = 0x1,
2199    print_perf_info = 0x2,
2200    print_kill = 0x4,
2201    print_live_vars = 0x8,
2202 };
2203 
2204 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2205 void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2206 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2207 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2208                        unsigned flags = 0);
2209 
2210 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2211 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2212 
2213 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2214 #define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2215 
2216 /* utilities for dealing with register demand */
2217 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2218 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2219 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2220                                  aco_ptr<Instruction>& instr_before);
2221 
2222 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2223 uint16_t get_extra_sgprs(Program* program);
2224 
2225 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2226 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2227 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2228 
2229 /* return number of addressable sgprs/vgprs for max_waves */
2230 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2231 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2232 
2233 typedef struct {
2234    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2235    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2236    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2237    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2238    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2239    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2240    const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2241    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2242    /* sizes used for input/output modifiers and constants */
2243    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2244    const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2245 } Info;
2246 
2247 extern const Info instr_info;
2248 
2249 } // namespace aco
2250 
2251 #endif /* ACO_IR_H */
2252