1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27 
28 #include "aco_opcodes.h"
29 #include "aco_util.h"
30 
31 #include "vulkan/radv_shader.h"
32 
33 #include "nir.h"
34 
35 #include <bitset>
36 #include <memory>
37 #include <vector>
38 
39 struct radv_shader_args;
40 struct radv_shader_info;
41 struct radv_vs_prolog_key;
42 
43 namespace aco {
44 
45 extern uint64_t debug_flags;
46 
47 enum {
48    DEBUG_VALIDATE_IR = 0x1,
49    DEBUG_VALIDATE_RA = 0x2,
50    DEBUG_PERFWARN = 0x4,
51    DEBUG_FORCE_WAITCNT = 0x8,
52    DEBUG_NO_VN = 0x10,
53    DEBUG_NO_OPT = 0x20,
54    DEBUG_NO_SCHED = 0x40,
55    DEBUG_PERF_INFO = 0x80,
56    DEBUG_LIVE_INFO = 0x100,
57 };
58 
59 /**
60  * Representation of the instruction's microcode encoding format
61  * Note: Some Vector ALU Formats can be combined, such that:
62  * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
63  * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
64  * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
65  *
66  * (*) The same is applicable for VOP1 and VOPC instructions.
67  */
68 enum class Format : std::uint16_t {
69    /* Pseudo Instruction Format */
70    PSEUDO = 0,
71    /* Scalar ALU & Control Formats */
72    SOP1 = 1,
73    SOP2 = 2,
74    SOPK = 3,
75    SOPP = 4,
76    SOPC = 5,
77    /* Scalar Memory Format */
78    SMEM = 6,
79    /* LDS/GDS Format */
80    DS = 8,
81    /* Vector Memory Buffer Formats */
82    MTBUF = 9,
83    MUBUF = 10,
84    /* Vector Memory Image Format */
85    MIMG = 11,
86    /* Export Format */
87    EXP = 12,
88    /* Flat Formats */
89    FLAT = 13,
90    GLOBAL = 14,
91    SCRATCH = 15,
92 
93    PSEUDO_BRANCH = 16,
94    PSEUDO_BARRIER = 17,
95    PSEUDO_REDUCTION = 18,
96 
97    /* Vector ALU Formats */
98    VOP3P = 19,
99    VOP1 = 1 << 8,
100    VOP2 = 1 << 9,
101    VOPC = 1 << 10,
102    VOP3 = 1 << 11,
103    /* Vector Parameter Interpolation Format */
104    VINTRP = 1 << 12,
105    DPP16 = 1 << 13,
106    SDWA = 1 << 14,
107    DPP8 = 1 << 15,
108 };
109 
110 enum class instr_class : uint8_t {
111    valu32 = 0,
112    valu_convert32 = 1,
113    valu64 = 2,
114    valu_quarter_rate32 = 3,
115    valu_fma = 4,
116    valu_transcendental32 = 5,
117    valu_double = 6,
118    valu_double_add = 7,
119    valu_double_convert = 8,
120    valu_double_transcendental = 9,
121    salu = 10,
122    smem = 11,
123    barrier = 12,
124    branch = 13,
125    sendmsg = 14,
126    ds = 15,
127    exp = 16,
128    vmem = 17,
129    waitcnt = 18,
130    other = 19,
131    count,
132 };
133 
134 enum storage_class : uint8_t {
135    storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
136    storage_buffer = 0x1, /* SSBOs and global memory */
137    storage_atomic_counter = 0x2, /* not used for Vulkan */
138    storage_image = 0x4,
139    storage_shared = 0x8,       /* or TCS output */
140    storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
141    storage_task_payload = 0x20,/* Task-Mesh payload */
142    storage_scratch = 0x40,
143    storage_vgpr_spill = 0x80,
144    storage_count = 8, /* not counting storage_none */
145 };
146 
147 enum memory_semantics : uint8_t {
148    semantic_none = 0x0,
149    /* for loads: don't move any access after this load to before this load (even other loads)
150     * for barriers: don't move any access after the barrier to before any
151     * atomics/control_barriers/sendmsg_gs_done before the barrier */
152    semantic_acquire = 0x1,
153    /* for stores: don't move any access before this store to after this store
154     * for barriers: don't move any access before the barrier to after any
155     * atomics/control_barriers/sendmsg_gs_done after the barrier */
156    semantic_release = 0x2,
157 
158    /* the rest are for load/stores/atomics only */
159    /* cannot be DCE'd or CSE'd */
160    semantic_volatile = 0x4,
161    /* does not interact with barriers and assumes this lane is the only lane
162     * accessing this memory */
163    semantic_private = 0x8,
164    /* this operation can be reordered around operations of the same storage.
165     * says nothing about barriers */
166    semantic_can_reorder = 0x10,
167    /* this is a atomic instruction (may only read or write memory) */
168    semantic_atomic = 0x20,
169    /* this is instruction both reads and writes memory */
170    semantic_rmw = 0x40,
171 
172    semantic_acqrel = semantic_acquire | semantic_release,
173    semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
174 };
175 
176 enum sync_scope : uint8_t {
177    scope_invocation = 0,
178    scope_subgroup = 1,
179    scope_workgroup = 2,
180    scope_queuefamily = 3,
181    scope_device = 4,
182 };
183 
184 struct memory_sync_info {
memory_sync_infomemory_sync_info185    memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
186    memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
187        : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
188    {}
189 
190    storage_class storage : 8;
191    memory_semantics semantics : 8;
192    sync_scope scope : 8;
193 
194    bool operator==(const memory_sync_info& rhs) const
195    {
196       return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
197    }
198 
can_reordermemory_sync_info199    bool can_reorder() const
200    {
201       if (semantics & semantic_acqrel)
202          return false;
203       /* Also check storage so that zero-initialized memory_sync_info can be
204        * reordered. */
205       return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
206    }
207 };
208 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
209 
210 enum fp_round {
211    fp_round_ne = 0,
212    fp_round_pi = 1,
213    fp_round_ni = 2,
214    fp_round_tz = 3,
215 };
216 
217 enum fp_denorm {
218    /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
219     * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
220    fp_denorm_flush = 0x0,
221    fp_denorm_keep_in = 0x1,
222    fp_denorm_keep_out = 0x2,
223    fp_denorm_keep = 0x3,
224 };
225 
226 struct float_mode {
227    /* matches encoding of the MODE register */
228    union {
229       struct {
230          fp_round round32 : 2;
231          fp_round round16_64 : 2;
232          unsigned denorm32 : 2;
233          unsigned denorm16_64 : 2;
234       };
235       struct {
236          uint8_t round : 4;
237          uint8_t denorm : 4;
238       };
239       uint8_t val = 0;
240    };
241    /* if false, optimizations which may remove infs/nan/-0.0 can be done */
242    bool preserve_signed_zero_inf_nan32 : 1;
243    bool preserve_signed_zero_inf_nan16_64 : 1;
244    /* if false, optimizations which may remove denormal flushing can be done */
245    bool must_flush_denorms32 : 1;
246    bool must_flush_denorms16_64 : 1;
247    bool care_about_round32 : 1;
248    bool care_about_round16_64 : 1;
249 
250    /* Returns true if instructions using the mode "other" can safely use the
251     * current one instead. */
canReplacefloat_mode252    bool canReplace(float_mode other) const noexcept
253    {
254       return val == other.val &&
255              (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
256              (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
257              (must_flush_denorms32 || !other.must_flush_denorms32) &&
258              (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
259              (care_about_round32 || !other.care_about_round32) &&
260              (care_about_round16_64 || !other.care_about_round16_64);
261    }
262 };
263 
264 struct wait_imm {
265    static const uint8_t unset_counter = 0xff;
266 
267    uint8_t vm;
268    uint8_t exp;
269    uint8_t lgkm;
270    uint8_t vs;
271 
272    wait_imm();
273    wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
274    wait_imm(enum chip_class chip, uint16_t packed);
275 
276    uint16_t pack(enum chip_class chip) const;
277 
278    bool combine(const wait_imm& other);
279 
280    bool empty() const;
281 };
282 
283 constexpr Format
asVOP3(Format format)284 asVOP3(Format format)
285 {
286    return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
287 };
288 
289 constexpr Format
asSDWA(Format format)290 asSDWA(Format format)
291 {
292    assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
293    return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
294 }
295 
296 constexpr Format
withoutDPP(Format format)297 withoutDPP(Format format)
298 {
299    return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
300 }
301 
302 enum class RegType {
303    none = 0,
304    sgpr,
305    vgpr,
306    linear_vgpr,
307 };
308 
309 struct RegClass {
310 
311    enum RC : uint8_t {
312       s1 = 1,
313       s2 = 2,
314       s3 = 3,
315       s4 = 4,
316       s6 = 6,
317       s8 = 8,
318       s16 = 16,
319       v1 = s1 | (1 << 5),
320       v2 = s2 | (1 << 5),
321       v3 = s3 | (1 << 5),
322       v4 = s4 | (1 << 5),
323       v5 = 5 | (1 << 5),
324       v6 = 6 | (1 << 5),
325       v7 = 7 | (1 << 5),
326       v8 = 8 | (1 << 5),
327       /* byte-sized register class */
328       v1b = v1 | (1 << 7),
329       v2b = v2 | (1 << 7),
330       v3b = v3 | (1 << 7),
331       v4b = v4 | (1 << 7),
332       v6b = v6 | (1 << 7),
333       v8b = v8 | (1 << 7),
334       /* these are used for WWM and spills to vgpr */
335       v1_linear = v1 | (1 << 6),
336       v2_linear = v2 | (1 << 6),
337    };
338 
339    RegClass() = default;
RegClassRegClass340    constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass341    constexpr RegClass(RegType type, unsigned size)
342        : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
343    {}
344 
RCRegClass345    constexpr operator RC() const { return rc; }
346    explicit operator bool() = delete;
347 
typeRegClass348    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass349    constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass350    constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass351    constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
352    // TODO: use size() less in favor of bytes()
sizeRegClass353    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass354    constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass355    constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass356    constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
357 
getRegClass358    static constexpr RegClass get(RegType type, unsigned bytes)
359    {
360       if (type == RegType::sgpr) {
361          return RegClass(type, DIV_ROUND_UP(bytes, 4u));
362       } else {
363          return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
364       }
365    }
366 
resizeRegClass367    constexpr RegClass resize(unsigned bytes) const
368    {
369       if (is_linear_vgpr()) {
370          assert(bytes % 4u == 0);
371          return get(RegType::vgpr, bytes).as_linear();
372       }
373       return get(type(), bytes);
374    }
375 
376 private:
377    RC rc;
378 };
379 
380 /* transitional helper expressions */
381 static constexpr RegClass s1{RegClass::s1};
382 static constexpr RegClass s2{RegClass::s2};
383 static constexpr RegClass s3{RegClass::s3};
384 static constexpr RegClass s4{RegClass::s4};
385 static constexpr RegClass s8{RegClass::s8};
386 static constexpr RegClass s16{RegClass::s16};
387 static constexpr RegClass v1{RegClass::v1};
388 static constexpr RegClass v2{RegClass::v2};
389 static constexpr RegClass v3{RegClass::v3};
390 static constexpr RegClass v4{RegClass::v4};
391 static constexpr RegClass v5{RegClass::v5};
392 static constexpr RegClass v6{RegClass::v6};
393 static constexpr RegClass v7{RegClass::v7};
394 static constexpr RegClass v8{RegClass::v8};
395 static constexpr RegClass v1b{RegClass::v1b};
396 static constexpr RegClass v2b{RegClass::v2b};
397 static constexpr RegClass v3b{RegClass::v3b};
398 static constexpr RegClass v4b{RegClass::v4b};
399 static constexpr RegClass v6b{RegClass::v6b};
400 static constexpr RegClass v8b{RegClass::v8b};
401 
402 /**
403  * Temp Class
404  * Each temporary virtual register has a
405  * register class (i.e. size and type)
406  * and SSA id.
407  */
408 struct Temp {
TempTemp409    Temp() noexcept : id_(0), reg_class(0) {}
TempTemp410    constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
411 
idTemp412    constexpr uint32_t id() const noexcept { return id_; }
regClassTemp413    constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
414 
bytesTemp415    constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp416    constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp417    constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp418    constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
419 
420    constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
421    constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
422    constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
423 
424 private:
425    uint32_t id_ : 24;
426    uint32_t reg_class : 8;
427 };
428 
429 /**
430  * PhysReg
431  * Represents the physical register for each
432  * Operand and Definition.
433  */
434 struct PhysReg {
435    constexpr PhysReg() = default;
PhysRegPhysReg436    explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg437    constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg438    constexpr unsigned byte() const { return reg_b & 0x3; }
439    constexpr operator unsigned() const { return reg(); }
440    constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
441    constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
442    constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg443    constexpr PhysReg advance(int bytes) const
444    {
445       PhysReg res = *this;
446       res.reg_b += bytes;
447       return res;
448    }
449 
450    uint16_t reg_b = 0;
451 };
452 
453 /* helper expressions for special registers */
454 static constexpr PhysReg m0{124};
455 static constexpr PhysReg vcc{106};
456 static constexpr PhysReg vcc_hi{107};
457 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
458 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
459 static constexpr PhysReg ttmp0{112};
460 static constexpr PhysReg ttmp1{113};
461 static constexpr PhysReg ttmp2{114};
462 static constexpr PhysReg ttmp3{115};
463 static constexpr PhysReg ttmp4{116};
464 static constexpr PhysReg ttmp5{117};
465 static constexpr PhysReg ttmp6{118};
466 static constexpr PhysReg ttmp7{119};
467 static constexpr PhysReg ttmp8{120};
468 static constexpr PhysReg ttmp9{121};
469 static constexpr PhysReg ttmp10{122};
470 static constexpr PhysReg ttmp11{123};
471 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
472 static constexpr PhysReg exec{126};
473 static constexpr PhysReg exec_lo{126};
474 static constexpr PhysReg exec_hi{127};
475 static constexpr PhysReg vccz{251};
476 static constexpr PhysReg execz{252};
477 static constexpr PhysReg scc{253};
478 
479 /**
480  * Operand Class
481  * Initially, each Operand refers to either
482  * a temporary virtual register
483  * or to a constant value
484  * Temporary registers get mapped to physical register during RA
485  * Constant values are inlined into the instruction sequence.
486  */
487 class Operand final {
488 public:
Operand()489    constexpr Operand()
490        : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
491          isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
492          is24bit_(false), signext(false)
493    {}
494 
Operand(Temp r)495    explicit Operand(Temp r) noexcept
496    {
497       data_.temp = r;
498       if (r.id()) {
499          isTemp_ = true;
500       } else {
501          isUndef_ = true;
502          setFixed(PhysReg{128});
503       }
504    };
Operand(Temp r,PhysReg reg)505    explicit Operand(Temp r, PhysReg reg) noexcept
506    {
507       assert(r.id()); /* Don't allow fixing an undef to a register */
508       data_.temp = r;
509       isTemp_ = true;
510       setFixed(reg);
511    };
512 
513    /* 8-bit constant */
c8(uint8_t v)514    static Operand c8(uint8_t v) noexcept
515    {
516       /* 8-bit constants are only used for copies and copies from any 8-bit
517        * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
518        * to be inline constants. */
519       Operand op;
520       op.control_ = 0;
521       op.data_.i = v;
522       op.isConstant_ = true;
523       op.constSize = 0;
524       op.setFixed(PhysReg{0u});
525       return op;
526    };
527 
528    /* 16-bit constant */
c16(uint16_t v)529    static Operand c16(uint16_t v) noexcept
530    {
531       Operand op;
532       op.control_ = 0;
533       op.data_.i = v;
534       op.isConstant_ = true;
535       op.constSize = 1;
536       if (v <= 64)
537          op.setFixed(PhysReg{128u + v});
538       else if (v >= 0xFFF0) /* [-16 .. -1] */
539          op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
540       else if (v == 0x3800) /* 0.5 */
541          op.setFixed(PhysReg{240});
542       else if (v == 0xB800) /* -0.5 */
543          op.setFixed(PhysReg{241});
544       else if (v == 0x3C00) /* 1.0 */
545          op.setFixed(PhysReg{242});
546       else if (v == 0xBC00) /* -1.0 */
547          op.setFixed(PhysReg{243});
548       else if (v == 0x4000) /* 2.0 */
549          op.setFixed(PhysReg{244});
550       else if (v == 0xC000) /* -2.0 */
551          op.setFixed(PhysReg{245});
552       else if (v == 0x4400) /* 4.0 */
553          op.setFixed(PhysReg{246});
554       else if (v == 0xC400) /* -4.0 */
555          op.setFixed(PhysReg{247});
556       else if (v == 0x3118) /* 1/2 PI */
557          op.setFixed(PhysReg{248});
558       else /* Literal Constant */
559          op.setFixed(PhysReg{255});
560       return op;
561    }
562 
563    /* 32-bit constant */
c32(uint32_t v)564    static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
565 
566    /* 64-bit constant */
c64(uint64_t v)567    static Operand c64(uint64_t v) noexcept
568    {
569       Operand op;
570       op.control_ = 0;
571       op.isConstant_ = true;
572       op.constSize = 3;
573       if (v <= 64) {
574          op.data_.i = (uint32_t)v;
575          op.setFixed(PhysReg{128 + (uint32_t)v});
576       } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
577          op.data_.i = (uint32_t)v;
578          op.setFixed(PhysReg{192 - (uint32_t)v});
579       } else if (v == 0x3FE0000000000000) { /* 0.5 */
580          op.data_.i = 0x3f000000;
581          op.setFixed(PhysReg{240});
582       } else if (v == 0xBFE0000000000000) { /* -0.5 */
583          op.data_.i = 0xbf000000;
584          op.setFixed(PhysReg{241});
585       } else if (v == 0x3FF0000000000000) { /* 1.0 */
586          op.data_.i = 0x3f800000;
587          op.setFixed(PhysReg{242});
588       } else if (v == 0xBFF0000000000000) { /* -1.0 */
589          op.data_.i = 0xbf800000;
590          op.setFixed(PhysReg{243});
591       } else if (v == 0x4000000000000000) { /* 2.0 */
592          op.data_.i = 0x40000000;
593          op.setFixed(PhysReg{244});
594       } else if (v == 0xC000000000000000) { /* -2.0 */
595          op.data_.i = 0xc0000000;
596          op.setFixed(PhysReg{245});
597       } else if (v == 0x4010000000000000) { /* 4.0 */
598          op.data_.i = 0x40800000;
599          op.setFixed(PhysReg{246});
600       } else if (v == 0xC010000000000000) { /* -4.0 */
601          op.data_.i = 0xc0800000;
602          op.setFixed(PhysReg{247});
603       } else { /* Literal Constant: we don't know if it is a long or double.*/
604          op.signext = v >> 63;
605          op.data_.i = v & 0xffffffffu;
606          op.setFixed(PhysReg{255});
607          assert(op.constantValue64() == v &&
608                 "attempt to create a unrepresentable 64-bit literal constant");
609       }
610       return op;
611    }
612 
613    /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)614    static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
615    {
616       Operand op;
617       op.control_ = 0;
618       op.data_.i = v;
619       op.isConstant_ = true;
620       op.constSize = is64bit ? 3 : 2;
621       if (v <= 64)
622          op.setFixed(PhysReg{128 + v});
623       else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
624          op.setFixed(PhysReg{192 - v});
625       else if (v == 0x3f000000) /* 0.5 */
626          op.setFixed(PhysReg{240});
627       else if (v == 0xbf000000) /* -0.5 */
628          op.setFixed(PhysReg{241});
629       else if (v == 0x3f800000) /* 1.0 */
630          op.setFixed(PhysReg{242});
631       else if (v == 0xbf800000) /* -1.0 */
632          op.setFixed(PhysReg{243});
633       else if (v == 0x40000000) /* 2.0 */
634          op.setFixed(PhysReg{244});
635       else if (v == 0xc0000000) /* -2.0 */
636          op.setFixed(PhysReg{245});
637       else if (v == 0x40800000) /* 4.0 */
638          op.setFixed(PhysReg{246});
639       else if (v == 0xc0800000) /* -4.0 */
640          op.setFixed(PhysReg{247});
641       else { /* Literal Constant */
642          assert(!is64bit && "attempt to create a 64-bit literal constant");
643          op.setFixed(PhysReg{255});
644       }
645       return op;
646    }
647 
Operand(RegClass type)648    explicit Operand(RegClass type) noexcept
649    {
650       isUndef_ = true;
651       data_.temp = Temp(0, type);
652       setFixed(PhysReg{128});
653    };
Operand(PhysReg reg,RegClass type)654    explicit Operand(PhysReg reg, RegClass type) noexcept
655    {
656       data_.temp = Temp(0, type);
657       setFixed(reg);
658    }
659 
660    static Operand zero(unsigned bytes = 4) noexcept
661    {
662       if (bytes == 8)
663          return Operand::c64(0);
664       else if (bytes == 4)
665          return Operand::c32(0);
666       else if (bytes == 2)
667          return Operand::c16(0);
668       assert(bytes == 1);
669       return Operand::c8(0);
670    }
671 
672    /* This is useful over the constructors when you want to take a chip class
673     * for 1/2 PI or an unknown operand size.
674     */
get_const(enum chip_class chip,uint64_t val,unsigned bytes)675    static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes)
676    {
677       if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
678          /* 1/2 PI can be an inline constant on GFX8+ */
679          Operand op = Operand::c32(val);
680          op.setFixed(PhysReg{248});
681          return op;
682       }
683 
684       if (bytes == 8)
685          return Operand::c64(val);
686       else if (bytes == 4)
687          return Operand::c32(val);
688       else if (bytes == 2)
689          return Operand::c16(val);
690       assert(bytes == 1);
691       return Operand::c8(val);
692    }
693 
694    static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
695                                          bool sext = false)
696    {
697       if (bytes <= 4)
698          return true;
699 
700       if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
701          return true;
702       uint64_t upper33 = val & 0xFFFFFFFF80000000;
703       if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
704          return true;
705 
706       return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
707              val == 0x3FE0000000000000 ||              /* 0.5 */
708              val == 0xBFE0000000000000 ||              /* -0.5 */
709              val == 0x3FF0000000000000 ||              /* 1.0 */
710              val == 0xBFF0000000000000 ||              /* -1.0 */
711              val == 0x4000000000000000 ||              /* 2.0 */
712              val == 0xC000000000000000 ||              /* -2.0 */
713              val == 0x4010000000000000 ||              /* 4.0 */
714              val == 0xC010000000000000;                /* -4.0 */
715    }
716 
isTemp()717    constexpr bool isTemp() const noexcept { return isTemp_; }
718 
setTemp(Temp t)719    constexpr void setTemp(Temp t) noexcept
720    {
721       assert(!isConstant_);
722       isTemp_ = true;
723       data_.temp = t;
724    }
725 
getTemp()726    constexpr Temp getTemp() const noexcept { return data_.temp; }
727 
tempId()728    constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
729 
hasRegClass()730    constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
731 
regClass()732    constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
733 
bytes()734    constexpr unsigned bytes() const noexcept
735    {
736       if (isConstant())
737          return 1 << constSize;
738       else
739          return data_.temp.bytes();
740    }
741 
size()742    constexpr unsigned size() const noexcept
743    {
744       if (isConstant())
745          return constSize > 2 ? 2 : 1;
746       else
747          return data_.temp.size();
748    }
749 
isFixed()750    constexpr bool isFixed() const noexcept { return isFixed_; }
751 
physReg()752    constexpr PhysReg physReg() const noexcept { return reg_; }
753 
setFixed(PhysReg reg)754    constexpr void setFixed(PhysReg reg) noexcept
755    {
756       isFixed_ = reg != unsigned(-1);
757       reg_ = reg;
758    }
759 
isConstant()760    constexpr bool isConstant() const noexcept { return isConstant_; }
761 
isLiteral()762    constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
763 
isUndefined()764    constexpr bool isUndefined() const noexcept { return isUndef_; }
765 
constantValue()766    constexpr uint32_t constantValue() const noexcept { return data_.i; }
767 
constantEquals(uint32_t cmp)768    constexpr bool constantEquals(uint32_t cmp) const noexcept
769    {
770       return isConstant() && constantValue() == cmp;
771    }
772 
constantValue64()773    constexpr uint64_t constantValue64() const noexcept
774    {
775       if (constSize == 3) {
776          if (reg_ <= 192)
777             return reg_ - 128;
778          else if (reg_ <= 208)
779             return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
780 
781          switch (reg_) {
782          case 240: return 0x3FE0000000000000;
783          case 241: return 0xBFE0000000000000;
784          case 242: return 0x3FF0000000000000;
785          case 243: return 0xBFF0000000000000;
786          case 244: return 0x4000000000000000;
787          case 245: return 0xC000000000000000;
788          case 246: return 0x4010000000000000;
789          case 247: return 0xC010000000000000;
790          case 255:
791             return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
792          }
793          unreachable("invalid register for 64-bit constant");
794       } else {
795          return data_.i;
796       }
797    }
798 
isOfType(RegType type)799    constexpr bool isOfType(RegType type) const noexcept
800    {
801       return hasRegClass() && regClass().type() == type;
802    }
803 
804    /* Indicates that the killed operand's live range intersects with the
805     * instruction's definitions. Unlike isKill() and isFirstKill(), this is
806     * not set by liveness analysis. */
setLateKill(bool flag)807    constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
808 
isLateKill()809    constexpr bool isLateKill() const noexcept { return isLateKill_; }
810 
setKill(bool flag)811    constexpr void setKill(bool flag) noexcept
812    {
813       isKill_ = flag;
814       if (!flag)
815          setFirstKill(false);
816    }
817 
isKill()818    constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
819 
setFirstKill(bool flag)820    constexpr void setFirstKill(bool flag) noexcept
821    {
822       isFirstKill_ = flag;
823       if (flag)
824          setKill(flag);
825    }
826 
827    /* When there are multiple operands killing the same temporary,
828     * isFirstKill() is only returns true for the first one. */
isFirstKill()829    constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
830 
isKillBeforeDef()831    constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
832 
isFirstKillBeforeDef()833    constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
834 
835    constexpr bool operator==(Operand other) const noexcept
836    {
837       if (other.size() != size())
838          return false;
839       if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
840          return false;
841       if (isFixed() && other.isFixed() && physReg() != other.physReg())
842          return false;
843       if (isLiteral())
844          return other.isLiteral() && other.constantValue() == constantValue();
845       else if (isConstant())
846          return other.isConstant() && other.physReg() == physReg();
847       else if (isUndefined())
848          return other.isUndefined() && other.regClass() == regClass();
849       else
850          return other.isTemp() && other.getTemp() == getTemp();
851    }
852 
853    constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
854 
set16bit(bool flag)855    constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
856 
is16bit()857    constexpr bool is16bit() const noexcept { return is16bit_; }
858 
set24bit(bool flag)859    constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
860 
is24bit()861    constexpr bool is24bit() const noexcept { return is24bit_; }
862 
863 private:
864    union {
865       Temp temp;
866       uint32_t i;
867       float f;
868    } data_ = {Temp(0, s1)};
869    PhysReg reg_;
870    union {
871       struct {
872          uint8_t isTemp_ : 1;
873          uint8_t isFixed_ : 1;
874          uint8_t isConstant_ : 1;
875          uint8_t isKill_ : 1;
876          uint8_t isUndef_ : 1;
877          uint8_t isFirstKill_ : 1;
878          uint8_t constSize : 2;
879          uint8_t isLateKill_ : 1;
880          uint8_t is16bit_ : 1;
881          uint8_t is24bit_ : 1;
882          uint8_t signext : 1;
883       };
884       /* can't initialize bit-fields in c++11, so work around using a union */
885       uint16_t control_ = 0;
886    };
887 };
888 
889 /**
890  * Definition Class
891  * Definitions are the results of Instructions
892  * and refer to temporary virtual registers
893  * which are later mapped to physical registers
894  */
895 class Definition final {
896 public:
Definition()897    constexpr Definition()
898        : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
899          isNoCSE_(0)
900    {}
Definition(uint32_t index,RegClass type)901    Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
Definition(Temp tmp)902    explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)903    Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)904    Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
905    {
906       setFixed(reg);
907    }
908 
isTemp()909    constexpr bool isTemp() const noexcept { return tempId() > 0; }
910 
getTemp()911    constexpr Temp getTemp() const noexcept { return temp; }
912 
tempId()913    constexpr uint32_t tempId() const noexcept { return temp.id(); }
914 
setTemp(Temp t)915    constexpr void setTemp(Temp t) noexcept { temp = t; }
916 
swapTemp(Definition & other)917    void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
918 
regClass()919    constexpr RegClass regClass() const noexcept { return temp.regClass(); }
920 
bytes()921    constexpr unsigned bytes() const noexcept { return temp.bytes(); }
922 
size()923    constexpr unsigned size() const noexcept { return temp.size(); }
924 
isFixed()925    constexpr bool isFixed() const noexcept { return isFixed_; }
926 
physReg()927    constexpr PhysReg physReg() const noexcept { return reg_; }
928 
setFixed(PhysReg reg)929    constexpr void setFixed(PhysReg reg) noexcept
930    {
931       isFixed_ = 1;
932       reg_ = reg;
933    }
934 
setHint(PhysReg reg)935    constexpr void setHint(PhysReg reg) noexcept
936    {
937       hasHint_ = 1;
938       reg_ = reg;
939    }
940 
hasHint()941    constexpr bool hasHint() const noexcept { return hasHint_; }
942 
setKill(bool flag)943    constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
944 
isKill()945    constexpr bool isKill() const noexcept { return isKill_; }
946 
setPrecise(bool precise)947    constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
948 
isPrecise()949    constexpr bool isPrecise() const noexcept { return isPrecise_; }
950 
951    /* No Unsigned Wrap */
setNUW(bool nuw)952    constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
953 
isNUW()954    constexpr bool isNUW() const noexcept { return isNUW_; }
955 
setNoCSE(bool noCSE)956    constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
957 
isNoCSE()958    constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
959 
960 private:
961    Temp temp = Temp(0, s1);
962    PhysReg reg_;
963    union {
964       struct {
965          uint8_t isFixed_ : 1;
966          uint8_t hasHint_ : 1;
967          uint8_t isKill_ : 1;
968          uint8_t isPrecise_ : 1;
969          uint8_t isNUW_ : 1;
970          uint8_t isNoCSE_ : 1;
971       };
972       /* can't initialize bit-fields in c++11, so work around using a union */
973       uint8_t control_ = 0;
974    };
975 };
976 
977 struct Block;
978 struct Instruction;
979 struct Pseudo_instruction;
980 struct SOP1_instruction;
981 struct SOP2_instruction;
982 struct SOPK_instruction;
983 struct SOPP_instruction;
984 struct SOPC_instruction;
985 struct SMEM_instruction;
986 struct DS_instruction;
987 struct MTBUF_instruction;
988 struct MUBUF_instruction;
989 struct MIMG_instruction;
990 struct Export_instruction;
991 struct FLAT_instruction;
992 struct Pseudo_branch_instruction;
993 struct Pseudo_barrier_instruction;
994 struct Pseudo_reduction_instruction;
995 struct VOP3P_instruction;
996 struct VOP1_instruction;
997 struct VOP2_instruction;
998 struct VOPC_instruction;
999 struct VOP3_instruction;
1000 struct Interp_instruction;
1001 struct DPP16_instruction;
1002 struct DPP8_instruction;
1003 struct SDWA_instruction;
1004 
1005 struct Instruction {
1006    aco_opcode opcode;
1007    Format format;
1008    uint32_t pass_flags;
1009 
1010    aco::span<Operand> operands;
1011    aco::span<Definition> definitions;
1012 
1013    constexpr bool usesModifiers() const noexcept;
1014 
reads_execInstruction1015    constexpr bool reads_exec() const noexcept
1016    {
1017       for (const Operand& op : operands) {
1018          if (op.isFixed() && op.physReg() == exec)
1019             return true;
1020       }
1021       return false;
1022    }
1023 
pseudoInstruction1024    Pseudo_instruction& pseudo() noexcept
1025    {
1026       assert(isPseudo());
1027       return *(Pseudo_instruction*)this;
1028    }
pseudoInstruction1029    const Pseudo_instruction& pseudo() const noexcept
1030    {
1031       assert(isPseudo());
1032       return *(Pseudo_instruction*)this;
1033    }
isPseudoInstruction1034    constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
sop1Instruction1035    SOP1_instruction& sop1() noexcept
1036    {
1037       assert(isSOP1());
1038       return *(SOP1_instruction*)this;
1039    }
sop1Instruction1040    const SOP1_instruction& sop1() const noexcept
1041    {
1042       assert(isSOP1());
1043       return *(SOP1_instruction*)this;
1044    }
isSOP1Instruction1045    constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
sop2Instruction1046    SOP2_instruction& sop2() noexcept
1047    {
1048       assert(isSOP2());
1049       return *(SOP2_instruction*)this;
1050    }
sop2Instruction1051    const SOP2_instruction& sop2() const noexcept
1052    {
1053       assert(isSOP2());
1054       return *(SOP2_instruction*)this;
1055    }
isSOP2Instruction1056    constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
sopkInstruction1057    SOPK_instruction& sopk() noexcept
1058    {
1059       assert(isSOPK());
1060       return *(SOPK_instruction*)this;
1061    }
sopkInstruction1062    const SOPK_instruction& sopk() const noexcept
1063    {
1064       assert(isSOPK());
1065       return *(SOPK_instruction*)this;
1066    }
isSOPKInstruction1067    constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
soppInstruction1068    SOPP_instruction& sopp() noexcept
1069    {
1070       assert(isSOPP());
1071       return *(SOPP_instruction*)this;
1072    }
soppInstruction1073    const SOPP_instruction& sopp() const noexcept
1074    {
1075       assert(isSOPP());
1076       return *(SOPP_instruction*)this;
1077    }
isSOPPInstruction1078    constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
sopcInstruction1079    SOPC_instruction& sopc() noexcept
1080    {
1081       assert(isSOPC());
1082       return *(SOPC_instruction*)this;
1083    }
sopcInstruction1084    const SOPC_instruction& sopc() const noexcept
1085    {
1086       assert(isSOPC());
1087       return *(SOPC_instruction*)this;
1088    }
isSOPCInstruction1089    constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
smemInstruction1090    SMEM_instruction& smem() noexcept
1091    {
1092       assert(isSMEM());
1093       return *(SMEM_instruction*)this;
1094    }
smemInstruction1095    const SMEM_instruction& smem() const noexcept
1096    {
1097       assert(isSMEM());
1098       return *(SMEM_instruction*)this;
1099    }
isSMEMInstruction1100    constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1101    DS_instruction& ds() noexcept
1102    {
1103       assert(isDS());
1104       return *(DS_instruction*)this;
1105    }
dsInstruction1106    const DS_instruction& ds() const noexcept
1107    {
1108       assert(isDS());
1109       return *(DS_instruction*)this;
1110    }
isDSInstruction1111    constexpr bool isDS() const noexcept { return format == Format::DS; }
mtbufInstruction1112    MTBUF_instruction& mtbuf() noexcept
1113    {
1114       assert(isMTBUF());
1115       return *(MTBUF_instruction*)this;
1116    }
mtbufInstruction1117    const MTBUF_instruction& mtbuf() const noexcept
1118    {
1119       assert(isMTBUF());
1120       return *(MTBUF_instruction*)this;
1121    }
isMTBUFInstruction1122    constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1123    MUBUF_instruction& mubuf() noexcept
1124    {
1125       assert(isMUBUF());
1126       return *(MUBUF_instruction*)this;
1127    }
mubufInstruction1128    const MUBUF_instruction& mubuf() const noexcept
1129    {
1130       assert(isMUBUF());
1131       return *(MUBUF_instruction*)this;
1132    }
isMUBUFInstruction1133    constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1134    MIMG_instruction& mimg() noexcept
1135    {
1136       assert(isMIMG());
1137       return *(MIMG_instruction*)this;
1138    }
mimgInstruction1139    const MIMG_instruction& mimg() const noexcept
1140    {
1141       assert(isMIMG());
1142       return *(MIMG_instruction*)this;
1143    }
isMIMGInstruction1144    constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1145    Export_instruction& exp() noexcept
1146    {
1147       assert(isEXP());
1148       return *(Export_instruction*)this;
1149    }
expInstruction1150    const Export_instruction& exp() const noexcept
1151    {
1152       assert(isEXP());
1153       return *(Export_instruction*)this;
1154    }
isEXPInstruction1155    constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1156    FLAT_instruction& flat() noexcept
1157    {
1158       assert(isFlat());
1159       return *(FLAT_instruction*)this;
1160    }
flatInstruction1161    const FLAT_instruction& flat() const noexcept
1162    {
1163       assert(isFlat());
1164       return *(FLAT_instruction*)this;
1165    }
isFlatInstruction1166    constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1167    FLAT_instruction& global() noexcept
1168    {
1169       assert(isGlobal());
1170       return *(FLAT_instruction*)this;
1171    }
globalInstruction1172    const FLAT_instruction& global() const noexcept
1173    {
1174       assert(isGlobal());
1175       return *(FLAT_instruction*)this;
1176    }
isGlobalInstruction1177    constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1178    FLAT_instruction& scratch() noexcept
1179    {
1180       assert(isScratch());
1181       return *(FLAT_instruction*)this;
1182    }
scratchInstruction1183    const FLAT_instruction& scratch() const noexcept
1184    {
1185       assert(isScratch());
1186       return *(FLAT_instruction*)this;
1187    }
isScratchInstruction1188    constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1189    Pseudo_branch_instruction& branch() noexcept
1190    {
1191       assert(isBranch());
1192       return *(Pseudo_branch_instruction*)this;
1193    }
branchInstruction1194    const Pseudo_branch_instruction& branch() const noexcept
1195    {
1196       assert(isBranch());
1197       return *(Pseudo_branch_instruction*)this;
1198    }
isBranchInstruction1199    constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1200    Pseudo_barrier_instruction& barrier() noexcept
1201    {
1202       assert(isBarrier());
1203       return *(Pseudo_barrier_instruction*)this;
1204    }
barrierInstruction1205    const Pseudo_barrier_instruction& barrier() const noexcept
1206    {
1207       assert(isBarrier());
1208       return *(Pseudo_barrier_instruction*)this;
1209    }
isBarrierInstruction1210    constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1211    Pseudo_reduction_instruction& reduction() noexcept
1212    {
1213       assert(isReduction());
1214       return *(Pseudo_reduction_instruction*)this;
1215    }
reductionInstruction1216    const Pseudo_reduction_instruction& reduction() const noexcept
1217    {
1218       assert(isReduction());
1219       return *(Pseudo_reduction_instruction*)this;
1220    }
isReductionInstruction1221    constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
vop3pInstruction1222    VOP3P_instruction& vop3p() noexcept
1223    {
1224       assert(isVOP3P());
1225       return *(VOP3P_instruction*)this;
1226    }
vop3pInstruction1227    const VOP3P_instruction& vop3p() const noexcept
1228    {
1229       assert(isVOP3P());
1230       return *(VOP3P_instruction*)this;
1231    }
isVOP3PInstruction1232    constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
vop1Instruction1233    VOP1_instruction& vop1() noexcept
1234    {
1235       assert(isVOP1());
1236       return *(VOP1_instruction*)this;
1237    }
vop1Instruction1238    const VOP1_instruction& vop1() const noexcept
1239    {
1240       assert(isVOP1());
1241       return *(VOP1_instruction*)this;
1242    }
isVOP1Instruction1243    constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
vop2Instruction1244    VOP2_instruction& vop2() noexcept
1245    {
1246       assert(isVOP2());
1247       return *(VOP2_instruction*)this;
1248    }
vop2Instruction1249    const VOP2_instruction& vop2() const noexcept
1250    {
1251       assert(isVOP2());
1252       return *(VOP2_instruction*)this;
1253    }
isVOP2Instruction1254    constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
vopcInstruction1255    VOPC_instruction& vopc() noexcept
1256    {
1257       assert(isVOPC());
1258       return *(VOPC_instruction*)this;
1259    }
vopcInstruction1260    const VOPC_instruction& vopc() const noexcept
1261    {
1262       assert(isVOPC());
1263       return *(VOPC_instruction*)this;
1264    }
isVOPCInstruction1265    constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
vop3Instruction1266    VOP3_instruction& vop3() noexcept
1267    {
1268       assert(isVOP3());
1269       return *(VOP3_instruction*)this;
1270    }
vop3Instruction1271    const VOP3_instruction& vop3() const noexcept
1272    {
1273       assert(isVOP3());
1274       return *(VOP3_instruction*)this;
1275    }
isVOP3Instruction1276    constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1277    Interp_instruction& vintrp() noexcept
1278    {
1279       assert(isVINTRP());
1280       return *(Interp_instruction*)this;
1281    }
vintrpInstruction1282    const Interp_instruction& vintrp() const noexcept
1283    {
1284       assert(isVINTRP());
1285       return *(Interp_instruction*)this;
1286    }
isVINTRPInstruction1287    constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
dpp16Instruction1288    DPP16_instruction& dpp16() noexcept
1289    {
1290       assert(isDPP16());
1291       return *(DPP16_instruction*)this;
1292    }
dpp16Instruction1293    const DPP16_instruction& dpp16() const noexcept
1294    {
1295       assert(isDPP16());
1296       return *(DPP16_instruction*)this;
1297    }
isDPP16Instruction1298    constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
dpp8Instruction1299    DPP8_instruction& dpp8() noexcept
1300    {
1301       assert(isDPP8());
1302       return *(DPP8_instruction*)this;
1303    }
dpp8Instruction1304    const DPP8_instruction& dpp8() const noexcept
1305    {
1306       assert(isDPP8());
1307       return *(DPP8_instruction*)this;
1308    }
isDPP8Instruction1309    constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
isDPPInstruction1310    constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
sdwaInstruction1311    SDWA_instruction& sdwa() noexcept
1312    {
1313       assert(isSDWA());
1314       return *(SDWA_instruction*)this;
1315    }
sdwaInstruction1316    const SDWA_instruction& sdwa() const noexcept
1317    {
1318       assert(isSDWA());
1319       return *(SDWA_instruction*)this;
1320    }
isSDWAInstruction1321    constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1322 
flatlikeInstruction1323    FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1324 
flatlikeInstruction1325    const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1326 
isFlatLikeInstruction1327    constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1328 
isVALUInstruction1329    constexpr bool isVALU() const noexcept
1330    {
1331       return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1332    }
1333 
isSALUInstruction1334    constexpr bool isSALU() const noexcept
1335    {
1336       return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1337    }
1338 
isVMEMInstruction1339    constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1340 };
1341 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1342 
1343 struct SOPK_instruction : public Instruction {
1344    uint16_t imm;
1345    uint16_t padding;
1346 };
1347 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1348 
1349 struct SOPP_instruction : public Instruction {
1350    uint32_t imm;
1351    int block;
1352 };
1353 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1354 
1355 struct SOPC_instruction : public Instruction {};
1356 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1357 
1358 struct SOP1_instruction : public Instruction {};
1359 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1360 
1361 struct SOP2_instruction : public Instruction {};
1362 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1363 
1364 /**
1365  * Scalar Memory Format:
1366  * For s_(buffer_)load_dword*:
1367  * Operand(0): SBASE - SGPR-pair which provides base address
1368  * Operand(1): Offset - immediate (un)signed offset or SGPR
1369  * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1370  * Operand(n-1): SOffset - SGPR offset (Vega only)
1371  *
1372  * Having no operands is also valid for instructions such as s_dcache_inv.
1373  *
1374  */
1375 struct SMEM_instruction : public Instruction {
1376    memory_sync_info sync;
1377    bool glc : 1; /* VI+: globally coherent */
1378    bool dlc : 1; /* NAVI: device level coherent */
1379    bool nv : 1;  /* VEGA only: Non-volatile */
1380    bool disable_wqm : 1;
1381    bool prevent_overflow : 1; /* avoid overflow when combining additions */
1382    uint8_t padding : 3;
1383 };
1384 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1385 
1386 struct VOP1_instruction : public Instruction {};
1387 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1388 
1389 struct VOP2_instruction : public Instruction {};
1390 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1391 
1392 struct VOPC_instruction : public Instruction {};
1393 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1394 
1395 struct VOP3_instruction : public Instruction {
1396    bool abs[3];
1397    bool neg[3];
1398    uint8_t opsel : 4;
1399    uint8_t omod : 2;
1400    bool clamp : 1;
1401    uint8_t padding0 : 1;
1402    uint8_t padding1;
1403 };
1404 static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1405 
1406 struct VOP3P_instruction : public Instruction {
1407    bool neg_lo[3];
1408    bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */
1409    uint8_t opsel_lo : 3;
1410    uint8_t opsel_hi : 3;
1411    bool clamp : 1;
1412    uint8_t padding0 : 1;
1413    uint8_t padding1;
1414 };
1415 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1416 
1417 /**
1418  * Data Parallel Primitives Format:
1419  * This format can be used for VOP1, VOP2 or VOPC instructions.
1420  * The swizzle applies to the src0 operand.
1421  *
1422  */
1423 struct DPP16_instruction : public Instruction {
1424    bool abs[2];
1425    bool neg[2];
1426    uint16_t dpp_ctrl;
1427    uint8_t row_mask : 4;
1428    uint8_t bank_mask : 4;
1429    bool bound_ctrl : 1;
1430    uint8_t padding : 7;
1431 };
1432 static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1433 
1434 struct DPP8_instruction : public Instruction {
1435    uint8_t lane_sel[8];
1436 };
1437 static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1438 
1439 struct SubdwordSel {
1440    enum sdwa_sel : uint8_t {
1441       ubyte = 0x4,
1442       uword = 0x8,
1443       dword = 0x10,
1444       sext = 0x20,
1445       sbyte = ubyte | sext,
1446       sword = uword | sext,
1447 
1448       ubyte0 = ubyte,
1449       ubyte1 = ubyte | 1,
1450       ubyte2 = ubyte | 2,
1451       ubyte3 = ubyte | 3,
1452       sbyte0 = sbyte,
1453       sbyte1 = sbyte | 1,
1454       sbyte2 = sbyte | 2,
1455       sbyte3 = sbyte | 3,
1456       uword0 = uword,
1457       uword1 = uword | 2,
1458       sword0 = sword,
1459       sword1 = sword | 2,
1460    };
1461 
SubdwordSelSubdwordSel1462    SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1463    constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1464    constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1465        : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1466    {}
sdwa_selSubdwordSel1467    constexpr operator sdwa_sel() const { return sel; }
1468    explicit operator bool() const { return sel != 0; }
1469 
sizeSubdwordSel1470    constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1471    constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1472    constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1473    constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1474    {
1475       reg_byte_offset += offset();
1476       if (size() == 1)
1477          return reg_byte_offset;
1478       else if (size() == 2)
1479          return 4 + (reg_byte_offset >> 1);
1480       else
1481          return 6;
1482    }
1483 
1484 private:
1485    sdwa_sel sel;
1486 };
1487 
1488 /**
1489  * Sub-Dword Addressing Format:
1490  * This format can be used for VOP1, VOP2 or VOPC instructions.
1491  *
1492  * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1493  * the definition doesn't have to be VCC on GFX9+.
1494  *
1495  */
1496 struct SDWA_instruction : public Instruction {
1497    /* these destination modifiers aren't available with VOPC except for
1498     * clamp on GFX8 */
1499    SubdwordSel sel[2];
1500    SubdwordSel dst_sel;
1501    bool neg[2];
1502    bool abs[2];
1503    bool clamp : 1;
1504    uint8_t omod : 2; /* GFX9+ */
1505    uint8_t padding : 5;
1506 };
1507 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1508 
1509 struct Interp_instruction : public Instruction {
1510    uint8_t attribute;
1511    uint8_t component;
1512    uint16_t padding;
1513 };
1514 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1515 
1516 /**
1517  * Local and Global Data Sharing instructions
1518  * Operand(0): ADDR - VGPR which supplies the address.
1519  * Operand(1): DATA0 - First data VGPR.
1520  * Operand(2): DATA1 - Second data VGPR.
1521  * Operand(n-1): M0 - LDS size.
1522  * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1523  *
1524  */
1525 struct DS_instruction : public Instruction {
1526    memory_sync_info sync;
1527    bool gds;
1528    int16_t offset0;
1529    int8_t offset1;
1530    uint8_t padding;
1531 };
1532 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1533 
1534 /**
1535  * Vector Memory Untyped-buffer Instructions
1536  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1537  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1538  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1539  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1540  *
1541  */
1542 struct MUBUF_instruction : public Instruction {
1543    memory_sync_info sync;
1544    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1545    bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
1546    bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
1547    bool glc : 1;             /* globally coherent */
1548    bool dlc : 1;             /* NAVI: device level coherent */
1549    bool slc : 1;             /* system level coherent */
1550    bool tfe : 1;             /* texture fail enable */
1551    bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
1552    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1553    uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
1554    uint16_t swizzled : 1;
1555    uint16_t padding0 : 2;
1556    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1557    uint16_t padding1 : 10;
1558 };
1559 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1560 
1561 /**
1562  * Vector Memory Typed-buffer Instructions
1563  * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1564  * Operand(1): VADDR - Address source. Can carry an index and/or offset
1565  * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1566  * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1567  *
1568  */
1569 struct MTBUF_instruction : public Instruction {
1570    memory_sync_info sync;
1571    uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
1572    uint8_t nfmt : 3;         /* Numeric format of data in memory */
1573    bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
1574    uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
1575    uint16_t glc : 1;         /* globally coherent */
1576    uint16_t dlc : 1;         /* NAVI: device level coherent */
1577    uint16_t slc : 1;         /* system level coherent */
1578    uint16_t tfe : 1;         /* texture fail enable */
1579    uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1580    uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1581    uint16_t padding : 4;
1582    uint16_t offset; /* Unsigned byte offset - 12 bit */
1583 };
1584 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1585 
1586 /**
1587  * Vector Memory Image Instructions
1588  * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1589  * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1590  * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1591  * Operand(3): VADDR - Address source. Can carry an offset or an index.
1592  * Definition(0): VDATA - Vector GPR for read result.
1593  *
1594  */
1595 struct MIMG_instruction : public Instruction {
1596    memory_sync_info sync;
1597    uint8_t dmask;        /* Data VGPR enable mask */
1598    uint8_t dim : 3;      /* NAVI: dimensionality */
1599    bool unrm : 1;        /* Force address to be un-normalized */
1600    bool dlc : 1;         /* NAVI: device level coherent */
1601    bool glc : 1;         /* globally coherent */
1602    bool slc : 1;         /* system level coherent */
1603    bool tfe : 1;         /* texture fail enable */
1604    bool da : 1;          /* declare an array */
1605    bool lwe : 1;         /* LOD warning enable */
1606    bool r128 : 1;        /* NAVI: Texture resource size */
1607    bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
1608    bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
1609    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1610    uint8_t padding0 : 2;
1611    uint8_t padding1;
1612    uint8_t padding2;
1613 };
1614 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1615 
1616 /**
1617  * Flat/Scratch/Global Instructions
1618  * Operand(0): ADDR
1619  * Operand(1): SADDR
1620  * Operand(2) / Definition(0): DATA/VDST
1621  *
1622  */
1623 struct FLAT_instruction : public Instruction {
1624    memory_sync_info sync;
1625    bool slc : 1; /* system level coherent */
1626    bool glc : 1; /* globally coherent */
1627    bool dlc : 1; /* NAVI: device level coherent */
1628    bool lds : 1;
1629    bool nv : 1;
1630    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1631    uint8_t padding0 : 2;
1632    uint16_t offset; /* Vega/Navi only */
1633    uint16_t padding1;
1634 };
1635 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1636 
1637 struct Export_instruction : public Instruction {
1638    uint8_t enabled_mask;
1639    uint8_t dest;
1640    bool compressed : 1;
1641    bool done : 1;
1642    bool valid_mask : 1;
1643    uint8_t padding0 : 5;
1644    uint8_t padding1;
1645 };
1646 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1647 
1648 struct Pseudo_instruction : public Instruction {
1649    PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1650    bool tmp_in_scc;
1651    uint8_t padding;
1652 };
1653 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1654 
1655 struct Pseudo_branch_instruction : public Instruction {
1656    /* target[0] is the block index of the branch target.
1657     * For conditional branches, target[1] contains the fall-through alternative.
1658     * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1659     */
1660    uint32_t target[2];
1661 };
1662 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1663 
1664 struct Pseudo_barrier_instruction : public Instruction {
1665    memory_sync_info sync;
1666    sync_scope exec_scope;
1667 };
1668 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1669 
1670 enum ReduceOp : uint16_t {
1671    // clang-format off
1672    iadd8, iadd16, iadd32, iadd64,
1673    imul8, imul16, imul32, imul64,
1674           fadd16, fadd32, fadd64,
1675           fmul16, fmul32, fmul64,
1676    imin8, imin16, imin32, imin64,
1677    imax8, imax16, imax32, imax64,
1678    umin8, umin16, umin32, umin64,
1679    umax8, umax16, umax32, umax64,
1680           fmin16, fmin32, fmin64,
1681           fmax16, fmax32, fmax64,
1682    iand8, iand16, iand32, iand64,
1683    ior8, ior16, ior32, ior64,
1684    ixor8, ixor16, ixor32, ixor64,
1685    num_reduce_ops,
1686    // clang-format on
1687 };
1688 
1689 /**
1690  * Subgroup Reduction Instructions, everything except for the data to be
1691  * reduced and the result as inserted by setup_reduce_temp().
1692  * Operand(0): data to be reduced
1693  * Operand(1): reduce temporary
1694  * Operand(2): vector temporary
1695  * Definition(0): result
1696  * Definition(1): scalar temporary
1697  * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1698  * Definition(3): scc clobber
1699  * Definition(4): vcc clobber
1700  *
1701  */
1702 struct Pseudo_reduction_instruction : public Instruction {
1703    ReduceOp reduce_op;
1704    uint16_t cluster_size; // must be 0 for scans
1705 };
1706 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1707               "Unexpected padding");
1708 
1709 struct instr_deleter_functor {
operatorinstr_deleter_functor1710    void operator()(void* p) { free(p); }
1711 };
1712 
1713 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1714 
1715 template <typename T>
1716 T*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1717 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1718                    uint32_t num_definitions)
1719 {
1720    std::size_t size =
1721       sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1722    char* data = (char*)calloc(1, size);
1723    T* inst = (T*)data;
1724 
1725    inst->opcode = opcode;
1726    inst->format = format;
1727 
1728    uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1729    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1730    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1731    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1732 
1733    return inst;
1734 }
1735 
1736 constexpr bool
usesModifiers()1737 Instruction::usesModifiers() const noexcept
1738 {
1739    if (isDPP() || isSDWA())
1740       return true;
1741 
1742    if (isVOP3P()) {
1743       const VOP3P_instruction& vop3p = this->vop3p();
1744       for (unsigned i = 0; i < operands.size(); i++) {
1745          if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1746             return true;
1747 
1748          /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1749          if (!(vop3p.opsel_hi & (1 << i)))
1750             return true;
1751       }
1752       return vop3p.opsel_lo || vop3p.clamp;
1753    } else if (isVOP3()) {
1754       const VOP3_instruction& vop3 = this->vop3();
1755       for (unsigned i = 0; i < operands.size(); i++) {
1756          if (vop3.abs[i] || vop3.neg[i])
1757             return true;
1758       }
1759       return vop3.opsel || vop3.clamp || vop3.omod;
1760    }
1761    return false;
1762 }
1763 
1764 constexpr bool
is_phi(Instruction * instr)1765 is_phi(Instruction* instr)
1766 {
1767    return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1768 }
1769 
1770 static inline bool
is_phi(aco_ptr<Instruction> & instr)1771 is_phi(aco_ptr<Instruction>& instr)
1772 {
1773    return is_phi(instr.get());
1774 }
1775 
1776 memory_sync_info get_sync_info(const Instruction* instr);
1777 
1778 bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1779 
1780 bool can_use_opsel(chip_class chip, aco_opcode op, int idx);
1781 bool instr_is_16bit(chip_class chip, aco_opcode op);
1782 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
1783 bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
1784 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1785 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1786 aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8);
1787 bool needs_exec_mask(const Instruction* instr);
1788 
1789 aco_opcode get_ordered(aco_opcode op);
1790 aco_opcode get_unordered(aco_opcode op);
1791 aco_opcode get_inverse(aco_opcode op);
1792 aco_opcode get_f32_cmp(aco_opcode op);
1793 unsigned get_cmp_bitsize(aco_opcode op);
1794 bool is_cmp(aco_opcode op);
1795 
1796 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1797 
1798 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1799 
1800 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1801 
1802 bool should_form_clause(const Instruction* a, const Instruction* b);
1803 
1804 enum block_kind {
1805    /* uniform indicates that leaving this block,
1806     * all actives lanes stay active */
1807    block_kind_uniform = 1 << 0,
1808    block_kind_top_level = 1 << 1,
1809    block_kind_loop_preheader = 1 << 2,
1810    block_kind_loop_header = 1 << 3,
1811    block_kind_loop_exit = 1 << 4,
1812    block_kind_continue = 1 << 5,
1813    block_kind_break = 1 << 6,
1814    block_kind_continue_or_break = 1 << 7,
1815    block_kind_branch = 1 << 8,
1816    block_kind_merge = 1 << 9,
1817    block_kind_invert = 1 << 10,
1818    block_kind_uses_discard = 1 << 12,
1819    block_kind_needs_lowering = 1 << 13,
1820    block_kind_export_end = 1 << 15,
1821 };
1822 
1823 struct RegisterDemand {
1824    constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1825    constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1826    int16_t vgpr = 0;
1827    int16_t sgpr = 0;
1828 
1829    constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1830    {
1831       return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1832    }
1833 
exceedsRegisterDemand1834    constexpr bool exceeds(const RegisterDemand other) const noexcept
1835    {
1836       return vgpr > other.vgpr || sgpr > other.sgpr;
1837    }
1838 
1839    constexpr RegisterDemand operator+(const Temp t) const noexcept
1840    {
1841       if (t.type() == RegType::sgpr)
1842          return RegisterDemand(vgpr, sgpr + t.size());
1843       else
1844          return RegisterDemand(vgpr + t.size(), sgpr);
1845    }
1846 
1847    constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1848    {
1849       return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1850    }
1851 
1852    constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1853    {
1854       return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1855    }
1856 
1857    constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1858    {
1859       vgpr += other.vgpr;
1860       sgpr += other.sgpr;
1861       return *this;
1862    }
1863 
1864    constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1865    {
1866       vgpr -= other.vgpr;
1867       sgpr -= other.sgpr;
1868       return *this;
1869    }
1870 
1871    constexpr RegisterDemand& operator+=(const Temp t) noexcept
1872    {
1873       if (t.type() == RegType::sgpr)
1874          sgpr += t.size();
1875       else
1876          vgpr += t.size();
1877       return *this;
1878    }
1879 
1880    constexpr RegisterDemand& operator-=(const Temp t) noexcept
1881    {
1882       if (t.type() == RegType::sgpr)
1883          sgpr -= t.size();
1884       else
1885          vgpr -= t.size();
1886       return *this;
1887    }
1888 
updateRegisterDemand1889    constexpr void update(const RegisterDemand other) noexcept
1890    {
1891       vgpr = std::max(vgpr, other.vgpr);
1892       sgpr = std::max(sgpr, other.sgpr);
1893    }
1894 };
1895 
1896 /* CFG */
1897 struct Block {
1898    float_mode fp_mode;
1899    unsigned index;
1900    unsigned offset = 0;
1901    std::vector<aco_ptr<Instruction>> instructions;
1902    std::vector<unsigned> logical_preds;
1903    std::vector<unsigned> linear_preds;
1904    std::vector<unsigned> logical_succs;
1905    std::vector<unsigned> linear_succs;
1906    RegisterDemand register_demand = RegisterDemand();
1907    uint16_t loop_nest_depth = 0;
1908    uint16_t divergent_if_logical_depth = 0;
1909    uint16_t uniform_if_depth = 0;
1910    uint16_t kind = 0;
1911    int logical_idom = -1;
1912    int linear_idom = -1;
1913 
1914    /* this information is needed for predecessors to blocks with phis when
1915     * moving out of ssa */
1916    bool scc_live_out = false;
1917 
BlockBlock1918    Block() : index(0) {}
1919 };
1920 
1921 /*
1922  * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1923  */
1924 enum class SWStage : uint16_t {
1925    None = 0,
1926    VS = 1 << 0,     /* Vertex Shader */
1927    GS = 1 << 1,     /* Geometry Shader */
1928    TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
1929    TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
1930    FS = 1 << 4,     /* Fragment aka Pixel Shader */
1931    CS = 1 << 5,     /* Compute Shader */
1932    TS = 1 << 6,     /* Task Shader */
1933    MS = 1 << 7,     /* Mesh Shader */
1934    GSCopy = 1 << 8, /* GS Copy Shader (internal) */
1935 
1936    /* Stage combinations merged to run on a single HWStage */
1937    VS_GS = VS | GS,
1938    VS_TCS = VS | TCS,
1939    TES_GS = TES | GS,
1940 };
1941 
1942 constexpr SWStage
1943 operator|(SWStage a, SWStage b)
1944 {
1945    return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
1946 }
1947 
1948 /*
1949  * Shader stages as running on the AMD GPU.
1950  *
1951  * The relation between HWStages and SWStages is not a one-to-one mapping:
1952  * Some SWStages are merged by ACO to run on a single HWStage.
1953  * See README.md for details.
1954  */
1955 enum class HWStage : uint8_t {
1956    VS,
1957    ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1958    GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
1959    NGG, /* Primitive shader, used to implement VS, TES, GS. */
1960    LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1961    HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1962    FS,
1963    CS,
1964 };
1965 
1966 /*
1967  * Set of SWStages to be merged into a single shader paired with the
1968  * HWStage it will run on.
1969  */
1970 struct Stage {
1971    constexpr Stage() = default;
1972 
StageStage1973    explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1974 
1975    /* Check if the given SWStage is included */
hasStage1976    constexpr bool has(SWStage stage) const
1977    {
1978       return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
1979    }
1980 
num_sw_stagesStage1981    unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
1982 
1983    constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
1984 
1985    constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
1986 
1987    /* Mask of merged software stages */
1988    SWStage sw = SWStage::None;
1989 
1990    /* Active hardware stage */
1991    HWStage hw{};
1992 };
1993 
1994 /* possible settings of Program::stage */
1995 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1996 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1997 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1998 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1999 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
2000 /* Mesh shading pipeline */
2001 static constexpr Stage task_cs(HWStage::CS, SWStage::TS);
2002 static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS);
2003 /* GFX10/NGG */
2004 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
2005 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
2006 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
2007 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
2008 /* GFX9 (and GFX10 if NGG isn't used) */
2009 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
2010 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
2011 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
2012 /* pre-GFX9 */
2013 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
2014 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
2015 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
2016 static constexpr Stage tess_eval_es(HWStage::ES,
2017                                     SWStage::TES); /* tesselation evaluation before geometry */
2018 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
2019 
2020 enum statistic {
2021    statistic_hash,
2022    statistic_instructions,
2023    statistic_copies,
2024    statistic_branches,
2025    statistic_latency,
2026    statistic_inv_throughput,
2027    statistic_vmem_clauses,
2028    statistic_smem_clauses,
2029    statistic_sgpr_presched,
2030    statistic_vgpr_presched,
2031    num_statistics
2032 };
2033 
2034 struct DeviceInfo {
2035    uint16_t lds_encoding_granule;
2036    uint16_t lds_alloc_granule;
2037    uint32_t lds_limit; /* in bytes */
2038    bool has_16bank_lds;
2039    uint16_t physical_sgprs;
2040    uint16_t physical_vgprs;
2041    uint16_t vgpr_limit;
2042    uint16_t sgpr_limit;
2043    uint16_t sgpr_alloc_granule;
2044    uint16_t vgpr_alloc_granule; /* must be power of two */
2045    unsigned max_wave64_per_simd;
2046    unsigned simd_per_cu;
2047    bool has_fast_fma32 = false;
2048    bool has_mac_legacy32 = false;
2049    bool fused_mad_mix = false;
2050    bool xnack_enabled = false;
2051    bool sram_ecc_enabled = false;
2052 };
2053 
2054 enum class CompilationProgress {
2055    after_isel,
2056    after_spilling,
2057    after_ra,
2058 };
2059 
2060 class Program final {
2061 public:
2062    std::vector<Block> blocks;
2063    std::vector<RegClass> temp_rc = {s1};
2064    RegisterDemand max_reg_demand = RegisterDemand();
2065    uint16_t num_waves = 0;
2066    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
2067    ac_shader_config* config;
2068    const struct radv_shader_info* info;
2069    enum chip_class chip_class;
2070    enum radeon_family family;
2071    DeviceInfo dev;
2072    unsigned wave_size;
2073    RegClass lane_mask;
2074    Stage stage;
2075    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2076    bool needs_wqm = false;   /* there exists a p_wqm instruction */
2077 
2078    std::vector<uint8_t> constant_data;
2079    Temp private_segment_buffer;
2080    Temp scratch_offset;
2081 
2082    uint16_t min_waves = 0;
2083    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2084    bool wgp_mode;
2085    bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2086 
2087    bool needs_vcc = false;
2088    bool needs_flat_scr = false;
2089 
2090    CompilationProgress progress;
2091 
2092    bool collect_statistics = false;
2093    uint32_t statistics[num_statistics];
2094 
2095    float_mode next_fp_mode;
2096    unsigned next_loop_depth = 0;
2097    unsigned next_divergent_if_logical_depth = 0;
2098    unsigned next_uniform_if_depth = 0;
2099 
2100    std::vector<Definition> vs_inputs;
2101 
2102    struct {
2103       FILE* output = stderr;
2104       bool shorten_messages = false;
2105       void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
2106       void* private_data;
2107    } debug;
2108 
allocateId(RegClass rc)2109    uint32_t allocateId(RegClass rc)
2110    {
2111       assert(allocationID <= 16777215);
2112       temp_rc.push_back(rc);
2113       return allocationID++;
2114    }
2115 
allocateRange(unsigned amount)2116    void allocateRange(unsigned amount)
2117    {
2118       assert(allocationID + amount <= 16777216);
2119       temp_rc.resize(temp_rc.size() + amount);
2120       allocationID += amount;
2121    }
2122 
allocateTmp(RegClass rc)2123    Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2124 
peekAllocationId()2125    uint32_t peekAllocationId() { return allocationID; }
2126 
2127    friend void reindex_ssa(Program* program);
2128    friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2129 
create_and_insert_block()2130    Block* create_and_insert_block()
2131    {
2132       Block block;
2133       return insert_block(std::move(block));
2134    }
2135 
insert_block(Block && block)2136    Block* insert_block(Block&& block)
2137    {
2138       block.index = blocks.size();
2139       block.fp_mode = next_fp_mode;
2140       block.loop_nest_depth = next_loop_depth;
2141       block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2142       block.uniform_if_depth = next_uniform_if_depth;
2143       blocks.emplace_back(std::move(block));
2144       return &blocks.back();
2145    }
2146 
2147 private:
2148    uint32_t allocationID = 1;
2149 };
2150 
2151 struct live {
2152    /* live temps out per block */
2153    std::vector<IDSet> live_out;
2154    /* register demand (sgpr/vgpr) per instruction per block */
2155    std::vector<std::vector<RegisterDemand>> register_demand;
2156 };
2157 
2158 struct ra_test_policy {
2159    /* Force RA to always use its pessimistic fallback algorithm */
2160    bool skip_optimistic_path = false;
2161 };
2162 
2163 void init();
2164 
2165 void init_program(Program* program, Stage stage, const struct radv_shader_info* info,
2166                   enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
2167                   ac_shader_config* config);
2168 
2169 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2170                     ac_shader_config* config, const struct radv_nir_compiler_options* options,
2171                     const struct radv_shader_info* info,
2172                     const struct radv_shader_args* args);
2173 void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2174                            const struct radv_nir_compiler_options* options,
2175                            const struct radv_shader_info* info,
2176                            const struct radv_shader_args* args);
2177 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2178                                 ac_shader_config* config,
2179                                 const struct radv_nir_compiler_options* options,
2180                                 const struct radv_shader_info* info,
2181                                 const struct radv_shader_args* args);
2182 void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
2183                       ac_shader_config* config,
2184                       const struct radv_nir_compiler_options* options,
2185                       const struct radv_shader_info* info,
2186                       const struct radv_shader_args* args,
2187                       unsigned* num_preserved_sgprs);
2188 
2189 void lower_phis(Program* program);
2190 void calc_min_waves(Program* program);
2191 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2192 live live_var_analysis(Program* program);
2193 std::vector<uint16_t> dead_code_analysis(Program* program);
2194 void dominator_tree(Program* program);
2195 void insert_exec_mask(Program* program);
2196 void value_numbering(Program* program);
2197 void optimize(Program* program);
2198 void optimize_postRA(Program* program);
2199 void setup_reduce_temp(Program* program);
2200 void lower_to_cssa(Program* program, live& live_vars);
2201 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2202                          ra_test_policy = {});
2203 void ssa_elimination(Program* program);
2204 void lower_to_hw_instr(Program* program);
2205 void schedule_program(Program* program, live& live_vars);
2206 void spill(Program* program, live& live_vars);
2207 void insert_wait_states(Program* program);
2208 void insert_NOPs(Program* program);
2209 void form_hard_clauses(Program* program);
2210 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2211 /**
2212  * Returns true if print_asm can disassemble the given program for the current build/runtime
2213  * configuration
2214  */
2215 bool check_print_asm_support(Program* program);
2216 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2217 bool validate_ir(Program* program);
2218 bool validate_ra(Program* program);
2219 #ifndef NDEBUG
2220 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2221 #else
2222 #define perfwarn(program, cond, msg, ...)                                                          \
2223    do {                                                                                            \
2224    } while (0)
2225 #endif
2226 
2227 void collect_presched_stats(Program* program);
2228 void collect_preasm_stats(Program* program);
2229 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2230 
2231 enum print_flags {
2232    print_no_ssa = 0x1,
2233    print_perf_info = 0x2,
2234    print_kill = 0x4,
2235    print_live_vars = 0x8,
2236 };
2237 
2238 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2239 void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2240 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2241 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2242                        unsigned flags = 0);
2243 
2244 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2245 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2246 
2247 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2248 #define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2249 
2250 /* utilities for dealing with register demand */
2251 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2252 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2253 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2254                                  aco_ptr<Instruction>& instr_before);
2255 
2256 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2257 uint16_t get_extra_sgprs(Program* program);
2258 
2259 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2260 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2261 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2262 
2263 /* return number of addressable sgprs/vgprs for max_waves */
2264 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2265 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2266 
2267 typedef struct {
2268    const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2269    const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2270    const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2271    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2272    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2273    const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2274    const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2275    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2276    /* sizes used for input/output modifiers and constants */
2277    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2278    const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2279 } Info;
2280 
2281 extern const Info instr_info;
2282 
2283 } // namespace aco
2284 
2285 #endif /* ACO_IR_H */
2286