amd/compiler/aco_ir.h

/*
 * Copyright © 2018 Valve Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 */

#ifndef ACO_IR_H
#define ACO_IR_H

#include "aco_opcodes.h"
#include "aco_util.h"

#include "vulkan/radv_shader.h"

#include "nir.h"

#include <bitset>
#include <memory>
#include <vector>

struct radv_shader_args;
struct radv_shader_info;
struct radv_vs_prolog_key;

namespace aco {

extern uint64_t debug_flags;

enum {
   DEBUG_VALIDATE_IR = 0x1,
   DEBUG_VALIDATE_RA = 0x2,
   DEBUG_PERFWARN = 0x4,
   DEBUG_FORCE_WAITCNT = 0x8,
   DEBUG_NO_VN = 0x10,
   DEBUG_NO_OPT = 0x20,
   DEBUG_NO_SCHED = 0x40,
   DEBUG_PERF_INFO = 0x80,
   DEBUG_LIVE_INFO = 0x100,
};

/**
 * Representation of the instruction's microcode encoding format
 * Note: Some Vector ALU Formats can be combined, such that:
 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
 *
 * (*) The same is applicable for VOP1 and VOPC instructions.
 */
enum class Format : std::uint16_t {
   /* Pseudo Instruction Format */
   PSEUDO = 0,
   /* Scalar ALU & Control Formats */
   SOP1 = 1,
   SOP2 = 2,
   SOPK = 3,
   SOPP = 4,
   SOPC = 5,
   /* Scalar Memory Format */
   SMEM = 6,
   /* LDS/GDS Format */
   DS = 8,
   /* Vector Memory Buffer Formats */
   MTBUF = 9,
   MUBUF = 10,
   /* Vector Memory Image Format */
   MIMG = 11,
   /* Export Format */
   EXP = 12,
   /* Flat Formats */
   FLAT = 13,
   GLOBAL = 14,
   SCRATCH = 15,

   PSEUDO_BRANCH = 16,
   PSEUDO_BARRIER = 17,
   PSEUDO_REDUCTION = 18,

   /* Vector ALU Formats */
   VOP3P = 19,
   VOP1 = 1 << 8,
   VOP2 = 1 << 9,
   VOPC = 1 << 10,
   VOP3 = 1 << 11,
   /* Vector Parameter Interpolation Format */
   VINTRP = 1 << 12,
   DPP16 = 1 << 13,
   SDWA = 1 << 14,
   DPP8 = 1 << 15,
};

enum class instr_class : uint8_t {
   valu32 = 0,
   valu_convert32 = 1,
   valu64 = 2,
   valu_quarter_rate32 = 3,
   valu_fma = 4,
   valu_transcendental32 = 5,
   valu_double = 6,
   valu_double_add = 7,
   valu_double_convert = 8,
   valu_double_transcendental = 9,
   salu = 10,
   smem = 11,
   barrier = 12,
   branch = 13,
   sendmsg = 14,
   ds = 15,
   exp = 16,
   vmem = 17,
   waitcnt = 18,
   other = 19,
   count,
};

enum storage_class : uint8_t {
   storage_none = 0x0,   /* no synchronization and can be reordered around aliasing stores */
   storage_buffer = 0x1, /* SSBOs and global memory */
   storage_atomic_counter = 0x2, /* not used for Vulkan */
   storage_image = 0x4,
   storage_shared = 0x8,       /* or TCS output */
   storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
   storage_task_payload = 0x20,/* Task-Mesh payload */
   storage_scratch = 0x40,
   storage_vgpr_spill = 0x80,
   storage_count = 8, /* not counting storage_none */
};

enum memory_semantics : uint8_t {
   semantic_none = 0x0,
   /* for loads: don't move any access after this load to before this load (even other loads)
    * for barriers: don't move any access after the barrier to before any
    * atomics/control_barriers/sendmsg_gs_done before the barrier */
   semantic_acquire = 0x1,
   /* for stores: don't move any access before this store to after this store
    * for barriers: don't move any access before the barrier to after any
    * atomics/control_barriers/sendmsg_gs_done after the barrier */
   semantic_release = 0x2,

   /* the rest are for load/stores/atomics only */
   /* cannot be DCE'd or CSE'd */
   semantic_volatile = 0x4,
   /* does not interact with barriers and assumes this lane is the only lane
    * accessing this memory */
   semantic_private = 0x8,
   /* this operation can be reordered around operations of the same storage.
    * says nothing about barriers */
   semantic_can_reorder = 0x10,
   /* this is a atomic instruction (may only read or write memory) */
   semantic_atomic = 0x20,
   /* this is instruction both reads and writes memory */
   semantic_rmw = 0x40,

   semantic_acqrel = semantic_acquire | semantic_release,
   semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
};

enum sync_scope : uint8_t {
   scope_invocation = 0,
   scope_subgroup = 1,
   scope_workgroup = 2,
   scope_queuefamily = 3,
   scope_device = 4,
};

struct memory_sync_info {
   memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
   memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
       : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
   {}

   storage_class storage : 8;
   memory_semantics semantics : 8;
   sync_scope scope : 8;

   bool operator==(const memory_sync_info& rhs) const
   {
      return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
   }

   bool can_reorder() const
   {
      if (semantics & semantic_acqrel)
         return false;
      /* Also check storage so that zero-initialized memory_sync_info can be
       * reordered. */
      return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
   }
};
static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");

enum fp_round {
   fp_round_ne = 0,
   fp_round_pi = 1,
   fp_round_ni = 2,
   fp_round_tz = 3,
};

enum fp_denorm {
   /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
    * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
   fp_denorm_flush = 0x0,
   fp_denorm_keep_in = 0x1,
   fp_denorm_keep_out = 0x2,
   fp_denorm_keep = 0x3,
};

struct float_mode {
   /* matches encoding of the MODE register */
   union {
      struct {
         fp_round round32 : 2;
         fp_round round16_64 : 2;
         unsigned denorm32 : 2;
         unsigned denorm16_64 : 2;
      };
      struct {
         uint8_t round : 4;
         uint8_t denorm : 4;
      };
      uint8_t val = 0;
   };
   /* if false, optimizations which may remove infs/nan/-0.0 can be done */
   bool preserve_signed_zero_inf_nan32 : 1;
   bool preserve_signed_zero_inf_nan16_64 : 1;
   /* if false, optimizations which may remove denormal flushing can be done */
   bool must_flush_denorms32 : 1;
   bool must_flush_denorms16_64 : 1;
   bool care_about_round32 : 1;
   bool care_about_round16_64 : 1;

   /* Returns true if instructions using the mode "other" can safely use the
    * current one instead. */
   bool canReplace(float_mode other) const noexcept
   {
      return val == other.val &&
             (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
             (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
             (must_flush_denorms32 || !other.must_flush_denorms32) &&
             (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
             (care_about_round32 || !other.care_about_round32) &&
             (care_about_round16_64 || !other.care_about_round16_64);
   }
};

struct wait_imm {
   static const uint8_t unset_counter = 0xff;

   uint8_t vm;
   uint8_t exp;
   uint8_t lgkm;
   uint8_t vs;

   wait_imm();
   wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
   wait_imm(enum chip_class chip, uint16_t packed);

   uint16_t pack(enum chip_class chip) const;

   bool combine(const wait_imm& other);

   bool empty() const;
};

constexpr Format
asVOP3(Format format)
{
   return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
};

constexpr Format
asSDWA(Format format)
{
   assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
   return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
}

constexpr Format
withoutDPP(Format format)
{
   return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
}

enum class RegType {
   none = 0,
   sgpr,
   vgpr,
   linear_vgpr,
};

struct RegClass {

   enum RC : uint8_t {
      s1 = 1,
      s2 = 2,
      s3 = 3,
      s4 = 4,
      s6 = 6,
      s8 = 8,
      s16 = 16,
      v1 = s1 | (1 << 5),
      v2 = s2 | (1 << 5),
      v3 = s3 | (1 << 5),
      v4 = s4 | (1 << 5),
      v5 = 5 | (1 << 5),
      v6 = 6 | (1 << 5),
      v7 = 7 | (1 << 5),
      v8 = 8 | (1 << 5),
      /* byte-sized register class */
      v1b = v1 | (1 << 7),
      v2b = v2 | (1 << 7),
      v3b = v3 | (1 << 7),
      v4b = v4 | (1 << 7),
      v6b = v6 | (1 << 7),
      v8b = v8 | (1 << 7),
      /* these are used for WWM and spills to vgpr */
      v1_linear = v1 | (1 << 6),
      v2_linear = v2 | (1 << 6),
   };

   RegClass() = default;
   constexpr RegClass(RC rc_) : rc(rc_) {}
   constexpr RegClass(RegType type, unsigned size)
       : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
   {}

   constexpr operator RC() const { return rc; }
   explicit operator bool() = delete;

   constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
   constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
   constexpr bool is_subdword() const { return rc & (1 << 7); }
   constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
   // TODO: use size() less in favor of bytes()
   constexpr unsigned size() const { return (bytes() + 3) >> 2; }
   constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
   constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
   constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }

   static constexpr RegClass get(RegType type, unsigned bytes)
   {
      if (type == RegType::sgpr) {
         return RegClass(type, DIV_ROUND_UP(bytes, 4u));
      } else {
         return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
      }
   }

   constexpr RegClass resize(unsigned bytes) const
   {
      if (is_linear_vgpr()) {
         assert(bytes % 4u == 0);
         return get(RegType::vgpr, bytes).as_linear();
      }
      return get(type(), bytes);
   }

private:
   RC rc;
};

/* transitional helper expressions */
static constexpr RegClass s1{RegClass::s1};
static constexpr RegClass s2{RegClass::s2};
static constexpr RegClass s3{RegClass::s3};
static constexpr RegClass s4{RegClass::s4};
static constexpr RegClass s8{RegClass::s8};
static constexpr RegClass s16{RegClass::s16};
static constexpr RegClass v1{RegClass::v1};
static constexpr RegClass v2{RegClass::v2};
static constexpr RegClass v3{RegClass::v3};
static constexpr RegClass v4{RegClass::v4};
static constexpr RegClass v5{RegClass::v5};
static constexpr RegClass v6{RegClass::v6};
static constexpr RegClass v7{RegClass::v7};
static constexpr RegClass v8{RegClass::v8};
static constexpr RegClass v1b{RegClass::v1b};
static constexpr RegClass v2b{RegClass::v2b};
static constexpr RegClass v3b{RegClass::v3b};
static constexpr RegClass v4b{RegClass::v4b};
static constexpr RegClass v6b{RegClass::v6b};
static constexpr RegClass v8b{RegClass::v8b};

/**
 * Temp Class
 * Each temporary virtual register has a
 * register class (i.e. size and type)
 * and SSA id.
 */
struct Temp {
   Temp() noexcept : id_(0), reg_class(0) {}
   constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}

   constexpr uint32_t id() const noexcept { return id_; }
   constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }

   constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
   constexpr unsigned size() const noexcept { return regClass().size(); }
   constexpr RegType type() const noexcept { return regClass().type(); }
   constexpr bool is_linear() const noexcept { return regClass().is_linear(); }

   constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
   constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
   constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }

private:
   uint32_t id_ : 24;
   uint32_t reg_class : 8;
};

/**
 * PhysReg
 * Represents the physical register for each
 * Operand and Definition.
 */
struct PhysReg {
   constexpr PhysReg() = default;
   explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
   constexpr unsigned reg() const { return reg_b >> 2; }
   constexpr unsigned byte() const { return reg_b & 0x3; }
   constexpr operator unsigned() const { return reg(); }
   constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
   constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
   constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
   constexpr PhysReg advance(int bytes) const
   {
      PhysReg res = *this;
      res.reg_b += bytes;
      return res;
   }

   uint16_t reg_b = 0;
};

/* helper expressions for special registers */
static constexpr PhysReg m0{124};
static constexpr PhysReg vcc{106};
static constexpr PhysReg vcc_hi{107};
static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
static constexpr PhysReg ttmp0{112};
static constexpr PhysReg ttmp1{113};
static constexpr PhysReg ttmp2{114};
static constexpr PhysReg ttmp3{115};
static constexpr PhysReg ttmp4{116};
static constexpr PhysReg ttmp5{117};
static constexpr PhysReg ttmp6{118};
static constexpr PhysReg ttmp7{119};
static constexpr PhysReg ttmp8{120};
static constexpr PhysReg ttmp9{121};
static constexpr PhysReg ttmp10{122};
static constexpr PhysReg ttmp11{123};
static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
static constexpr PhysReg exec{126};
static constexpr PhysReg exec_lo{126};
static constexpr PhysReg exec_hi{127};
static constexpr PhysReg vccz{251};
static constexpr PhysReg execz{252};
static constexpr PhysReg scc{253};

/**
 * Operand Class
 * Initially, each Operand refers to either
 * a temporary virtual register
 * or to a constant value
 * Temporary registers get mapped to physical register during RA
 * Constant values are inlined into the instruction sequence.
 */
class Operand final {
public:
   constexpr Operand()
       : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
         isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
         is24bit_(false), signext(false)
   {}

   explicit Operand(Temp r) noexcept
   {
      data_.temp = r;
      if (r.id()) {
         isTemp_ = true;
      } else {
         isUndef_ = true;
         setFixed(PhysReg{128});
      }
   };
   explicit Operand(Temp r, PhysReg reg) noexcept
   {
      assert(r.id()); /* Don't allow fixing an undef to a register */
      data_.temp = r;
      isTemp_ = true;
      setFixed(reg);
   };

   /* 8-bit constant */
   static Operand c8(uint8_t v) noexcept
   {
      /* 8-bit constants are only used for copies and copies from any 8-bit
       * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
       * to be inline constants. */
      Operand op;
      op.control_ = 0;
      op.data_.i = v;
      op.isConstant_ = true;
      op.constSize = 0;
      op.setFixed(PhysReg{0u});
      return op;
   };

   /* 16-bit constant */
   static Operand c16(uint16_t v) noexcept
   {
      Operand op;
      op.control_ = 0;
      op.data_.i = v;
      op.isConstant_ = true;
      op.constSize = 1;
      if (v <= 64)
         op.setFixed(PhysReg{128u + v});
      else if (v >= 0xFFF0) /* [-16 .. -1] */
         op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
      else if (v == 0x3800) /* 0.5 */
         op.setFixed(PhysReg{240});
      else if (v == 0xB800) /* -0.5 */
         op.setFixed(PhysReg{241});
      else if (v == 0x3C00) /* 1.0 */
         op.setFixed(PhysReg{242});
      else if (v == 0xBC00) /* -1.0 */
         op.setFixed(PhysReg{243});
      else if (v == 0x4000) /* 2.0 */
         op.setFixed(PhysReg{244});
      else if (v == 0xC000) /* -2.0 */
         op.setFixed(PhysReg{245});
      else if (v == 0x4400) /* 4.0 */
         op.setFixed(PhysReg{246});
      else if (v == 0xC400) /* -4.0 */
         op.setFixed(PhysReg{247});
      else if (v == 0x3118) /* 1/2 PI */
         op.setFixed(PhysReg{248});
      else /* Literal Constant */
         op.setFixed(PhysReg{255});
      return op;
   }

   /* 32-bit constant */
   static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }

   /* 64-bit constant */
   static Operand c64(uint64_t v) noexcept
   {
      Operand op;
      op.control_ = 0;
      op.isConstant_ = true;
      op.constSize = 3;
      if (v <= 64) {
         op.data_.i = (uint32_t)v;
         op.setFixed(PhysReg{128 + (uint32_t)v});
      } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
         op.data_.i = (uint32_t)v;
         op.setFixed(PhysReg{192 - (uint32_t)v});
      } else if (v == 0x3FE0000000000000) { /* 0.5 */
         op.data_.i = 0x3f000000;
         op.setFixed(PhysReg{240});
      } else if (v == 0xBFE0000000000000) { /* -0.5 */
         op.data_.i = 0xbf000000;
         op.setFixed(PhysReg{241});
      } else if (v == 0x3FF0000000000000) { /* 1.0 */
         op.data_.i = 0x3f800000;
         op.setFixed(PhysReg{242});
      } else if (v == 0xBFF0000000000000) { /* -1.0 */
         op.data_.i = 0xbf800000;
         op.setFixed(PhysReg{243});
      } else if (v == 0x4000000000000000) { /* 2.0 */
         op.data_.i = 0x40000000;
         op.setFixed(PhysReg{244});
      } else if (v == 0xC000000000000000) { /* -2.0 */
         op.data_.i = 0xc0000000;
         op.setFixed(PhysReg{245});
      } else if (v == 0x4010000000000000) { /* 4.0 */
         op.data_.i = 0x40800000;
         op.setFixed(PhysReg{246});
      } else if (v == 0xC010000000000000) { /* -4.0 */
         op.data_.i = 0xc0800000;
         op.setFixed(PhysReg{247});
      } else { /* Literal Constant: we don't know if it is a long or double.*/
         op.signext = v >> 63;
         op.data_.i = v & 0xffffffffu;
         op.setFixed(PhysReg{255});
         assert(op.constantValue64() == v &&
                "attempt to create a unrepresentable 64-bit literal constant");
      }
      return op;
   }

   /* 32-bit constant stored as a 32-bit or 64-bit operand */
   static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
   {
      Operand op;
      op.control_ = 0;
      op.data_.i = v;
      op.isConstant_ = true;
      op.constSize = is64bit ? 3 : 2;
      if (v <= 64)
         op.setFixed(PhysReg{128 + v});
      else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
         op.setFixed(PhysReg{192 - v});
      else if (v == 0x3f000000) /* 0.5 */
         op.setFixed(PhysReg{240});
      else if (v == 0xbf000000) /* -0.5 */
         op.setFixed(PhysReg{241});
      else if (v == 0x3f800000) /* 1.0 */
         op.setFixed(PhysReg{242});
      else if (v == 0xbf800000) /* -1.0 */
         op.setFixed(PhysReg{243});
      else if (v == 0x40000000) /* 2.0 */
         op.setFixed(PhysReg{244});
      else if (v == 0xc0000000) /* -2.0 */
         op.setFixed(PhysReg{245});
      else if (v == 0x40800000) /* 4.0 */
         op.setFixed(PhysReg{246});
      else if (v == 0xc0800000) /* -4.0 */
         op.setFixed(PhysReg{247});
      else { /* Literal Constant */
         assert(!is64bit && "attempt to create a 64-bit literal constant");
         op.setFixed(PhysReg{255});
      }
      return op;
   }

   explicit Operand(RegClass type) noexcept
   {
      isUndef_ = true;
      data_.temp = Temp(0, type);
      setFixed(PhysReg{128});
   };
   explicit Operand(PhysReg reg, RegClass type) noexcept
   {
      data_.temp = Temp(0, type);
      setFixed(reg);
   }

   static Operand zero(unsigned bytes = 4) noexcept
   {
      if (bytes == 8)
         return Operand::c64(0);
      else if (bytes == 4)
         return Operand::c32(0);
      else if (bytes == 2)
         return Operand::c16(0);
      assert(bytes == 1);
      return Operand::c8(0);
   }

   /* This is useful over the constructors when you want to take a chip class
    * for 1/2 PI or an unknown operand size.
    */
   static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes)
   {
      if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
         /* 1/2 PI can be an inline constant on GFX8+ */
         Operand op = Operand::c32(val);
         op.setFixed(PhysReg{248});
         return op;
      }

      if (bytes == 8)
         return Operand::c64(val);
      else if (bytes == 4)
         return Operand::c32(val);
      else if (bytes == 2)
         return Operand::c16(val);
      assert(bytes == 1);
      return Operand::c8(val);
   }

   static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
                                         bool sext = false)
   {
      if (bytes <= 4)
         return true;

      if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
         return true;
      uint64_t upper33 = val & 0xFFFFFFFF80000000;
      if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
         return true;

      return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
             val == 0x3FE0000000000000 ||              /* 0.5 */
             val == 0xBFE0000000000000 ||              /* -0.5 */
             val == 0x3FF0000000000000 ||              /* 1.0 */
             val == 0xBFF0000000000000 ||              /* -1.0 */
             val == 0x4000000000000000 ||              /* 2.0 */
             val == 0xC000000000000000 ||              /* -2.0 */
             val == 0x4010000000000000 ||              /* 4.0 */
             val == 0xC010000000000000;                /* -4.0 */
   }

   constexpr bool isTemp() const noexcept { return isTemp_; }

   constexpr void setTemp(Temp t) noexcept
   {
      assert(!isConstant_);
      isTemp_ = true;
      data_.temp = t;
   }

   constexpr Temp getTemp() const noexcept { return data_.temp; }

   constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }

   constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }

   constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }

   constexpr unsigned bytes() const noexcept
   {
      if (isConstant())
         return 1 << constSize;
      else
         return data_.temp.bytes();
   }

   constexpr unsigned size() const noexcept
   {
      if (isConstant())
         return constSize > 2 ? 2 : 1;
      else
         return data_.temp.size();
   }

   constexpr bool isFixed() const noexcept { return isFixed_; }

   constexpr PhysReg physReg() const noexcept { return reg_; }

   constexpr void setFixed(PhysReg reg) noexcept
   {
      isFixed_ = reg != unsigned(-1);
      reg_ = reg;
   }

   constexpr bool isConstant() const noexcept { return isConstant_; }

   constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }

   constexpr bool isUndefined() const noexcept { return isUndef_; }

   constexpr uint32_t constantValue() const noexcept { return data_.i; }

   constexpr bool constantEquals(uint32_t cmp) const noexcept
   {
      return isConstant() && constantValue() == cmp;
   }

   constexpr uint64_t constantValue64() const noexcept
   {
      if (constSize == 3) {
         if (reg_ <= 192)
            return reg_ - 128;
         else if (reg_ <= 208)
            return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);

         switch (reg_) {
         case 240: return 0x3FE0000000000000;
         case 241: return 0xBFE0000000000000;
         case 242: return 0x3FF0000000000000;
         case 243: return 0xBFF0000000000000;
         case 244: return 0x4000000000000000;
         case 245: return 0xC000000000000000;
         case 246: return 0x4010000000000000;
         case 247: return 0xC010000000000000;
         case 255:
            return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
         }
         unreachable("invalid register for 64-bit constant");
      } else {
         return data_.i;
      }
   }

   constexpr bool isOfType(RegType type) const noexcept
   {
      return hasRegClass() && regClass().type() == type;
   }

   /* Indicates that the killed operand's live range intersects with the
    * instruction's definitions. Unlike isKill() and isFirstKill(), this is
    * not set by liveness analysis. */
   constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }

   constexpr bool isLateKill() const noexcept { return isLateKill_; }

   constexpr void setKill(bool flag) noexcept
   {
      isKill_ = flag;
      if (!flag)
         setFirstKill(false);
   }

   constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }

   constexpr void setFirstKill(bool flag) noexcept
   {
      isFirstKill_ = flag;
      if (flag)
         setKill(flag);
   }

   /* When there are multiple operands killing the same temporary,
    * isFirstKill() is only returns true for the first one. */
   constexpr bool isFirstKill() const noexcept { return isFirstKill_; }

   constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }

   constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }

   constexpr bool operator==(Operand other) const noexcept
   {
      if (other.size() != size())
         return false;
      if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
         return false;
      if (isFixed() && other.isFixed() && physReg() != other.physReg())
         return false;
      if (isLiteral())
         return other.isLiteral() && other.constantValue() == constantValue();
      else if (isConstant())
         return other.isConstant() && other.physReg() == physReg();
      else if (isUndefined())
         return other.isUndefined() && other.regClass() == regClass();
      else
         return other.isTemp() && other.getTemp() == getTemp();
   }

   constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }

   constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }

   constexpr bool is16bit() const noexcept { return is16bit_; }

   constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }

   constexpr bool is24bit() const noexcept { return is24bit_; }

private:
   union {
      Temp temp;
      uint32_t i;
      float f;
   } data_ = {Temp(0, s1)};
   PhysReg reg_;
   union {
      struct {
         uint8_t isTemp_ : 1;
         uint8_t isFixed_ : 1;
         uint8_t isConstant_ : 1;
         uint8_t isKill_ : 1;
         uint8_t isUndef_ : 1;
         uint8_t isFirstKill_ : 1;
         uint8_t constSize : 2;
         uint8_t isLateKill_ : 1;
         uint8_t is16bit_ : 1;
         uint8_t is24bit_ : 1;
         uint8_t signext : 1;
      };
      /* can't initialize bit-fields in c++11, so work around using a union */
      uint16_t control_ = 0;
   };
};

/**
 * Definition Class
 * Definitions are the results of Instructions
 * and refer to temporary virtual registers
 * which are later mapped to physical registers
 */
class Definition final {
public:
   constexpr Definition()
       : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
         isNoCSE_(0)
   {}
   Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
   explicit Definition(Temp tmp) noexcept : temp(tmp) {}
   Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
   Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
   {
      setFixed(reg);
   }

   constexpr bool isTemp() const noexcept { return tempId() > 0; }

   constexpr Temp getTemp() const noexcept { return temp; }

   constexpr uint32_t tempId() const noexcept { return temp.id(); }

   constexpr void setTemp(Temp t) noexcept { temp = t; }

   void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }

   constexpr RegClass regClass() const noexcept { return temp.regClass(); }

   constexpr unsigned bytes() const noexcept { return temp.bytes(); }

   constexpr unsigned size() const noexcept { return temp.size(); }

   constexpr bool isFixed() const noexcept { return isFixed_; }

   constexpr PhysReg physReg() const noexcept { return reg_; }

   constexpr void setFixed(PhysReg reg) noexcept
   {
      isFixed_ = 1;
      reg_ = reg;
   }

   constexpr void setHint(PhysReg reg) noexcept
   {
      hasHint_ = 1;
      reg_ = reg;
   }

   constexpr bool hasHint() const noexcept { return hasHint_; }

   constexpr void setKill(bool flag) noexcept { isKill_ = flag; }

   constexpr bool isKill() const noexcept { return isKill_; }

   constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }

   constexpr bool isPrecise() const noexcept { return isPrecise_; }

   /* No Unsigned Wrap */
   constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }

   constexpr bool isNUW() const noexcept { return isNUW_; }

   constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }

   constexpr bool isNoCSE() const noexcept { return isNoCSE_; }

private:
   Temp temp = Temp(0, s1);
   PhysReg reg_;
   union {
      struct {
         uint8_t isFixed_ : 1;
         uint8_t hasHint_ : 1;
         uint8_t isKill_ : 1;
         uint8_t isPrecise_ : 1;
         uint8_t isNUW_ : 1;
         uint8_t isNoCSE_ : 1;
      };
      /* can't initialize bit-fields in c++11, so work around using a union */
      uint8_t control_ = 0;
   };
};

struct Block;
struct Instruction;
struct Pseudo_instruction;
struct SOP1_instruction;
struct SOP2_instruction;
struct SOPK_instruction;
struct SOPP_instruction;
struct SOPC_instruction;
struct SMEM_instruction;
struct DS_instruction;
struct MTBUF_instruction;
struct MUBUF_instruction;
struct MIMG_instruction;
struct Export_instruction;
struct FLAT_instruction;
struct Pseudo_branch_instruction;
struct Pseudo_barrier_instruction;
struct Pseudo_reduction_instruction;
struct VOP3P_instruction;
struct VOP1_instruction;
struct VOP2_instruction;
struct VOPC_instruction;
struct VOP3_instruction;
struct Interp_instruction;
struct DPP16_instruction;
struct DPP8_instruction;
struct SDWA_instruction;

struct Instruction {
   aco_opcode opcode;
   Format format;
   uint32_t pass_flags;

   aco::span<Operand> operands;
   aco::span<Definition> definitions;

   constexpr bool usesModifiers() const noexcept;

   constexpr bool reads_exec() const noexcept
   {
      for (const Operand& op : operands) {
         if (op.isFixed() && op.physReg() == exec)
            return true;
      }
      return false;
   }

   Pseudo_instruction& pseudo() noexcept
   {
      assert(isPseudo());
      return *(Pseudo_instruction*)this;
   }
   const Pseudo_instruction& pseudo() const noexcept
   {
      assert(isPseudo());
      return *(Pseudo_instruction*)this;
   }
   constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
   SOP1_instruction& sop1() noexcept
   {
      assert(isSOP1());
      return *(SOP1_instruction*)this;
   }
   const SOP1_instruction& sop1() const noexcept
   {
      assert(isSOP1());
      return *(SOP1_instruction*)this;
   }
   constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
   SOP2_instruction& sop2() noexcept
   {
      assert(isSOP2());
      return *(SOP2_instruction*)this;
   }
   const SOP2_instruction& sop2() const noexcept
   {
      assert(isSOP2());
      return *(SOP2_instruction*)this;
   }
   constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
   SOPK_instruction& sopk() noexcept
   {
      assert(isSOPK());
      return *(SOPK_instruction*)this;
   }
   const SOPK_instruction& sopk() const noexcept
   {
      assert(isSOPK());
      return *(SOPK_instruction*)this;
   }
   constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
   SOPP_instruction& sopp() noexcept
   {
      assert(isSOPP());
      return *(SOPP_instruction*)this;
   }
   const SOPP_instruction& sopp() const noexcept
   {
      assert(isSOPP());
      return *(SOPP_instruction*)this;
   }
   constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
   SOPC_instruction& sopc() noexcept
   {
      assert(isSOPC());
      return *(SOPC_instruction*)this;
   }
   const SOPC_instruction& sopc() const noexcept
   {
      assert(isSOPC());
      return *(SOPC_instruction*)this;
   }
   constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
   SMEM_instruction& smem() noexcept
   {
      assert(isSMEM());
      return *(SMEM_instruction*)this;
   }
   const SMEM_instruction& smem() const noexcept
   {
      assert(isSMEM());
      return *(SMEM_instruction*)this;
   }
   constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
   DS_instruction& ds() noexcept
   {
      assert(isDS());
      return *(DS_instruction*)this;
   }
   const DS_instruction& ds() const noexcept
   {
      assert(isDS());
      return *(DS_instruction*)this;
   }
   constexpr bool isDS() const noexcept { return format == Format::DS; }
   MTBUF_instruction& mtbuf() noexcept
   {
      assert(isMTBUF());
      return *(MTBUF_instruction*)this;
   }
   const MTBUF_instruction& mtbuf() const noexcept
   {
      assert(isMTBUF());
      return *(MTBUF_instruction*)this;
   }
   constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
   MUBUF_instruction& mubuf() noexcept
   {
      assert(isMUBUF());
      return *(MUBUF_instruction*)this;
   }
   const MUBUF_instruction& mubuf() const noexcept
   {
      assert(isMUBUF());
      return *(MUBUF_instruction*)this;
   }
   constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
   MIMG_instruction& mimg() noexcept
   {
      assert(isMIMG());
      return *(MIMG_instruction*)this;
   }
   const MIMG_instruction& mimg() const noexcept
   {
      assert(isMIMG());
      return *(MIMG_instruction*)this;
   }
   constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
   Export_instruction& exp() noexcept
   {
      assert(isEXP());
      return *(Export_instruction*)this;
   }
   const Export_instruction& exp() const noexcept
   {
      assert(isEXP());
      return *(Export_instruction*)this;
   }
   constexpr bool isEXP() const noexcept { return format == Format::EXP; }
   FLAT_instruction& flat() noexcept
   {
      assert(isFlat());
      return *(FLAT_instruction*)this;
   }
   const FLAT_instruction& flat() const noexcept
   {
      assert(isFlat());
      return *(FLAT_instruction*)this;
   }
   constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
   FLAT_instruction& global() noexcept
   {
      assert(isGlobal());
      return *(FLAT_instruction*)this;
   }
   const FLAT_instruction& global() const noexcept
   {
      assert(isGlobal());
      return *(FLAT_instruction*)this;
   }
   constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
   FLAT_instruction& scratch() noexcept
   {
      assert(isScratch());
      return *(FLAT_instruction*)this;
   }
   const FLAT_instruction& scratch() const noexcept
   {
      assert(isScratch());
      return *(FLAT_instruction*)this;
   }
   constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
   Pseudo_branch_instruction& branch() noexcept
   {
      assert(isBranch());
      return *(Pseudo_branch_instruction*)this;
   }
   const Pseudo_branch_instruction& branch() const noexcept
   {
      assert(isBranch());
      return *(Pseudo_branch_instruction*)this;
   }
   constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
   Pseudo_barrier_instruction& barrier() noexcept
   {
      assert(isBarrier());
      return *(Pseudo_barrier_instruction*)this;
   }
   const Pseudo_barrier_instruction& barrier() const noexcept
   {
      assert(isBarrier());
      return *(Pseudo_barrier_instruction*)this;
   }
   constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
   Pseudo_reduction_instruction& reduction() noexcept
   {
      assert(isReduction());
      return *(Pseudo_reduction_instruction*)this;
   }
   const Pseudo_reduction_instruction& reduction() const noexcept
   {
      assert(isReduction());
      return *(Pseudo_reduction_instruction*)this;
   }
   constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
   VOP3P_instruction& vop3p() noexcept
   {
      assert(isVOP3P());
      return *(VOP3P_instruction*)this;
   }
   const VOP3P_instruction& vop3p() const noexcept
   {
      assert(isVOP3P());
      return *(VOP3P_instruction*)this;
   }
   constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
   VOP1_instruction& vop1() noexcept
   {
      assert(isVOP1());
      return *(VOP1_instruction*)this;
   }
   const VOP1_instruction& vop1() const noexcept
   {
      assert(isVOP1());
      return *(VOP1_instruction*)this;
   }
   constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
   VOP2_instruction& vop2() noexcept
   {
      assert(isVOP2());
      return *(VOP2_instruction*)this;
   }
   const VOP2_instruction& vop2() const noexcept
   {
      assert(isVOP2());
      return *(VOP2_instruction*)this;
   }
   constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
   VOPC_instruction& vopc() noexcept
   {
      assert(isVOPC());
      return *(VOPC_instruction*)this;
   }
   const VOPC_instruction& vopc() const noexcept
   {
      assert(isVOPC());
      return *(VOPC_instruction*)this;
   }
   constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
   VOP3_instruction& vop3() noexcept
   {
      assert(isVOP3());
      return *(VOP3_instruction*)this;
   }
   const VOP3_instruction& vop3() const noexcept
   {
      assert(isVOP3());
      return *(VOP3_instruction*)this;
   }
   constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
   Interp_instruction& vintrp() noexcept
   {
      assert(isVINTRP());
      return *(Interp_instruction*)this;
   }
   const Interp_instruction& vintrp() const noexcept
   {
      assert(isVINTRP());
      return *(Interp_instruction*)this;
   }
   constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
   DPP16_instruction& dpp16() noexcept
   {
      assert(isDPP16());
      return *(DPP16_instruction*)this;
   }
   const DPP16_instruction& dpp16() const noexcept
   {
      assert(isDPP16());
      return *(DPP16_instruction*)this;
   }
   constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
   DPP8_instruction& dpp8() noexcept
   {
      assert(isDPP8());
      return *(DPP8_instruction*)this;
   }
   const DPP8_instruction& dpp8() const noexcept
   {
      assert(isDPP8());
      return *(DPP8_instruction*)this;
   }
   constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
   constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
   SDWA_instruction& sdwa() noexcept
   {
      assert(isSDWA());
      return *(SDWA_instruction*)this;
   }
   const SDWA_instruction& sdwa() const noexcept
   {
      assert(isSDWA());
      return *(SDWA_instruction*)this;
   }
   constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }

   FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }

   const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }

   constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }

   constexpr bool isVALU() const noexcept
   {
      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
   }

   constexpr bool isSALU() const noexcept
   {
      return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
   }

   constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
};
static_assert(sizeof(Instruction) == 16, "Unexpected padding");

struct SOPK_instruction : public Instruction {
   uint16_t imm;
   uint16_t padding;
};
static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

struct SOPP_instruction : public Instruction {
   uint32_t imm;
   int block;
};
static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

struct SOPC_instruction : public Instruction {};
static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");

struct SOP1_instruction : public Instruction {};
static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");

struct SOP2_instruction : public Instruction {};
static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");

/**
 * Scalar Memory Format:
 * For s_(buffer_)load_dword*:
 * Operand(0): SBASE - SGPR-pair which provides base address
 * Operand(1): Offset - immediate (un)signed offset or SGPR
 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
 * Operand(n-1): SOffset - SGPR offset (Vega only)
 *
 * Having no operands is also valid for instructions such as s_dcache_inv.
 *
 */
struct SMEM_instruction : public Instruction {
   memory_sync_info sync;
   bool glc : 1; /* VI+: globally coherent */
   bool dlc : 1; /* NAVI: device level coherent */
   bool nv : 1;  /* VEGA only: Non-volatile */
   bool disable_wqm : 1;
   bool prevent_overflow : 1; /* avoid overflow when combining additions */
   uint8_t padding : 3;
};
static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

struct VOP1_instruction : public Instruction {};
static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");

struct VOP2_instruction : public Instruction {};
static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");

struct VOPC_instruction : public Instruction {};
static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");

struct VOP3_instruction : public Instruction {
   bool abs[3];
   bool neg[3];
   uint8_t opsel : 4;
   uint8_t omod : 2;
   bool clamp : 1;
   uint8_t padding0 : 1;
   uint8_t padding1;
};
static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

struct VOP3P_instruction : public Instruction {
   bool neg_lo[3];
   bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */
   uint8_t opsel_lo : 3;
   uint8_t opsel_hi : 3;
   bool clamp : 1;
   uint8_t padding0 : 1;
   uint8_t padding1;
};
static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

/**
 * Data Parallel Primitives Format:
 * This format can be used for VOP1, VOP2 or VOPC instructions.
 * The swizzle applies to the src0 operand.
 *
 */
struct DPP16_instruction : public Instruction {
   bool abs[2];
   bool neg[2];
   uint16_t dpp_ctrl;
   uint8_t row_mask : 4;
   uint8_t bank_mask : 4;
   bool bound_ctrl : 1;
   uint8_t padding : 7;
};
static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

struct DPP8_instruction : public Instruction {
   uint8_t lane_sel[8];
};
static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

struct SubdwordSel {
   enum sdwa_sel : uint8_t {
      ubyte = 0x4,
      uword = 0x8,
      dword = 0x10,
      sext = 0x20,
      sbyte = ubyte | sext,
      sword = uword | sext,

      ubyte0 = ubyte,
      ubyte1 = ubyte | 1,
      ubyte2 = ubyte | 2,
      ubyte3 = ubyte | 3,
      sbyte0 = sbyte,
      sbyte1 = sbyte | 1,
      sbyte2 = sbyte | 2,
      sbyte3 = sbyte | 3,
      uword0 = uword,
      uword1 = uword | 2,
      sword0 = sword,
      sword1 = sword | 2,
   };

   SubdwordSel() : sel((sdwa_sel)0) {}
   constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
   constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
       : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
   {}
   constexpr operator sdwa_sel() const { return sel; }
   explicit operator bool() const { return sel != 0; }

   constexpr unsigned size() const { return (sel >> 2) & 0x7; }
   constexpr unsigned offset() const { return sel & 0x3; }
   constexpr bool sign_extend() const { return sel & sext; }
   constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
   {
      reg_byte_offset += offset();
      if (size() == 1)
         return reg_byte_offset;
      else if (size() == 2)
         return 4 + (reg_byte_offset >> 1);
      else
         return 6;
   }

private:
   sdwa_sel sel;
};

/**
 * Sub-Dword Addressing Format:
 * This format can be used for VOP1, VOP2 or VOPC instructions.
 *
 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
 * the definition doesn't have to be VCC on GFX9+.
 *
 */
struct SDWA_instruction : public Instruction {
   /* these destination modifiers aren't available with VOPC except for
    * clamp on GFX8 */
   SubdwordSel sel[2];
   SubdwordSel dst_sel;
   bool neg[2];
   bool abs[2];
   bool clamp : 1;
   uint8_t omod : 2; /* GFX9+ */
   uint8_t padding : 5;
};
static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

struct Interp_instruction : public Instruction {
   uint8_t attribute;
   uint8_t component;
   uint16_t padding;
};
static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

/**
 * Local and Global Data Sharing instructions
 * Operand(0): ADDR - VGPR which supplies the address.
 * Operand(1): DATA0 - First data VGPR.
 * Operand(2): DATA1 - Second data VGPR.
 * Operand(n-1): M0 - LDS size.
 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
 *
 */
struct DS_instruction : public Instruction {
   memory_sync_info sync;
   bool gds;
   int16_t offset0;
   int8_t offset1;
   uint8_t padding;
};
static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

/**
 * Vector Memory Untyped-buffer Instructions
 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 * Operand(1): VADDR - Address source. Can carry an index and/or offset
 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 *
 */
struct MUBUF_instruction : public Instruction {
   memory_sync_info sync;
   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
   bool idxen : 1;           /* Supply an index from VGPR (VADDR) */
   bool addr64 : 1;          /* SI, CIK: Address size is 64-bit */
   bool glc : 1;             /* globally coherent */
   bool dlc : 1;             /* NAVI: device level coherent */
   bool slc : 1;             /* system level coherent */
   bool tfe : 1;             /* texture fail enable */
   bool lds : 1;             /* Return read-data to LDS instead of VGPRs */
   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
   uint16_t offset : 12;     /* Unsigned byte offset - 12 bit */
   uint16_t swizzled : 1;
   uint16_t padding0 : 2;
   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
   uint16_t padding1 : 10;
};
static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

/**
 * Vector Memory Typed-buffer Instructions
 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
 * Operand(1): VADDR - Address source. Can carry an index and/or offset
 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
 *
 */
struct MTBUF_instruction : public Instruction {
   memory_sync_info sync;
   uint8_t dfmt : 4;         /* Data Format of data in memory buffer */
   uint8_t nfmt : 3;         /* Numeric format of data in memory */
   bool offen : 1;           /* Supply an offset from VGPR (VADDR) */
   uint16_t idxen : 1;       /* Supply an index from VGPR (VADDR) */
   uint16_t glc : 1;         /* globally coherent */
   uint16_t dlc : 1;         /* NAVI: device level coherent */
   uint16_t slc : 1;         /* system level coherent */
   uint16_t tfe : 1;         /* texture fail enable */
   uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
   uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
   uint16_t padding : 4;
   uint16_t offset; /* Unsigned byte offset - 12 bit */
};
static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

/**
 * Vector Memory Image Instructions
 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
 * Operand(3): VADDR - Address source. Can carry an offset or an index.
 * Definition(0): VDATA - Vector GPR for read result.
 *
 */
struct MIMG_instruction : public Instruction {
   memory_sync_info sync;
   uint8_t dmask;        /* Data VGPR enable mask */
   uint8_t dim : 3;      /* NAVI: dimensionality */
   bool unrm : 1;        /* Force address to be un-normalized */
   bool dlc : 1;         /* NAVI: device level coherent */
   bool glc : 1;         /* globally coherent */
   bool slc : 1;         /* system level coherent */
   bool tfe : 1;         /* texture fail enable */
   bool da : 1;          /* declare an array */
   bool lwe : 1;         /* LOD warning enable */
   bool r128 : 1;        /* NAVI: Texture resource size */
   bool a16 : 1;         /* VEGA, NAVI: Address components are 16-bits */
   bool d16 : 1;         /* Convert 32-bit data to 16-bit data */
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
   uint8_t padding0 : 2;
   uint8_t padding1;
   uint8_t padding2;
};
static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

/**
 * Flat/Scratch/Global Instructions
 * Operand(0): ADDR
 * Operand(1): SADDR
 * Operand(2) / Definition(0): DATA/VDST
 *
 */
struct FLAT_instruction : public Instruction {
   memory_sync_info sync;
   bool slc : 1; /* system level coherent */
   bool glc : 1; /* globally coherent */
   bool dlc : 1; /* NAVI: device level coherent */
   bool lds : 1;
   bool nv : 1;
   bool disable_wqm : 1; /* Require an exec mask without helper invocations */
   uint8_t padding0 : 2;
   uint16_t offset; /* Vega/Navi only */
   uint16_t padding1;
};
static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

struct Export_instruction : public Instruction {
   uint8_t enabled_mask;
   uint8_t dest;
   bool compressed : 1;
   bool done : 1;
   bool valid_mask : 1;
   uint8_t padding0 : 5;
   uint8_t padding1;
};
static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

struct Pseudo_instruction : public Instruction {
   PhysReg scratch_sgpr; /* might not be valid if it's not needed */
   bool tmp_in_scc;
   uint8_t padding;
};
static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

struct Pseudo_branch_instruction : public Instruction {
   /* target[0] is the block index of the branch target.
    * For conditional branches, target[1] contains the fall-through alternative.
    * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
    */
   uint32_t target[2];
};
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");

struct Pseudo_barrier_instruction : public Instruction {
   memory_sync_info sync;
   sync_scope exec_scope;
};
static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");

enum ReduceOp : uint16_t {
   // clang-format off
   iadd8, iadd16, iadd32, iadd64,
   imul8, imul16, imul32, imul64,
          fadd16, fadd32, fadd64,
          fmul16, fmul32, fmul64,
   imin8, imin16, imin32, imin64,
   imax8, imax16, imax32, imax64,
   umin8, umin16, umin32, umin64,
   umax8, umax16, umax32, umax64,
          fmin16, fmin32, fmin64,
          fmax16, fmax32, fmax64,
   iand8, iand16, iand32, iand64,
   ior8, ior16, ior32, ior64,
   ixor8, ixor16, ixor32, ixor64,
   num_reduce_ops,
   // clang-format on
};

/**
 * Subgroup Reduction Instructions, everything except for the data to be
 * reduced and the result as inserted by setup_reduce_temp().
 * Operand(0): data to be reduced
 * Operand(1): reduce temporary
 * Operand(2): vector temporary
 * Definition(0): result
 * Definition(1): scalar temporary
 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
 * Definition(3): scc clobber
 * Definition(4): vcc clobber
 *
 */
struct Pseudo_reduction_instruction : public Instruction {
   ReduceOp reduce_op;
   uint16_t cluster_size; // must be 0 for scans
};
static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
              "Unexpected padding");

struct instr_deleter_functor {
   void operator()(void* p) { free(p); }
};

template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;

template <typename T>
T*
create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
                   uint32_t num_definitions)
{
   std::size_t size =
      sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
   char* data = (char*)calloc(1, size);
   T* inst = (T*)data;

   inst->opcode = opcode;
   inst->format = format;

   uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
   inst->operands = aco::span<Operand>(operands_offset, num_operands);
   uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
   inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);

   return inst;
}

constexpr bool
Instruction::usesModifiers() const noexcept
{
   if (isDPP() || isSDWA())
      return true;

   if (isVOP3P()) {
      const VOP3P_instruction& vop3p = this->vop3p();
      for (unsigned i = 0; i < operands.size(); i++) {
         if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
            return true;

         /* opsel_hi must be 1 to not be considered a modifier - even for constants */
         if (!(vop3p.opsel_hi & (1 << i)))
            return true;
      }
      return vop3p.opsel_lo || vop3p.clamp;
   } else if (isVOP3()) {
      const VOP3_instruction& vop3 = this->vop3();
      for (unsigned i = 0; i < operands.size(); i++) {
         if (vop3.abs[i] || vop3.neg[i])
            return true;
      }
      return vop3.opsel || vop3.clamp || vop3.omod;
   }
   return false;
}

constexpr bool
is_phi(Instruction* instr)
{
   return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
}

static inline bool
is_phi(aco_ptr<Instruction>& instr)
{
   return is_phi(instr.get());
}

memory_sync_info get_sync_info(const Instruction* instr);

bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);

bool can_use_opsel(chip_class chip, aco_opcode op, int idx);
bool instr_is_16bit(chip_class chip, aco_opcode op);
bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8);
bool needs_exec_mask(const Instruction* instr);

aco_opcode get_ordered(aco_opcode op);
aco_opcode get_unordered(aco_opcode op);
aco_opcode get_inverse(aco_opcode op);
aco_opcode get_f32_cmp(aco_opcode op);
unsigned get_cmp_bitsize(aco_opcode op);
bool is_cmp(aco_opcode op);

bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);

uint32_t get_reduction_identity(ReduceOp op, unsigned idx);

unsigned get_mimg_nsa_dwords(const Instruction* instr);

bool should_form_clause(const Instruction* a, const Instruction* b);

enum block_kind {
   /* uniform indicates that leaving this block,
    * all actives lanes stay active */
   block_kind_uniform = 1 << 0,
   block_kind_top_level = 1 << 1,
   block_kind_loop_preheader = 1 << 2,
   block_kind_loop_header = 1 << 3,
   block_kind_loop_exit = 1 << 4,
   block_kind_continue = 1 << 5,
   block_kind_break = 1 << 6,
   block_kind_continue_or_break = 1 << 7,
   block_kind_branch = 1 << 8,
   block_kind_merge = 1 << 9,
   block_kind_invert = 1 << 10,
   block_kind_uses_discard = 1 << 12,
   block_kind_needs_lowering = 1 << 13,
   block_kind_export_end = 1 << 15,
};

struct RegisterDemand {
   constexpr RegisterDemand() = default;
   constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
   int16_t vgpr = 0;
   int16_t sgpr = 0;

   constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
   {
      return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
   }

   constexpr bool exceeds(const RegisterDemand other) const noexcept
   {
      return vgpr > other.vgpr || sgpr > other.sgpr;
   }

   constexpr RegisterDemand operator+(const Temp t) const noexcept
   {
      if (t.type() == RegType::sgpr)
         return RegisterDemand(vgpr, sgpr + t.size());
      else
         return RegisterDemand(vgpr + t.size(), sgpr);
   }

   constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
   {
      return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
   }

   constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
   {
      return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
   }

   constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
   {
      vgpr += other.vgpr;
      sgpr += other.sgpr;
      return *this;
   }

   constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
   {
      vgpr -= other.vgpr;
      sgpr -= other.sgpr;
      return *this;
   }

   constexpr RegisterDemand& operator+=(const Temp t) noexcept
   {
      if (t.type() == RegType::sgpr)
         sgpr += t.size();
      else
         vgpr += t.size();
      return *this;
   }

   constexpr RegisterDemand& operator-=(const Temp t) noexcept
   {
      if (t.type() == RegType::sgpr)
         sgpr -= t.size();
      else
         vgpr -= t.size();
      return *this;
   }

   constexpr void update(const RegisterDemand other) noexcept
   {
      vgpr = std::max(vgpr, other.vgpr);
      sgpr = std::max(sgpr, other.sgpr);
   }
};

/* CFG */
struct Block {
   float_mode fp_mode;
   unsigned index;
   unsigned offset = 0;
   std::vector<aco_ptr<Instruction>> instructions;
   std::vector<unsigned> logical_preds;
   std::vector<unsigned> linear_preds;
   std::vector<unsigned> logical_succs;
   std::vector<unsigned> linear_succs;
   RegisterDemand register_demand = RegisterDemand();
   uint16_t loop_nest_depth = 0;
   uint16_t divergent_if_logical_depth = 0;
   uint16_t uniform_if_depth = 0;
   uint16_t kind = 0;
   int logical_idom = -1;
   int linear_idom = -1;

   /* this information is needed for predecessors to blocks with phis when
    * moving out of ssa */
   bool scc_live_out = false;

   Block() : index(0) {}
};

/*
 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
 */
enum class SWStage : uint16_t {
   None = 0,
   VS = 1 << 0,     /* Vertex Shader */
   GS = 1 << 1,     /* Geometry Shader */
   TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
   TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
   FS = 1 << 4,     /* Fragment aka Pixel Shader */
   CS = 1 << 5,     /* Compute Shader */
   TS = 1 << 6,     /* Task Shader */
   MS = 1 << 7,     /* Mesh Shader */
   GSCopy = 1 << 8, /* GS Copy Shader (internal) */

   /* Stage combinations merged to run on a single HWStage */
   VS_GS = VS | GS,
   VS_TCS = VS | TCS,
   TES_GS = TES | GS,
};

constexpr SWStage
operator|(SWStage a, SWStage b)
{
   return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
}

/*
 * Shader stages as running on the AMD GPU.
 *
 * The relation between HWStages and SWStages is not a one-to-one mapping:
 * Some SWStages are merged by ACO to run on a single HWStage.
 * See README.md for details.
 */
enum class HWStage : uint8_t {
   VS,
   ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
   GS,  /* Geometry shader on GFX10/legacy and GFX6-9. */
   NGG, /* Primitive shader, used to implement VS, TES, GS. */
   LS,  /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
   HS,  /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
   FS,
   CS,
};

/*
 * Set of SWStages to be merged into a single shader paired with the
 * HWStage it will run on.
 */
struct Stage {
   constexpr Stage() = default;

   explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}

   /* Check if the given SWStage is included */
   constexpr bool has(SWStage stage) const
   {
      return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
   }

   unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }

   constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }

   constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }

   /* Mask of merged software stages */
   SWStage sw = SWStage::None;

   /* Active hardware stage */
   HWStage hw{};
};

/* possible settings of Program::stage */
static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
/* Mesh shading pipeline */
static constexpr Stage task_cs(HWStage::CS, SWStage::TS);
static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS);
/* GFX10/NGG */
static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
/* GFX9 (and GFX10 if NGG isn't used) */
static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
/* pre-GFX9 */
static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
static constexpr Stage tess_eval_es(HWStage::ES,
                                    SWStage::TES); /* tesselation evaluation before geometry */
static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);

enum statistic {
   statistic_hash,
   statistic_instructions,
   statistic_copies,
   statistic_branches,
   statistic_latency,
   statistic_inv_throughput,
   statistic_vmem_clauses,
   statistic_smem_clauses,
   statistic_sgpr_presched,
   statistic_vgpr_presched,
   num_statistics
};

struct DeviceInfo {
   uint16_t lds_encoding_granule;
   uint16_t lds_alloc_granule;
   uint32_t lds_limit; /* in bytes */
   bool has_16bank_lds;
   uint16_t physical_sgprs;
   uint16_t physical_vgprs;
   uint16_t vgpr_limit;
   uint16_t sgpr_limit;
   uint16_t sgpr_alloc_granule;
   uint16_t vgpr_alloc_granule; /* must be power of two */
   unsigned max_wave64_per_simd;
   unsigned simd_per_cu;
   bool has_fast_fma32 = false;
   bool has_mac_legacy32 = false;
   bool fused_mad_mix = false;
   bool xnack_enabled = false;
   bool sram_ecc_enabled = false;
};

enum class CompilationProgress {
   after_isel,
   after_spilling,
   after_ra,
};

class Program final {
public:
   std::vector<Block> blocks;
   std::vector<RegClass> temp_rc = {s1};
   RegisterDemand max_reg_demand = RegisterDemand();
   uint16_t num_waves = 0;
   uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
   ac_shader_config* config;
   const struct radv_shader_info* info;
   enum chip_class chip_class;
   enum radeon_family family;
   DeviceInfo dev;
   unsigned wave_size;
   RegClass lane_mask;
   Stage stage;
   bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
   bool needs_wqm = false;   /* there exists a p_wqm instruction */

   std::vector<uint8_t> constant_data;
   Temp private_segment_buffer;
   Temp scratch_offset;

   uint16_t min_waves = 0;
   unsigned workgroup_size; /* if known; otherwise UINT_MAX */
   bool wgp_mode;
   bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */

   bool needs_vcc = false;
   bool needs_flat_scr = false;

   CompilationProgress progress;

   bool collect_statistics = false;
   uint32_t statistics[num_statistics];

   float_mode next_fp_mode;
   unsigned next_loop_depth = 0;
   unsigned next_divergent_if_logical_depth = 0;
   unsigned next_uniform_if_depth = 0;

   std::vector<Definition> vs_inputs;

   struct {
      FILE* output = stderr;
      bool shorten_messages = false;
      void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
      void* private_data;
   } debug;

   uint32_t allocateId(RegClass rc)
   {
      assert(allocationID <= 16777215);
      temp_rc.push_back(rc);
      return allocationID++;
   }

   void allocateRange(unsigned amount)
   {
      assert(allocationID + amount <= 16777216);
      temp_rc.resize(temp_rc.size() + amount);
      allocationID += amount;
   }

   Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }

   uint32_t peekAllocationId() { return allocationID; }

   friend void reindex_ssa(Program* program);
   friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);

   Block* create_and_insert_block()
   {
      Block block;
      return insert_block(std::move(block));
   }

   Block* insert_block(Block&& block)
   {
      block.index = blocks.size();
      block.fp_mode = next_fp_mode;
      block.loop_nest_depth = next_loop_depth;
      block.divergent_if_logical_depth = next_divergent_if_logical_depth;
      block.uniform_if_depth = next_uniform_if_depth;
      blocks.emplace_back(std::move(block));
      return &blocks.back();
   }

private:
   uint32_t allocationID = 1;
};

struct live {
   /* live temps out per block */
   std::vector<IDSet> live_out;
   /* register demand (sgpr/vgpr) per instruction per block */
   std::vector<std::vector<RegisterDemand>> register_demand;
};

struct ra_test_policy {
   /* Force RA to always use its pessimistic fallback algorithm */
   bool skip_optimistic_path = false;
};

void init();

void init_program(Program* program, Stage stage, const struct radv_shader_info* info,
                  enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
                  ac_shader_config* config);

void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
                    ac_shader_config* config, const struct radv_nir_compiler_options* options,
                    const struct radv_shader_info* info,
                    const struct radv_shader_args* args);
void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
                           const struct radv_nir_compiler_options* options,
                           const struct radv_shader_info* info,
                           const struct radv_shader_args* args);
void select_trap_handler_shader(Program* program, struct nir_shader* shader,
                                ac_shader_config* config,
                                const struct radv_nir_compiler_options* options,
                                const struct radv_shader_info* info,
                                const struct radv_shader_args* args);
void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
                      ac_shader_config* config,
                      const struct radv_nir_compiler_options* options,
                      const struct radv_shader_info* info,
                      const struct radv_shader_args* args,
                      unsigned* num_preserved_sgprs);

void lower_phis(Program* program);
void calc_min_waves(Program* program);
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
live live_var_analysis(Program* program);
std::vector<uint16_t> dead_code_analysis(Program* program);
void dominator_tree(Program* program);
void insert_exec_mask(Program* program);
void value_numbering(Program* program);
void optimize(Program* program);
void optimize_postRA(Program* program);
void setup_reduce_temp(Program* program);
void lower_to_cssa(Program* program, live& live_vars);
void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
                         ra_test_policy = {});
void ssa_elimination(Program* program);
void lower_to_hw_instr(Program* program);
void schedule_program(Program* program, live& live_vars);
void spill(Program* program, live& live_vars);
void insert_wait_states(Program* program);
void insert_NOPs(Program* program);
void form_hard_clauses(Program* program);
unsigned emit_program(Program* program, std::vector<uint32_t>& code);
/**
 * Returns true if print_asm can disassemble the given program for the current build/runtime
 * configuration
 */
bool check_print_asm_support(Program* program);
bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
bool validate_ir(Program* program);
bool validate_ra(Program* program);
#ifndef NDEBUG
void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
#else
#define perfwarn(program, cond, msg, ...)                                                          \
   do {                                                                                            \
   } while (0)
#endif

void collect_presched_stats(Program* program);
void collect_preasm_stats(Program* program);
void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);

enum print_flags {
   print_no_ssa = 0x1,
   print_perf_info = 0x2,
   print_kill = 0x4,
   print_live_vars = 0x8,
};

void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
void aco_print_program(const Program* program, FILE* output, const live& live_vars,
                       unsigned flags = 0);

void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);

#define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
#define aco_err(program, ...)      _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)

/* utilities for dealing with register demand */
RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
                                 aco_ptr<Instruction>& instr_before);

/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
uint16_t get_extra_sgprs(Program* program);

/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);

/* return number of addressable sgprs/vgprs for max_waves */
uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);

typedef struct {
   const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
   const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
   const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
   const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
   const char* name[static_cast<int>(aco_opcode::num_opcodes)];
   const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
   /* sizes used for input/output modifiers and constants */
   const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
   const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
} Info;

extern const Info instr_info;

} // namespace aco

#endif /* ACO_IR_H */