1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #ifndef ACO_IR_H
26 #define ACO_IR_H
27
28 #include "aco_opcodes.h"
29 #include "aco_util.h"
30
31 #include "vulkan/radv_shader.h"
32
33 #include "nir.h"
34
35 #include <bitset>
36 #include <memory>
37 #include <vector>
38
39 struct radv_shader_args;
40 struct radv_shader_info;
41 struct radv_vs_prolog_key;
42
43 namespace aco {
44
45 extern uint64_t debug_flags;
46
47 enum {
48 DEBUG_VALIDATE_IR = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
50 DEBUG_PERFWARN = 0x4,
51 DEBUG_FORCE_WAITCNT = 0x8,
52 DEBUG_NO_VN = 0x10,
53 DEBUG_NO_OPT = 0x20,
54 DEBUG_NO_SCHED = 0x40,
55 DEBUG_PERF_INFO = 0x80,
56 DEBUG_LIVE_INFO = 0x100,
57 };
58
59 /**
60 * Representation of the instruction's microcode encoding format
61 * Note: Some Vector ALU Formats can be combined, such that:
62 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
63 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
64 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
65 *
66 * (*) The same is applicable for VOP1 and VOPC instructions.
67 */
68 enum class Format : std::uint16_t {
69 /* Pseudo Instruction Format */
70 PSEUDO = 0,
71 /* Scalar ALU & Control Formats */
72 SOP1 = 1,
73 SOP2 = 2,
74 SOPK = 3,
75 SOPP = 4,
76 SOPC = 5,
77 /* Scalar Memory Format */
78 SMEM = 6,
79 /* LDS/GDS Format */
80 DS = 8,
81 /* Vector Memory Buffer Formats */
82 MTBUF = 9,
83 MUBUF = 10,
84 /* Vector Memory Image Format */
85 MIMG = 11,
86 /* Export Format */
87 EXP = 12,
88 /* Flat Formats */
89 FLAT = 13,
90 GLOBAL = 14,
91 SCRATCH = 15,
92
93 PSEUDO_BRANCH = 16,
94 PSEUDO_BARRIER = 17,
95 PSEUDO_REDUCTION = 18,
96
97 /* Vector ALU Formats */
98 VOP3P = 19,
99 VOP1 = 1 << 8,
100 VOP2 = 1 << 9,
101 VOPC = 1 << 10,
102 VOP3 = 1 << 11,
103 /* Vector Parameter Interpolation Format */
104 VINTRP = 1 << 12,
105 DPP16 = 1 << 13,
106 SDWA = 1 << 14,
107 DPP8 = 1 << 15,
108 };
109
110 enum class instr_class : uint8_t {
111 valu32 = 0,
112 valu_convert32 = 1,
113 valu64 = 2,
114 valu_quarter_rate32 = 3,
115 valu_fma = 4,
116 valu_transcendental32 = 5,
117 valu_double = 6,
118 valu_double_add = 7,
119 valu_double_convert = 8,
120 valu_double_transcendental = 9,
121 salu = 10,
122 smem = 11,
123 barrier = 12,
124 branch = 13,
125 sendmsg = 14,
126 ds = 15,
127 exp = 16,
128 vmem = 17,
129 waitcnt = 18,
130 other = 19,
131 count,
132 };
133
134 enum storage_class : uint8_t {
135 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
136 storage_buffer = 0x1, /* SSBOs and global memory */
137 storage_atomic_counter = 0x2, /* not used for Vulkan */
138 storage_image = 0x4,
139 storage_shared = 0x8, /* or TCS output */
140 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
141 storage_task_payload = 0x20,/* Task-Mesh payload */
142 storage_scratch = 0x40,
143 storage_vgpr_spill = 0x80,
144 storage_count = 8, /* not counting storage_none */
145 };
146
147 enum memory_semantics : uint8_t {
148 semantic_none = 0x0,
149 /* for loads: don't move any access after this load to before this load (even other loads)
150 * for barriers: don't move any access after the barrier to before any
151 * atomics/control_barriers/sendmsg_gs_done before the barrier */
152 semantic_acquire = 0x1,
153 /* for stores: don't move any access before this store to after this store
154 * for barriers: don't move any access before the barrier to after any
155 * atomics/control_barriers/sendmsg_gs_done after the barrier */
156 semantic_release = 0x2,
157
158 /* the rest are for load/stores/atomics only */
159 /* cannot be DCE'd or CSE'd */
160 semantic_volatile = 0x4,
161 /* does not interact with barriers and assumes this lane is the only lane
162 * accessing this memory */
163 semantic_private = 0x8,
164 /* this operation can be reordered around operations of the same storage.
165 * says nothing about barriers */
166 semantic_can_reorder = 0x10,
167 /* this is a atomic instruction (may only read or write memory) */
168 semantic_atomic = 0x20,
169 /* this is instruction both reads and writes memory */
170 semantic_rmw = 0x40,
171
172 semantic_acqrel = semantic_acquire | semantic_release,
173 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
174 };
175
176 enum sync_scope : uint8_t {
177 scope_invocation = 0,
178 scope_subgroup = 1,
179 scope_workgroup = 2,
180 scope_queuefamily = 3,
181 scope_device = 4,
182 };
183
184 struct memory_sync_info {
memory_sync_infomemory_sync_info185 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
186 memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
187 : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
188 {}
189
190 storage_class storage : 8;
191 memory_semantics semantics : 8;
192 sync_scope scope : 8;
193
194 bool operator==(const memory_sync_info& rhs) const
195 {
196 return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
197 }
198
can_reordermemory_sync_info199 bool can_reorder() const
200 {
201 if (semantics & semantic_acqrel)
202 return false;
203 /* Also check storage so that zero-initialized memory_sync_info can be
204 * reordered. */
205 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
206 }
207 };
208 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
209
210 enum fp_round {
211 fp_round_ne = 0,
212 fp_round_pi = 1,
213 fp_round_ni = 2,
214 fp_round_tz = 3,
215 };
216
217 enum fp_denorm {
218 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
219 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
220 fp_denorm_flush = 0x0,
221 fp_denorm_keep_in = 0x1,
222 fp_denorm_keep_out = 0x2,
223 fp_denorm_keep = 0x3,
224 };
225
226 struct float_mode {
227 /* matches encoding of the MODE register */
228 union {
229 struct {
230 fp_round round32 : 2;
231 fp_round round16_64 : 2;
232 unsigned denorm32 : 2;
233 unsigned denorm16_64 : 2;
234 };
235 struct {
236 uint8_t round : 4;
237 uint8_t denorm : 4;
238 };
239 uint8_t val = 0;
240 };
241 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
242 bool preserve_signed_zero_inf_nan32 : 1;
243 bool preserve_signed_zero_inf_nan16_64 : 1;
244 /* if false, optimizations which may remove denormal flushing can be done */
245 bool must_flush_denorms32 : 1;
246 bool must_flush_denorms16_64 : 1;
247 bool care_about_round32 : 1;
248 bool care_about_round16_64 : 1;
249
250 /* Returns true if instructions using the mode "other" can safely use the
251 * current one instead. */
canReplacefloat_mode252 bool canReplace(float_mode other) const noexcept
253 {
254 return val == other.val &&
255 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
256 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
257 (must_flush_denorms32 || !other.must_flush_denorms32) &&
258 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
259 (care_about_round32 || !other.care_about_round32) &&
260 (care_about_round16_64 || !other.care_about_round16_64);
261 }
262 };
263
264 struct wait_imm {
265 static const uint8_t unset_counter = 0xff;
266
267 uint8_t vm;
268 uint8_t exp;
269 uint8_t lgkm;
270 uint8_t vs;
271
272 wait_imm();
273 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
274 wait_imm(enum chip_class chip, uint16_t packed);
275
276 uint16_t pack(enum chip_class chip) const;
277
278 bool combine(const wait_imm& other);
279
280 bool empty() const;
281 };
282
283 constexpr Format
asVOP3(Format format)284 asVOP3(Format format)
285 {
286 return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
287 };
288
289 constexpr Format
asSDWA(Format format)290 asSDWA(Format format)
291 {
292 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
293 return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
294 }
295
296 constexpr Format
withoutDPP(Format format)297 withoutDPP(Format format)
298 {
299 return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
300 }
301
302 enum class RegType {
303 none = 0,
304 sgpr,
305 vgpr,
306 linear_vgpr,
307 };
308
309 struct RegClass {
310
311 enum RC : uint8_t {
312 s1 = 1,
313 s2 = 2,
314 s3 = 3,
315 s4 = 4,
316 s6 = 6,
317 s8 = 8,
318 s16 = 16,
319 v1 = s1 | (1 << 5),
320 v2 = s2 | (1 << 5),
321 v3 = s3 | (1 << 5),
322 v4 = s4 | (1 << 5),
323 v5 = 5 | (1 << 5),
324 v6 = 6 | (1 << 5),
325 v7 = 7 | (1 << 5),
326 v8 = 8 | (1 << 5),
327 /* byte-sized register class */
328 v1b = v1 | (1 << 7),
329 v2b = v2 | (1 << 7),
330 v3b = v3 | (1 << 7),
331 v4b = v4 | (1 << 7),
332 v6b = v6 | (1 << 7),
333 v8b = v8 | (1 << 7),
334 /* these are used for WWM and spills to vgpr */
335 v1_linear = v1 | (1 << 6),
336 v2_linear = v2 | (1 << 6),
337 };
338
339 RegClass() = default;
RegClassRegClass340 constexpr RegClass(RC rc_) : rc(rc_) {}
RegClassRegClass341 constexpr RegClass(RegType type, unsigned size)
342 : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
343 {}
344
RCRegClass345 constexpr operator RC() const { return rc; }
346 explicit operator bool() = delete;
347
typeRegClass348 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
is_linear_vgprRegClass349 constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
is_subdwordRegClass350 constexpr bool is_subdword() const { return rc & (1 << 7); }
bytesRegClass351 constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
352 // TODO: use size() less in favor of bytes()
sizeRegClass353 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
is_linearRegClass354 constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
as_linearRegClass355 constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
as_subdwordRegClass356 constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
357
getRegClass358 static constexpr RegClass get(RegType type, unsigned bytes)
359 {
360 if (type == RegType::sgpr) {
361 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
362 } else {
363 return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
364 }
365 }
366
resizeRegClass367 constexpr RegClass resize(unsigned bytes) const
368 {
369 if (is_linear_vgpr()) {
370 assert(bytes % 4u == 0);
371 return get(RegType::vgpr, bytes).as_linear();
372 }
373 return get(type(), bytes);
374 }
375
376 private:
377 RC rc;
378 };
379
380 /* transitional helper expressions */
381 static constexpr RegClass s1{RegClass::s1};
382 static constexpr RegClass s2{RegClass::s2};
383 static constexpr RegClass s3{RegClass::s3};
384 static constexpr RegClass s4{RegClass::s4};
385 static constexpr RegClass s8{RegClass::s8};
386 static constexpr RegClass s16{RegClass::s16};
387 static constexpr RegClass v1{RegClass::v1};
388 static constexpr RegClass v2{RegClass::v2};
389 static constexpr RegClass v3{RegClass::v3};
390 static constexpr RegClass v4{RegClass::v4};
391 static constexpr RegClass v5{RegClass::v5};
392 static constexpr RegClass v6{RegClass::v6};
393 static constexpr RegClass v7{RegClass::v7};
394 static constexpr RegClass v8{RegClass::v8};
395 static constexpr RegClass v1b{RegClass::v1b};
396 static constexpr RegClass v2b{RegClass::v2b};
397 static constexpr RegClass v3b{RegClass::v3b};
398 static constexpr RegClass v4b{RegClass::v4b};
399 static constexpr RegClass v6b{RegClass::v6b};
400 static constexpr RegClass v8b{RegClass::v8b};
401
402 /**
403 * Temp Class
404 * Each temporary virtual register has a
405 * register class (i.e. size and type)
406 * and SSA id.
407 */
408 struct Temp {
TempTemp409 Temp() noexcept : id_(0), reg_class(0) {}
TempTemp410 constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
411
idTemp412 constexpr uint32_t id() const noexcept { return id_; }
regClassTemp413 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
414
bytesTemp415 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
sizeTemp416 constexpr unsigned size() const noexcept { return regClass().size(); }
typeTemp417 constexpr RegType type() const noexcept { return regClass().type(); }
is_linearTemp418 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
419
420 constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
421 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
422 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
423
424 private:
425 uint32_t id_ : 24;
426 uint32_t reg_class : 8;
427 };
428
429 /**
430 * PhysReg
431 * Represents the physical register for each
432 * Operand and Definition.
433 */
434 struct PhysReg {
435 constexpr PhysReg() = default;
PhysRegPhysReg436 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
regPhysReg437 constexpr unsigned reg() const { return reg_b >> 2; }
bytePhysReg438 constexpr unsigned byte() const { return reg_b & 0x3; }
439 constexpr operator unsigned() const { return reg(); }
440 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
441 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
442 constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
advancePhysReg443 constexpr PhysReg advance(int bytes) const
444 {
445 PhysReg res = *this;
446 res.reg_b += bytes;
447 return res;
448 }
449
450 uint16_t reg_b = 0;
451 };
452
453 /* helper expressions for special registers */
454 static constexpr PhysReg m0{124};
455 static constexpr PhysReg vcc{106};
456 static constexpr PhysReg vcc_hi{107};
457 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
458 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
459 static constexpr PhysReg ttmp0{112};
460 static constexpr PhysReg ttmp1{113};
461 static constexpr PhysReg ttmp2{114};
462 static constexpr PhysReg ttmp3{115};
463 static constexpr PhysReg ttmp4{116};
464 static constexpr PhysReg ttmp5{117};
465 static constexpr PhysReg ttmp6{118};
466 static constexpr PhysReg ttmp7{119};
467 static constexpr PhysReg ttmp8{120};
468 static constexpr PhysReg ttmp9{121};
469 static constexpr PhysReg ttmp10{122};
470 static constexpr PhysReg ttmp11{123};
471 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
472 static constexpr PhysReg exec{126};
473 static constexpr PhysReg exec_lo{126};
474 static constexpr PhysReg exec_hi{127};
475 static constexpr PhysReg vccz{251};
476 static constexpr PhysReg execz{252};
477 static constexpr PhysReg scc{253};
478
479 /**
480 * Operand Class
481 * Initially, each Operand refers to either
482 * a temporary virtual register
483 * or to a constant value
484 * Temporary registers get mapped to physical register during RA
485 * Constant values are inlined into the instruction sequence.
486 */
487 class Operand final {
488 public:
Operand()489 constexpr Operand()
490 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
491 isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
492 is24bit_(false), signext(false)
493 {}
494
Operand(Temp r)495 explicit Operand(Temp r) noexcept
496 {
497 data_.temp = r;
498 if (r.id()) {
499 isTemp_ = true;
500 } else {
501 isUndef_ = true;
502 setFixed(PhysReg{128});
503 }
504 };
Operand(Temp r,PhysReg reg)505 explicit Operand(Temp r, PhysReg reg) noexcept
506 {
507 assert(r.id()); /* Don't allow fixing an undef to a register */
508 data_.temp = r;
509 isTemp_ = true;
510 setFixed(reg);
511 };
512
513 /* 8-bit constant */
c8(uint8_t v)514 static Operand c8(uint8_t v) noexcept
515 {
516 /* 8-bit constants are only used for copies and copies from any 8-bit
517 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
518 * to be inline constants. */
519 Operand op;
520 op.control_ = 0;
521 op.data_.i = v;
522 op.isConstant_ = true;
523 op.constSize = 0;
524 op.setFixed(PhysReg{0u});
525 return op;
526 };
527
528 /* 16-bit constant */
c16(uint16_t v)529 static Operand c16(uint16_t v) noexcept
530 {
531 Operand op;
532 op.control_ = 0;
533 op.data_.i = v;
534 op.isConstant_ = true;
535 op.constSize = 1;
536 if (v <= 64)
537 op.setFixed(PhysReg{128u + v});
538 else if (v >= 0xFFF0) /* [-16 .. -1] */
539 op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
540 else if (v == 0x3800) /* 0.5 */
541 op.setFixed(PhysReg{240});
542 else if (v == 0xB800) /* -0.5 */
543 op.setFixed(PhysReg{241});
544 else if (v == 0x3C00) /* 1.0 */
545 op.setFixed(PhysReg{242});
546 else if (v == 0xBC00) /* -1.0 */
547 op.setFixed(PhysReg{243});
548 else if (v == 0x4000) /* 2.0 */
549 op.setFixed(PhysReg{244});
550 else if (v == 0xC000) /* -2.0 */
551 op.setFixed(PhysReg{245});
552 else if (v == 0x4400) /* 4.0 */
553 op.setFixed(PhysReg{246});
554 else if (v == 0xC400) /* -4.0 */
555 op.setFixed(PhysReg{247});
556 else if (v == 0x3118) /* 1/2 PI */
557 op.setFixed(PhysReg{248});
558 else /* Literal Constant */
559 op.setFixed(PhysReg{255});
560 return op;
561 }
562
563 /* 32-bit constant */
c32(uint32_t v)564 static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
565
566 /* 64-bit constant */
c64(uint64_t v)567 static Operand c64(uint64_t v) noexcept
568 {
569 Operand op;
570 op.control_ = 0;
571 op.isConstant_ = true;
572 op.constSize = 3;
573 if (v <= 64) {
574 op.data_.i = (uint32_t)v;
575 op.setFixed(PhysReg{128 + (uint32_t)v});
576 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
577 op.data_.i = (uint32_t)v;
578 op.setFixed(PhysReg{192 - (uint32_t)v});
579 } else if (v == 0x3FE0000000000000) { /* 0.5 */
580 op.data_.i = 0x3f000000;
581 op.setFixed(PhysReg{240});
582 } else if (v == 0xBFE0000000000000) { /* -0.5 */
583 op.data_.i = 0xbf000000;
584 op.setFixed(PhysReg{241});
585 } else if (v == 0x3FF0000000000000) { /* 1.0 */
586 op.data_.i = 0x3f800000;
587 op.setFixed(PhysReg{242});
588 } else if (v == 0xBFF0000000000000) { /* -1.0 */
589 op.data_.i = 0xbf800000;
590 op.setFixed(PhysReg{243});
591 } else if (v == 0x4000000000000000) { /* 2.0 */
592 op.data_.i = 0x40000000;
593 op.setFixed(PhysReg{244});
594 } else if (v == 0xC000000000000000) { /* -2.0 */
595 op.data_.i = 0xc0000000;
596 op.setFixed(PhysReg{245});
597 } else if (v == 0x4010000000000000) { /* 4.0 */
598 op.data_.i = 0x40800000;
599 op.setFixed(PhysReg{246});
600 } else if (v == 0xC010000000000000) { /* -4.0 */
601 op.data_.i = 0xc0800000;
602 op.setFixed(PhysReg{247});
603 } else { /* Literal Constant: we don't know if it is a long or double.*/
604 op.signext = v >> 63;
605 op.data_.i = v & 0xffffffffu;
606 op.setFixed(PhysReg{255});
607 assert(op.constantValue64() == v &&
608 "attempt to create a unrepresentable 64-bit literal constant");
609 }
610 return op;
611 }
612
613 /* 32-bit constant stored as a 32-bit or 64-bit operand */
c32_or_c64(uint32_t v,bool is64bit)614 static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
615 {
616 Operand op;
617 op.control_ = 0;
618 op.data_.i = v;
619 op.isConstant_ = true;
620 op.constSize = is64bit ? 3 : 2;
621 if (v <= 64)
622 op.setFixed(PhysReg{128 + v});
623 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
624 op.setFixed(PhysReg{192 - v});
625 else if (v == 0x3f000000) /* 0.5 */
626 op.setFixed(PhysReg{240});
627 else if (v == 0xbf000000) /* -0.5 */
628 op.setFixed(PhysReg{241});
629 else if (v == 0x3f800000) /* 1.0 */
630 op.setFixed(PhysReg{242});
631 else if (v == 0xbf800000) /* -1.0 */
632 op.setFixed(PhysReg{243});
633 else if (v == 0x40000000) /* 2.0 */
634 op.setFixed(PhysReg{244});
635 else if (v == 0xc0000000) /* -2.0 */
636 op.setFixed(PhysReg{245});
637 else if (v == 0x40800000) /* 4.0 */
638 op.setFixed(PhysReg{246});
639 else if (v == 0xc0800000) /* -4.0 */
640 op.setFixed(PhysReg{247});
641 else { /* Literal Constant */
642 assert(!is64bit && "attempt to create a 64-bit literal constant");
643 op.setFixed(PhysReg{255});
644 }
645 return op;
646 }
647
Operand(RegClass type)648 explicit Operand(RegClass type) noexcept
649 {
650 isUndef_ = true;
651 data_.temp = Temp(0, type);
652 setFixed(PhysReg{128});
653 };
Operand(PhysReg reg,RegClass type)654 explicit Operand(PhysReg reg, RegClass type) noexcept
655 {
656 data_.temp = Temp(0, type);
657 setFixed(reg);
658 }
659
660 static Operand zero(unsigned bytes = 4) noexcept
661 {
662 if (bytes == 8)
663 return Operand::c64(0);
664 else if (bytes == 4)
665 return Operand::c32(0);
666 else if (bytes == 2)
667 return Operand::c16(0);
668 assert(bytes == 1);
669 return Operand::c8(0);
670 }
671
672 /* This is useful over the constructors when you want to take a chip class
673 * for 1/2 PI or an unknown operand size.
674 */
get_const(enum chip_class chip,uint64_t val,unsigned bytes)675 static Operand get_const(enum chip_class chip, uint64_t val, unsigned bytes)
676 {
677 if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
678 /* 1/2 PI can be an inline constant on GFX8+ */
679 Operand op = Operand::c32(val);
680 op.setFixed(PhysReg{248});
681 return op;
682 }
683
684 if (bytes == 8)
685 return Operand::c64(val);
686 else if (bytes == 4)
687 return Operand::c32(val);
688 else if (bytes == 2)
689 return Operand::c16(val);
690 assert(bytes == 1);
691 return Operand::c8(val);
692 }
693
694 static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
695 bool sext = false)
696 {
697 if (bytes <= 4)
698 return true;
699
700 if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
701 return true;
702 uint64_t upper33 = val & 0xFFFFFFFF80000000;
703 if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
704 return true;
705
706 return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
707 val == 0x3FE0000000000000 || /* 0.5 */
708 val == 0xBFE0000000000000 || /* -0.5 */
709 val == 0x3FF0000000000000 || /* 1.0 */
710 val == 0xBFF0000000000000 || /* -1.0 */
711 val == 0x4000000000000000 || /* 2.0 */
712 val == 0xC000000000000000 || /* -2.0 */
713 val == 0x4010000000000000 || /* 4.0 */
714 val == 0xC010000000000000; /* -4.0 */
715 }
716
isTemp()717 constexpr bool isTemp() const noexcept { return isTemp_; }
718
setTemp(Temp t)719 constexpr void setTemp(Temp t) noexcept
720 {
721 assert(!isConstant_);
722 isTemp_ = true;
723 data_.temp = t;
724 }
725
getTemp()726 constexpr Temp getTemp() const noexcept { return data_.temp; }
727
tempId()728 constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
729
hasRegClass()730 constexpr bool hasRegClass() const noexcept { return isTemp() || isUndefined(); }
731
regClass()732 constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
733
bytes()734 constexpr unsigned bytes() const noexcept
735 {
736 if (isConstant())
737 return 1 << constSize;
738 else
739 return data_.temp.bytes();
740 }
741
size()742 constexpr unsigned size() const noexcept
743 {
744 if (isConstant())
745 return constSize > 2 ? 2 : 1;
746 else
747 return data_.temp.size();
748 }
749
isFixed()750 constexpr bool isFixed() const noexcept { return isFixed_; }
751
physReg()752 constexpr PhysReg physReg() const noexcept { return reg_; }
753
setFixed(PhysReg reg)754 constexpr void setFixed(PhysReg reg) noexcept
755 {
756 isFixed_ = reg != unsigned(-1);
757 reg_ = reg;
758 }
759
isConstant()760 constexpr bool isConstant() const noexcept { return isConstant_; }
761
isLiteral()762 constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
763
isUndefined()764 constexpr bool isUndefined() const noexcept { return isUndef_; }
765
constantValue()766 constexpr uint32_t constantValue() const noexcept { return data_.i; }
767
constantEquals(uint32_t cmp)768 constexpr bool constantEquals(uint32_t cmp) const noexcept
769 {
770 return isConstant() && constantValue() == cmp;
771 }
772
constantValue64()773 constexpr uint64_t constantValue64() const noexcept
774 {
775 if (constSize == 3) {
776 if (reg_ <= 192)
777 return reg_ - 128;
778 else if (reg_ <= 208)
779 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
780
781 switch (reg_) {
782 case 240: return 0x3FE0000000000000;
783 case 241: return 0xBFE0000000000000;
784 case 242: return 0x3FF0000000000000;
785 case 243: return 0xBFF0000000000000;
786 case 244: return 0x4000000000000000;
787 case 245: return 0xC000000000000000;
788 case 246: return 0x4010000000000000;
789 case 247: return 0xC010000000000000;
790 case 255:
791 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
792 }
793 unreachable("invalid register for 64-bit constant");
794 } else {
795 return data_.i;
796 }
797 }
798
isOfType(RegType type)799 constexpr bool isOfType(RegType type) const noexcept
800 {
801 return hasRegClass() && regClass().type() == type;
802 }
803
804 /* Indicates that the killed operand's live range intersects with the
805 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
806 * not set by liveness analysis. */
setLateKill(bool flag)807 constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
808
isLateKill()809 constexpr bool isLateKill() const noexcept { return isLateKill_; }
810
setKill(bool flag)811 constexpr void setKill(bool flag) noexcept
812 {
813 isKill_ = flag;
814 if (!flag)
815 setFirstKill(false);
816 }
817
isKill()818 constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
819
setFirstKill(bool flag)820 constexpr void setFirstKill(bool flag) noexcept
821 {
822 isFirstKill_ = flag;
823 if (flag)
824 setKill(flag);
825 }
826
827 /* When there are multiple operands killing the same temporary,
828 * isFirstKill() is only returns true for the first one. */
isFirstKill()829 constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
830
isKillBeforeDef()831 constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
832
isFirstKillBeforeDef()833 constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
834
835 constexpr bool operator==(Operand other) const noexcept
836 {
837 if (other.size() != size())
838 return false;
839 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
840 return false;
841 if (isFixed() && other.isFixed() && physReg() != other.physReg())
842 return false;
843 if (isLiteral())
844 return other.isLiteral() && other.constantValue() == constantValue();
845 else if (isConstant())
846 return other.isConstant() && other.physReg() == physReg();
847 else if (isUndefined())
848 return other.isUndefined() && other.regClass() == regClass();
849 else
850 return other.isTemp() && other.getTemp() == getTemp();
851 }
852
853 constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
854
set16bit(bool flag)855 constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
856
is16bit()857 constexpr bool is16bit() const noexcept { return is16bit_; }
858
set24bit(bool flag)859 constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
860
is24bit()861 constexpr bool is24bit() const noexcept { return is24bit_; }
862
863 private:
864 union {
865 Temp temp;
866 uint32_t i;
867 float f;
868 } data_ = {Temp(0, s1)};
869 PhysReg reg_;
870 union {
871 struct {
872 uint8_t isTemp_ : 1;
873 uint8_t isFixed_ : 1;
874 uint8_t isConstant_ : 1;
875 uint8_t isKill_ : 1;
876 uint8_t isUndef_ : 1;
877 uint8_t isFirstKill_ : 1;
878 uint8_t constSize : 2;
879 uint8_t isLateKill_ : 1;
880 uint8_t is16bit_ : 1;
881 uint8_t is24bit_ : 1;
882 uint8_t signext : 1;
883 };
884 /* can't initialize bit-fields in c++11, so work around using a union */
885 uint16_t control_ = 0;
886 };
887 };
888
889 /**
890 * Definition Class
891 * Definitions are the results of Instructions
892 * and refer to temporary virtual registers
893 * which are later mapped to physical registers
894 */
895 class Definition final {
896 public:
Definition()897 constexpr Definition()
898 : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0), isNUW_(0),
899 isNoCSE_(0)
900 {}
Definition(uint32_t index,RegClass type)901 Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
Definition(Temp tmp)902 explicit Definition(Temp tmp) noexcept : temp(tmp) {}
Definition(PhysReg reg,RegClass type)903 Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
Definition(uint32_t tmpId,PhysReg reg,RegClass type)904 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
905 {
906 setFixed(reg);
907 }
908
isTemp()909 constexpr bool isTemp() const noexcept { return tempId() > 0; }
910
getTemp()911 constexpr Temp getTemp() const noexcept { return temp; }
912
tempId()913 constexpr uint32_t tempId() const noexcept { return temp.id(); }
914
setTemp(Temp t)915 constexpr void setTemp(Temp t) noexcept { temp = t; }
916
swapTemp(Definition & other)917 void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
918
regClass()919 constexpr RegClass regClass() const noexcept { return temp.regClass(); }
920
bytes()921 constexpr unsigned bytes() const noexcept { return temp.bytes(); }
922
size()923 constexpr unsigned size() const noexcept { return temp.size(); }
924
isFixed()925 constexpr bool isFixed() const noexcept { return isFixed_; }
926
physReg()927 constexpr PhysReg physReg() const noexcept { return reg_; }
928
setFixed(PhysReg reg)929 constexpr void setFixed(PhysReg reg) noexcept
930 {
931 isFixed_ = 1;
932 reg_ = reg;
933 }
934
setHint(PhysReg reg)935 constexpr void setHint(PhysReg reg) noexcept
936 {
937 hasHint_ = 1;
938 reg_ = reg;
939 }
940
hasHint()941 constexpr bool hasHint() const noexcept { return hasHint_; }
942
setKill(bool flag)943 constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
944
isKill()945 constexpr bool isKill() const noexcept { return isKill_; }
946
setPrecise(bool precise)947 constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
948
isPrecise()949 constexpr bool isPrecise() const noexcept { return isPrecise_; }
950
951 /* No Unsigned Wrap */
setNUW(bool nuw)952 constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
953
isNUW()954 constexpr bool isNUW() const noexcept { return isNUW_; }
955
setNoCSE(bool noCSE)956 constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
957
isNoCSE()958 constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
959
960 private:
961 Temp temp = Temp(0, s1);
962 PhysReg reg_;
963 union {
964 struct {
965 uint8_t isFixed_ : 1;
966 uint8_t hasHint_ : 1;
967 uint8_t isKill_ : 1;
968 uint8_t isPrecise_ : 1;
969 uint8_t isNUW_ : 1;
970 uint8_t isNoCSE_ : 1;
971 };
972 /* can't initialize bit-fields in c++11, so work around using a union */
973 uint8_t control_ = 0;
974 };
975 };
976
977 struct Block;
978 struct Instruction;
979 struct Pseudo_instruction;
980 struct SOP1_instruction;
981 struct SOP2_instruction;
982 struct SOPK_instruction;
983 struct SOPP_instruction;
984 struct SOPC_instruction;
985 struct SMEM_instruction;
986 struct DS_instruction;
987 struct MTBUF_instruction;
988 struct MUBUF_instruction;
989 struct MIMG_instruction;
990 struct Export_instruction;
991 struct FLAT_instruction;
992 struct Pseudo_branch_instruction;
993 struct Pseudo_barrier_instruction;
994 struct Pseudo_reduction_instruction;
995 struct VOP3P_instruction;
996 struct VOP1_instruction;
997 struct VOP2_instruction;
998 struct VOPC_instruction;
999 struct VOP3_instruction;
1000 struct Interp_instruction;
1001 struct DPP16_instruction;
1002 struct DPP8_instruction;
1003 struct SDWA_instruction;
1004
1005 struct Instruction {
1006 aco_opcode opcode;
1007 Format format;
1008 uint32_t pass_flags;
1009
1010 aco::span<Operand> operands;
1011 aco::span<Definition> definitions;
1012
1013 constexpr bool usesModifiers() const noexcept;
1014
reads_execInstruction1015 constexpr bool reads_exec() const noexcept
1016 {
1017 for (const Operand& op : operands) {
1018 if (op.isFixed() && op.physReg() == exec)
1019 return true;
1020 }
1021 return false;
1022 }
1023
pseudoInstruction1024 Pseudo_instruction& pseudo() noexcept
1025 {
1026 assert(isPseudo());
1027 return *(Pseudo_instruction*)this;
1028 }
pseudoInstruction1029 const Pseudo_instruction& pseudo() const noexcept
1030 {
1031 assert(isPseudo());
1032 return *(Pseudo_instruction*)this;
1033 }
isPseudoInstruction1034 constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
sop1Instruction1035 SOP1_instruction& sop1() noexcept
1036 {
1037 assert(isSOP1());
1038 return *(SOP1_instruction*)this;
1039 }
sop1Instruction1040 const SOP1_instruction& sop1() const noexcept
1041 {
1042 assert(isSOP1());
1043 return *(SOP1_instruction*)this;
1044 }
isSOP1Instruction1045 constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
sop2Instruction1046 SOP2_instruction& sop2() noexcept
1047 {
1048 assert(isSOP2());
1049 return *(SOP2_instruction*)this;
1050 }
sop2Instruction1051 const SOP2_instruction& sop2() const noexcept
1052 {
1053 assert(isSOP2());
1054 return *(SOP2_instruction*)this;
1055 }
isSOP2Instruction1056 constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
sopkInstruction1057 SOPK_instruction& sopk() noexcept
1058 {
1059 assert(isSOPK());
1060 return *(SOPK_instruction*)this;
1061 }
sopkInstruction1062 const SOPK_instruction& sopk() const noexcept
1063 {
1064 assert(isSOPK());
1065 return *(SOPK_instruction*)this;
1066 }
isSOPKInstruction1067 constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
soppInstruction1068 SOPP_instruction& sopp() noexcept
1069 {
1070 assert(isSOPP());
1071 return *(SOPP_instruction*)this;
1072 }
soppInstruction1073 const SOPP_instruction& sopp() const noexcept
1074 {
1075 assert(isSOPP());
1076 return *(SOPP_instruction*)this;
1077 }
isSOPPInstruction1078 constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
sopcInstruction1079 SOPC_instruction& sopc() noexcept
1080 {
1081 assert(isSOPC());
1082 return *(SOPC_instruction*)this;
1083 }
sopcInstruction1084 const SOPC_instruction& sopc() const noexcept
1085 {
1086 assert(isSOPC());
1087 return *(SOPC_instruction*)this;
1088 }
isSOPCInstruction1089 constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
smemInstruction1090 SMEM_instruction& smem() noexcept
1091 {
1092 assert(isSMEM());
1093 return *(SMEM_instruction*)this;
1094 }
smemInstruction1095 const SMEM_instruction& smem() const noexcept
1096 {
1097 assert(isSMEM());
1098 return *(SMEM_instruction*)this;
1099 }
isSMEMInstruction1100 constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
dsInstruction1101 DS_instruction& ds() noexcept
1102 {
1103 assert(isDS());
1104 return *(DS_instruction*)this;
1105 }
dsInstruction1106 const DS_instruction& ds() const noexcept
1107 {
1108 assert(isDS());
1109 return *(DS_instruction*)this;
1110 }
isDSInstruction1111 constexpr bool isDS() const noexcept { return format == Format::DS; }
mtbufInstruction1112 MTBUF_instruction& mtbuf() noexcept
1113 {
1114 assert(isMTBUF());
1115 return *(MTBUF_instruction*)this;
1116 }
mtbufInstruction1117 const MTBUF_instruction& mtbuf() const noexcept
1118 {
1119 assert(isMTBUF());
1120 return *(MTBUF_instruction*)this;
1121 }
isMTBUFInstruction1122 constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
mubufInstruction1123 MUBUF_instruction& mubuf() noexcept
1124 {
1125 assert(isMUBUF());
1126 return *(MUBUF_instruction*)this;
1127 }
mubufInstruction1128 const MUBUF_instruction& mubuf() const noexcept
1129 {
1130 assert(isMUBUF());
1131 return *(MUBUF_instruction*)this;
1132 }
isMUBUFInstruction1133 constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
mimgInstruction1134 MIMG_instruction& mimg() noexcept
1135 {
1136 assert(isMIMG());
1137 return *(MIMG_instruction*)this;
1138 }
mimgInstruction1139 const MIMG_instruction& mimg() const noexcept
1140 {
1141 assert(isMIMG());
1142 return *(MIMG_instruction*)this;
1143 }
isMIMGInstruction1144 constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
expInstruction1145 Export_instruction& exp() noexcept
1146 {
1147 assert(isEXP());
1148 return *(Export_instruction*)this;
1149 }
expInstruction1150 const Export_instruction& exp() const noexcept
1151 {
1152 assert(isEXP());
1153 return *(Export_instruction*)this;
1154 }
isEXPInstruction1155 constexpr bool isEXP() const noexcept { return format == Format::EXP; }
flatInstruction1156 FLAT_instruction& flat() noexcept
1157 {
1158 assert(isFlat());
1159 return *(FLAT_instruction*)this;
1160 }
flatInstruction1161 const FLAT_instruction& flat() const noexcept
1162 {
1163 assert(isFlat());
1164 return *(FLAT_instruction*)this;
1165 }
isFlatInstruction1166 constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
globalInstruction1167 FLAT_instruction& global() noexcept
1168 {
1169 assert(isGlobal());
1170 return *(FLAT_instruction*)this;
1171 }
globalInstruction1172 const FLAT_instruction& global() const noexcept
1173 {
1174 assert(isGlobal());
1175 return *(FLAT_instruction*)this;
1176 }
isGlobalInstruction1177 constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
scratchInstruction1178 FLAT_instruction& scratch() noexcept
1179 {
1180 assert(isScratch());
1181 return *(FLAT_instruction*)this;
1182 }
scratchInstruction1183 const FLAT_instruction& scratch() const noexcept
1184 {
1185 assert(isScratch());
1186 return *(FLAT_instruction*)this;
1187 }
isScratchInstruction1188 constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
branchInstruction1189 Pseudo_branch_instruction& branch() noexcept
1190 {
1191 assert(isBranch());
1192 return *(Pseudo_branch_instruction*)this;
1193 }
branchInstruction1194 const Pseudo_branch_instruction& branch() const noexcept
1195 {
1196 assert(isBranch());
1197 return *(Pseudo_branch_instruction*)this;
1198 }
isBranchInstruction1199 constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
barrierInstruction1200 Pseudo_barrier_instruction& barrier() noexcept
1201 {
1202 assert(isBarrier());
1203 return *(Pseudo_barrier_instruction*)this;
1204 }
barrierInstruction1205 const Pseudo_barrier_instruction& barrier() const noexcept
1206 {
1207 assert(isBarrier());
1208 return *(Pseudo_barrier_instruction*)this;
1209 }
isBarrierInstruction1210 constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
reductionInstruction1211 Pseudo_reduction_instruction& reduction() noexcept
1212 {
1213 assert(isReduction());
1214 return *(Pseudo_reduction_instruction*)this;
1215 }
reductionInstruction1216 const Pseudo_reduction_instruction& reduction() const noexcept
1217 {
1218 assert(isReduction());
1219 return *(Pseudo_reduction_instruction*)this;
1220 }
isReductionInstruction1221 constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
vop3pInstruction1222 VOP3P_instruction& vop3p() noexcept
1223 {
1224 assert(isVOP3P());
1225 return *(VOP3P_instruction*)this;
1226 }
vop3pInstruction1227 const VOP3P_instruction& vop3p() const noexcept
1228 {
1229 assert(isVOP3P());
1230 return *(VOP3P_instruction*)this;
1231 }
isVOP3PInstruction1232 constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
vop1Instruction1233 VOP1_instruction& vop1() noexcept
1234 {
1235 assert(isVOP1());
1236 return *(VOP1_instruction*)this;
1237 }
vop1Instruction1238 const VOP1_instruction& vop1() const noexcept
1239 {
1240 assert(isVOP1());
1241 return *(VOP1_instruction*)this;
1242 }
isVOP1Instruction1243 constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
vop2Instruction1244 VOP2_instruction& vop2() noexcept
1245 {
1246 assert(isVOP2());
1247 return *(VOP2_instruction*)this;
1248 }
vop2Instruction1249 const VOP2_instruction& vop2() const noexcept
1250 {
1251 assert(isVOP2());
1252 return *(VOP2_instruction*)this;
1253 }
isVOP2Instruction1254 constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
vopcInstruction1255 VOPC_instruction& vopc() noexcept
1256 {
1257 assert(isVOPC());
1258 return *(VOPC_instruction*)this;
1259 }
vopcInstruction1260 const VOPC_instruction& vopc() const noexcept
1261 {
1262 assert(isVOPC());
1263 return *(VOPC_instruction*)this;
1264 }
isVOPCInstruction1265 constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
vop3Instruction1266 VOP3_instruction& vop3() noexcept
1267 {
1268 assert(isVOP3());
1269 return *(VOP3_instruction*)this;
1270 }
vop3Instruction1271 const VOP3_instruction& vop3() const noexcept
1272 {
1273 assert(isVOP3());
1274 return *(VOP3_instruction*)this;
1275 }
isVOP3Instruction1276 constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
vintrpInstruction1277 Interp_instruction& vintrp() noexcept
1278 {
1279 assert(isVINTRP());
1280 return *(Interp_instruction*)this;
1281 }
vintrpInstruction1282 const Interp_instruction& vintrp() const noexcept
1283 {
1284 assert(isVINTRP());
1285 return *(Interp_instruction*)this;
1286 }
isVINTRPInstruction1287 constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
dpp16Instruction1288 DPP16_instruction& dpp16() noexcept
1289 {
1290 assert(isDPP16());
1291 return *(DPP16_instruction*)this;
1292 }
dpp16Instruction1293 const DPP16_instruction& dpp16() const noexcept
1294 {
1295 assert(isDPP16());
1296 return *(DPP16_instruction*)this;
1297 }
isDPP16Instruction1298 constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
dpp8Instruction1299 DPP8_instruction& dpp8() noexcept
1300 {
1301 assert(isDPP8());
1302 return *(DPP8_instruction*)this;
1303 }
dpp8Instruction1304 const DPP8_instruction& dpp8() const noexcept
1305 {
1306 assert(isDPP8());
1307 return *(DPP8_instruction*)this;
1308 }
isDPP8Instruction1309 constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
isDPPInstruction1310 constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
sdwaInstruction1311 SDWA_instruction& sdwa() noexcept
1312 {
1313 assert(isSDWA());
1314 return *(SDWA_instruction*)this;
1315 }
sdwaInstruction1316 const SDWA_instruction& sdwa() const noexcept
1317 {
1318 assert(isSDWA());
1319 return *(SDWA_instruction*)this;
1320 }
isSDWAInstruction1321 constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1322
flatlikeInstruction1323 FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1324
flatlikeInstruction1325 const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1326
isFlatLikeInstruction1327 constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1328
isVALUInstruction1329 constexpr bool isVALU() const noexcept
1330 {
1331 return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P();
1332 }
1333
isSALUInstruction1334 constexpr bool isSALU() const noexcept
1335 {
1336 return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1337 }
1338
isVMEMInstruction1339 constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1340 };
1341 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1342
1343 struct SOPK_instruction : public Instruction {
1344 uint16_t imm;
1345 uint16_t padding;
1346 };
1347 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1348
1349 struct SOPP_instruction : public Instruction {
1350 uint32_t imm;
1351 int block;
1352 };
1353 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1354
1355 struct SOPC_instruction : public Instruction {};
1356 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1357
1358 struct SOP1_instruction : public Instruction {};
1359 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1360
1361 struct SOP2_instruction : public Instruction {};
1362 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1363
1364 /**
1365 * Scalar Memory Format:
1366 * For s_(buffer_)load_dword*:
1367 * Operand(0): SBASE - SGPR-pair which provides base address
1368 * Operand(1): Offset - immediate (un)signed offset or SGPR
1369 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1370 * Operand(n-1): SOffset - SGPR offset (Vega only)
1371 *
1372 * Having no operands is also valid for instructions such as s_dcache_inv.
1373 *
1374 */
1375 struct SMEM_instruction : public Instruction {
1376 memory_sync_info sync;
1377 bool glc : 1; /* VI+: globally coherent */
1378 bool dlc : 1; /* NAVI: device level coherent */
1379 bool nv : 1; /* VEGA only: Non-volatile */
1380 bool disable_wqm : 1;
1381 bool prevent_overflow : 1; /* avoid overflow when combining additions */
1382 uint8_t padding : 3;
1383 };
1384 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1385
1386 struct VOP1_instruction : public Instruction {};
1387 static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1388
1389 struct VOP2_instruction : public Instruction {};
1390 static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1391
1392 struct VOPC_instruction : public Instruction {};
1393 static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1394
1395 struct VOP3_instruction : public Instruction {
1396 bool abs[3];
1397 bool neg[3];
1398 uint8_t opsel : 4;
1399 uint8_t omod : 2;
1400 bool clamp : 1;
1401 uint8_t padding0 : 1;
1402 uint8_t padding1;
1403 };
1404 static_assert(sizeof(VOP3_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1405
1406 struct VOP3P_instruction : public Instruction {
1407 bool neg_lo[3];
1408 bool neg_hi[3]; /* abs modifier, for v_mad_mix/v_fma_mix */
1409 uint8_t opsel_lo : 3;
1410 uint8_t opsel_hi : 3;
1411 bool clamp : 1;
1412 uint8_t padding0 : 1;
1413 uint8_t padding1;
1414 };
1415 static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1416
1417 /**
1418 * Data Parallel Primitives Format:
1419 * This format can be used for VOP1, VOP2 or VOPC instructions.
1420 * The swizzle applies to the src0 operand.
1421 *
1422 */
1423 struct DPP16_instruction : public Instruction {
1424 bool abs[2];
1425 bool neg[2];
1426 uint16_t dpp_ctrl;
1427 uint8_t row_mask : 4;
1428 uint8_t bank_mask : 4;
1429 bool bound_ctrl : 1;
1430 uint8_t padding : 7;
1431 };
1432 static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1433
1434 struct DPP8_instruction : public Instruction {
1435 uint8_t lane_sel[8];
1436 };
1437 static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1438
1439 struct SubdwordSel {
1440 enum sdwa_sel : uint8_t {
1441 ubyte = 0x4,
1442 uword = 0x8,
1443 dword = 0x10,
1444 sext = 0x20,
1445 sbyte = ubyte | sext,
1446 sword = uword | sext,
1447
1448 ubyte0 = ubyte,
1449 ubyte1 = ubyte | 1,
1450 ubyte2 = ubyte | 2,
1451 ubyte3 = ubyte | 3,
1452 sbyte0 = sbyte,
1453 sbyte1 = sbyte | 1,
1454 sbyte2 = sbyte | 2,
1455 sbyte3 = sbyte | 3,
1456 uword0 = uword,
1457 uword1 = uword | 2,
1458 sword0 = sword,
1459 sword1 = sword | 2,
1460 };
1461
SubdwordSelSubdwordSel1462 SubdwordSel() : sel((sdwa_sel)0) {}
SubdwordSelSubdwordSel1463 constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
SubdwordSelSubdwordSel1464 constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1465 : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1466 {}
sdwa_selSubdwordSel1467 constexpr operator sdwa_sel() const { return sel; }
1468 explicit operator bool() const { return sel != 0; }
1469
sizeSubdwordSel1470 constexpr unsigned size() const { return (sel >> 2) & 0x7; }
offsetSubdwordSel1471 constexpr unsigned offset() const { return sel & 0x3; }
sign_extendSubdwordSel1472 constexpr bool sign_extend() const { return sel & sext; }
to_sdwa_selSubdwordSel1473 constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1474 {
1475 reg_byte_offset += offset();
1476 if (size() == 1)
1477 return reg_byte_offset;
1478 else if (size() == 2)
1479 return 4 + (reg_byte_offset >> 1);
1480 else
1481 return 6;
1482 }
1483
1484 private:
1485 sdwa_sel sel;
1486 };
1487
1488 /**
1489 * Sub-Dword Addressing Format:
1490 * This format can be used for VOP1, VOP2 or VOPC instructions.
1491 *
1492 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1493 * the definition doesn't have to be VCC on GFX9+.
1494 *
1495 */
1496 struct SDWA_instruction : public Instruction {
1497 /* these destination modifiers aren't available with VOPC except for
1498 * clamp on GFX8 */
1499 SubdwordSel sel[2];
1500 SubdwordSel dst_sel;
1501 bool neg[2];
1502 bool abs[2];
1503 bool clamp : 1;
1504 uint8_t omod : 2; /* GFX9+ */
1505 uint8_t padding : 5;
1506 };
1507 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1508
1509 struct Interp_instruction : public Instruction {
1510 uint8_t attribute;
1511 uint8_t component;
1512 uint16_t padding;
1513 };
1514 static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1515
1516 /**
1517 * Local and Global Data Sharing instructions
1518 * Operand(0): ADDR - VGPR which supplies the address.
1519 * Operand(1): DATA0 - First data VGPR.
1520 * Operand(2): DATA1 - Second data VGPR.
1521 * Operand(n-1): M0 - LDS size.
1522 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1523 *
1524 */
1525 struct DS_instruction : public Instruction {
1526 memory_sync_info sync;
1527 bool gds;
1528 int16_t offset0;
1529 int8_t offset1;
1530 uint8_t padding;
1531 };
1532 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1533
1534 /**
1535 * Vector Memory Untyped-buffer Instructions
1536 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1537 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1538 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1539 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1540 *
1541 */
1542 struct MUBUF_instruction : public Instruction {
1543 memory_sync_info sync;
1544 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1545 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1546 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1547 bool glc : 1; /* globally coherent */
1548 bool dlc : 1; /* NAVI: device level coherent */
1549 bool slc : 1; /* system level coherent */
1550 bool tfe : 1; /* texture fail enable */
1551 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1552 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1553 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1554 uint16_t swizzled : 1;
1555 uint16_t padding0 : 2;
1556 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1557 uint16_t padding1 : 10;
1558 };
1559 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1560
1561 /**
1562 * Vector Memory Typed-buffer Instructions
1563 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1564 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1565 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1566 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1567 *
1568 */
1569 struct MTBUF_instruction : public Instruction {
1570 memory_sync_info sync;
1571 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1572 uint8_t nfmt : 3; /* Numeric format of data in memory */
1573 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1574 uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */
1575 uint16_t glc : 1; /* globally coherent */
1576 uint16_t dlc : 1; /* NAVI: device level coherent */
1577 uint16_t slc : 1; /* system level coherent */
1578 uint16_t tfe : 1; /* texture fail enable */
1579 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1580 uint16_t vtx_binding : 6; /* 0 if this is not a vertex attribute load */
1581 uint16_t padding : 4;
1582 uint16_t offset; /* Unsigned byte offset - 12 bit */
1583 };
1584 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1585
1586 /**
1587 * Vector Memory Image Instructions
1588 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1589 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1590 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1591 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1592 * Definition(0): VDATA - Vector GPR for read result.
1593 *
1594 */
1595 struct MIMG_instruction : public Instruction {
1596 memory_sync_info sync;
1597 uint8_t dmask; /* Data VGPR enable mask */
1598 uint8_t dim : 3; /* NAVI: dimensionality */
1599 bool unrm : 1; /* Force address to be un-normalized */
1600 bool dlc : 1; /* NAVI: device level coherent */
1601 bool glc : 1; /* globally coherent */
1602 bool slc : 1; /* system level coherent */
1603 bool tfe : 1; /* texture fail enable */
1604 bool da : 1; /* declare an array */
1605 bool lwe : 1; /* LOD warning enable */
1606 bool r128 : 1; /* NAVI: Texture resource size */
1607 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1608 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1609 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1610 uint8_t padding0 : 2;
1611 uint8_t padding1;
1612 uint8_t padding2;
1613 };
1614 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1615
1616 /**
1617 * Flat/Scratch/Global Instructions
1618 * Operand(0): ADDR
1619 * Operand(1): SADDR
1620 * Operand(2) / Definition(0): DATA/VDST
1621 *
1622 */
1623 struct FLAT_instruction : public Instruction {
1624 memory_sync_info sync;
1625 bool slc : 1; /* system level coherent */
1626 bool glc : 1; /* globally coherent */
1627 bool dlc : 1; /* NAVI: device level coherent */
1628 bool lds : 1;
1629 bool nv : 1;
1630 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1631 uint8_t padding0 : 2;
1632 uint16_t offset; /* Vega/Navi only */
1633 uint16_t padding1;
1634 };
1635 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1636
1637 struct Export_instruction : public Instruction {
1638 uint8_t enabled_mask;
1639 uint8_t dest;
1640 bool compressed : 1;
1641 bool done : 1;
1642 bool valid_mask : 1;
1643 uint8_t padding0 : 5;
1644 uint8_t padding1;
1645 };
1646 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1647
1648 struct Pseudo_instruction : public Instruction {
1649 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1650 bool tmp_in_scc;
1651 uint8_t padding;
1652 };
1653 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1654
1655 struct Pseudo_branch_instruction : public Instruction {
1656 /* target[0] is the block index of the branch target.
1657 * For conditional branches, target[1] contains the fall-through alternative.
1658 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1659 */
1660 uint32_t target[2];
1661 };
1662 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1663
1664 struct Pseudo_barrier_instruction : public Instruction {
1665 memory_sync_info sync;
1666 sync_scope exec_scope;
1667 };
1668 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1669
1670 enum ReduceOp : uint16_t {
1671 // clang-format off
1672 iadd8, iadd16, iadd32, iadd64,
1673 imul8, imul16, imul32, imul64,
1674 fadd16, fadd32, fadd64,
1675 fmul16, fmul32, fmul64,
1676 imin8, imin16, imin32, imin64,
1677 imax8, imax16, imax32, imax64,
1678 umin8, umin16, umin32, umin64,
1679 umax8, umax16, umax32, umax64,
1680 fmin16, fmin32, fmin64,
1681 fmax16, fmax32, fmax64,
1682 iand8, iand16, iand32, iand64,
1683 ior8, ior16, ior32, ior64,
1684 ixor8, ixor16, ixor32, ixor64,
1685 num_reduce_ops,
1686 // clang-format on
1687 };
1688
1689 /**
1690 * Subgroup Reduction Instructions, everything except for the data to be
1691 * reduced and the result as inserted by setup_reduce_temp().
1692 * Operand(0): data to be reduced
1693 * Operand(1): reduce temporary
1694 * Operand(2): vector temporary
1695 * Definition(0): result
1696 * Definition(1): scalar temporary
1697 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1698 * Definition(3): scc clobber
1699 * Definition(4): vcc clobber
1700 *
1701 */
1702 struct Pseudo_reduction_instruction : public Instruction {
1703 ReduceOp reduce_op;
1704 uint16_t cluster_size; // must be 0 for scans
1705 };
1706 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1707 "Unexpected padding");
1708
1709 struct instr_deleter_functor {
operatorinstr_deleter_functor1710 void operator()(void* p) { free(p); }
1711 };
1712
1713 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1714
1715 template <typename T>
1716 T*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1717 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1718 uint32_t num_definitions)
1719 {
1720 std::size_t size =
1721 sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1722 char* data = (char*)calloc(1, size);
1723 T* inst = (T*)data;
1724
1725 inst->opcode = opcode;
1726 inst->format = format;
1727
1728 uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands;
1729 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1730 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1731 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1732
1733 return inst;
1734 }
1735
1736 constexpr bool
usesModifiers()1737 Instruction::usesModifiers() const noexcept
1738 {
1739 if (isDPP() || isSDWA())
1740 return true;
1741
1742 if (isVOP3P()) {
1743 const VOP3P_instruction& vop3p = this->vop3p();
1744 for (unsigned i = 0; i < operands.size(); i++) {
1745 if (vop3p.neg_lo[i] || vop3p.neg_hi[i])
1746 return true;
1747
1748 /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1749 if (!(vop3p.opsel_hi & (1 << i)))
1750 return true;
1751 }
1752 return vop3p.opsel_lo || vop3p.clamp;
1753 } else if (isVOP3()) {
1754 const VOP3_instruction& vop3 = this->vop3();
1755 for (unsigned i = 0; i < operands.size(); i++) {
1756 if (vop3.abs[i] || vop3.neg[i])
1757 return true;
1758 }
1759 return vop3.opsel || vop3.clamp || vop3.omod;
1760 }
1761 return false;
1762 }
1763
1764 constexpr bool
is_phi(Instruction * instr)1765 is_phi(Instruction* instr)
1766 {
1767 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1768 }
1769
1770 static inline bool
is_phi(aco_ptr<Instruction> & instr)1771 is_phi(aco_ptr<Instruction>& instr)
1772 {
1773 return is_phi(instr.get());
1774 }
1775
1776 memory_sync_info get_sync_info(const Instruction* instr);
1777
1778 bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
1779
1780 bool can_use_opsel(chip_class chip, aco_opcode op, int idx);
1781 bool instr_is_16bit(chip_class chip, aco_opcode op);
1782 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
1783 bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
1784 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1785 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
1786 aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8);
1787 bool needs_exec_mask(const Instruction* instr);
1788
1789 aco_opcode get_ordered(aco_opcode op);
1790 aco_opcode get_unordered(aco_opcode op);
1791 aco_opcode get_inverse(aco_opcode op);
1792 aco_opcode get_f32_cmp(aco_opcode op);
1793 unsigned get_cmp_bitsize(aco_opcode op);
1794 bool is_cmp(aco_opcode op);
1795
1796 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
1797
1798 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1799
1800 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1801
1802 bool should_form_clause(const Instruction* a, const Instruction* b);
1803
1804 enum block_kind {
1805 /* uniform indicates that leaving this block,
1806 * all actives lanes stay active */
1807 block_kind_uniform = 1 << 0,
1808 block_kind_top_level = 1 << 1,
1809 block_kind_loop_preheader = 1 << 2,
1810 block_kind_loop_header = 1 << 3,
1811 block_kind_loop_exit = 1 << 4,
1812 block_kind_continue = 1 << 5,
1813 block_kind_break = 1 << 6,
1814 block_kind_continue_or_break = 1 << 7,
1815 block_kind_branch = 1 << 8,
1816 block_kind_merge = 1 << 9,
1817 block_kind_invert = 1 << 10,
1818 block_kind_uses_discard = 1 << 12,
1819 block_kind_needs_lowering = 1 << 13,
1820 block_kind_export_end = 1 << 15,
1821 };
1822
1823 struct RegisterDemand {
1824 constexpr RegisterDemand() = default;
RegisterDemandRegisterDemand1825 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1826 int16_t vgpr = 0;
1827 int16_t sgpr = 0;
1828
1829 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1830 {
1831 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1832 }
1833
exceedsRegisterDemand1834 constexpr bool exceeds(const RegisterDemand other) const noexcept
1835 {
1836 return vgpr > other.vgpr || sgpr > other.sgpr;
1837 }
1838
1839 constexpr RegisterDemand operator+(const Temp t) const noexcept
1840 {
1841 if (t.type() == RegType::sgpr)
1842 return RegisterDemand(vgpr, sgpr + t.size());
1843 else
1844 return RegisterDemand(vgpr + t.size(), sgpr);
1845 }
1846
1847 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1848 {
1849 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1850 }
1851
1852 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1853 {
1854 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1855 }
1856
1857 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1858 {
1859 vgpr += other.vgpr;
1860 sgpr += other.sgpr;
1861 return *this;
1862 }
1863
1864 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1865 {
1866 vgpr -= other.vgpr;
1867 sgpr -= other.sgpr;
1868 return *this;
1869 }
1870
1871 constexpr RegisterDemand& operator+=(const Temp t) noexcept
1872 {
1873 if (t.type() == RegType::sgpr)
1874 sgpr += t.size();
1875 else
1876 vgpr += t.size();
1877 return *this;
1878 }
1879
1880 constexpr RegisterDemand& operator-=(const Temp t) noexcept
1881 {
1882 if (t.type() == RegType::sgpr)
1883 sgpr -= t.size();
1884 else
1885 vgpr -= t.size();
1886 return *this;
1887 }
1888
updateRegisterDemand1889 constexpr void update(const RegisterDemand other) noexcept
1890 {
1891 vgpr = std::max(vgpr, other.vgpr);
1892 sgpr = std::max(sgpr, other.sgpr);
1893 }
1894 };
1895
1896 /* CFG */
1897 struct Block {
1898 float_mode fp_mode;
1899 unsigned index;
1900 unsigned offset = 0;
1901 std::vector<aco_ptr<Instruction>> instructions;
1902 std::vector<unsigned> logical_preds;
1903 std::vector<unsigned> linear_preds;
1904 std::vector<unsigned> logical_succs;
1905 std::vector<unsigned> linear_succs;
1906 RegisterDemand register_demand = RegisterDemand();
1907 uint16_t loop_nest_depth = 0;
1908 uint16_t divergent_if_logical_depth = 0;
1909 uint16_t uniform_if_depth = 0;
1910 uint16_t kind = 0;
1911 int logical_idom = -1;
1912 int linear_idom = -1;
1913
1914 /* this information is needed for predecessors to blocks with phis when
1915 * moving out of ssa */
1916 bool scc_live_out = false;
1917
BlockBlock1918 Block() : index(0) {}
1919 };
1920
1921 /*
1922 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1923 */
1924 enum class SWStage : uint16_t {
1925 None = 0,
1926 VS = 1 << 0, /* Vertex Shader */
1927 GS = 1 << 1, /* Geometry Shader */
1928 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
1929 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
1930 FS = 1 << 4, /* Fragment aka Pixel Shader */
1931 CS = 1 << 5, /* Compute Shader */
1932 TS = 1 << 6, /* Task Shader */
1933 MS = 1 << 7, /* Mesh Shader */
1934 GSCopy = 1 << 8, /* GS Copy Shader (internal) */
1935
1936 /* Stage combinations merged to run on a single HWStage */
1937 VS_GS = VS | GS,
1938 VS_TCS = VS | TCS,
1939 TES_GS = TES | GS,
1940 };
1941
1942 constexpr SWStage
1943 operator|(SWStage a, SWStage b)
1944 {
1945 return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
1946 }
1947
1948 /*
1949 * Shader stages as running on the AMD GPU.
1950 *
1951 * The relation between HWStages and SWStages is not a one-to-one mapping:
1952 * Some SWStages are merged by ACO to run on a single HWStage.
1953 * See README.md for details.
1954 */
1955 enum class HWStage : uint8_t {
1956 VS,
1957 ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
1958 GS, /* Geometry shader on GFX10/legacy and GFX6-9. */
1959 NGG, /* Primitive shader, used to implement VS, TES, GS. */
1960 LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
1961 HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
1962 FS,
1963 CS,
1964 };
1965
1966 /*
1967 * Set of SWStages to be merged into a single shader paired with the
1968 * HWStage it will run on.
1969 */
1970 struct Stage {
1971 constexpr Stage() = default;
1972
StageStage1973 explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
1974
1975 /* Check if the given SWStage is included */
hasStage1976 constexpr bool has(SWStage stage) const
1977 {
1978 return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
1979 }
1980
num_sw_stagesStage1981 unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
1982
1983 constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
1984
1985 constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
1986
1987 /* Mask of merged software stages */
1988 SWStage sw = SWStage::None;
1989
1990 /* Active hardware stage */
1991 HWStage hw{};
1992 };
1993
1994 /* possible settings of Program::stage */
1995 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
1996 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
1997 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
1998 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
1999 static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
2000 /* Mesh shading pipeline */
2001 static constexpr Stage task_cs(HWStage::CS, SWStage::TS);
2002 static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS);
2003 /* GFX10/NGG */
2004 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
2005 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
2006 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
2007 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
2008 /* GFX9 (and GFX10 if NGG isn't used) */
2009 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
2010 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
2011 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
2012 /* pre-GFX9 */
2013 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
2014 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
2015 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
2016 static constexpr Stage tess_eval_es(HWStage::ES,
2017 SWStage::TES); /* tesselation evaluation before geometry */
2018 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
2019
2020 enum statistic {
2021 statistic_hash,
2022 statistic_instructions,
2023 statistic_copies,
2024 statistic_branches,
2025 statistic_latency,
2026 statistic_inv_throughput,
2027 statistic_vmem_clauses,
2028 statistic_smem_clauses,
2029 statistic_sgpr_presched,
2030 statistic_vgpr_presched,
2031 num_statistics
2032 };
2033
2034 struct DeviceInfo {
2035 uint16_t lds_encoding_granule;
2036 uint16_t lds_alloc_granule;
2037 uint32_t lds_limit; /* in bytes */
2038 bool has_16bank_lds;
2039 uint16_t physical_sgprs;
2040 uint16_t physical_vgprs;
2041 uint16_t vgpr_limit;
2042 uint16_t sgpr_limit;
2043 uint16_t sgpr_alloc_granule;
2044 uint16_t vgpr_alloc_granule; /* must be power of two */
2045 unsigned max_wave64_per_simd;
2046 unsigned simd_per_cu;
2047 bool has_fast_fma32 = false;
2048 bool has_mac_legacy32 = false;
2049 bool fused_mad_mix = false;
2050 bool xnack_enabled = false;
2051 bool sram_ecc_enabled = false;
2052 };
2053
2054 enum class CompilationProgress {
2055 after_isel,
2056 after_spilling,
2057 after_ra,
2058 };
2059
2060 class Program final {
2061 public:
2062 std::vector<Block> blocks;
2063 std::vector<RegClass> temp_rc = {s1};
2064 RegisterDemand max_reg_demand = RegisterDemand();
2065 uint16_t num_waves = 0;
2066 uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
2067 ac_shader_config* config;
2068 const struct radv_shader_info* info;
2069 enum chip_class chip_class;
2070 enum radeon_family family;
2071 DeviceInfo dev;
2072 unsigned wave_size;
2073 RegClass lane_mask;
2074 Stage stage;
2075 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2076 bool needs_wqm = false; /* there exists a p_wqm instruction */
2077
2078 std::vector<uint8_t> constant_data;
2079 Temp private_segment_buffer;
2080 Temp scratch_offset;
2081
2082 uint16_t min_waves = 0;
2083 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2084 bool wgp_mode;
2085 bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
2086
2087 bool needs_vcc = false;
2088 bool needs_flat_scr = false;
2089
2090 CompilationProgress progress;
2091
2092 bool collect_statistics = false;
2093 uint32_t statistics[num_statistics];
2094
2095 float_mode next_fp_mode;
2096 unsigned next_loop_depth = 0;
2097 unsigned next_divergent_if_logical_depth = 0;
2098 unsigned next_uniform_if_depth = 0;
2099
2100 std::vector<Definition> vs_inputs;
2101
2102 struct {
2103 FILE* output = stderr;
2104 bool shorten_messages = false;
2105 void (*func)(void* private_data, enum radv_compiler_debug_level level, const char* message);
2106 void* private_data;
2107 } debug;
2108
allocateId(RegClass rc)2109 uint32_t allocateId(RegClass rc)
2110 {
2111 assert(allocationID <= 16777215);
2112 temp_rc.push_back(rc);
2113 return allocationID++;
2114 }
2115
allocateRange(unsigned amount)2116 void allocateRange(unsigned amount)
2117 {
2118 assert(allocationID + amount <= 16777216);
2119 temp_rc.resize(temp_rc.size() + amount);
2120 allocationID += amount;
2121 }
2122
allocateTmp(RegClass rc)2123 Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2124
peekAllocationId()2125 uint32_t peekAllocationId() { return allocationID; }
2126
2127 friend void reindex_ssa(Program* program);
2128 friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2129
create_and_insert_block()2130 Block* create_and_insert_block()
2131 {
2132 Block block;
2133 return insert_block(std::move(block));
2134 }
2135
insert_block(Block && block)2136 Block* insert_block(Block&& block)
2137 {
2138 block.index = blocks.size();
2139 block.fp_mode = next_fp_mode;
2140 block.loop_nest_depth = next_loop_depth;
2141 block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2142 block.uniform_if_depth = next_uniform_if_depth;
2143 blocks.emplace_back(std::move(block));
2144 return &blocks.back();
2145 }
2146
2147 private:
2148 uint32_t allocationID = 1;
2149 };
2150
2151 struct live {
2152 /* live temps out per block */
2153 std::vector<IDSet> live_out;
2154 /* register demand (sgpr/vgpr) per instruction per block */
2155 std::vector<std::vector<RegisterDemand>> register_demand;
2156 };
2157
2158 struct ra_test_policy {
2159 /* Force RA to always use its pessimistic fallback algorithm */
2160 bool skip_optimistic_path = false;
2161 };
2162
2163 void init();
2164
2165 void init_program(Program* program, Stage stage, const struct radv_shader_info* info,
2166 enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
2167 ac_shader_config* config);
2168
2169 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2170 ac_shader_config* config, const struct radv_nir_compiler_options* options,
2171 const struct radv_shader_info* info,
2172 const struct radv_shader_args* args);
2173 void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
2174 const struct radv_nir_compiler_options* options,
2175 const struct radv_shader_info* info,
2176 const struct radv_shader_args* args);
2177 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2178 ac_shader_config* config,
2179 const struct radv_nir_compiler_options* options,
2180 const struct radv_shader_info* info,
2181 const struct radv_shader_args* args);
2182 void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
2183 ac_shader_config* config,
2184 const struct radv_nir_compiler_options* options,
2185 const struct radv_shader_info* info,
2186 const struct radv_shader_args* args,
2187 unsigned* num_preserved_sgprs);
2188
2189 void lower_phis(Program* program);
2190 void calc_min_waves(Program* program);
2191 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2192 live live_var_analysis(Program* program);
2193 std::vector<uint16_t> dead_code_analysis(Program* program);
2194 void dominator_tree(Program* program);
2195 void insert_exec_mask(Program* program);
2196 void value_numbering(Program* program);
2197 void optimize(Program* program);
2198 void optimize_postRA(Program* program);
2199 void setup_reduce_temp(Program* program);
2200 void lower_to_cssa(Program* program, live& live_vars);
2201 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2202 ra_test_policy = {});
2203 void ssa_elimination(Program* program);
2204 void lower_to_hw_instr(Program* program);
2205 void schedule_program(Program* program, live& live_vars);
2206 void spill(Program* program, live& live_vars);
2207 void insert_wait_states(Program* program);
2208 void insert_NOPs(Program* program);
2209 void form_hard_clauses(Program* program);
2210 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
2211 /**
2212 * Returns true if print_asm can disassemble the given program for the current build/runtime
2213 * configuration
2214 */
2215 bool check_print_asm_support(Program* program);
2216 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2217 bool validate_ir(Program* program);
2218 bool validate_ra(Program* program);
2219 #ifndef NDEBUG
2220 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2221 #else
2222 #define perfwarn(program, cond, msg, ...) \
2223 do { \
2224 } while (0)
2225 #endif
2226
2227 void collect_presched_stats(Program* program);
2228 void collect_preasm_stats(Program* program);
2229 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2230
2231 enum print_flags {
2232 print_no_ssa = 0x1,
2233 print_perf_info = 0x2,
2234 print_kill = 0x4,
2235 print_live_vars = 0x8,
2236 };
2237
2238 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2239 void aco_print_instr(const Instruction* instr, FILE* output, unsigned flags = 0);
2240 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2241 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2242 unsigned flags = 0);
2243
2244 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2245 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2246
2247 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2248 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2249
2250 /* utilities for dealing with register demand */
2251 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2252 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2253 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2254 aco_ptr<Instruction>& instr_before);
2255
2256 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2257 uint16_t get_extra_sgprs(Program* program);
2258
2259 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2260 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2261 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2262
2263 /* return number of addressable sgprs/vgprs for max_waves */
2264 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2265 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2266
2267 typedef struct {
2268 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2269 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2270 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2271 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2272 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2273 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2274 const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2275 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2276 /* sizes used for input/output modifiers and constants */
2277 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2278 const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2279 } Info;
2280
2281 extern const Info instr_info;
2282
2283 } // namespace aco
2284
2285 #endif /* ACO_IR_H */
2286