1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70
71 #include "AMDGPURegisterBankInfo.h"
72
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90
91 using namespace llvm;
92 using namespace MIPatternMatch;
93
94 namespace {
95
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99 const AMDGPURegisterBankInfo &RBI;
100 MachineRegisterInfo &MRI;
101 const RegisterBank *NewBank;
102 SmallVector<MachineInstr *, 4> NewInsts;
103
104 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106 MachineRegisterInfo &MRI_, const RegisterBank *RB)
107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108
~ApplyRegBankMapping()109 ~ApplyRegBankMapping() {
110 for (MachineInstr *MI : NewInsts)
111 applyBank(*MI);
112 }
113
114 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)115 void applyBank(MachineInstr &MI) {
116 const unsigned Opc = MI.getOpcode();
117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118 Opc == AMDGPU::G_SEXT) {
119 // LegalizerHelper wants to use the basic legalization artifacts when
120 // widening etc. We don't handle selection with vcc in artifact sources,
121 // so we need to use a sslect instead to handle these properly.
122 Register DstReg = MI.getOperand(0).getReg();
123 Register SrcReg = MI.getOperand(1).getReg();
124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125 if (SrcBank == &AMDGPU::VCCRegBank) {
126 const LLT S32 = LLT::scalar(32);
127 assert(MRI.getType(SrcReg) == LLT::scalar(1));
128 assert(MRI.getType(DstReg) == S32);
129 assert(NewBank == &AMDGPU::VGPRRegBank);
130
131 // Replace the extension with a select, which really uses the boolean
132 // source.
133 MachineIRBuilder B(MI);
134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135 auto False = B.buildConstant(S32, 0);
136 B.buildSelect(DstReg, SrcReg, True, False);
137 MRI.setRegBank(True.getReg(0), *NewBank);
138 MRI.setRegBank(False.getReg(0), *NewBank);
139 MI.eraseFromParent();
140 }
141
142 assert(!MRI.getRegClassOrRegBank(DstReg));
143 MRI.setRegBank(DstReg, *NewBank);
144 return;
145 }
146
147 #ifndef NDEBUG
148 if (Opc == AMDGPU::G_TRUNC) {
149 Register DstReg = MI.getOperand(0).getReg();
150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151 assert(DstBank != &AMDGPU::VCCRegBank);
152 }
153 #endif
154
155 for (MachineOperand &Op : MI.operands()) {
156 if (!Op.isReg())
157 continue;
158
159 // We may see physical registers if building a real MI
160 Register Reg = Op.getReg();
161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162 continue;
163
164 const RegisterBank *RB = NewBank;
165 if (MRI.getType(Reg) == LLT::scalar(1)) {
166 assert(NewBank == &AMDGPU::VGPRRegBank &&
167 "s1 operands should only be used for vector bools");
168 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170 "not expecting legalization artifacts here");
171 RB = &AMDGPU::VCCRegBank;
172 }
173
174 MRI.setRegBank(Reg, *RB);
175 }
176 }
177
erasingInstr(MachineInstr & MI)178 void erasingInstr(MachineInstr &MI) override {}
179
createdInstr(MachineInstr & MI)180 void createdInstr(MachineInstr &MI) override {
181 // At this point, the instruction was just inserted and has no operands.
182 NewInsts.push_back(&MI);
183 }
184
changingInstr(MachineInstr & MI)185 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)186 void changedInstr(MachineInstr &MI) override {
187 // FIXME: In principle we should probably add the instruction to NewInsts,
188 // but the way the LegalizerHelper uses the observer, we will always see the
189 // registers we need to set the regbank on also referenced in a new
190 // instruction.
191 }
192 };
193
194 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196 : AMDGPUGenRegisterBankInfo(),
197 Subtarget(ST),
198 TRI(Subtarget.getRegisterInfo()),
199 TII(Subtarget.getInstrInfo()) {
200
201 // HACK: Until this is fully tablegen'd.
202 static llvm::once_flag InitializeRegisterBankFlag;
203
204 static auto InitializeRegisterBankOnce = [this]() {
205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208 (void)this;
209 };
210
211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213
isVectorRegisterBank(const RegisterBank & Bank)214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215 unsigned BankID = Bank.getID();
216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220 const RegisterBank &Src,
221 unsigned Size) const {
222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225 return std::numeric_limits<unsigned>::max();
226 }
227
228 // Bool values are tricky, because the meaning is based on context. The SCC
229 // and VCC banks are for the natural scalar and vector conditions produced by
230 // a compare.
231 //
232 // Legalization doesn't know about the necessary context, so an s1 use may
233 // have been a truncate from an arbitrary value, in which case a copy (lowered
234 // as a compare with 0) needs to be inserted.
235 if (Size == 1 &&
236 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237 (isVectorRegisterBank(Src) ||
238 Src.getID() == AMDGPU::SGPRRegBankID ||
239 Src.getID() == AMDGPU::VCCRegBankID))
240 return std::numeric_limits<unsigned>::max();
241
242 // There is no direct copy between AGPRs.
243 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244 Src.getID() == AMDGPU::AGPRRegBankID)
245 return 4;
246
247 return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251 const ValueMapping &ValMapping,
252 const RegisterBank *CurBank) const {
253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254 // VGPR.
255 // FIXME: Is there a better way to do this?
256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257 return 10; // This is expensive.
258
259 assert(ValMapping.NumBreakDowns == 2 &&
260 ValMapping.BreakDown[0].Length == 32 &&
261 ValMapping.BreakDown[0].StartIdx == 0 &&
262 ValMapping.BreakDown[1].Length == 32 &&
263 ValMapping.BreakDown[1].StartIdx == 32 &&
264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265
266 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268 // want.
269
270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271 // alignment restrictions, but this probably isn't important.
272 return 1;
273 }
274
275 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277 LLT Ty) const {
278 if (&RC == &AMDGPU::SReg_1RegClass)
279 return AMDGPU::VCCRegBank;
280
281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282 // VCC-like use.
283 if (TRI->isSGPRClass(&RC)) {
284 // FIXME: This probably came from a copy from a physical register, which
285 // should be inferrrable from the copied to-type. We don't have many boolean
286 // physical register constraints so just assume a normal SGPR for now.
287 if (!Ty.isValid())
288 return AMDGPU::SGPRRegBank;
289
290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291 }
292
293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const298 AMDGPURegisterBankInfo::addMappingFromTable(
299 const MachineInstr &MI, const MachineRegisterInfo &MRI,
300 const std::array<unsigned, NumOps> RegSrcOpIdx,
301 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302
303 InstructionMappings AltMappings;
304
305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
306
307 unsigned Sizes[NumOps];
308 for (unsigned I = 0; I < NumOps; ++I) {
309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311 }
312
313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316 }
317
318 // getInstrMapping's default mapping uses ID 1, so start at 2.
319 unsigned MappingID = 2;
320 for (const auto &Entry : Table) {
321 for (unsigned I = 0; I < NumOps; ++I) {
322 int OpIdx = RegSrcOpIdx[I];
323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324 }
325
326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327 getOperandsMapping(Operands),
328 Operands.size()));
329 }
330
331 return AltMappings;
332 }
333
334 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337 switch (MI.getIntrinsicID()) {
338 case Intrinsic::amdgcn_readlane: {
339 static const OpRegBankEntry<3> Table[2] = {
340 // Perfectly legal.
341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342
343 // Need a readfirstlane for the index.
344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345 };
346
347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349 }
350 case Intrinsic::amdgcn_writelane: {
351 static const OpRegBankEntry<4> Table[4] = {
352 // Perfectly legal.
353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354
355 // Need readfirstlane of first op
356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357
358 // Need readfirstlane of second op
359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360
361 // Need readfirstlane of both ops
362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363 };
364
365 // rsrc, voffset, offset
366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368 }
369 default:
370 return RegisterBankInfo::getInstrAlternativeMappings(MI);
371 }
372 }
373
374 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377
378 switch (MI.getIntrinsicID()) {
379 case Intrinsic::amdgcn_s_buffer_load: {
380 static const OpRegBankEntry<2> Table[4] = {
381 // Perfectly legal.
382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383
384 // Only need 1 register in loop
385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386
387 // Have to waterfall the resource.
388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389
390 // Have to waterfall the resource, and the offset.
391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392 };
393
394 // rsrc, offset
395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397 }
398 case Intrinsic::amdgcn_ds_ordered_add:
399 case Intrinsic::amdgcn_ds_ordered_swap: {
400 // VGPR = M0, VGPR
401 static const OpRegBankEntry<3> Table[2] = {
402 // Perfectly legal.
403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
404
405 // Need a readfirstlane for m0
406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407 };
408
409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411 }
412 case Intrinsic::amdgcn_s_sendmsg:
413 case Intrinsic::amdgcn_s_sendmsghalt: {
414 // FIXME: Should have no register for immediate
415 static const OpRegBankEntry<1> Table[2] = {
416 // Perfectly legal.
417 { { AMDGPU::SGPRRegBankID }, 1 },
418
419 // Need readlane
420 { { AMDGPU::VGPRRegBankID }, 3 }
421 };
422
423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425 }
426 default:
427 return RegisterBankInfo::getInstrAlternativeMappings(MI);
428 }
429 }
430
memOpHasNoClobbered(const MachineMemOperand * MMO)431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433 return I && I->getMetadata("amdgpu.noclobber");
434 }
435
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439 if (!MI.hasOneMemOperand())
440 return false;
441
442 const MachineMemOperand *MMO = *MI.memoperands_begin();
443 const unsigned AS = MMO->getAddrSpace();
444 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446 // Require 4-byte alignment.
447 return MMO->getAlign() >= Align(4) &&
448 // Can't do a scalar atomic load.
449 !MMO->isAtomic() &&
450 // Don't use scalar loads for volatile accesses to non-constant address
451 // spaces.
452 (IsConst || !MMO->isVolatile()) &&
453 // Memory must be known constant, or not written before this load.
454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455 AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457
458 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460 const MachineInstr &MI) const {
461
462 const MachineFunction &MF = *MI.getParent()->getParent();
463 const MachineRegisterInfo &MRI = MF.getRegInfo();
464
465
466 InstructionMappings AltMappings;
467 switch (MI.getOpcode()) {
468 case TargetOpcode::G_CONSTANT: {
469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470 if (Size == 1) {
471 static const OpRegBankEntry<1> Table[3] = {
472 { { AMDGPU::VGPRRegBankID }, 1 },
473 { { AMDGPU::SGPRRegBankID }, 1 },
474 { { AMDGPU::VCCRegBankID }, 1 }
475 };
476
477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478 }
479
480 LLVM_FALLTHROUGH;
481 }
482 case TargetOpcode::G_FCONSTANT:
483 case TargetOpcode::G_FRAME_INDEX:
484 case TargetOpcode::G_GLOBAL_VALUE: {
485 static const OpRegBankEntry<1> Table[2] = {
486 { { AMDGPU::VGPRRegBankID }, 1 },
487 { { AMDGPU::SGPRRegBankID }, 1 }
488 };
489
490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491 }
492 case TargetOpcode::G_AND:
493 case TargetOpcode::G_OR:
494 case TargetOpcode::G_XOR: {
495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496
497 if (Size == 1) {
498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499 const InstructionMapping &SCCMapping = getInstructionMapping(
500 1, 1, getOperandsMapping(
501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504 3); // Num Operands
505 AltMappings.push_back(&SCCMapping);
506
507 const InstructionMapping &VCCMapping0 = getInstructionMapping(
508 2, 1, getOperandsMapping(
509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512 3); // Num Operands
513 AltMappings.push_back(&VCCMapping0);
514 return AltMappings;
515 }
516
517 if (Size != 64)
518 break;
519
520 const InstructionMapping &SSMapping = getInstructionMapping(
521 1, 1, getOperandsMapping(
522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525 3); // Num Operands
526 AltMappings.push_back(&SSMapping);
527
528 const InstructionMapping &VVMapping = getInstructionMapping(
529 2, 2, getOperandsMapping(
530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533 3); // Num Operands
534 AltMappings.push_back(&VVMapping);
535 break;
536 }
537 case TargetOpcode::G_LOAD:
538 case TargetOpcode::G_ZEXTLOAD:
539 case TargetOpcode::G_SEXTLOAD: {
540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542 unsigned PtrSize = PtrTy.getSizeInBits();
543 unsigned AS = PtrTy.getAddressSpace();
544
545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547 isScalarLoadLegal(MI)) {
548 const InstructionMapping &SSMapping = getInstructionMapping(
549 1, 1, getOperandsMapping(
550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552 2); // Num Operands
553 AltMappings.push_back(&SSMapping);
554 }
555
556 const InstructionMapping &VVMapping = getInstructionMapping(
557 2, 1,
558 getOperandsMapping(
559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561 2); // Num Operands
562 AltMappings.push_back(&VVMapping);
563
564 // It may be possible to have a vgpr = load sgpr mapping here, because
565 // the mubuf instructions support this kind of load, but probably for only
566 // gfx7 and older. However, the addressing mode matching in the instruction
567 // selector should be able to do a better job of detecting and selecting
568 // these kinds of loads from the vgpr = load vgpr mapping.
569
570 return AltMappings;
571
572 }
573 case TargetOpcode::G_SELECT: {
574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580 4); // Num Operands
581 AltMappings.push_back(&SSMapping);
582
583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588 4); // Num Operands
589 AltMappings.push_back(&VVMapping);
590
591 return AltMappings;
592 }
593 case TargetOpcode::G_UADDE:
594 case TargetOpcode::G_USUBE:
595 case TargetOpcode::G_SADDE:
596 case TargetOpcode::G_SSUBE: {
597 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
599 getOperandsMapping(
600 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605 5); // Num Operands
606 AltMappings.push_back(&SSMapping);
607
608 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614 5); // Num Operands
615 AltMappings.push_back(&VVMapping);
616 return AltMappings;
617 }
618 case AMDGPU::G_BRCOND: {
619 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
620
621 // TODO: Change type to 32 for scalar
622 const InstructionMapping &SMapping = getInstructionMapping(
623 1, 1, getOperandsMapping(
624 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625 2); // Num Operands
626 AltMappings.push_back(&SMapping);
627
628 const InstructionMapping &VMapping = getInstructionMapping(
629 1, 1, getOperandsMapping(
630 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631 2); // Num Operands
632 AltMappings.push_back(&VMapping);
633 return AltMappings;
634 }
635 case AMDGPU::G_INTRINSIC:
636 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
637 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
638 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
639 default:
640 break;
641 }
642 return RegisterBankInfo::getInstrAlternativeMappings(MI);
643 }
644
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
646 MachineIRBuilder &B,
647 SmallVector<Register, 2> &Regs,
648 LLT HalfTy,
649 Register Reg) const {
650 assert(HalfTy.getSizeInBits() == 32);
651 MachineRegisterInfo *MRI = B.getMRI();
652 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655 MRI->setRegBank(LoLHS, *Bank);
656 MRI->setRegBank(HiLHS, *Bank);
657
658 Regs.push_back(LoLHS);
659 Regs.push_back(HiLHS);
660
661 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662 .addDef(LoLHS)
663 .addDef(HiLHS)
664 .addUse(Reg);
665 }
666
667 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
669 LLT NewTy) {
670 for (Register Reg : Regs) {
671 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
672 MRI.setType(Reg, NewTy);
673 }
674 }
675
getHalfSizedType(LLT Ty)676 static LLT getHalfSizedType(LLT Ty) {
677 if (Ty.isVector()) {
678 assert(Ty.getElementCount().isKnownMultipleOf(2));
679 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
680 Ty.getElementType());
681 }
682
683 assert(Ty.getScalarSizeInBits() % 2 == 0);
684 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
685 }
686
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
692 ///
693 /// Essentially performs this loop:
694 //
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 /// Enable Lane, Disable all other lanes
698 /// SGPR = read SGPR value for current lane from VGPR
699 /// VGPRResult[Lane] = use_op SGPR
700 /// }
701 /// Restore Execution Mask
702 ///
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
706 MachineIRBuilder &B,
707 iterator_range<MachineBasicBlock::iterator> Range,
708 SmallSet<Register, 4> &SGPROperandRegs,
709 MachineRegisterInfo &MRI) const {
710 SmallVector<Register, 4> ResultRegs;
711 SmallVector<Register, 4> InitResultRegs;
712 SmallVector<Register, 4> PhiRegs;
713
714 // Track use registers which have already been expanded with a readfirstlane
715 // sequence. This may have multiple uses if moving a sequence.
716 DenseMap<Register, Register> WaterfalledRegMap;
717
718 MachineBasicBlock &MBB = B.getMBB();
719 MachineFunction *MF = &B.getMF();
720
721 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
722 const unsigned WaveAndOpc = Subtarget.isWave32() ?
723 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
724 const unsigned MovTermOpc = Subtarget.isWave32() ?
725 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
726 const unsigned XorTermOpc = Subtarget.isWave32() ?
727 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
728 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
729 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
730 const unsigned ExecReg = Subtarget.isWave32() ?
731 AMDGPU::EXEC_LO : AMDGPU::EXEC;
732
733 #ifndef NDEBUG
734 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
735 #endif
736
737 for (MachineInstr &MI : Range) {
738 for (MachineOperand &Def : MI.defs()) {
739 if (MRI.use_nodbg_empty(Def.getReg()))
740 continue;
741
742 LLT ResTy = MRI.getType(Def.getReg());
743 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
744 ResultRegs.push_back(Def.getReg());
745 Register InitReg = B.buildUndef(ResTy).getReg(0);
746 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
747 InitResultRegs.push_back(InitReg);
748 PhiRegs.push_back(PhiReg);
749 MRI.setRegBank(PhiReg, *DefBank);
750 MRI.setRegBank(InitReg, *DefBank);
751 }
752 }
753
754 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
755 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
756
757 // Don't bother using generic instructions/registers for the exec mask.
758 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
759 .addDef(InitSaveExecReg);
760
761 Register PhiExec = MRI.createVirtualRegister(WaveRC);
762 Register NewExec = MRI.createVirtualRegister(WaveRC);
763
764 // To insert the loop we need to split the block. Move everything before this
765 // point to a new block, and insert a new empty block before this instruction.
766 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
767 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
768 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
769 MachineFunction::iterator MBBI(MBB);
770 ++MBBI;
771 MF->insert(MBBI, LoopBB);
772 MF->insert(MBBI, RestoreExecBB);
773 MF->insert(MBBI, RemainderBB);
774
775 LoopBB->addSuccessor(RestoreExecBB);
776 LoopBB->addSuccessor(LoopBB);
777
778 // Move the rest of the block into a new block.
779 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
780 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
781
782 MBB.addSuccessor(LoopBB);
783 RestoreExecBB->addSuccessor(RemainderBB);
784
785 B.setInsertPt(*LoopBB, LoopBB->end());
786
787 B.buildInstr(TargetOpcode::PHI)
788 .addDef(PhiExec)
789 .addReg(InitSaveExecReg)
790 .addMBB(&MBB)
791 .addReg(NewExec)
792 .addMBB(LoopBB);
793
794 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
795 B.buildInstr(TargetOpcode::G_PHI)
796 .addDef(std::get<2>(Result))
797 .addReg(std::get<0>(Result)) // Initial value / implicit_def
798 .addMBB(&MBB)
799 .addReg(std::get<1>(Result)) // Mid-loop value.
800 .addMBB(LoopBB);
801 }
802
803 const DebugLoc &DL = B.getDL();
804
805 MachineInstr &FirstInst = *Range.begin();
806
807 // Move the instruction into the loop. Note we moved everything after
808 // Range.end() already into a new block, so Range.end() is no longer valid.
809 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
810
811 // Figure out the iterator range after splicing the instructions.
812 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
813 auto NewEnd = LoopBB->end();
814
815 MachineBasicBlock::iterator I = Range.begin();
816 B.setInsertPt(*LoopBB, I);
817
818 Register CondReg;
819
820 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
821
822 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
823 for (MachineOperand &Op : MI.uses()) {
824 if (!Op.isReg() || Op.isDef())
825 continue;
826
827 Register OldReg = Op.getReg();
828 if (!SGPROperandRegs.count(OldReg))
829 continue;
830
831 // See if we already processed this register in another instruction in the
832 // sequence.
833 auto OldVal = WaterfalledRegMap.find(OldReg);
834 if (OldVal != WaterfalledRegMap.end()) {
835 Op.setReg(OldVal->second);
836 continue;
837 }
838
839 Register OpReg = Op.getReg();
840 LLT OpTy = MRI.getType(OpReg);
841
842 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
843 if (OpBank != &AMDGPU::VGPRRegBank) {
844 // Insert copy from AGPR to VGPR before the loop.
845 B.setMBB(MBB);
846 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
847 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
848 B.setInstr(*I);
849 }
850
851 unsigned OpSize = OpTy.getSizeInBits();
852
853 // Can only do a readlane of 32-bit pieces.
854 if (OpSize == 32) {
855 // Avoid extra copies in the simple case of one 32-bit register.
856 Register CurrentLaneOpReg
857 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
858 MRI.setType(CurrentLaneOpReg, OpTy);
859
860 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
861 // Read the next variant <- also loop target.
862 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
863 CurrentLaneOpReg)
864 .addReg(OpReg);
865
866 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
867 bool First = CondReg == AMDGPU::NoRegister;
868 if (First)
869 CondReg = NewCondReg;
870
871 // Compare the just read M0 value to all possible Idx values.
872 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
873 .addDef(NewCondReg)
874 .addReg(CurrentLaneOpReg)
875 .addReg(OpReg);
876 Op.setReg(CurrentLaneOpReg);
877
878 if (!First) {
879 Register AndReg = MRI.createVirtualRegister(WaveRC);
880
881 // If there are multiple operands to consider, and the conditions.
882 B.buildInstr(WaveAndOpc)
883 .addDef(AndReg)
884 .addReg(NewCondReg)
885 .addReg(CondReg);
886 CondReg = AndReg;
887 }
888 } else {
889 LLT S32 = LLT::scalar(32);
890 SmallVector<Register, 8> ReadlanePieces;
891
892 // The compares can be done as 64-bit, but the extract needs to be done
893 // in 32-bit pieces.
894
895 bool Is64 = OpSize % 64 == 0;
896
897 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899 : AMDGPU::V_CMP_EQ_U32_e64;
900
901 // The compares can be done as 64-bit, but the extract needs to be done
902 // in 32-bit pieces.
903
904 // Insert the unmerge before the loop.
905
906 B.setMBB(MBB);
907 auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
908 B.setInstr(*I);
909
910 unsigned NumPieces = Unmerge->getNumOperands() - 1;
911 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
912 Register UnmergePiece = Unmerge.getReg(PieceIdx);
913
914 Register CurrentLaneOpReg;
915 if (Is64) {
916 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
917 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
918
919 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
920 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
921 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
922
923 // Read the next variant <- also loop target.
924 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
925 CurrentLaneOpRegLo)
926 .addReg(UnmergePiece, 0, AMDGPU::sub0);
927
928 // Read the next variant <- also loop target.
929 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930 CurrentLaneOpRegHi)
931 .addReg(UnmergePiece, 0, AMDGPU::sub1);
932
933 CurrentLaneOpReg =
934 B.buildMerge(LLT::scalar(64),
935 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
936 .getReg(0);
937
938 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
939
940 if (OpTy.getScalarSizeInBits() == 64) {
941 // If we need to produce a 64-bit element vector, so use the
942 // merged pieces
943 ReadlanePieces.push_back(CurrentLaneOpReg);
944 } else {
945 // 32-bit element type.
946 ReadlanePieces.push_back(CurrentLaneOpRegLo);
947 ReadlanePieces.push_back(CurrentLaneOpRegHi);
948 }
949 } else {
950 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
951 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
952 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
953
954 // Read the next variant <- also loop target.
955 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
956 CurrentLaneOpReg)
957 .addReg(UnmergePiece);
958 ReadlanePieces.push_back(CurrentLaneOpReg);
959 }
960
961 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
962 bool First = CondReg == AMDGPU::NoRegister;
963 if (First)
964 CondReg = NewCondReg;
965
966 B.buildInstr(CmpOp)
967 .addDef(NewCondReg)
968 .addReg(CurrentLaneOpReg)
969 .addReg(UnmergePiece);
970
971 if (!First) {
972 Register AndReg = MRI.createVirtualRegister(WaveRC);
973
974 // If there are multiple operands to consider, and the conditions.
975 B.buildInstr(WaveAndOpc)
976 .addDef(AndReg)
977 .addReg(NewCondReg)
978 .addReg(CondReg);
979 CondReg = AndReg;
980 }
981 }
982
983 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
984 // BUILD_VECTOR
985 if (OpTy.isVector()) {
986 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
987 Op.setReg(Merge.getReg(0));
988 } else {
989 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
990 Op.setReg(Merge.getReg(0));
991 }
992
993 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
994 }
995
996 // Make sure we don't re-process this register again.
997 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
998 }
999 }
1000
1001 B.setInsertPt(*LoopBB, LoopBB->end());
1002
1003 // Update EXEC, save the original EXEC value to VCC.
1004 B.buildInstr(AndSaveExecOpc)
1005 .addDef(NewExec)
1006 .addReg(CondReg, RegState::Kill);
1007
1008 MRI.setSimpleHint(NewExec, CondReg);
1009
1010 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011 B.buildInstr(XorTermOpc)
1012 .addDef(ExecReg)
1013 .addReg(ExecReg)
1014 .addReg(NewExec);
1015
1016 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1017 // s_cbranch_scc0?
1018
1019 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1021 .addMBB(LoopBB);
1022
1023 // Save the EXEC mask before the loop.
1024 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1025 .addReg(ExecReg);
1026
1027 // Restore the EXEC mask after the loop.
1028 B.setMBB(*RestoreExecBB);
1029 B.buildInstr(MovTermOpc)
1030 .addDef(ExecReg)
1031 .addReg(SaveExecReg);
1032
1033 // Set the insert point after the original instruction, so any new
1034 // instructions will be in the remainder.
1035 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1036
1037 return true;
1038 }
1039
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operands to handle and a
1043 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1045 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1046 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1047 for (unsigned Op : OpIndices) {
1048 assert(MI.getOperand(Op).isUse());
1049 Register Reg = MI.getOperand(Op).getReg();
1050 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1051 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1052 SGPROperandRegs.insert(Reg);
1053 }
1054
1055 // No operands need to be replaced, so no need to loop.
1056 return !SGPROperandRegs.empty();
1057 }
1058
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1060 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1061 ArrayRef<unsigned> OpIndices) const {
1062 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063 // are the same register.
1064 SmallSet<Register, 4> SGPROperandRegs;
1065
1066 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1067 return false;
1068
1069 MachineBasicBlock::iterator I = MI.getIterator();
1070 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1071 SGPROperandRegs, MRI);
1072 }
1073
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1075 MachineInstr &MI, MachineRegisterInfo &MRI,
1076 ArrayRef<unsigned> OpIndices) const {
1077 MachineIRBuilder B(MI);
1078 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1079 }
1080
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1083 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1084 Register Reg = MI.getOperand(OpIdx).getReg();
1085 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1086 if (Bank == &AMDGPU::SGPRRegBank)
1087 return;
1088
1089 LLT Ty = MRI.getType(Reg);
1090 MachineIRBuilder B(MI);
1091
1092 if (Bank != &AMDGPU::VGPRRegBank) {
1093 // We need to copy from AGPR to VGPR
1094 Reg = B.buildCopy(Ty, Reg).getReg(0);
1095 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1096 }
1097
1098 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1099 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1100 .addDef(SGPR)
1101 .addReg(Reg);
1102
1103 MRI.setType(SGPR, Ty);
1104
1105 const TargetRegisterClass *Constrained =
1106 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1107 (void)Constrained;
1108 assert(Constrained && "Failed to constrain readfirstlane src reg");
1109
1110 MI.getOperand(OpIdx).setReg(SGPR);
1111 }
1112
1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1114 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1115 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1116 unsigned TotalSize = Ty.getSizeInBits();
1117 if (!Ty.isVector())
1118 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1119
1120 LLT EltTy = Ty.getElementType();
1121 unsigned EltSize = EltTy.getSizeInBits();
1122 assert(FirstSize % EltSize == 0);
1123
1124 unsigned FirstPartNumElts = FirstSize / EltSize;
1125 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1126
1127 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1128 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1129 }
1130
widen96To128(LLT Ty)1131 static LLT widen96To128(LLT Ty) {
1132 if (!Ty.isVector())
1133 return LLT::scalar(128);
1134
1135 LLT EltTy = Ty.getElementType();
1136 assert(128 % EltTy.getSizeInBits() == 0);
1137 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1138 }
1139
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1140 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1141 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1142 MachineRegisterInfo &MRI) const {
1143 Register DstReg = MI.getOperand(0).getReg();
1144 const LLT LoadTy = MRI.getType(DstReg);
1145 unsigned LoadSize = LoadTy.getSizeInBits();
1146 const unsigned MaxNonSmrdLoadSize = 128;
1147
1148 const RegisterBank *DstBank =
1149 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1150 if (DstBank == &AMDGPU::SGPRRegBank) {
1151 // There are some special cases that we need to look at for 32 bit and 96
1152 // bit SGPR loads otherwise we have nothing to do.
1153 if (LoadSize != 32 && LoadSize != 96)
1154 return false;
1155
1156 MachineMemOperand *MMO = *MI.memoperands_begin();
1157 const unsigned MemSize = 8 * MMO->getSize();
1158 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1159 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1160 // scalar loads should have a load size of 32 but memory access size of less
1161 // than 32.
1162 if (LoadSize == 32 &&
1163 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1164 return false;
1165
1166 Register PtrReg = MI.getOperand(1).getReg();
1167
1168 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1169 MachineIRBuilder B(MI, O);
1170
1171 if (LoadSize == 32) {
1172 // This is an extending load from a sub-dword size. Widen the memory
1173 // access size to 4 bytes and clear the extra high bits appropriately
1174 const LLT S32 = LLT::scalar(32);
1175 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1176 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1177 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1178 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1179 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1180 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1181 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1182 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1183 } else
1184 // We do not need to touch the higher bits for regular loads.
1185 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1186 } else {
1187 // 96-bit loads are only available for vector loads. We need to split this
1188 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1189 if (MMO->getAlign() < Align(16)) {
1190 LLT Part64, Part32;
1191 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1192 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1193 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1194
1195 auto Undef = B.buildUndef(LoadTy);
1196 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1197 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1198 } else {
1199 LLT WiderTy = widen96To128(LoadTy);
1200 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1201 B.buildExtract(MI.getOperand(0), WideLoad, 0);
1202 }
1203 }
1204
1205 MI.eraseFromParent();
1206 return true;
1207 }
1208
1209 // 128-bit loads are supported for all instruction types.
1210 if (LoadSize <= MaxNonSmrdLoadSize)
1211 return false;
1212
1213 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1214 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1215
1216 if (SrcRegs.empty())
1217 SrcRegs.push_back(MI.getOperand(1).getReg());
1218
1219 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1220
1221 // RegBankSelect only emits scalar types, so we need to reset the pointer
1222 // operand to a pointer type.
1223 Register BasePtrReg = SrcRegs[0];
1224 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1225 MRI.setType(BasePtrReg, PtrTy);
1226
1227 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1228 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1229 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1230 MachineIRBuilder B(MI, Observer);
1231 LegalizerHelper Helper(B.getMF(), Observer, B);
1232
1233 if (LoadTy.isVector()) {
1234 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1235 return false;
1236 } else {
1237 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1238 return false;
1239 }
1240
1241 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1242 return true;
1243 }
1244
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1245 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1246 MachineInstr &MI,
1247 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1248 MachineRegisterInfo &MRI) const {
1249 const MachineFunction &MF = *MI.getMF();
1250 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1251 const auto &TFI = *ST.getFrameLowering();
1252
1253 // Guard in case the stack growth direction ever changes with scratch
1254 // instructions.
1255 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1256 return false;
1257
1258 Register Dst = MI.getOperand(0).getReg();
1259 Register AllocSize = MI.getOperand(1).getReg();
1260 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1261
1262 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1263
1264 // TODO: Need to emit a wave reduction to get the maximum size.
1265 if (SizeBank != &AMDGPU::SGPRRegBank)
1266 return false;
1267
1268 LLT PtrTy = MRI.getType(Dst);
1269 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1270
1271 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1272 Register SPReg = Info->getStackPtrOffsetReg();
1273 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1274 MachineIRBuilder B(MI, ApplyBank);
1275
1276 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1277 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1278
1279 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1280 if (Alignment > TFI.getStackAlign()) {
1281 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1282 B.buildMaskLowPtrBits(Dst, PtrAdd,
1283 Log2(Alignment) + ST.getWavefrontSizeLog2());
1284 } else {
1285 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1286 }
1287
1288 MI.eraseFromParent();
1289 return true;
1290 }
1291
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1292 bool AMDGPURegisterBankInfo::applyMappingImage(
1293 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1294 MachineRegisterInfo &MRI, int RsrcIdx) const {
1295 const int NumDefs = MI.getNumExplicitDefs();
1296
1297 // The reported argument index is relative to the IR intrinsic call arguments,
1298 // so we need to shift by the number of defs and the intrinsic ID.
1299 RsrcIdx += NumDefs + 1;
1300
1301 // Insert copies to VGPR arguments.
1302 applyDefaultMapping(OpdMapper);
1303
1304 // Fixup any SGPR arguments.
1305 SmallVector<unsigned, 4> SGPRIndexes;
1306 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1307 if (!MI.getOperand(I).isReg())
1308 continue;
1309
1310 // If this intrinsic has a sampler, it immediately follows rsrc.
1311 if (I == RsrcIdx || I == RsrcIdx + 1)
1312 SGPRIndexes.push_back(I);
1313 }
1314
1315 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1316 return true;
1317 }
1318
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1319 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1320 Register Reg) {
1321 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1322 if (!Def)
1323 return Reg;
1324
1325 // TODO: Guard against this being an implicit def
1326 return Def->getOperand(0).getReg();
1327 }
1328
1329 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1330 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1331 static unsigned setBufferOffsets(MachineIRBuilder &B,
1332 const AMDGPURegisterBankInfo &RBI,
1333 Register CombinedOffset, Register &VOffsetReg,
1334 Register &SOffsetReg, int64_t &InstOffsetVal,
1335 Align Alignment) {
1336 const LLT S32 = LLT::scalar(32);
1337 MachineRegisterInfo *MRI = B.getMRI();
1338
1339 if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
1340 uint32_t SOffset, ImmOffset;
1341 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1342 Alignment)) {
1343 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1344 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1345 InstOffsetVal = ImmOffset;
1346
1347 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1348 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1349 return SOffset + ImmOffset;
1350 }
1351 }
1352
1353 Register Base;
1354 unsigned Offset;
1355
1356 std::tie(Base, Offset) =
1357 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1358
1359 uint32_t SOffset, ImmOffset;
1360 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1361 &RBI.Subtarget, Alignment)) {
1362 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1363 VOffsetReg = Base;
1364 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1365 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1366 InstOffsetVal = ImmOffset;
1367 return 0; // XXX - Why is this 0?
1368 }
1369
1370 // If we have SGPR base, we can use it for soffset.
1371 if (SOffset == 0) {
1372 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1373 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1374 SOffsetReg = Base;
1375 InstOffsetVal = ImmOffset;
1376 return 0; // XXX - Why is this 0?
1377 }
1378 }
1379
1380 // Handle the variable sgpr + vgpr case.
1381 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1382 if (Add && (int)Offset >= 0) {
1383 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1384 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1385
1386 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1387 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1388
1389 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1390 VOffsetReg = Src0;
1391 SOffsetReg = Src1;
1392 return 0;
1393 }
1394
1395 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1396 VOffsetReg = Src1;
1397 SOffsetReg = Src0;
1398 return 0;
1399 }
1400 }
1401
1402 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1403 // have an SGPR offset and a VGPR resource.
1404 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1405 VOffsetReg = CombinedOffset;
1406 } else {
1407 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1408 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1409 }
1410
1411 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1412 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1413 return 0;
1414 }
1415
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1416 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1417 const OperandsMapper &OpdMapper) const {
1418 MachineInstr &MI = OpdMapper.getMI();
1419 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1420
1421 const LLT S32 = LLT::scalar(32);
1422 Register Dst = MI.getOperand(0).getReg();
1423 LLT Ty = MRI.getType(Dst);
1424
1425 const RegisterBank *RSrcBank =
1426 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1427 const RegisterBank *OffsetBank =
1428 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1429 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1430 OffsetBank == &AMDGPU::SGPRRegBank)
1431 return true; // Legal mapping
1432
1433 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1434 // here but don't have an MMO.
1435
1436 unsigned LoadSize = Ty.getSizeInBits();
1437 int NumLoads = 1;
1438 if (LoadSize == 256 || LoadSize == 512) {
1439 NumLoads = LoadSize / 128;
1440 Ty = Ty.divide(NumLoads);
1441 }
1442
1443 // Use the alignment to ensure that the required offsets will fit into the
1444 // immediate offsets.
1445 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1446
1447 MachineIRBuilder B(MI);
1448 MachineFunction &MF = B.getMF();
1449
1450 Register SOffset;
1451 Register VOffset;
1452 int64_t ImmOffset = 0;
1453
1454 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1455 VOffset, SOffset, ImmOffset, Alignment);
1456
1457 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1458 // can, but we neeed to track an MMO for that.
1459 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1460 const Align MemAlign(4); // FIXME: ABI type alignment?
1461 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1462 MachinePointerInfo(),
1463 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1464 MachineMemOperand::MOInvariant,
1465 MemSize, MemAlign);
1466 if (MMOOffset != 0)
1467 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1468
1469 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1470 // assume that the buffer is unswizzled.
1471
1472 Register RSrc = MI.getOperand(1).getReg();
1473 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1474 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1475
1476 SmallVector<Register, 4> LoadParts(NumLoads);
1477
1478 MachineBasicBlock::iterator MII = MI.getIterator();
1479 MachineInstrSpan Span(MII, &B.getMBB());
1480
1481 for (int i = 0; i < NumLoads; ++i) {
1482 if (NumLoads == 1) {
1483 LoadParts[i] = Dst;
1484 } else {
1485 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1486 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1487 }
1488
1489 MachineMemOperand *MMO = BaseMMO;
1490 if (i != 0)
1491 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1492
1493 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1494 .addDef(LoadParts[i]) // vdata
1495 .addUse(RSrc) // rsrc
1496 .addUse(VIndex) // vindex
1497 .addUse(VOffset) // voffset
1498 .addUse(SOffset) // soffset
1499 .addImm(ImmOffset + 16 * i) // offset(imm)
1500 .addImm(0) // cachepolicy, swizzled buffer(imm)
1501 .addImm(0) // idxen(imm)
1502 .addMemOperand(MMO);
1503 }
1504
1505 // TODO: If only the resource is a VGPR, it may be better to execute the
1506 // scalar load in the waterfall loop if the resource is expected to frequently
1507 // be dynamically uniform.
1508 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1509 // Remove the original instruction to avoid potentially confusing the
1510 // waterfall loop logic.
1511 B.setInstr(*Span.begin());
1512 MI.eraseFromParent();
1513
1514 SmallSet<Register, 4> OpsToWaterfall;
1515
1516 OpsToWaterfall.insert(RSrc);
1517 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1518 OpsToWaterfall, MRI);
1519 }
1520
1521 if (NumLoads != 1) {
1522 if (Ty.isVector())
1523 B.buildConcatVectors(Dst, LoadParts);
1524 else
1525 B.buildMerge(Dst, LoadParts);
1526 }
1527
1528 // We removed the instruction earlier with a waterfall loop.
1529 if (RSrcBank == &AMDGPU::SGPRRegBank)
1530 MI.eraseFromParent();
1531
1532 return true;
1533 }
1534
applyMappingBFE(const OperandsMapper & OpdMapper,bool Signed) const1535 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1536 bool Signed) const {
1537 MachineInstr &MI = OpdMapper.getMI();
1538 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1539
1540 // Insert basic copies
1541 applyDefaultMapping(OpdMapper);
1542
1543 Register DstReg = MI.getOperand(0).getReg();
1544 LLT Ty = MRI.getType(DstReg);
1545
1546 const LLT S32 = LLT::scalar(32);
1547
1548 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1549 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1550 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1551 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1552
1553 const RegisterBank *DstBank =
1554 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1555 if (DstBank == &AMDGPU::VGPRRegBank) {
1556 if (Ty == S32)
1557 return true;
1558
1559 // There is no 64-bit vgpr bitfield extract instructions so the operation
1560 // is expanded to a sequence of instructions that implement the operation.
1561 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1562 MachineIRBuilder B(MI, ApplyBank);
1563
1564 const LLT S64 = LLT::scalar(64);
1565 // Shift the source operand so that extracted bits start at bit 0.
1566 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1567 : B.buildLShr(S64, SrcReg, OffsetReg);
1568 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1569
1570 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1571 // if the width is a constant.
1572 if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) {
1573 // Use the 32-bit bitfield extract instruction if the width is a constant.
1574 // Depending on the width size, use either the low or high 32-bits.
1575 auto Zero = B.buildConstant(S32, 0);
1576 auto WidthImm = ConstWidth->Value.getZExtValue();
1577 if (WidthImm <= 32) {
1578 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1579 // or clear the upper 32-bits.
1580 auto Extract =
1581 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1582 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1583 auto Extend =
1584 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1585 B.buildMerge(DstReg, {Extract, Extend});
1586 } else {
1587 // Use bitfield extract on upper 32-bit source, and combine with lower
1588 // 32-bit source.
1589 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1590 auto Extract =
1591 Signed
1592 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1593 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1594 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1595 }
1596 MI.eraseFromParent();
1597 return true;
1598 }
1599
1600 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1601 // operations.
1602 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1603 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1604 if (Signed)
1605 B.buildAShr(S64, SignBit, ExtShift);
1606 else
1607 B.buildLShr(S64, SignBit, ExtShift);
1608 MI.eraseFromParent();
1609 return true;
1610 }
1611
1612 // The scalar form packs the offset and width in a single operand.
1613
1614 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1615 MachineIRBuilder B(MI, ApplyBank);
1616
1617 // Ensure the high bits are clear to insert the offset.
1618 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1619 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1620
1621 // Zeros out the low bits, so don't bother clamping the input value.
1622 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1623
1624 // Transformation function, pack the offset and width of a BFE into
1625 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1626 // source, bits [5:0] contain the offset and bits [22:16] the width.
1627 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1628
1629 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1630 // register class constraints.
1631 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1632 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1633
1634 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1635 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1636 llvm_unreachable("failed to constrain BFE");
1637
1638 MI.eraseFromParent();
1639 return true;
1640 }
1641
1642 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1643 static unsigned getExtendOp(unsigned Opc) {
1644 switch (Opc) {
1645 case TargetOpcode::G_ASHR:
1646 case TargetOpcode::G_SMIN:
1647 case TargetOpcode::G_SMAX:
1648 return TargetOpcode::G_SEXT;
1649 case TargetOpcode::G_LSHR:
1650 case TargetOpcode::G_UMIN:
1651 case TargetOpcode::G_UMAX:
1652 return TargetOpcode::G_ZEXT;
1653 default:
1654 return TargetOpcode::G_ANYEXT;
1655 }
1656 }
1657
1658 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1659 // any illegal vector extend or unmerge operations.
1660 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1661 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1662 const LLT S32 = LLT::scalar(32);
1663 auto Bitcast = B.buildBitcast(S32, Src);
1664
1665 if (ExtOpcode == TargetOpcode::G_SEXT) {
1666 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1667 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1668 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1669 }
1670
1671 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1672 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1673 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1674 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1675 }
1676
1677 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1678 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1679 }
1680
1681 // For cases where only a single copy is inserted for matching register banks.
1682 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1683 static bool substituteSimpleCopyRegs(
1684 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1685 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1686 if (!SrcReg.empty()) {
1687 assert(SrcReg.size() == 1);
1688 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1689 return true;
1690 }
1691
1692 return false;
1693 }
1694
1695 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1696 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1697 MachineRegisterInfo &MRI,
1698 Register Reg) const {
1699 if (!Subtarget.hasUnpackedD16VMem())
1700 return Reg;
1701
1702 const LLT S16 = LLT::scalar(16);
1703 LLT StoreVT = MRI.getType(Reg);
1704 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1705 return Reg;
1706
1707 auto Unmerge = B.buildUnmerge(S16, Reg);
1708
1709
1710 SmallVector<Register, 4> WideRegs;
1711 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1712 WideRegs.push_back(Unmerge.getReg(I));
1713
1714 const LLT S32 = LLT::scalar(32);
1715 int NumElts = StoreVT.getNumElements();
1716
1717 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1718 }
1719
1720 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1721 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1722 int64_t Const;
1723 if (mi_match(Reg, MRI, m_ICst(Const)))
1724 return std::make_pair(Register(), Const);
1725
1726 Register Base;
1727 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1728 return std::make_pair(Base, Const);
1729
1730 // TODO: Handle G_OR used for add case
1731 return std::make_pair(Reg, 0);
1732 }
1733
1734 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1735 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1736 Register OrigOffset) const {
1737 const unsigned MaxImm = 4095;
1738 Register BaseReg;
1739 unsigned ImmOffset;
1740 const LLT S32 = LLT::scalar(32);
1741
1742 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1743 OrigOffset);
1744
1745 unsigned C1 = 0;
1746 if (ImmOffset != 0) {
1747 // If the immediate value is too big for the immoffset field, put the value
1748 // and -4096 into the immoffset field so that the value that is copied/added
1749 // for the voffset field is a multiple of 4096, and it stands more chance
1750 // of being CSEd with the copy/add for another similar load/store.
1751 // However, do not do that rounding down to a multiple of 4096 if that is a
1752 // negative number, as it appears to be illegal to have a negative offset
1753 // in the vgpr, even if adding the immediate offset makes it positive.
1754 unsigned Overflow = ImmOffset & ~MaxImm;
1755 ImmOffset -= Overflow;
1756 if ((int32_t)Overflow < 0) {
1757 Overflow += ImmOffset;
1758 ImmOffset = 0;
1759 }
1760
1761 C1 = ImmOffset;
1762 if (Overflow != 0) {
1763 if (!BaseReg)
1764 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1765 else {
1766 auto OverflowVal = B.buildConstant(S32, Overflow);
1767 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1768 }
1769 }
1770 }
1771
1772 if (!BaseReg)
1773 BaseReg = B.buildConstant(S32, 0).getReg(0);
1774
1775 return {BaseReg, C1};
1776 }
1777
isZero(Register Reg,MachineRegisterInfo & MRI)1778 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1779 int64_t C;
1780 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1781 }
1782
extractCPol(unsigned CachePolicy)1783 static unsigned extractCPol(unsigned CachePolicy) {
1784 return CachePolicy & AMDGPU::CPol::ALL;
1785 }
1786
extractSWZ(unsigned CachePolicy)1787 static unsigned extractSWZ(unsigned CachePolicy) {
1788 return (CachePolicy >> 3) & 1;
1789 }
1790
1791
1792 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1793 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1794 MachineInstr &MI) const {
1795 MachineRegisterInfo &MRI = *B.getMRI();
1796 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1797
1798 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1799
1800 Register VData = MI.getOperand(1).getReg();
1801 LLT Ty = MRI.getType(VData);
1802
1803 int EltSize = Ty.getScalarSizeInBits();
1804 int Size = Ty.getSizeInBits();
1805
1806 // FIXME: Broken integer truncstore.
1807 if (EltSize != 32)
1808 report_fatal_error("unhandled intrinsic store");
1809
1810 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1811 const int MemSize = (*MI.memoperands_begin())->getSize();
1812
1813
1814 Register RSrc = MI.getOperand(2).getReg();
1815 Register VOffset = MI.getOperand(3).getReg();
1816 Register SOffset = MI.getOperand(4).getReg();
1817 unsigned CachePolicy = MI.getOperand(5).getImm();
1818
1819 unsigned ImmOffset;
1820 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1821
1822 const bool Offen = !isZero(VOffset, MRI);
1823
1824 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1825 switch (8 * MemSize) {
1826 case 8:
1827 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1828 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1829 break;
1830 case 16:
1831 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1832 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1833 break;
1834 default:
1835 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1836 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1837 if (Size > 32)
1838 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1839 break;
1840 }
1841
1842
1843 // Set the insertion point back to the instruction in case it was moved into a
1844 // loop.
1845 B.setInstr(MI);
1846
1847 MachineInstrBuilder MIB = B.buildInstr(Opc)
1848 .addUse(VData);
1849
1850 if (Offen)
1851 MIB.addUse(VOffset);
1852
1853 MIB.addUse(RSrc)
1854 .addUse(SOffset)
1855 .addImm(ImmOffset)
1856 .addImm(extractCPol(CachePolicy))
1857 .addImm(0) // tfe: FIXME: Remove from inst
1858 .addImm(extractSWZ(CachePolicy))
1859 .cloneMemRefs(MI);
1860
1861 // FIXME: We need a way to report failure from applyMappingImpl.
1862 // Insert constrain copies before inserting the loop.
1863 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1864 report_fatal_error("failed to constrain selected store intrinsic");
1865
1866 return MIB;
1867 }
1868
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1870 Register SrcReg) const {
1871 MachineRegisterInfo &MRI = *B.getMRI();
1872 LLT SrcTy = MRI.getType(SrcReg);
1873 if (SrcTy.getSizeInBits() == 32) {
1874 // Use a v_mov_b32 here to make the exec dependency explicit.
1875 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876 .addDef(DstReg)
1877 .addUse(SrcReg);
1878 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1879 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1880 }
1881
1882 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1883 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1884
1885 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1886 .addDef(TmpReg0)
1887 .addUse(SrcReg, 0, AMDGPU::sub0);
1888 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1889 .addDef(TmpReg1)
1890 .addUse(SrcReg, 0, AMDGPU::sub1);
1891 B.buildInstr(AMDGPU::REG_SEQUENCE)
1892 .addDef(DstReg)
1893 .addUse(TmpReg0)
1894 .addImm(AMDGPU::sub0)
1895 .addUse(TmpReg1)
1896 .addImm(AMDGPU::sub1);
1897
1898 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1899 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1900 }
1901
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterwall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1905 MachineInstr &IdxUseInstr,
1906 unsigned OpIdx,
1907 unsigned ConstOffset) {
1908 MachineRegisterInfo &MRI = *B.getMRI();
1909 const LLT S32 = LLT::scalar(32);
1910 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1911 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1912
1913 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1914
1915 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1916 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1917 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1918 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1919 }
1920
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1924 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1925 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1926 Register Hi32Reg, Register Lo32Reg,
1927 unsigned ExtOpc,
1928 const RegisterBank &RegBank,
1929 bool IsBooleanSrc = false) {
1930 if (ExtOpc == AMDGPU::G_ZEXT) {
1931 B.buildConstant(Hi32Reg, 0);
1932 } else if (ExtOpc == AMDGPU::G_SEXT) {
1933 if (IsBooleanSrc) {
1934 // If we know the original source was an s1, the high half is the same as
1935 // the low.
1936 B.buildCopy(Hi32Reg, Lo32Reg);
1937 } else {
1938 // Replicate sign bit from 32-bit extended part.
1939 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1940 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1941 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1942 }
1943 } else {
1944 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1945 B.buildUndef(Hi32Reg);
1946 }
1947 }
1948
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1950 MachineInstr &MI, MachineRegisterInfo &MRI,
1951 const OperandsMapper &OpdMapper) const {
1952
1953 Register VecReg = MI.getOperand(1).getReg();
1954 Register Idx = MI.getOperand(2).getReg();
1955
1956 const RegisterBank &IdxBank =
1957 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1958
1959 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1960
1961 LLT VecTy = MRI.getType(VecReg);
1962 unsigned EltSize = VecTy.getScalarSizeInBits();
1963 unsigned NumElem = VecTy.getNumElements();
1964
1965 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1966 IsDivergentIdx))
1967 return false;
1968
1969 MachineIRBuilder B(MI);
1970 LLT S32 = LLT::scalar(32);
1971
1972 const RegisterBank &DstBank =
1973 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1974 const RegisterBank &SrcBank =
1975 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1976
1977 const RegisterBank &CCBank =
1978 (DstBank == AMDGPU::SGPRRegBank &&
1979 SrcBank == AMDGPU::SGPRRegBank &&
1980 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1981 : AMDGPU::VCCRegBank;
1982 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1983
1984 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1985 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1986 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1987 }
1988
1989 LLT EltTy = VecTy.getScalarType();
1990 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1991 unsigned NumLanes = DstRegs.size();
1992 if (!NumLanes)
1993 NumLanes = 1;
1994 else
1995 EltTy = MRI.getType(DstRegs[0]);
1996
1997 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1998 SmallVector<Register, 2> Res(NumLanes);
1999 for (unsigned L = 0; L < NumLanes; ++L)
2000 Res[L] = UnmergeToEltTy.getReg(L);
2001
2002 for (unsigned I = 1; I < NumElem; ++I) {
2003 auto IC = B.buildConstant(S32, I);
2004 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2005 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2006 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2007
2008 for (unsigned L = 0; L < NumLanes; ++L) {
2009 auto S = B.buildSelect(EltTy, Cmp,
2010 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2011
2012 for (unsigned N : { 0, 2, 3 })
2013 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2014
2015 Res[L] = S->getOperand(0).getReg();
2016 }
2017 }
2018
2019 for (unsigned L = 0; L < NumLanes; ++L) {
2020 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2021 B.buildCopy(DstReg, Res[L]);
2022 MRI.setRegBank(DstReg, DstBank);
2023 }
2024
2025 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2026 MI.eraseFromParent();
2027
2028 return true;
2029 }
2030
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2033 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2034 MachineIRBuilder &B, Register &Reg,
2035 const RegisterBank &Bank) {
2036 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2037 if (CurrBank && *CurrBank != Bank) {
2038 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2039 MRI.setRegBank(Copy, Bank);
2040 return Copy;
2041 }
2042
2043 MRI.setRegBank(Reg, Bank);
2044 return Reg;
2045 }
2046
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2048 MachineInstr &MI, MachineRegisterInfo &MRI,
2049 const OperandsMapper &OpdMapper) const {
2050
2051 Register VecReg = MI.getOperand(1).getReg();
2052 Register Idx = MI.getOperand(3).getReg();
2053
2054 const RegisterBank &IdxBank =
2055 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2056
2057 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2058
2059 LLT VecTy = MRI.getType(VecReg);
2060 unsigned EltSize = VecTy.getScalarSizeInBits();
2061 unsigned NumElem = VecTy.getNumElements();
2062
2063 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2064 IsDivergentIdx))
2065 return false;
2066
2067 MachineIRBuilder B(MI);
2068 LLT S32 = LLT::scalar(32);
2069
2070 const RegisterBank &DstBank =
2071 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2072 const RegisterBank &SrcBank =
2073 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2074 const RegisterBank &InsBank =
2075 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2076
2077 const RegisterBank &CCBank =
2078 (DstBank == AMDGPU::SGPRRegBank &&
2079 SrcBank == AMDGPU::SGPRRegBank &&
2080 InsBank == AMDGPU::SGPRRegBank &&
2081 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2082 : AMDGPU::VCCRegBank;
2083 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2084
2085 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2086 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2087 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2088 }
2089
2090 LLT EltTy = VecTy.getScalarType();
2091 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2092 unsigned NumLanes = InsRegs.size();
2093 if (!NumLanes) {
2094 NumLanes = 1;
2095 InsRegs.push_back(MI.getOperand(2).getReg());
2096 } else {
2097 EltTy = MRI.getType(InsRegs[0]);
2098 }
2099
2100 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2101 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2102
2103 for (unsigned I = 0; I < NumElem; ++I) {
2104 auto IC = B.buildConstant(S32, I);
2105 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2106 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2107 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2108
2109 for (unsigned L = 0; L < NumLanes; ++L) {
2110 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2111 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2112 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2113
2114 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2115 MRI.setRegBank(Select, DstBank);
2116
2117 Ops[I * NumLanes + L] = Select;
2118 }
2119 }
2120
2121 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2122 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2123 B.buildBuildVector(MI.getOperand(0), Ops);
2124 } else {
2125 auto Vec = B.buildBuildVector(MergeTy, Ops);
2126 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2127 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2128 }
2129
2130 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2131 MI.eraseFromParent();
2132
2133 return true;
2134 }
2135
applyMappingImpl(const OperandsMapper & OpdMapper) const2136 void AMDGPURegisterBankInfo::applyMappingImpl(
2137 const OperandsMapper &OpdMapper) const {
2138 MachineInstr &MI = OpdMapper.getMI();
2139 unsigned Opc = MI.getOpcode();
2140 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2141 switch (Opc) {
2142 case AMDGPU::G_PHI: {
2143 Register DstReg = MI.getOperand(0).getReg();
2144 LLT DstTy = MRI.getType(DstReg);
2145 if (DstTy != LLT::scalar(1))
2146 break;
2147
2148 const LLT S32 = LLT::scalar(32);
2149 const RegisterBank *DstBank =
2150 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2151 if (DstBank == &AMDGPU::VCCRegBank) {
2152 applyDefaultMapping(OpdMapper);
2153 // The standard handling only considers the result register bank for
2154 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2155 // produce an invalid copy. We can only copy with some kind of compare to
2156 // get a vector boolean result. Insert a regitser bank copy that will be
2157 // correctly lowered to a compare.
2158 MachineIRBuilder B(*MI.getParent()->getParent());
2159
2160 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2161 Register SrcReg = MI.getOperand(I).getReg();
2162 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2163
2164 if (SrcBank != &AMDGPU::VCCRegBank) {
2165 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2166 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2167
2168 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2169 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2170 MI.getOperand(I).setReg(Copy.getReg(0));
2171 }
2172 }
2173
2174 return;
2175 }
2176
2177 // Phi handling is strange and only considers the bank of the destination.
2178 substituteSimpleCopyRegs(OpdMapper, 0);
2179
2180 // Promote SGPR/VGPR booleans to s32
2181 MachineFunction *MF = MI.getParent()->getParent();
2182 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2183 MachineIRBuilder B(MI, ApplyBank);
2184 LegalizerHelper Helper(*MF, ApplyBank, B);
2185
2186 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2187 llvm_unreachable("widen scalar should have succeeded");
2188
2189 return;
2190 }
2191 case AMDGPU::G_ICMP:
2192 case AMDGPU::G_UADDO:
2193 case AMDGPU::G_USUBO:
2194 case AMDGPU::G_UADDE:
2195 case AMDGPU::G_SADDE:
2196 case AMDGPU::G_USUBE:
2197 case AMDGPU::G_SSUBE: {
2198 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2199 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2200
2201 const RegisterBank *DstBank =
2202 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2203 if (DstBank != &AMDGPU::SGPRRegBank)
2204 break;
2205
2206 const bool HasCarryIn = MI.getNumOperands() == 5;
2207
2208 // If this is a scalar compare, promote the result to s32, as the selection
2209 // will end up using a copy to a 32-bit vreg.
2210 const LLT S32 = LLT::scalar(32);
2211 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2212 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2213 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2214 MachineIRBuilder B(MI);
2215
2216 if (HasCarryIn) {
2217 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2218 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2219 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2220 MI.getOperand(4).setReg(NewSrcReg);
2221 }
2222
2223 MachineBasicBlock *MBB = MI.getParent();
2224 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2225
2226 // If we had a constrained VCC result register, a copy was inserted to VCC
2227 // from SGPR.
2228 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2229 if (DefRegs.empty())
2230 DefRegs.push_back(DstReg);
2231 B.buildTrunc(DefRegs[0], NewDstReg);
2232 return;
2233 }
2234 case AMDGPU::G_SELECT: {
2235 Register DstReg = MI.getOperand(0).getReg();
2236 LLT DstTy = MRI.getType(DstReg);
2237
2238 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2239 if (CondRegs.empty())
2240 CondRegs.push_back(MI.getOperand(1).getReg());
2241 else {
2242 assert(CondRegs.size() == 1);
2243 }
2244
2245 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2246 if (CondBank == &AMDGPU::SGPRRegBank) {
2247 MachineIRBuilder B(MI);
2248 const LLT S32 = LLT::scalar(32);
2249 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2250 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2251
2252 MI.getOperand(1).setReg(NewCondReg);
2253 B.buildZExt(NewCondReg, CondRegs[0]);
2254 }
2255
2256 if (DstTy.getSizeInBits() != 64)
2257 break;
2258
2259 MachineIRBuilder B(MI);
2260 LLT HalfTy = getHalfSizedType(DstTy);
2261
2262 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2263 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2264 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2265
2266 // All inputs are SGPRs, nothing special to do.
2267 if (DefRegs.empty()) {
2268 assert(Src1Regs.empty() && Src2Regs.empty());
2269 break;
2270 }
2271
2272 if (Src1Regs.empty())
2273 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2274 else {
2275 setRegsToType(MRI, Src1Regs, HalfTy);
2276 }
2277
2278 if (Src2Regs.empty())
2279 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2280 else
2281 setRegsToType(MRI, Src2Regs, HalfTy);
2282
2283 setRegsToType(MRI, DefRegs, HalfTy);
2284
2285 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2286 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2287
2288 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2289 MI.eraseFromParent();
2290 return;
2291 }
2292 case AMDGPU::G_BRCOND: {
2293 Register CondReg = MI.getOperand(0).getReg();
2294 // FIXME: Should use legalizer helper, but should change bool ext type.
2295 const RegisterBank *CondBank =
2296 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2297
2298 if (CondBank == &AMDGPU::SGPRRegBank) {
2299 MachineIRBuilder B(MI);
2300 const LLT S32 = LLT::scalar(32);
2301 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2302 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2303
2304 MI.getOperand(0).setReg(NewCondReg);
2305 B.buildZExt(NewCondReg, CondReg);
2306 return;
2307 }
2308
2309 break;
2310 }
2311 case AMDGPU::G_AND:
2312 case AMDGPU::G_OR:
2313 case AMDGPU::G_XOR: {
2314 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2315 // there is a VGPR input.
2316 Register DstReg = MI.getOperand(0).getReg();
2317 LLT DstTy = MRI.getType(DstReg);
2318
2319 if (DstTy.getSizeInBits() == 1) {
2320 const RegisterBank *DstBank =
2321 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2322 if (DstBank == &AMDGPU::VCCRegBank)
2323 break;
2324
2325 MachineFunction *MF = MI.getParent()->getParent();
2326 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2327 MachineIRBuilder B(MI, ApplyBank);
2328 LegalizerHelper Helper(*MF, ApplyBank, B);
2329
2330 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2331 LegalizerHelper::Legalized)
2332 llvm_unreachable("widen scalar should have succeeded");
2333 return;
2334 }
2335
2336 if (DstTy.getSizeInBits() != 64)
2337 break;
2338
2339 LLT HalfTy = getHalfSizedType(DstTy);
2340 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2341 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2342 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2343
2344 // All inputs are SGPRs, nothing special to do.
2345 if (DefRegs.empty()) {
2346 assert(Src0Regs.empty() && Src1Regs.empty());
2347 break;
2348 }
2349
2350 assert(DefRegs.size() == 2);
2351 assert(Src0Regs.size() == Src1Regs.size() &&
2352 (Src0Regs.empty() || Src0Regs.size() == 2));
2353
2354 // Depending on where the source registers came from, the generic code may
2355 // have decided to split the inputs already or not. If not, we still need to
2356 // extract the values.
2357 MachineIRBuilder B(MI);
2358
2359 if (Src0Regs.empty())
2360 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2361 else
2362 setRegsToType(MRI, Src0Regs, HalfTy);
2363
2364 if (Src1Regs.empty())
2365 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2366 else
2367 setRegsToType(MRI, Src1Regs, HalfTy);
2368
2369 setRegsToType(MRI, DefRegs, HalfTy);
2370
2371 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2372 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2373
2374 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2375 MI.eraseFromParent();
2376 return;
2377 }
2378 case AMDGPU::G_ABS: {
2379 Register SrcReg = MI.getOperand(1).getReg();
2380 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2381
2382 // There is no VALU abs instruction so we need to replace it with a sub and
2383 // max combination.
2384 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2385 MachineFunction *MF = MI.getParent()->getParent();
2386 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2387 MachineIRBuilder B(MI, Apply);
2388 LegalizerHelper Helper(*MF, Apply, B);
2389
2390 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2391 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2392 return;
2393 }
2394 LLVM_FALLTHROUGH;
2395 }
2396 case AMDGPU::G_ADD:
2397 case AMDGPU::G_SUB:
2398 case AMDGPU::G_MUL:
2399 case AMDGPU::G_SHL:
2400 case AMDGPU::G_LSHR:
2401 case AMDGPU::G_ASHR:
2402 case AMDGPU::G_SMIN:
2403 case AMDGPU::G_SMAX:
2404 case AMDGPU::G_UMIN:
2405 case AMDGPU::G_UMAX: {
2406 Register DstReg = MI.getOperand(0).getReg();
2407 LLT DstTy = MRI.getType(DstReg);
2408
2409 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2410 // Packed 16-bit operations need to be scalarized and promoted.
2411 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2412 break;
2413
2414 const RegisterBank *DstBank =
2415 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2416 if (DstBank == &AMDGPU::VGPRRegBank)
2417 break;
2418
2419 const LLT S32 = LLT::scalar(32);
2420 MachineBasicBlock *MBB = MI.getParent();
2421 MachineFunction *MF = MBB->getParent();
2422 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2423 MachineIRBuilder B(MI, ApplySALU);
2424
2425 if (DstTy.isVector()) {
2426 Register WideSrc0Lo, WideSrc0Hi;
2427 Register WideSrc1Lo, WideSrc1Hi;
2428
2429 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2430 std::tie(WideSrc0Lo, WideSrc0Hi)
2431 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2432 std::tie(WideSrc1Lo, WideSrc1Hi)
2433 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2434 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2435 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2436 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2437 MI.eraseFromParent();
2438 } else {
2439 LegalizerHelper Helper(*MF, ApplySALU, B);
2440
2441 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2442 llvm_unreachable("widen scalar should have succeeded");
2443
2444 // FIXME: s16 shift amounts should be legal.
2445 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2446 Opc == AMDGPU::G_ASHR) {
2447 B.setInsertPt(*MBB, MI.getIterator());
2448 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2449 llvm_unreachable("widen scalar should have succeeded");
2450 }
2451 }
2452
2453 return;
2454 }
2455 case AMDGPU::G_SEXT_INREG: {
2456 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2457 if (SrcRegs.empty())
2458 break; // Nothing to repair
2459
2460 const LLT S32 = LLT::scalar(32);
2461 MachineIRBuilder B(MI);
2462 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2463 GISelObserverWrapper Observer(&O);
2464 B.setChangeObserver(Observer);
2465
2466 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2467 // we would need to further expand, and doesn't let us directly set the
2468 // result registers.
2469 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2470
2471 int Amt = MI.getOperand(2).getImm();
2472 if (Amt <= 32) {
2473 if (Amt == 32) {
2474 // The low bits are unchanged.
2475 B.buildCopy(DstRegs[0], SrcRegs[0]);
2476 } else {
2477 // Extend in the low bits and propagate the sign bit to the high half.
2478 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2479 }
2480
2481 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2482 } else {
2483 // The low bits are unchanged, and extend in the high bits.
2484 B.buildCopy(DstRegs[0], SrcRegs[0]);
2485 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2486 }
2487
2488 Register DstReg = MI.getOperand(0).getReg();
2489 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2490 MI.eraseFromParent();
2491 return;
2492 }
2493 case AMDGPU::G_CTPOP:
2494 case AMDGPU::G_BITREVERSE:
2495 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2496 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2497 const RegisterBank *DstBank =
2498 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2499 if (DstBank == &AMDGPU::SGPRRegBank)
2500 break;
2501
2502 Register SrcReg = MI.getOperand(1).getReg();
2503 const LLT S32 = LLT::scalar(32);
2504 LLT Ty = MRI.getType(SrcReg);
2505 if (Ty == S32)
2506 break;
2507
2508 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2509 MachineIRBuilder B(MI, ApplyVALU);
2510
2511 MachineFunction &MF = B.getMF();
2512 LegalizerHelper Helper(MF, ApplyVALU, B);
2513
2514 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2515 llvm_unreachable("narrowScalar should have succeeded");
2516 return;
2517 }
2518 case AMDGPU::G_SEXT:
2519 case AMDGPU::G_ZEXT:
2520 case AMDGPU::G_ANYEXT: {
2521 Register SrcReg = MI.getOperand(1).getReg();
2522 LLT SrcTy = MRI.getType(SrcReg);
2523 const bool Signed = Opc == AMDGPU::G_SEXT;
2524
2525 assert(empty(OpdMapper.getVRegs(1)));
2526
2527 MachineIRBuilder B(MI);
2528 const RegisterBank *SrcBank =
2529 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2530
2531 Register DstReg = MI.getOperand(0).getReg();
2532 LLT DstTy = MRI.getType(DstReg);
2533 if (DstTy.isScalar() &&
2534 SrcBank != &AMDGPU::SGPRRegBank &&
2535 SrcBank != &AMDGPU::VCCRegBank &&
2536 // FIXME: Should handle any type that round to s64 when irregular
2537 // breakdowns supported.
2538 DstTy.getSizeInBits() == 64 &&
2539 SrcTy.getSizeInBits() <= 32) {
2540 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2541
2542 // Extend to 32-bit, and then extend the low half.
2543 if (Signed) {
2544 // TODO: Should really be buildSExtOrCopy
2545 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2546 } else if (Opc == AMDGPU::G_ZEXT) {
2547 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2548 } else {
2549 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2550 }
2551
2552 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2553 MRI.setRegBank(DstReg, *SrcBank);
2554 MI.eraseFromParent();
2555 return;
2556 }
2557
2558 if (SrcTy != LLT::scalar(1))
2559 return;
2560
2561 // It is not legal to have a legalization artifact with a VCC source. Rather
2562 // than introducing a copy, insert the select we would have to select the
2563 // copy to.
2564 if (SrcBank == &AMDGPU::VCCRegBank) {
2565 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2566
2567 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2568
2569 unsigned DstSize = DstTy.getSizeInBits();
2570 // 64-bit select is SGPR only
2571 const bool UseSel64 = DstSize > 32 &&
2572 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2573
2574 // TODO: Should s16 select be legal?
2575 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2576 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2577 auto False = B.buildConstant(SelType, 0);
2578
2579 MRI.setRegBank(True.getReg(0), *DstBank);
2580 MRI.setRegBank(False.getReg(0), *DstBank);
2581 MRI.setRegBank(DstReg, *DstBank);
2582
2583 if (DstSize > 32) {
2584 B.buildSelect(DefRegs[0], SrcReg, True, False);
2585 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2586 } else if (DstSize < 32) {
2587 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2588 MRI.setRegBank(Sel.getReg(0), *DstBank);
2589 B.buildTrunc(DstReg, Sel);
2590 } else {
2591 B.buildSelect(DstReg, SrcReg, True, False);
2592 }
2593
2594 MI.eraseFromParent();
2595 return;
2596 }
2597
2598 break;
2599 }
2600 case AMDGPU::G_BUILD_VECTOR:
2601 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2602 Register DstReg = MI.getOperand(0).getReg();
2603 LLT DstTy = MRI.getType(DstReg);
2604 if (DstTy != LLT::fixed_vector(2, 16))
2605 break;
2606
2607 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2608 substituteSimpleCopyRegs(OpdMapper, 1);
2609 substituteSimpleCopyRegs(OpdMapper, 2);
2610
2611 const RegisterBank *DstBank =
2612 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2613 if (DstBank == &AMDGPU::SGPRRegBank)
2614 break; // Can use S_PACK_* instructions.
2615
2616 MachineIRBuilder B(MI);
2617
2618 Register Lo = MI.getOperand(1).getReg();
2619 Register Hi = MI.getOperand(2).getReg();
2620 const LLT S32 = LLT::scalar(32);
2621
2622 const RegisterBank *BankLo =
2623 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2624 const RegisterBank *BankHi =
2625 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2626
2627 Register ZextLo;
2628 Register ShiftHi;
2629
2630 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2631 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2632 MRI.setRegBank(ZextLo, *BankLo);
2633
2634 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2635 MRI.setRegBank(ZextHi, *BankHi);
2636
2637 auto ShiftAmt = B.buildConstant(S32, 16);
2638 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2639
2640 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2641 MRI.setRegBank(ShiftHi, *BankHi);
2642 } else {
2643 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2644 MRI.setRegBank(MaskLo, *BankLo);
2645
2646 auto ShiftAmt = B.buildConstant(S32, 16);
2647 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2648
2649 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2650 MRI.setRegBank(ShiftHi, *BankHi);
2651
2652 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2653 MRI.setRegBank(ZextLo, *BankLo);
2654 }
2655
2656 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2657 MRI.setRegBank(Or.getReg(0), *DstBank);
2658
2659 B.buildBitcast(DstReg, Or);
2660 MI.eraseFromParent();
2661 return;
2662 }
2663 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2664 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2665
2666 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2667
2668 Register DstReg = MI.getOperand(0).getReg();
2669 Register SrcReg = MI.getOperand(1).getReg();
2670
2671 const LLT S32 = LLT::scalar(32);
2672 LLT DstTy = MRI.getType(DstReg);
2673 LLT SrcTy = MRI.getType(SrcReg);
2674
2675 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2676 return;
2677
2678 MachineIRBuilder B(MI);
2679
2680 const ValueMapping &DstMapping
2681 = OpdMapper.getInstrMapping().getOperandMapping(0);
2682 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2683 const RegisterBank *SrcBank =
2684 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2685 const RegisterBank *IdxBank =
2686 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2687
2688 Register BaseIdxReg;
2689 unsigned ConstOffset;
2690 std::tie(BaseIdxReg, ConstOffset) =
2691 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2692
2693 // See if the index is an add of a constant which will be foldable by moving
2694 // the base register of the index later if this is going to be executed in a
2695 // waterfall loop. This is essentially to reassociate the add of a constant
2696 // with the readfirstlane.
2697 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2698 ConstOffset > 0 &&
2699 ConstOffset < SrcTy.getNumElements();
2700
2701 // Move the base register. We'll re-insert the add later.
2702 if (ShouldMoveIndexIntoLoop)
2703 MI.getOperand(2).setReg(BaseIdxReg);
2704
2705 // If this is a VGPR result only because the index was a VGPR result, the
2706 // actual indexing will be done on the SGPR source vector, which will
2707 // produce a scalar result. We need to copy to the VGPR result inside the
2708 // waterfall loop.
2709 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2710 SrcBank == &AMDGPU::SGPRRegBank;
2711 if (DstRegs.empty()) {
2712 applyDefaultMapping(OpdMapper);
2713
2714 executeInWaterfallLoop(MI, MRI, { 2 });
2715
2716 if (NeedCopyToVGPR) {
2717 // We don't want a phi for this temporary reg.
2718 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2719 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2720 MI.getOperand(0).setReg(TmpReg);
2721 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2722
2723 // Use a v_mov_b32 here to make the exec dependency explicit.
2724 buildVCopy(B, DstReg, TmpReg);
2725 }
2726
2727 // Re-insert the constant offset add inside the waterfall loop.
2728 if (ShouldMoveIndexIntoLoop)
2729 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2730
2731 return;
2732 }
2733
2734 assert(DstTy.getSizeInBits() == 64);
2735
2736 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2737
2738 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2739 auto One = B.buildConstant(S32, 1);
2740
2741 MachineBasicBlock::iterator MII = MI.getIterator();
2742
2743 // Split the vector index into 32-bit pieces. Prepare to move all of the
2744 // new instructions into a waterfall loop if necessary.
2745 //
2746 // Don't put the bitcast or constant in the loop.
2747 MachineInstrSpan Span(MII, &B.getMBB());
2748
2749 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2750 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2751 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2752
2753 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2754 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2755
2756 MRI.setRegBank(DstReg, *DstBank);
2757 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2758 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2759 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2760 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2761
2762 SmallSet<Register, 4> OpsToWaterfall;
2763 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2764 MI.eraseFromParent();
2765 return;
2766 }
2767
2768 // Remove the original instruction to avoid potentially confusing the
2769 // waterfall loop logic.
2770 B.setInstr(*Span.begin());
2771 MI.eraseFromParent();
2772 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2773 OpsToWaterfall, MRI);
2774
2775 if (NeedCopyToVGPR) {
2776 MachineBasicBlock *LoopBB = Extract1->getParent();
2777 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2778 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2779 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2780 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2781
2782 Extract0->getOperand(0).setReg(TmpReg0);
2783 Extract1->getOperand(0).setReg(TmpReg1);
2784
2785 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2786
2787 buildVCopy(B, DstRegs[0], TmpReg0);
2788 buildVCopy(B, DstRegs[1], TmpReg1);
2789 }
2790
2791 if (ShouldMoveIndexIntoLoop)
2792 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2793
2794 return;
2795 }
2796 case AMDGPU::G_INSERT_VECTOR_ELT: {
2797 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2798
2799 Register DstReg = MI.getOperand(0).getReg();
2800 LLT VecTy = MRI.getType(DstReg);
2801
2802 assert(OpdMapper.getVRegs(0).empty());
2803 assert(OpdMapper.getVRegs(3).empty());
2804
2805 if (substituteSimpleCopyRegs(OpdMapper, 1))
2806 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2807
2808 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2809 return;
2810
2811 const RegisterBank *IdxBank =
2812 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2813
2814 Register SrcReg = MI.getOperand(1).getReg();
2815 Register InsReg = MI.getOperand(2).getReg();
2816 LLT InsTy = MRI.getType(InsReg);
2817 (void)InsTy;
2818
2819 Register BaseIdxReg;
2820 unsigned ConstOffset;
2821 std::tie(BaseIdxReg, ConstOffset) =
2822 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2823
2824 // See if the index is an add of a constant which will be foldable by moving
2825 // the base register of the index later if this is going to be executed in a
2826 // waterfall loop. This is essentially to reassociate the add of a constant
2827 // with the readfirstlane.
2828 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2829 ConstOffset > 0 &&
2830 ConstOffset < VecTy.getNumElements();
2831
2832 // Move the base register. We'll re-insert the add later.
2833 if (ShouldMoveIndexIntoLoop)
2834 MI.getOperand(3).setReg(BaseIdxReg);
2835
2836
2837 if (InsRegs.empty()) {
2838 executeInWaterfallLoop(MI, MRI, { 3 });
2839
2840 // Re-insert the constant offset add inside the waterfall loop.
2841 if (ShouldMoveIndexIntoLoop) {
2842 MachineIRBuilder B(MI);
2843 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2844 }
2845
2846 return;
2847 }
2848
2849
2850 assert(InsTy.getSizeInBits() == 64);
2851
2852 const LLT S32 = LLT::scalar(32);
2853 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2854
2855 MachineIRBuilder B(MI);
2856 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2857 auto One = B.buildConstant(S32, 1);
2858
2859 // Split the vector index into 32-bit pieces. Prepare to move all of the
2860 // new instructions into a waterfall loop if necessary.
2861 //
2862 // Don't put the bitcast or constant in the loop.
2863 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2864
2865 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2866 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2867 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2868
2869 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2870 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2871
2872 const RegisterBank *DstBank =
2873 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2874 const RegisterBank *SrcBank =
2875 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2876 const RegisterBank *InsSrcBank =
2877 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2878
2879 MRI.setRegBank(InsReg, *InsSrcBank);
2880 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2881 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2882 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2883 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2884 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2885 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2886
2887
2888 SmallSet<Register, 4> OpsToWaterfall;
2889 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2890 B.setInsertPt(B.getMBB(), MI);
2891 B.buildBitcast(DstReg, InsHi);
2892 MI.eraseFromParent();
2893 return;
2894 }
2895
2896 B.setInstr(*Span.begin());
2897 MI.eraseFromParent();
2898
2899 // Figure out the point after the waterfall loop before mangling the control
2900 // flow.
2901 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2902 OpsToWaterfall, MRI);
2903
2904 // The insertion point is now right after the original instruction.
2905 //
2906 // Keep the bitcast to the original vector type out of the loop. Doing this
2907 // saved an extra phi we don't need inside the loop.
2908 B.buildBitcast(DstReg, InsHi);
2909
2910 // Re-insert the constant offset add inside the waterfall loop.
2911 if (ShouldMoveIndexIntoLoop)
2912 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2913
2914 return;
2915 }
2916 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2917 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2918 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2919 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2920 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2921 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2922 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2923 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2924 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2925 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2926 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2927 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2928 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2929 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2930 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2931 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2932 applyDefaultMapping(OpdMapper);
2933 executeInWaterfallLoop(MI, MRI, {1, 4});
2934 return;
2935 }
2936 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2937 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2938 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2939 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2940 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2941 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2942 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2943 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2944 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2945 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2946 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2947 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2948 applyDefaultMapping(OpdMapper);
2949 executeInWaterfallLoop(MI, MRI, {2, 5});
2950 return;
2951 }
2952 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2954 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2955 applyDefaultMapping(OpdMapper);
2956 executeInWaterfallLoop(MI, MRI, {2, 5});
2957 return;
2958 }
2959 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2960 applyDefaultMapping(OpdMapper);
2961 executeInWaterfallLoop(MI, MRI, {3, 6});
2962 return;
2963 }
2964 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2965 applyMappingSBufferLoad(OpdMapper);
2966 return;
2967 }
2968 case AMDGPU::G_INTRINSIC: {
2969 switch (MI.getIntrinsicID()) {
2970 case Intrinsic::amdgcn_readlane: {
2971 substituteSimpleCopyRegs(OpdMapper, 2);
2972
2973 assert(OpdMapper.getVRegs(0).empty());
2974 assert(OpdMapper.getVRegs(3).empty());
2975
2976 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2977 // waterfall loop, so assume it's a uniform value.
2978 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2979 return;
2980 }
2981 case Intrinsic::amdgcn_writelane: {
2982 assert(OpdMapper.getVRegs(0).empty());
2983 assert(OpdMapper.getVRegs(2).empty());
2984 assert(OpdMapper.getVRegs(3).empty());
2985
2986 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2987 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2988 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2989 return;
2990 }
2991 case Intrinsic::amdgcn_interp_p1:
2992 case Intrinsic::amdgcn_interp_p2:
2993 case Intrinsic::amdgcn_interp_mov:
2994 case Intrinsic::amdgcn_interp_p1_f16:
2995 case Intrinsic::amdgcn_interp_p2_f16: {
2996 applyDefaultMapping(OpdMapper);
2997
2998 // Readlane for m0 value, which is always the last operand.
2999 // FIXME: Should this be a waterfall loop instead?
3000 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3001 return;
3002 }
3003 case Intrinsic::amdgcn_permlane16:
3004 case Intrinsic::amdgcn_permlanex16: {
3005 // Doing a waterfall loop over these wouldn't make any sense.
3006 substituteSimpleCopyRegs(OpdMapper, 2);
3007 substituteSimpleCopyRegs(OpdMapper, 3);
3008 constrainOpWithReadfirstlane(MI, MRI, 4);
3009 constrainOpWithReadfirstlane(MI, MRI, 5);
3010 return;
3011 }
3012 case Intrinsic::amdgcn_sbfe:
3013 applyMappingBFE(OpdMapper, true);
3014 return;
3015 case Intrinsic::amdgcn_ubfe:
3016 applyMappingBFE(OpdMapper, false);
3017 return;
3018 case Intrinsic::amdgcn_ballot:
3019 // Use default handling and insert copy to vcc source.
3020 break;
3021 }
3022 break;
3023 }
3024 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3025 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3026 const AMDGPU::RsrcIntrinsic *RSrcIntrin
3027 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3028 assert(RSrcIntrin && RSrcIntrin->IsImage);
3029 // Non-images can have complications from operands that allow both SGPR
3030 // and VGPR. For now it's too complicated to figure out the final opcode
3031 // to derive the register bank from the MCInstrDesc.
3032 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3033 return;
3034 }
3035 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3036 unsigned N = MI.getNumExplicitOperands() - 2;
3037 executeInWaterfallLoop(MI, MRI, { N });
3038 return;
3039 }
3040 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3041 auto IntrID = MI.getIntrinsicID();
3042 switch (IntrID) {
3043 case Intrinsic::amdgcn_ds_ordered_add:
3044 case Intrinsic::amdgcn_ds_ordered_swap: {
3045 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3046 assert(OpdMapper.getVRegs(0).empty());
3047 substituteSimpleCopyRegs(OpdMapper, 3);
3048 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3049 return;
3050 }
3051 case Intrinsic::amdgcn_ds_gws_init:
3052 case Intrinsic::amdgcn_ds_gws_barrier:
3053 case Intrinsic::amdgcn_ds_gws_sema_br: {
3054 // Only the first lane is executes, so readfirstlane is safe.
3055 substituteSimpleCopyRegs(OpdMapper, 1);
3056 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3057 return;
3058 }
3059 case Intrinsic::amdgcn_ds_gws_sema_v:
3060 case Intrinsic::amdgcn_ds_gws_sema_p:
3061 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3062 // Only the first lane is executes, so readfirstlane is safe.
3063 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3064 return;
3065 }
3066 case Intrinsic::amdgcn_ds_append:
3067 case Intrinsic::amdgcn_ds_consume: {
3068 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3069 return;
3070 }
3071 case Intrinsic::amdgcn_s_sendmsg:
3072 case Intrinsic::amdgcn_s_sendmsghalt: {
3073 // FIXME: Should this use a waterfall loop?
3074 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3075 return;
3076 }
3077 case Intrinsic::amdgcn_s_setreg: {
3078 constrainOpWithReadfirstlane(MI, MRI, 2);
3079 return;
3080 }
3081 default: {
3082 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3083 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3084 // Non-images can have complications from operands that allow both SGPR
3085 // and VGPR. For now it's too complicated to figure out the final opcode
3086 // to derive the register bank from the MCInstrDesc.
3087 if (RSrcIntrin->IsImage) {
3088 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3089 return;
3090 }
3091 }
3092
3093 break;
3094 }
3095 }
3096 break;
3097 }
3098 case AMDGPU::G_LOAD:
3099 case AMDGPU::G_ZEXTLOAD:
3100 case AMDGPU::G_SEXTLOAD: {
3101 if (applyMappingLoad(MI, OpdMapper, MRI))
3102 return;
3103 break;
3104 }
3105 case AMDGPU::G_DYN_STACKALLOC:
3106 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3107 return;
3108 case AMDGPU::G_SBFX:
3109 applyMappingBFE(OpdMapper, /*Signed*/ true);
3110 return;
3111 case AMDGPU::G_UBFX:
3112 applyMappingBFE(OpdMapper, /*Signed*/ false);
3113 return;
3114 default:
3115 break;
3116 }
3117
3118 return applyDefaultMapping(OpdMapper);
3119 }
3120
3121 // vgpr, sgpr -> vgpr
3122 // vgpr, agpr -> vgpr
3123 // agpr, agpr -> agpr
3124 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3125 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3126 if (RB0 == AMDGPU::InvalidRegBankID)
3127 return RB1;
3128 if (RB1 == AMDGPU::InvalidRegBankID)
3129 return RB0;
3130
3131 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3132 return AMDGPU::SGPRRegBankID;
3133
3134 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3135 return AMDGPU::AGPRRegBankID;
3136
3137 return AMDGPU::VGPRRegBankID;
3138 }
3139
regBankBoolUnion(unsigned RB0,unsigned RB1)3140 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3141 if (RB0 == AMDGPU::InvalidRegBankID)
3142 return RB1;
3143 if (RB1 == AMDGPU::InvalidRegBankID)
3144 return RB0;
3145
3146 // vcc, vcc -> vcc
3147 // vcc, sgpr -> vcc
3148 // vcc, vgpr -> vcc
3149 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3150 return AMDGPU::VCCRegBankID;
3151
3152 // vcc, vgpr -> vgpr
3153 return regBankUnion(RB0, RB1);
3154 }
3155
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3156 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3157 const MachineInstr &MI) const {
3158 unsigned RegBank = AMDGPU::InvalidRegBankID;
3159
3160 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3161 if (!MI.getOperand(i).isReg())
3162 continue;
3163 Register Reg = MI.getOperand(i).getReg();
3164 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3165 RegBank = regBankUnion(RegBank, Bank->getID());
3166 if (RegBank == AMDGPU::VGPRRegBankID)
3167 break;
3168 }
3169 }
3170
3171 return RegBank;
3172 }
3173
isSALUMapping(const MachineInstr & MI) const3174 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3175 const MachineFunction &MF = *MI.getParent()->getParent();
3176 const MachineRegisterInfo &MRI = MF.getRegInfo();
3177 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3178 if (!MI.getOperand(i).isReg())
3179 continue;
3180 Register Reg = MI.getOperand(i).getReg();
3181 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3182 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3183 return false;
3184 }
3185 }
3186 return true;
3187 }
3188
3189 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3190 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3191 const MachineFunction &MF = *MI.getParent()->getParent();
3192 const MachineRegisterInfo &MRI = MF.getRegInfo();
3193 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3194
3195 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3196 const MachineOperand &SrcOp = MI.getOperand(i);
3197 if (!SrcOp.isReg())
3198 continue;
3199
3200 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3201 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3202 }
3203 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3204 MI.getNumOperands());
3205 }
3206
3207 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3208 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3209 const MachineFunction &MF = *MI.getParent()->getParent();
3210 const MachineRegisterInfo &MRI = MF.getRegInfo();
3211 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3212
3213 // Even though we technically could use SGPRs, this would require knowledge of
3214 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3215 //
3216 // TODO: Unary ops are trivially OK, so accept SGPRs?
3217 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3218 const MachineOperand &Src = MI.getOperand(i);
3219 if (!Src.isReg())
3220 continue;
3221
3222 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3223 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3224 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3225 }
3226
3227 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3228 MI.getNumOperands());
3229 }
3230
3231 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3232 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3233 const MachineFunction &MF = *MI.getParent()->getParent();
3234 const MachineRegisterInfo &MRI = MF.getRegInfo();
3235 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3236
3237 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3238 const MachineOperand &Op = MI.getOperand(I);
3239 if (!Op.isReg())
3240 continue;
3241
3242 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3243 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3244 }
3245
3246 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3247 MI.getNumOperands());
3248 }
3249
3250 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3251 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3252 const MachineInstr &MI,
3253 int RsrcIdx) const {
3254 // The reported argument index is relative to the IR intrinsic call arguments,
3255 // so we need to shift by the number of defs and the intrinsic ID.
3256 RsrcIdx += MI.getNumExplicitDefs() + 1;
3257
3258 const int NumOps = MI.getNumOperands();
3259 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3260
3261 // TODO: Should packed/unpacked D16 difference be reported here as part of
3262 // the value mapping?
3263 for (int I = 0; I != NumOps; ++I) {
3264 if (!MI.getOperand(I).isReg())
3265 continue;
3266
3267 Register OpReg = MI.getOperand(I).getReg();
3268 // We replace some dead address operands with $noreg
3269 if (!OpReg)
3270 continue;
3271
3272 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3273
3274 // FIXME: Probably need a new intrinsic register bank searchable table to
3275 // handle arbitrary intrinsics easily.
3276 //
3277 // If this has a sampler, it immediately follows rsrc.
3278 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3279
3280 if (MustBeSGPR) {
3281 // If this must be an SGPR, so we must report whatever it is as legal.
3282 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3283 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3284 } else {
3285 // Some operands must be VGPR, and these are easy to copy to.
3286 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3287 }
3288 }
3289
3290 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3291 }
3292
3293 /// Return the mapping for a pointer arugment.
3294 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3295 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3296 Register PtrReg) const {
3297 LLT PtrTy = MRI.getType(PtrReg);
3298 unsigned Size = PtrTy.getSizeInBits();
3299 if (Subtarget.useFlatForGlobal() ||
3300 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3301 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3302
3303 // If we're using MUBUF instructions for global memory, an SGPR base register
3304 // is possible. Otherwise this needs to be a VGPR.
3305 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3306 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3307 }
3308
3309 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3310 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3311
3312 const MachineFunction &MF = *MI.getParent()->getParent();
3313 const MachineRegisterInfo &MRI = MF.getRegInfo();
3314 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3315 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3316 Register PtrReg = MI.getOperand(1).getReg();
3317 LLT PtrTy = MRI.getType(PtrReg);
3318 unsigned AS = PtrTy.getAddressSpace();
3319 unsigned PtrSize = PtrTy.getSizeInBits();
3320
3321 const ValueMapping *ValMapping;
3322 const ValueMapping *PtrMapping;
3323
3324 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3325
3326 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3327 if (isScalarLoadLegal(MI)) {
3328 // We have a uniform instruction so we want to use an SMRD load
3329 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3330 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3331 } else {
3332 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3333
3334 // If we're using MUBUF instructions for global memory, an SGPR base
3335 // register is possible. Otherwise this needs to be a VGPR.
3336 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3337 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3338
3339 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3340 }
3341 } else {
3342 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3343 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3344 }
3345
3346 OpdsMapping[0] = ValMapping;
3347 OpdsMapping[1] = PtrMapping;
3348 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3349 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3350 return Mapping;
3351
3352 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3353 // handle that during instruction selection?
3354 }
3355
3356 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3357 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3358 const MachineRegisterInfo &MRI,
3359 unsigned Default) const {
3360 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3361 return Bank ? Bank->getID() : Default;
3362 }
3363
3364 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3365 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3366 const MachineRegisterInfo &MRI,
3367 const TargetRegisterInfo &TRI) const {
3368 // Lie and claim anything is legal, even though this needs to be an SGPR
3369 // applyMapping will have to deal with it as a waterfall loop.
3370 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3371 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3372 return AMDGPU::getValueMapping(Bank, Size);
3373 }
3374
3375 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3376 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3377 const MachineRegisterInfo &MRI,
3378 const TargetRegisterInfo &TRI) const {
3379 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3380 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3381 }
3382
3383 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3384 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3385 const MachineRegisterInfo &MRI,
3386 const TargetRegisterInfo &TRI) const {
3387 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3388 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3389 }
3390
3391 ///
3392 /// This function must return a legal mapping, because
3393 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3394 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3395 /// VGPR to SGPR generated is illegal.
3396 ///
3397 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3398 // legal. These will be dealt with in applyMappingImpl.
3399 //
3400 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3401 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3402 const MachineFunction &MF = *MI.getParent()->getParent();
3403 const MachineRegisterInfo &MRI = MF.getRegInfo();
3404
3405 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3406 // The default logic bothers to analyze impossible alternative mappings. We
3407 // want the most straightforward mapping, so just directly handle this.
3408 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3409 *TRI);
3410 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3411 *TRI);
3412 assert(SrcBank && "src bank should have been assigned already");
3413 if (!DstBank)
3414 DstBank = SrcBank;
3415
3416 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3417 if (cannotCopy(*DstBank, *SrcBank, Size))
3418 return getInvalidInstructionMapping();
3419
3420 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3421 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3422 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3423 OpdsMapping[0] = &ValMap;
3424 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3425 OpdsMapping[1] = &ValMap;
3426
3427 return getInstructionMapping(
3428 1, /*Cost*/ 1,
3429 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3430 }
3431
3432 if (MI.isRegSequence()) {
3433 // If any input is a VGPR, the result must be a VGPR. The default handling
3434 // assumes any copy between banks is legal.
3435 unsigned BankID = AMDGPU::SGPRRegBankID;
3436
3437 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3438 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3439 // It doesn't make sense to use vcc or scc banks here, so just ignore
3440 // them.
3441 if (OpBank != AMDGPU::SGPRRegBankID) {
3442 BankID = AMDGPU::VGPRRegBankID;
3443 break;
3444 }
3445 }
3446 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3447
3448 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3449 return getInstructionMapping(
3450 1, /*Cost*/ 1,
3451 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3452 }
3453
3454 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3455 // properly.
3456 //
3457 // TODO: There are additional exec masking dependencies to analyze.
3458 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3459 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3460 Register DstReg = MI.getOperand(0).getReg();
3461
3462 // Sometimes the result may have already been assigned a bank.
3463 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3464 ResultBank = DstBank->getID();
3465
3466 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3467 Register Reg = MI.getOperand(I).getReg();
3468 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3469
3470 // FIXME: Assuming VGPR for any undetermined inputs.
3471 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3472 ResultBank = AMDGPU::VGPRRegBankID;
3473 break;
3474 }
3475
3476 // FIXME: Need to promote SGPR case to s32
3477 unsigned OpBank = Bank->getID();
3478 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3479 }
3480
3481 assert(ResultBank != AMDGPU::InvalidRegBankID);
3482
3483 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3484
3485 const ValueMapping &ValMap =
3486 getValueMapping(0, Size, getRegBank(ResultBank));
3487 return getInstructionMapping(
3488 1, /*Cost*/ 1,
3489 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3490 }
3491
3492 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3493 if (Mapping.isValid())
3494 return Mapping;
3495
3496 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3497
3498 switch (MI.getOpcode()) {
3499 default:
3500 return getInvalidInstructionMapping();
3501
3502 case AMDGPU::G_AND:
3503 case AMDGPU::G_OR:
3504 case AMDGPU::G_XOR: {
3505 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3506 if (Size == 1) {
3507 const RegisterBank *DstBank
3508 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3509
3510 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3511 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3512 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3513 if (DstBank) {
3514 TargetBankID = DstBank->getID();
3515 if (DstBank == &AMDGPU::VCCRegBank) {
3516 TargetBankID = AMDGPU::VCCRegBankID;
3517 BankLHS = AMDGPU::VCCRegBankID;
3518 BankRHS = AMDGPU::VCCRegBankID;
3519 } else {
3520 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3521 AMDGPU::SGPRRegBankID);
3522 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3523 AMDGPU::SGPRRegBankID);
3524 }
3525 } else {
3526 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3527 AMDGPU::VCCRegBankID);
3528 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3529 AMDGPU::VCCRegBankID);
3530
3531 // Both inputs should be true booleans to produce a boolean result.
3532 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3533 TargetBankID = AMDGPU::VGPRRegBankID;
3534 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3535 TargetBankID = AMDGPU::VCCRegBankID;
3536 BankLHS = AMDGPU::VCCRegBankID;
3537 BankRHS = AMDGPU::VCCRegBankID;
3538 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3539 TargetBankID = AMDGPU::SGPRRegBankID;
3540 }
3541 }
3542
3543 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3544 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3545 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3546 break;
3547 }
3548
3549 if (Size == 64) {
3550
3551 if (isSALUMapping(MI)) {
3552 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3553 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3554 } else {
3555 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3556 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3557 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3558
3559 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3560 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3561 }
3562
3563 break;
3564 }
3565
3566 LLVM_FALLTHROUGH;
3567 }
3568 case AMDGPU::G_PTR_ADD:
3569 case AMDGPU::G_PTRMASK:
3570 case AMDGPU::G_ADD:
3571 case AMDGPU::G_SUB:
3572 case AMDGPU::G_MUL:
3573 case AMDGPU::G_SHL:
3574 case AMDGPU::G_LSHR:
3575 case AMDGPU::G_ASHR:
3576 case AMDGPU::G_UADDO:
3577 case AMDGPU::G_USUBO:
3578 case AMDGPU::G_UADDE:
3579 case AMDGPU::G_SADDE:
3580 case AMDGPU::G_USUBE:
3581 case AMDGPU::G_SSUBE:
3582 case AMDGPU::G_SMIN:
3583 case AMDGPU::G_SMAX:
3584 case AMDGPU::G_UMIN:
3585 case AMDGPU::G_UMAX:
3586 case AMDGPU::G_ABS:
3587 case AMDGPU::G_SHUFFLE_VECTOR:
3588 case AMDGPU::G_SBFX:
3589 case AMDGPU::G_UBFX:
3590 if (isSALUMapping(MI))
3591 return getDefaultMappingSOP(MI);
3592 LLVM_FALLTHROUGH;
3593
3594 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3595 case AMDGPU::G_SSUBSAT:
3596 case AMDGPU::G_UADDSAT:
3597 case AMDGPU::G_USUBSAT:
3598 case AMDGPU::G_FADD:
3599 case AMDGPU::G_FSUB:
3600 case AMDGPU::G_FPTOSI:
3601 case AMDGPU::G_FPTOUI:
3602 case AMDGPU::G_FMUL:
3603 case AMDGPU::G_FMA:
3604 case AMDGPU::G_FMAD:
3605 case AMDGPU::G_FSQRT:
3606 case AMDGPU::G_FFLOOR:
3607 case AMDGPU::G_FCEIL:
3608 case AMDGPU::G_FRINT:
3609 case AMDGPU::G_SITOFP:
3610 case AMDGPU::G_UITOFP:
3611 case AMDGPU::G_FPTRUNC:
3612 case AMDGPU::G_FPEXT:
3613 case AMDGPU::G_FEXP2:
3614 case AMDGPU::G_FLOG2:
3615 case AMDGPU::G_FMINNUM:
3616 case AMDGPU::G_FMAXNUM:
3617 case AMDGPU::G_FMINNUM_IEEE:
3618 case AMDGPU::G_FMAXNUM_IEEE:
3619 case AMDGPU::G_FCANONICALIZE:
3620 case AMDGPU::G_INTRINSIC_TRUNC:
3621 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3622 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3623 case AMDGPU::G_AMDGPU_FFBH_U32:
3624 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3625 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3626 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3627 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3628 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3629 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3630 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3631 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3632 case AMDGPU::G_AMDGPU_SMED3:
3633 return getDefaultMappingVOP(MI);
3634 case AMDGPU::G_UMULH:
3635 case AMDGPU::G_SMULH: {
3636 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3637 return getDefaultMappingSOP(MI);
3638 return getDefaultMappingVOP(MI);
3639 }
3640 case AMDGPU::G_IMPLICIT_DEF: {
3641 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3642 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3643 break;
3644 }
3645 case AMDGPU::G_FCONSTANT:
3646 case AMDGPU::G_CONSTANT:
3647 case AMDGPU::G_GLOBAL_VALUE:
3648 case AMDGPU::G_BLOCK_ADDR:
3649 case AMDGPU::G_READCYCLECOUNTER: {
3650 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3651 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3652 break;
3653 }
3654 case AMDGPU::G_FRAME_INDEX: {
3655 // TODO: This should be the same as other constants, but eliminateFrameIndex
3656 // currently assumes VALU uses.
3657 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3658 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3659 break;
3660 }
3661 case AMDGPU::G_DYN_STACKALLOC: {
3662 // Result is always uniform, and a wave reduction is needed for the source.
3663 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3664 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3665 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3666 break;
3667 }
3668 case AMDGPU::G_INSERT: {
3669 unsigned BankID = getMappingType(MRI, MI);
3670 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3671 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3672 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3673 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3674 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3675 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3676 OpdsMapping[3] = nullptr;
3677 break;
3678 }
3679 case AMDGPU::G_EXTRACT: {
3680 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3681 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3682 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3683 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3684 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3685 OpdsMapping[2] = nullptr;
3686 break;
3687 }
3688 case AMDGPU::G_BUILD_VECTOR:
3689 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3690 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3691 if (DstTy == LLT::fixed_vector(2, 16)) {
3692 unsigned DstSize = DstTy.getSizeInBits();
3693 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3694 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3695 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3696 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3697
3698 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3699 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3700 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3701 break;
3702 }
3703
3704 LLVM_FALLTHROUGH;
3705 }
3706 case AMDGPU::G_MERGE_VALUES:
3707 case AMDGPU::G_CONCAT_VECTORS: {
3708 unsigned Bank = getMappingType(MRI, MI);
3709 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3710 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3711
3712 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3713 // Op1 and Dst should use the same register bank.
3714 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3715 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3716 break;
3717 }
3718 case AMDGPU::G_BITREVERSE:
3719 case AMDGPU::G_BITCAST:
3720 case AMDGPU::G_INTTOPTR:
3721 case AMDGPU::G_PTRTOINT:
3722 case AMDGPU::G_FABS:
3723 case AMDGPU::G_FNEG: {
3724 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3725 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3726 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3727 break;
3728 }
3729 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3730 case AMDGPU::G_CTTZ_ZERO_UNDEF:
3731 case AMDGPU::G_CTPOP: {
3732 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3733 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3734 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3735
3736 // This should really be getValueMappingSGPR64Only, but allowing the generic
3737 // code to handle the register split just makes using LegalizerHelper more
3738 // difficult.
3739 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3740 break;
3741 }
3742 case AMDGPU::G_TRUNC: {
3743 Register Dst = MI.getOperand(0).getReg();
3744 Register Src = MI.getOperand(1).getReg();
3745 unsigned Bank = getRegBankID(Src, MRI);
3746 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3747 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3748 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3749 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3750 break;
3751 }
3752 case AMDGPU::G_ZEXT:
3753 case AMDGPU::G_SEXT:
3754 case AMDGPU::G_ANYEXT:
3755 case AMDGPU::G_SEXT_INREG: {
3756 Register Dst = MI.getOperand(0).getReg();
3757 Register Src = MI.getOperand(1).getReg();
3758 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3759 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3760
3761 unsigned DstBank;
3762 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3763 assert(SrcBank);
3764 switch (SrcBank->getID()) {
3765 case AMDGPU::SGPRRegBankID:
3766 DstBank = AMDGPU::SGPRRegBankID;
3767 break;
3768 default:
3769 DstBank = AMDGPU::VGPRRegBankID;
3770 break;
3771 }
3772
3773 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3774 // 32-bits, and then to 64.
3775 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3776 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3777 SrcSize);
3778 break;
3779 }
3780 case AMDGPU::G_FCMP: {
3781 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3782 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3783 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3784 OpdsMapping[1] = nullptr; // Predicate Operand.
3785 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3786 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3787 break;
3788 }
3789 case AMDGPU::G_STORE: {
3790 assert(MI.getOperand(0).isReg());
3791 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3792
3793 // FIXME: We need to specify a different reg bank once scalar stores are
3794 // supported.
3795 const ValueMapping *ValMapping =
3796 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3797 OpdsMapping[0] = ValMapping;
3798 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3799 break;
3800 }
3801 case AMDGPU::G_ICMP: {
3802 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3803 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3804
3805 // See if the result register has already been constrained to vcc, which may
3806 // happen due to control flow intrinsic lowering.
3807 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3808 AMDGPU::SGPRRegBankID);
3809 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3810 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3811
3812 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3813 Op2Bank == AMDGPU::SGPRRegBankID &&
3814 Op3Bank == AMDGPU::SGPRRegBankID &&
3815 (Size == 32 || (Size == 64 &&
3816 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3817 Subtarget.hasScalarCompareEq64()));
3818
3819 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3820 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3821
3822 // TODO: Use 32-bit for scalar output size.
3823 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3824 const unsigned ResultSize = 1;
3825
3826 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3827 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3828 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3829 break;
3830 }
3831 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3832 // VGPR index can be used for waterfall when indexing a SGPR vector.
3833 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3834 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3835 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3836 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3837 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3838 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3839
3840 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3841 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3842
3843 // The index can be either if the source vector is VGPR.
3844 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3845 break;
3846 }
3847 case AMDGPU::G_INSERT_VECTOR_ELT: {
3848 unsigned OutputBankID = isSALUMapping(MI) ?
3849 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3850
3851 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3852 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3853 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3854 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3855 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3856
3857 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3858 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3859
3860 // This is a weird case, because we need to break down the mapping based on
3861 // the register bank of a different operand.
3862 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3863 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3864 InsertSize);
3865 } else {
3866 assert(InsertSize == 32 || InsertSize == 64);
3867 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3868 }
3869
3870 // The index can be either if the source vector is VGPR.
3871 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3872 break;
3873 }
3874 case AMDGPU::G_UNMERGE_VALUES: {
3875 unsigned Bank = getMappingType(MRI, MI);
3876
3877 // Op1 and Dst should use the same register bank.
3878 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3879 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3880 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3881 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3882 }
3883 break;
3884 }
3885 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3886 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3887 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3888 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3889 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3890 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3891 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3892 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3893 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3894 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3895 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3896 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3897 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3898 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3899 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3900 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3901 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3902
3903 // rsrc
3904 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3905
3906 // vindex
3907 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3908
3909 // voffset
3910 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3911
3912 // soffset
3913 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3914
3915 // Any remaining operands are immediates and were correctly null
3916 // initialized.
3917 break;
3918 }
3919 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3920 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3921 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3922 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3923 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3924 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3925 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3926 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3927 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3928 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3929 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3930 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3931 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3932 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3933 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3934 // vdata_out
3935 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3936
3937 // vdata_in
3938 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3939
3940 // rsrc
3941 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3942
3943 // vindex
3944 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3945
3946 // voffset
3947 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3948
3949 // soffset
3950 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3951
3952 // Any remaining operands are immediates and were correctly null
3953 // initialized.
3954 break;
3955 }
3956 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3957 // vdata_out
3958 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3959
3960 // vdata_in
3961 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3962
3963 // cmp
3964 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3965
3966 // rsrc
3967 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3968
3969 // vindex
3970 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3971
3972 // voffset
3973 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3974
3975 // soffset
3976 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3977
3978 // Any remaining operands are immediates and were correctly null
3979 // initialized.
3980 break;
3981 }
3982 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3983 // Lie and claim everything is legal, even though some need to be
3984 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3985 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3986 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3987
3988 // We need to convert this to a MUBUF if either the resource of offset is
3989 // VGPR.
3990 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3991 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3992 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3993
3994 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3995 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3996 break;
3997 }
3998 case AMDGPU::G_INTRINSIC: {
3999 switch (MI.getIntrinsicID()) {
4000 default:
4001 return getInvalidInstructionMapping();
4002 case Intrinsic::amdgcn_div_fmas:
4003 case Intrinsic::amdgcn_div_fixup:
4004 case Intrinsic::amdgcn_trig_preop:
4005 case Intrinsic::amdgcn_sin:
4006 case Intrinsic::amdgcn_cos:
4007 case Intrinsic::amdgcn_log_clamp:
4008 case Intrinsic::amdgcn_rcp:
4009 case Intrinsic::amdgcn_rcp_legacy:
4010 case Intrinsic::amdgcn_sqrt:
4011 case Intrinsic::amdgcn_rsq:
4012 case Intrinsic::amdgcn_rsq_legacy:
4013 case Intrinsic::amdgcn_rsq_clamp:
4014 case Intrinsic::amdgcn_fmul_legacy:
4015 case Intrinsic::amdgcn_fma_legacy:
4016 case Intrinsic::amdgcn_ldexp:
4017 case Intrinsic::amdgcn_frexp_mant:
4018 case Intrinsic::amdgcn_frexp_exp:
4019 case Intrinsic::amdgcn_fract:
4020 case Intrinsic::amdgcn_cvt_pkrtz:
4021 case Intrinsic::amdgcn_cvt_pknorm_i16:
4022 case Intrinsic::amdgcn_cvt_pknorm_u16:
4023 case Intrinsic::amdgcn_cvt_pk_i16:
4024 case Intrinsic::amdgcn_cvt_pk_u16:
4025 case Intrinsic::amdgcn_fmed3:
4026 case Intrinsic::amdgcn_cubeid:
4027 case Intrinsic::amdgcn_cubema:
4028 case Intrinsic::amdgcn_cubesc:
4029 case Intrinsic::amdgcn_cubetc:
4030 case Intrinsic::amdgcn_sffbh:
4031 case Intrinsic::amdgcn_fmad_ftz:
4032 case Intrinsic::amdgcn_mbcnt_lo:
4033 case Intrinsic::amdgcn_mbcnt_hi:
4034 case Intrinsic::amdgcn_mul_u24:
4035 case Intrinsic::amdgcn_mul_i24:
4036 case Intrinsic::amdgcn_lerp:
4037 case Intrinsic::amdgcn_sad_u8:
4038 case Intrinsic::amdgcn_msad_u8:
4039 case Intrinsic::amdgcn_sad_hi_u8:
4040 case Intrinsic::amdgcn_sad_u16:
4041 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4042 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4043 case Intrinsic::amdgcn_mqsad_u32_u8:
4044 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4045 case Intrinsic::amdgcn_alignbit:
4046 case Intrinsic::amdgcn_alignbyte:
4047 case Intrinsic::amdgcn_perm:
4048 case Intrinsic::amdgcn_fdot2:
4049 case Intrinsic::amdgcn_sdot2:
4050 case Intrinsic::amdgcn_udot2:
4051 case Intrinsic::amdgcn_sdot4:
4052 case Intrinsic::amdgcn_udot4:
4053 case Intrinsic::amdgcn_sdot8:
4054 case Intrinsic::amdgcn_udot8:
4055 return getDefaultMappingVOP(MI);
4056 case Intrinsic::amdgcn_sbfe:
4057 case Intrinsic::amdgcn_ubfe:
4058 if (isSALUMapping(MI))
4059 return getDefaultMappingSOP(MI);
4060 return getDefaultMappingVOP(MI);
4061 case Intrinsic::amdgcn_ds_swizzle:
4062 case Intrinsic::amdgcn_ds_permute:
4063 case Intrinsic::amdgcn_ds_bpermute:
4064 case Intrinsic::amdgcn_update_dpp:
4065 case Intrinsic::amdgcn_mov_dpp8:
4066 case Intrinsic::amdgcn_mov_dpp:
4067 case Intrinsic::amdgcn_strict_wwm:
4068 case Intrinsic::amdgcn_wwm:
4069 case Intrinsic::amdgcn_strict_wqm:
4070 case Intrinsic::amdgcn_wqm:
4071 case Intrinsic::amdgcn_softwqm:
4072 case Intrinsic::amdgcn_set_inactive:
4073 return getDefaultMappingAllVGPR(MI);
4074 case Intrinsic::amdgcn_kernarg_segment_ptr:
4075 case Intrinsic::amdgcn_s_getpc:
4076 case Intrinsic::amdgcn_groupstaticsize:
4077 case Intrinsic::amdgcn_reloc_constant:
4078 case Intrinsic::returnaddress: {
4079 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4080 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4081 break;
4082 }
4083 case Intrinsic::amdgcn_wqm_vote: {
4084 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4085 OpdsMapping[0] = OpdsMapping[2]
4086 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4087 break;
4088 }
4089 case Intrinsic::amdgcn_ps_live: {
4090 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4091 break;
4092 }
4093 case Intrinsic::amdgcn_div_scale: {
4094 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4095 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4096 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4097 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4098
4099 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4100 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4101 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4102 break;
4103 }
4104 case Intrinsic::amdgcn_class: {
4105 Register Src0Reg = MI.getOperand(2).getReg();
4106 Register Src1Reg = MI.getOperand(3).getReg();
4107 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4108 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4109 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4110 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4111 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4112 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4113 break;
4114 }
4115 case Intrinsic::amdgcn_icmp:
4116 case Intrinsic::amdgcn_fcmp: {
4117 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4118 // This is not VCCRegBank because this is not used in boolean contexts.
4119 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4120 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4121 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4122 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4123 break;
4124 }
4125 case Intrinsic::amdgcn_readlane: {
4126 // This must be an SGPR, but accept a VGPR.
4127 Register IdxReg = MI.getOperand(3).getReg();
4128 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4129 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4130 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4131 LLVM_FALLTHROUGH;
4132 }
4133 case Intrinsic::amdgcn_readfirstlane: {
4134 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4135 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4136 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4137 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4138 break;
4139 }
4140 case Intrinsic::amdgcn_writelane: {
4141 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4142 Register SrcReg = MI.getOperand(2).getReg();
4143 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4144 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4145 Register IdxReg = MI.getOperand(3).getReg();
4146 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4147 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4148 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4149
4150 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4151 // to legalize.
4152 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4153 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4154 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4155 break;
4156 }
4157 case Intrinsic::amdgcn_if_break: {
4158 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4159 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4160 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4161 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4162 break;
4163 }
4164 case Intrinsic::amdgcn_permlane16:
4165 case Intrinsic::amdgcn_permlanex16: {
4166 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4167 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4168 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4169 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4170 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4171 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4172 break;
4173 }
4174 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4175 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4176 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4177 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4178 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4179 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4180 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4181 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4182 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4183 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4184 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4185 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4186 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4187 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4188 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4189 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4190 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4191 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4192 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4193 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4194 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4195 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4196 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4197 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4198 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4199 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4200 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4201 // Default for MAI intrinsics.
4202 // srcC can also be an immediate which can be folded later.
4203 // FIXME: Should we eventually add an alternative mapping with AGPR src
4204 // for srcA/srcB?
4205 //
4206 // vdst, srcA, srcB, srcC
4207 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4208 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4209 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4210 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4211 break;
4212 }
4213 case Intrinsic::amdgcn_interp_p1:
4214 case Intrinsic::amdgcn_interp_p2:
4215 case Intrinsic::amdgcn_interp_mov:
4216 case Intrinsic::amdgcn_interp_p1_f16:
4217 case Intrinsic::amdgcn_interp_p2_f16: {
4218 const int M0Idx = MI.getNumOperands() - 1;
4219 Register M0Reg = MI.getOperand(M0Idx).getReg();
4220 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4221 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4222
4223 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4224 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4225 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4226
4227 // Must be SGPR, but we must take whatever the original bank is and fix it
4228 // later.
4229 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4230 break;
4231 }
4232 case Intrinsic::amdgcn_ballot: {
4233 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4234 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4235 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4236 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4237 break;
4238 }
4239 }
4240 break;
4241 }
4242 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4243 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4244 auto IntrID = MI.getIntrinsicID();
4245 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4246 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4247 // Non-images can have complications from operands that allow both SGPR
4248 // and VGPR. For now it's too complicated to figure out the final opcode
4249 // to derive the register bank from the MCInstrDesc.
4250 assert(RSrcIntrin->IsImage);
4251 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4252 }
4253 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4254 unsigned N = MI.getNumExplicitOperands() - 2;
4255 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4256 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4257 for (unsigned I = 2; I < N; ++I)
4258 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4259 break;
4260 }
4261 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4262 auto IntrID = MI.getIntrinsicID();
4263 switch (IntrID) {
4264 case Intrinsic::amdgcn_s_getreg:
4265 case Intrinsic::amdgcn_s_memtime:
4266 case Intrinsic::amdgcn_s_memrealtime:
4267 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4268 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4269 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4270 break;
4271 }
4272 case Intrinsic::amdgcn_global_atomic_fadd:
4273 case Intrinsic::amdgcn_global_atomic_csub:
4274 case Intrinsic::amdgcn_global_atomic_fmin:
4275 case Intrinsic::amdgcn_global_atomic_fmax:
4276 case Intrinsic::amdgcn_flat_atomic_fadd:
4277 case Intrinsic::amdgcn_flat_atomic_fmin:
4278 case Intrinsic::amdgcn_flat_atomic_fmax:
4279 return getDefaultMappingAllVGPR(MI);
4280 case Intrinsic::amdgcn_ds_ordered_add:
4281 case Intrinsic::amdgcn_ds_ordered_swap: {
4282 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4283 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4284 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4285 AMDGPU::SGPRRegBankID);
4286 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4287 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4288 break;
4289 }
4290 case Intrinsic::amdgcn_ds_append:
4291 case Intrinsic::amdgcn_ds_consume: {
4292 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4293 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4294 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4295 break;
4296 }
4297 case Intrinsic::amdgcn_exp_compr:
4298 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4299 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4300 break;
4301 case Intrinsic::amdgcn_exp:
4302 // FIXME: Could we support packed types here?
4303 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4304 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4305 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4306 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4307 break;
4308 case Intrinsic::amdgcn_s_sendmsg:
4309 case Intrinsic::amdgcn_s_sendmsghalt: {
4310 // This must be an SGPR, but accept a VGPR.
4311 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4312 AMDGPU::SGPRRegBankID);
4313 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4314 break;
4315 }
4316 case Intrinsic::amdgcn_s_setreg: {
4317 // This must be an SGPR, but accept a VGPR.
4318 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4319 AMDGPU::SGPRRegBankID);
4320 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4321 break;
4322 }
4323 case Intrinsic::amdgcn_end_cf: {
4324 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4325 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4326 break;
4327 }
4328 case Intrinsic::amdgcn_else: {
4329 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4330 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4331 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4332 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4333 break;
4334 }
4335 case Intrinsic::amdgcn_live_mask: {
4336 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4337 break;
4338 }
4339 case Intrinsic::amdgcn_wqm_demote:
4340 case Intrinsic::amdgcn_kill: {
4341 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4342 break;
4343 }
4344 case Intrinsic::amdgcn_raw_buffer_load:
4345 case Intrinsic::amdgcn_raw_tbuffer_load: {
4346 // FIXME: Should make intrinsic ID the last operand of the instruction,
4347 // then this would be the same as store
4348 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4349 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4350 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4351 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4352 break;
4353 }
4354 case Intrinsic::amdgcn_raw_buffer_store:
4355 case Intrinsic::amdgcn_raw_buffer_store_format:
4356 case Intrinsic::amdgcn_raw_tbuffer_store: {
4357 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4358 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4359 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4360 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4361 break;
4362 }
4363 case Intrinsic::amdgcn_struct_buffer_load:
4364 case Intrinsic::amdgcn_struct_tbuffer_load: {
4365 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4366 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4367 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4368 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4369 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4370 break;
4371 }
4372 case Intrinsic::amdgcn_struct_buffer_store:
4373 case Intrinsic::amdgcn_struct_tbuffer_store: {
4374 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4375 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4376 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4377 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4378 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4379 break;
4380 }
4381 case Intrinsic::amdgcn_init_exec_from_input: {
4382 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4383 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4384 break;
4385 }
4386 case Intrinsic::amdgcn_ds_gws_init:
4387 case Intrinsic::amdgcn_ds_gws_barrier:
4388 case Intrinsic::amdgcn_ds_gws_sema_br: {
4389 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4390
4391 // This must be an SGPR, but accept a VGPR.
4392 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4393 AMDGPU::SGPRRegBankID);
4394 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4395 break;
4396 }
4397 case Intrinsic::amdgcn_ds_gws_sema_v:
4398 case Intrinsic::amdgcn_ds_gws_sema_p:
4399 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4400 // This must be an SGPR, but accept a VGPR.
4401 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4402 AMDGPU::SGPRRegBankID);
4403 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4404 break;
4405 }
4406 default:
4407 return getInvalidInstructionMapping();
4408 }
4409 break;
4410 }
4411 case AMDGPU::G_SELECT: {
4412 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4413 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4414 AMDGPU::SGPRRegBankID);
4415 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4416 AMDGPU::SGPRRegBankID);
4417 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4418 Op3Bank == AMDGPU::SGPRRegBankID;
4419
4420 unsigned CondBankDefault = SGPRSrcs ?
4421 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4422 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4423 CondBankDefault);
4424 if (CondBank == AMDGPU::SGPRRegBankID)
4425 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4426 else if (CondBank == AMDGPU::VGPRRegBankID)
4427 CondBank = AMDGPU::VCCRegBankID;
4428
4429 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4430 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4431
4432 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4433
4434 // TODO: Should report 32-bit for scalar condition type.
4435 if (Size == 64) {
4436 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4437 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4438 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4439 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4440 } else {
4441 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4442 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4443 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4444 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4445 }
4446
4447 break;
4448 }
4449
4450 case AMDGPU::G_LOAD:
4451 case AMDGPU::G_ZEXTLOAD:
4452 case AMDGPU::G_SEXTLOAD:
4453 return getInstrMappingForLoad(MI);
4454
4455 case AMDGPU::G_ATOMICRMW_XCHG:
4456 case AMDGPU::G_ATOMICRMW_ADD:
4457 case AMDGPU::G_ATOMICRMW_SUB:
4458 case AMDGPU::G_ATOMICRMW_AND:
4459 case AMDGPU::G_ATOMICRMW_OR:
4460 case AMDGPU::G_ATOMICRMW_XOR:
4461 case AMDGPU::G_ATOMICRMW_MAX:
4462 case AMDGPU::G_ATOMICRMW_MIN:
4463 case AMDGPU::G_ATOMICRMW_UMAX:
4464 case AMDGPU::G_ATOMICRMW_UMIN:
4465 case AMDGPU::G_ATOMICRMW_FADD:
4466 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4467 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4468 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4469 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4470 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4471 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4472 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4473 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4474 break;
4475 }
4476 case AMDGPU::G_ATOMIC_CMPXCHG: {
4477 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4478 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4479 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4480 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4481 break;
4482 }
4483 case AMDGPU::G_BRCOND: {
4484 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4485 AMDGPU::SGPRRegBankID);
4486 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4487 if (Bank != AMDGPU::SGPRRegBankID)
4488 Bank = AMDGPU::VCCRegBankID;
4489
4490 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4491 break;
4492 }
4493 }
4494
4495 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4496 getOperandsMapping(OpdsMapping),
4497 MI.getNumOperands());
4498 }
4499