1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70
71 #include "AMDGPURegisterBankInfo.h"
72
73 #include "AMDGPUGlobalISelUtils.h"
74 #include "AMDGPUInstrInfo.h"
75 #include "AMDGPUSubtarget.h"
76 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
84 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
85 #include "llvm/CodeGen/TargetRegisterInfo.h"
86 #include "llvm/CodeGen/TargetSubtargetInfo.h"
87 #include "llvm/IR/Constants.h"
88
89 #define GET_TARGET_REGBANK_IMPL
90 #include "AMDGPUGenRegisterBank.inc"
91
92 // This file will be TableGen'ed at some point.
93 #include "AMDGPUGenRegisterBankInfo.def"
94
95 using namespace llvm;
96 using namespace MIPatternMatch;
97
98 namespace {
99
100 // Observer to apply a register bank to new registers created by LegalizerHelper.
101 class ApplyRegBankMapping final : public GISelChangeObserver {
102 private:
103 const AMDGPURegisterBankInfo &RBI;
104 MachineRegisterInfo &MRI;
105 const RegisterBank *NewBank;
106 SmallVector<MachineInstr *, 4> NewInsts;
107
108 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)109 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
110 MachineRegisterInfo &MRI_, const RegisterBank *RB)
111 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
112
~ApplyRegBankMapping()113 ~ApplyRegBankMapping() {
114 for (MachineInstr *MI : NewInsts)
115 applyBank(*MI);
116 }
117
118 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)119 void applyBank(MachineInstr &MI) {
120 const unsigned Opc = MI.getOpcode();
121 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
122 Opc == AMDGPU::G_SEXT) {
123 // LegalizerHelper wants to use the basic legalization artifacts when
124 // widening etc. We don't handle selection with vcc in artifact sources,
125 // so we need to use a sslect instead to handle these properly.
126 Register DstReg = MI.getOperand(0).getReg();
127 Register SrcReg = MI.getOperand(1).getReg();
128 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
129 if (SrcBank == &AMDGPU::VCCRegBank) {
130 const LLT S32 = LLT::scalar(32);
131 assert(MRI.getType(SrcReg) == LLT::scalar(1));
132 assert(MRI.getType(DstReg) == S32);
133 assert(NewBank == &AMDGPU::VGPRRegBank);
134
135 // Replace the extension with a select, which really uses the boolean
136 // source.
137 MachineIRBuilder B(MI);
138 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
139 auto False = B.buildConstant(S32, 0);
140 B.buildSelect(DstReg, SrcReg, True, False);
141 MRI.setRegBank(True.getReg(0), *NewBank);
142 MRI.setRegBank(False.getReg(0), *NewBank);
143 MI.eraseFromParent();
144 }
145
146 assert(!MRI.getRegClassOrRegBank(DstReg));
147 MRI.setRegBank(DstReg, *NewBank);
148 return;
149 }
150
151 #ifndef NDEBUG
152 if (Opc == AMDGPU::G_TRUNC) {
153 Register DstReg = MI.getOperand(0).getReg();
154 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
155 assert(DstBank != &AMDGPU::VCCRegBank);
156 }
157 #endif
158
159 for (MachineOperand &Op : MI.operands()) {
160 if (!Op.isReg())
161 continue;
162
163 // We may see physical registers if building a real MI
164 Register Reg = Op.getReg();
165 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
166 continue;
167
168 const RegisterBank *RB = NewBank;
169 if (MRI.getType(Reg) == LLT::scalar(1)) {
170 assert(NewBank == &AMDGPU::VGPRRegBank &&
171 "s1 operands should only be used for vector bools");
172 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
173 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
174 "not expecting legalization artifacts here");
175 RB = &AMDGPU::VCCRegBank;
176 }
177
178 MRI.setRegBank(Reg, *RB);
179 }
180 }
181
erasingInstr(MachineInstr & MI)182 void erasingInstr(MachineInstr &MI) override {}
183
createdInstr(MachineInstr & MI)184 void createdInstr(MachineInstr &MI) override {
185 // At this point, the instruction was just inserted and has no operands.
186 NewInsts.push_back(&MI);
187 }
188
changingInstr(MachineInstr & MI)189 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)190 void changedInstr(MachineInstr &MI) override {}
191 };
192
193 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)194 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
195 : AMDGPUGenRegisterBankInfo(),
196 Subtarget(ST),
197 TRI(Subtarget.getRegisterInfo()),
198 TII(Subtarget.getInstrInfo()) {
199
200 // HACK: Until this is fully tablegen'd.
201 static llvm::once_flag InitializeRegisterBankFlag;
202
203 static auto InitializeRegisterBankOnce = [this]() {
204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207 (void)this;
208 };
209
210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212
isVectorRegisterBank(const RegisterBank & Bank)213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214 unsigned BankID = Bank.getID();
215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219 const RegisterBank &Src,
220 unsigned Size) const {
221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224 return std::numeric_limits<unsigned>::max();
225 }
226
227 // Bool values are tricky, because the meaning is based on context. The SCC
228 // and VCC banks are for the natural scalar and vector conditions produced by
229 // a compare.
230 //
231 // Legalization doesn't know about the necessary context, so an s1 use may
232 // have been a truncate from an arbitrary value, in which case a copy (lowered
233 // as a compare with 0) needs to be inserted.
234 if (Size == 1 &&
235 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236 (isVectorRegisterBank(Src) ||
237 Src.getID() == AMDGPU::SGPRRegBankID ||
238 Src.getID() == AMDGPU::VCCRegBankID))
239 return std::numeric_limits<unsigned>::max();
240
241 // There is no direct copy between AGPRs.
242 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243 Src.getID() == AMDGPU::AGPRRegBankID)
244 return 4;
245
246 return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const249 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
250 const ValueMapping &ValMapping,
251 const RegisterBank *CurBank) const {
252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253 // VGPR.
254 // FIXME: Is there a better way to do this?
255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256 return 10; // This is expensive.
257
258 assert(ValMapping.NumBreakDowns == 2 &&
259 ValMapping.BreakDown[0].Length == 32 &&
260 ValMapping.BreakDown[0].StartIdx == 0 &&
261 ValMapping.BreakDown[1].Length == 32 &&
262 ValMapping.BreakDown[1].StartIdx == 32 &&
263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264
265 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267 // want.
268
269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270 // alignment restrictions, but this probably isn't important.
271 return 1;
272 }
273
274 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
276 LLT Ty) const {
277 if (&RC == &AMDGPU::SReg_1RegClass)
278 return AMDGPU::VCCRegBank;
279
280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281 // VCC-like use.
282 if (TRI->isSGPRClass(&RC)) {
283 // FIXME: This probably came from a copy from a physical register, which
284 // should be inferrrable from the copied to-type. We don't have many boolean
285 // physical register constraints so just assume a normal SGPR for now.
286 if (!Ty.isValid())
287 return AMDGPU::SGPRRegBank;
288
289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290 }
291
292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294
295 template <unsigned NumOps>
296 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const297 AMDGPURegisterBankInfo::addMappingFromTable(
298 const MachineInstr &MI, const MachineRegisterInfo &MRI,
299 const std::array<unsigned, NumOps> RegSrcOpIdx,
300 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301
302 InstructionMappings AltMappings;
303
304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
305
306 unsigned Sizes[NumOps];
307 for (unsigned I = 0; I < NumOps; ++I) {
308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310 }
311
312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315 }
316
317 // getInstrMapping's default mapping uses ID 1, so start at 2.
318 unsigned MappingID = 2;
319 for (const auto &Entry : Table) {
320 for (unsigned I = 0; I < NumOps; ++I) {
321 int OpIdx = RegSrcOpIdx[I];
322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323 }
324
325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
326 getOperandsMapping(Operands),
327 Operands.size()));
328 }
329
330 return AltMappings;
331 }
332
333 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336 switch (MI.getIntrinsicID()) {
337 case Intrinsic::amdgcn_readlane: {
338 static const OpRegBankEntry<3> Table[2] = {
339 // Perfectly legal.
340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341
342 // Need a readfirstlane for the index.
343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344 };
345
346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
348 }
349 case Intrinsic::amdgcn_writelane: {
350 static const OpRegBankEntry<4> Table[4] = {
351 // Perfectly legal.
352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353
354 // Need readfirstlane of first op
355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356
357 // Need readfirstlane of second op
358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359
360 // Need readfirstlane of both ops
361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362 };
363
364 // rsrc, voffset, offset
365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
367 }
368 default:
369 return RegisterBankInfo::getInstrAlternativeMappings(MI);
370 }
371 }
372
373 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376
377 switch (MI.getIntrinsicID()) {
378 case Intrinsic::amdgcn_s_buffer_load: {
379 static const OpRegBankEntry<2> Table[4] = {
380 // Perfectly legal.
381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382
383 // Only need 1 register in loop
384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385
386 // Have to waterfall the resource.
387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388
389 // Have to waterfall the resource, and the offset.
390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391 };
392
393 // rsrc, offset
394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
396 }
397 case Intrinsic::amdgcn_ds_ordered_add:
398 case Intrinsic::amdgcn_ds_ordered_swap: {
399 // VGPR = M0, VGPR
400 static const OpRegBankEntry<3> Table[2] = {
401 // Perfectly legal.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
403
404 // Need a readfirstlane for m0
405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406 };
407
408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
410 }
411 case Intrinsic::amdgcn_s_sendmsg:
412 case Intrinsic::amdgcn_s_sendmsghalt: {
413 // FIXME: Should have no register for immediate
414 static const OpRegBankEntry<1> Table[2] = {
415 // Perfectly legal.
416 { { AMDGPU::SGPRRegBankID }, 1 },
417
418 // Need readlane
419 { { AMDGPU::VGPRRegBankID }, 3 }
420 };
421
422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
424 }
425 default:
426 return RegisterBankInfo::getInstrAlternativeMappings(MI);
427 }
428 }
429
memOpHasNoClobbered(const MachineMemOperand * MMO)430 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
431 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
432 return I && I->getMetadata("amdgpu.noclobber");
433 }
434
435 // FIXME: Returns uniform if there's no source value information. This is
436 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)437 static bool isScalarLoadLegal(const MachineInstr &MI) {
438 if (!MI.hasOneMemOperand())
439 return false;
440
441 const MachineMemOperand *MMO = *MI.memoperands_begin();
442 const unsigned AS = MMO->getAddrSpace();
443 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
444 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
445
446 // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
447 return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
448 // Can't do a scalar atomic load.
449 !MMO->isAtomic() &&
450 // Don't use scalar loads for volatile accesses to non-constant address
451 // spaces.
452 (IsConst || !MMO->isVolatile()) &&
453 // Memory must be known constant, or not written before this load.
454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455 AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457
458 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460 const MachineInstr &MI) const {
461
462 const MachineFunction &MF = *MI.getParent()->getParent();
463 const MachineRegisterInfo &MRI = MF.getRegInfo();
464
465
466 InstructionMappings AltMappings;
467 switch (MI.getOpcode()) {
468 case TargetOpcode::G_CONSTANT: {
469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470 if (Size == 1) {
471 static const OpRegBankEntry<1> Table[3] = {
472 { { AMDGPU::VGPRRegBankID }, 1 },
473 { { AMDGPU::SGPRRegBankID }, 1 },
474 { { AMDGPU::VCCRegBankID }, 1 }
475 };
476
477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478 }
479
480 LLVM_FALLTHROUGH;
481 }
482 case TargetOpcode::G_FCONSTANT:
483 case TargetOpcode::G_FRAME_INDEX:
484 case TargetOpcode::G_GLOBAL_VALUE: {
485 static const OpRegBankEntry<1> Table[2] = {
486 { { AMDGPU::VGPRRegBankID }, 1 },
487 { { AMDGPU::SGPRRegBankID }, 1 }
488 };
489
490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491 }
492 case TargetOpcode::G_AND:
493 case TargetOpcode::G_OR:
494 case TargetOpcode::G_XOR: {
495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496
497 if (Size == 1) {
498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499 const InstructionMapping &SCCMapping = getInstructionMapping(
500 1, 1, getOperandsMapping(
501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504 3); // Num Operands
505 AltMappings.push_back(&SCCMapping);
506
507 const InstructionMapping &VCCMapping0 = getInstructionMapping(
508 2, 1, getOperandsMapping(
509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512 3); // Num Operands
513 AltMappings.push_back(&VCCMapping0);
514 return AltMappings;
515 }
516
517 if (Size != 64)
518 break;
519
520 const InstructionMapping &SSMapping = getInstructionMapping(
521 1, 1, getOperandsMapping(
522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525 3); // Num Operands
526 AltMappings.push_back(&SSMapping);
527
528 const InstructionMapping &VVMapping = getInstructionMapping(
529 2, 2, getOperandsMapping(
530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533 3); // Num Operands
534 AltMappings.push_back(&VVMapping);
535 break;
536 }
537 case TargetOpcode::G_LOAD:
538 case TargetOpcode::G_ZEXTLOAD:
539 case TargetOpcode::G_SEXTLOAD: {
540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542 unsigned PtrSize = PtrTy.getSizeInBits();
543 unsigned AS = PtrTy.getAddressSpace();
544
545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547 isScalarLoadLegal(MI)) {
548 const InstructionMapping &SSMapping = getInstructionMapping(
549 1, 1, getOperandsMapping(
550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552 2); // Num Operands
553 AltMappings.push_back(&SSMapping);
554 }
555
556 const InstructionMapping &VVMapping = getInstructionMapping(
557 2, 1,
558 getOperandsMapping(
559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561 2); // Num Operands
562 AltMappings.push_back(&VVMapping);
563
564 // It may be possible to have a vgpr = load sgpr mapping here, because
565 // the mubuf instructions support this kind of load, but probably for only
566 // gfx7 and older. However, the addressing mode matching in the instruction
567 // selector should be able to do a better job of detecting and selecting
568 // these kinds of loads from the vgpr = load vgpr mapping.
569
570 return AltMappings;
571
572 }
573 case TargetOpcode::G_SELECT: {
574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580 4); // Num Operands
581 AltMappings.push_back(&SSMapping);
582
583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588 4); // Num Operands
589 AltMappings.push_back(&VVMapping);
590
591 return AltMappings;
592 }
593 case TargetOpcode::G_SMIN:
594 case TargetOpcode::G_SMAX:
595 case TargetOpcode::G_UMIN:
596 case TargetOpcode::G_UMAX: {
597 static const OpRegBankEntry<3> Table[2] = {
598 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
599
600 // Scalar requires cmp+select, and extends if 16-bit.
601 // FIXME: Should there be separate costs for 32 and 16-bit
602 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
603 };
604
605 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
606 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
607 }
608 case TargetOpcode::G_UADDE:
609 case TargetOpcode::G_USUBE:
610 case TargetOpcode::G_SADDE:
611 case TargetOpcode::G_SSUBE: {
612 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
613 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
614 getOperandsMapping(
615 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
617 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
618 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
619 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
620 5); // Num Operands
621 AltMappings.push_back(&SSMapping);
622
623 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
624 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
626 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
627 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
628 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
629 5); // Num Operands
630 AltMappings.push_back(&VVMapping);
631 return AltMappings;
632 }
633 case AMDGPU::G_BRCOND: {
634 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
635
636 // TODO: Change type to 32 for scalar
637 const InstructionMapping &SMapping = getInstructionMapping(
638 1, 1, getOperandsMapping(
639 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
640 2); // Num Operands
641 AltMappings.push_back(&SMapping);
642
643 const InstructionMapping &VMapping = getInstructionMapping(
644 1, 1, getOperandsMapping(
645 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
646 2); // Num Operands
647 AltMappings.push_back(&VMapping);
648 return AltMappings;
649 }
650 case AMDGPU::G_INTRINSIC:
651 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
652 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
653 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
654 default:
655 break;
656 }
657 return RegisterBankInfo::getInstrAlternativeMappings(MI);
658 }
659
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const660 void AMDGPURegisterBankInfo::split64BitValueForMapping(
661 MachineIRBuilder &B,
662 SmallVector<Register, 2> &Regs,
663 LLT HalfTy,
664 Register Reg) const {
665 assert(HalfTy.getSizeInBits() == 32);
666 MachineRegisterInfo *MRI = B.getMRI();
667 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
668 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
669 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
670 MRI->setRegBank(LoLHS, *Bank);
671 MRI->setRegBank(HiLHS, *Bank);
672
673 Regs.push_back(LoLHS);
674 Regs.push_back(HiLHS);
675
676 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
677 .addDef(LoLHS)
678 .addDef(HiLHS)
679 .addUse(Reg);
680 }
681
682 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)683 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
684 LLT NewTy) {
685 for (Register Reg : Regs) {
686 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
687 MRI.setType(Reg, NewTy);
688 }
689 }
690
getHalfSizedType(LLT Ty)691 static LLT getHalfSizedType(LLT Ty) {
692 if (Ty.isVector()) {
693 assert(Ty.getNumElements() % 2 == 0);
694 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
695 }
696
697 assert(Ty.getSizeInBits() % 2 == 0);
698 return LLT::scalar(Ty.getSizeInBits() / 2);
699 }
700
701 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
702 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
703 /// execute the instruction for each unique combination of values in all lanes
704 /// in the wave. The block will be split such that rest of the instructions are
705 /// moved to a new block.
706 ///
707 /// Essentially performs this loop:
708 //
709 /// Save Execution Mask
710 /// For (Lane : Wavefront) {
711 /// Enable Lane, Disable all other lanes
712 /// SGPR = read SGPR value for current lane from VGPR
713 /// VGPRResult[Lane] = use_op SGPR
714 /// }
715 /// Restore Execution Mask
716 ///
717 /// There is additional complexity to try for compare values to identify the
718 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const719 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
720 MachineIRBuilder &B,
721 iterator_range<MachineBasicBlock::iterator> Range,
722 SmallSet<Register, 4> &SGPROperandRegs,
723 MachineRegisterInfo &MRI) const {
724 SmallVector<Register, 4> ResultRegs;
725 SmallVector<Register, 4> InitResultRegs;
726 SmallVector<Register, 4> PhiRegs;
727
728 // Track use registers which have already been expanded with a readfirstlane
729 // sequence. This may have multiple uses if moving a sequence.
730 DenseMap<Register, Register> WaterfalledRegMap;
731
732 MachineBasicBlock &MBB = B.getMBB();
733 MachineFunction *MF = &B.getMF();
734
735 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
736 const unsigned WaveAndOpc = Subtarget.isWave32() ?
737 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
738 const unsigned MovTermOpc = Subtarget.isWave32() ?
739 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
740 const unsigned XorTermOpc = Subtarget.isWave32() ?
741 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
742 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
743 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
744 const unsigned ExecReg = Subtarget.isWave32() ?
745 AMDGPU::EXEC_LO : AMDGPU::EXEC;
746
747 #ifndef NDEBUG
748 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
749 #endif
750
751 for (MachineInstr &MI : Range) {
752 for (MachineOperand &Def : MI.defs()) {
753 LLT ResTy = MRI.getType(Def.getReg());
754 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
755 ResultRegs.push_back(Def.getReg());
756 Register InitReg = B.buildUndef(ResTy).getReg(0);
757 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
758 InitResultRegs.push_back(InitReg);
759 PhiRegs.push_back(PhiReg);
760 MRI.setRegBank(PhiReg, *DefBank);
761 MRI.setRegBank(InitReg, *DefBank);
762 }
763 }
764
765 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
766 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
767
768 // Don't bother using generic instructions/registers for the exec mask.
769 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
770 .addDef(InitSaveExecReg);
771
772 Register PhiExec = MRI.createVirtualRegister(WaveRC);
773 Register NewExec = MRI.createVirtualRegister(WaveRC);
774
775 // To insert the loop we need to split the block. Move everything before this
776 // point to a new block, and insert a new empty block before this instruction.
777 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
778 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
779 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
780 MachineFunction::iterator MBBI(MBB);
781 ++MBBI;
782 MF->insert(MBBI, LoopBB);
783 MF->insert(MBBI, RestoreExecBB);
784 MF->insert(MBBI, RemainderBB);
785
786 LoopBB->addSuccessor(RestoreExecBB);
787 LoopBB->addSuccessor(LoopBB);
788
789 // Move the rest of the block into a new block.
790 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
791 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
792
793 MBB.addSuccessor(LoopBB);
794 RestoreExecBB->addSuccessor(RemainderBB);
795
796 B.setInsertPt(*LoopBB, LoopBB->end());
797
798 B.buildInstr(TargetOpcode::PHI)
799 .addDef(PhiExec)
800 .addReg(InitSaveExecReg)
801 .addMBB(&MBB)
802 .addReg(NewExec)
803 .addMBB(LoopBB);
804
805 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
806 B.buildInstr(TargetOpcode::G_PHI)
807 .addDef(std::get<2>(Result))
808 .addReg(std::get<0>(Result)) // Initial value / implicit_def
809 .addMBB(&MBB)
810 .addReg(std::get<1>(Result)) // Mid-loop value.
811 .addMBB(LoopBB);
812 }
813
814 const DebugLoc &DL = B.getDL();
815
816 MachineInstr &FirstInst = *Range.begin();
817
818 // Move the instruction into the loop. Note we moved everything after
819 // Range.end() already into a new block, so Range.end() is no longer valid.
820 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
821
822 // Figure out the iterator range after splicing the instructions.
823 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
824 auto NewEnd = LoopBB->end();
825
826 MachineBasicBlock::iterator I = Range.begin();
827 B.setInsertPt(*LoopBB, I);
828
829 Register CondReg;
830
831 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
832
833 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
834 for (MachineOperand &Op : MI.uses()) {
835 if (!Op.isReg() || Op.isDef())
836 continue;
837
838 Register OldReg = Op.getReg();
839 if (!SGPROperandRegs.count(OldReg))
840 continue;
841
842 // See if we already processed this register in another instruction in the
843 // sequence.
844 auto OldVal = WaterfalledRegMap.find(OldReg);
845 if (OldVal != WaterfalledRegMap.end()) {
846 Op.setReg(OldVal->second);
847 continue;
848 }
849
850 LLT OpTy = MRI.getType(Op.getReg());
851 unsigned OpSize = OpTy.getSizeInBits();
852
853 // Can only do a readlane of 32-bit pieces.
854 if (OpSize == 32) {
855 // Avoid extra copies in the simple case of one 32-bit register.
856 Register CurrentLaneOpReg
857 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
858 MRI.setType(CurrentLaneOpReg, OpTy);
859
860 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
861 // Read the next variant <- also loop target.
862 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
863 CurrentLaneOpReg)
864 .addReg(Op.getReg());
865
866 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
867 bool First = CondReg == AMDGPU::NoRegister;
868 if (First)
869 CondReg = NewCondReg;
870
871 // Compare the just read M0 value to all possible Idx values.
872 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
873 .addDef(NewCondReg)
874 .addReg(CurrentLaneOpReg)
875 .addReg(Op.getReg());
876 Op.setReg(CurrentLaneOpReg);
877
878 if (!First) {
879 Register AndReg = MRI.createVirtualRegister(WaveRC);
880
881 // If there are multiple operands to consider, and the conditions.
882 B.buildInstr(WaveAndOpc)
883 .addDef(AndReg)
884 .addReg(NewCondReg)
885 .addReg(CondReg);
886 CondReg = AndReg;
887 }
888 } else {
889 LLT S32 = LLT::scalar(32);
890 SmallVector<Register, 8> ReadlanePieces;
891
892 // The compares can be done as 64-bit, but the extract needs to be done
893 // in 32-bit pieces.
894
895 bool Is64 = OpSize % 64 == 0;
896
897 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899 : AMDGPU::V_CMP_EQ_U32_e64;
900
901 // The compares can be done as 64-bit, but the extract needs to be done
902 // in 32-bit pieces.
903
904 // Insert the unmerge before the loop.
905
906 B.setMBB(MBB);
907 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
908 B.setInstr(*I);
909
910 unsigned NumPieces = Unmerge->getNumOperands() - 1;
911 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
912 Register UnmergePiece = Unmerge.getReg(PieceIdx);
913
914 Register CurrentLaneOpReg;
915 if (Is64) {
916 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
917 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
918
919 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
920 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
921 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
922
923 // Read the next variant <- also loop target.
924 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
925 CurrentLaneOpRegLo)
926 .addReg(UnmergePiece, 0, AMDGPU::sub0);
927
928 // Read the next variant <- also loop target.
929 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930 CurrentLaneOpRegHi)
931 .addReg(UnmergePiece, 0, AMDGPU::sub1);
932
933 CurrentLaneOpReg =
934 B.buildMerge(LLT::scalar(64),
935 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
936 .getReg(0);
937
938 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
939
940 if (OpTy.getScalarSizeInBits() == 64) {
941 // If we need to produce a 64-bit element vector, so use the
942 // merged pieces
943 ReadlanePieces.push_back(CurrentLaneOpReg);
944 } else {
945 // 32-bit element type.
946 ReadlanePieces.push_back(CurrentLaneOpRegLo);
947 ReadlanePieces.push_back(CurrentLaneOpRegHi);
948 }
949 } else {
950 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
951 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
952 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
953
954 // Read the next variant <- also loop target.
955 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
956 CurrentLaneOpReg)
957 .addReg(UnmergePiece);
958 ReadlanePieces.push_back(CurrentLaneOpReg);
959 }
960
961 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
962 bool First = CondReg == AMDGPU::NoRegister;
963 if (First)
964 CondReg = NewCondReg;
965
966 B.buildInstr(CmpOp)
967 .addDef(NewCondReg)
968 .addReg(CurrentLaneOpReg)
969 .addReg(UnmergePiece);
970
971 if (!First) {
972 Register AndReg = MRI.createVirtualRegister(WaveRC);
973
974 // If there are multiple operands to consider, and the conditions.
975 B.buildInstr(WaveAndOpc)
976 .addDef(AndReg)
977 .addReg(NewCondReg)
978 .addReg(CondReg);
979 CondReg = AndReg;
980 }
981 }
982
983 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
984 // BUILD_VECTOR
985 if (OpTy.isVector()) {
986 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
987 Op.setReg(Merge.getReg(0));
988 } else {
989 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
990 Op.setReg(Merge.getReg(0));
991 }
992
993 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
994 }
995
996 // Make sure we don't re-process this register again.
997 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
998 }
999 }
1000
1001 B.setInsertPt(*LoopBB, LoopBB->end());
1002
1003 // Update EXEC, save the original EXEC value to VCC.
1004 B.buildInstr(AndSaveExecOpc)
1005 .addDef(NewExec)
1006 .addReg(CondReg, RegState::Kill);
1007
1008 MRI.setSimpleHint(NewExec, CondReg);
1009
1010 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011 B.buildInstr(XorTermOpc)
1012 .addDef(ExecReg)
1013 .addReg(ExecReg)
1014 .addReg(NewExec);
1015
1016 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1017 // s_cbranch_scc0?
1018
1019 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1021 .addMBB(LoopBB);
1022
1023 // Save the EXEC mask before the loop.
1024 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1025 .addReg(ExecReg);
1026
1027 // Restore the EXEC mask after the loop.
1028 B.setMBB(*RestoreExecBB);
1029 B.buildInstr(MovTermOpc)
1030 .addDef(ExecReg)
1031 .addReg(SaveExecReg);
1032
1033 // Set the insert point after the original instruction, so any new
1034 // instructions will be in the remainder.
1035 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1036
1037 return true;
1038 }
1039
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operansd to handle and a
1043 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1045 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1046 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1047 for (unsigned Op : OpIndices) {
1048 assert(MI.getOperand(Op).isUse());
1049 Register Reg = MI.getOperand(Op).getReg();
1050 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1051 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
1052 SGPROperandRegs.insert(Reg);
1053 }
1054
1055 // No operands need to be replaced, so no need to loop.
1056 return !SGPROperandRegs.empty();
1057 }
1058
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1060 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1061 ArrayRef<unsigned> OpIndices) const {
1062 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063 // are the same register.
1064 SmallSet<Register, 4> SGPROperandRegs;
1065
1066 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1067 return false;
1068
1069 MachineBasicBlock::iterator I = MI.getIterator();
1070 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1071 SGPROperandRegs, MRI);
1072 }
1073
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1075 MachineInstr &MI, MachineRegisterInfo &MRI,
1076 ArrayRef<unsigned> OpIndices) const {
1077 MachineIRBuilder B(MI);
1078 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1079 }
1080
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1083 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1084 Register Reg = MI.getOperand(OpIdx).getReg();
1085 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1086 if (Bank != &AMDGPU::VGPRRegBank)
1087 return;
1088
1089 MachineIRBuilder B(MI);
1090 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1091 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1092 .addDef(SGPR)
1093 .addReg(Reg);
1094
1095 MRI.setType(SGPR, MRI.getType(Reg));
1096
1097 const TargetRegisterClass *Constrained =
1098 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1099 (void)Constrained;
1100 assert(Constrained && "Failed to constrain readfirstlane src reg");
1101
1102 MI.getOperand(OpIdx).setReg(SGPR);
1103 }
1104
1105 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1106 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1107 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1108 unsigned TotalSize = Ty.getSizeInBits();
1109 if (!Ty.isVector())
1110 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1111
1112 LLT EltTy = Ty.getElementType();
1113 unsigned EltSize = EltTy.getSizeInBits();
1114 assert(FirstSize % EltSize == 0);
1115
1116 unsigned FirstPartNumElts = FirstSize / EltSize;
1117 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1118
1119 return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
1120 LLT::scalarOrVector(RemainderElts, EltTy)};
1121 }
1122
widen96To128(LLT Ty)1123 static LLT widen96To128(LLT Ty) {
1124 if (!Ty.isVector())
1125 return LLT::scalar(128);
1126
1127 LLT EltTy = Ty.getElementType();
1128 assert(128 % EltTy.getSizeInBits() == 0);
1129 return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
1130 }
1131
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1132 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1133 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1134 MachineRegisterInfo &MRI) const {
1135 Register DstReg = MI.getOperand(0).getReg();
1136 const LLT LoadTy = MRI.getType(DstReg);
1137 unsigned LoadSize = LoadTy.getSizeInBits();
1138 const unsigned MaxNonSmrdLoadSize = 128;
1139
1140 const RegisterBank *PtrBank =
1141 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1142 if (PtrBank == &AMDGPU::SGPRRegBank) {
1143 // If the pointer is an SGPR, we ordinarily have nothing to do.
1144 if (LoadSize != 96)
1145 return false;
1146
1147 MachineMemOperand *MMO = *MI.memoperands_begin();
1148 Register PtrReg = MI.getOperand(1).getReg();
1149 // 96-bit loads are only available for vector loads. We need to split this
1150 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1151
1152 MachineIRBuilder B(MI);
1153 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1154 GISelObserverWrapper Observer(&O);
1155 B.setChangeObserver(Observer);
1156
1157 if (MMO->getAlign() < Align(16)) {
1158 LLT Part64, Part32;
1159 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1160 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1161 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1162
1163 auto Undef = B.buildUndef(LoadTy);
1164 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1165 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1166 } else {
1167 LLT WiderTy = widen96To128(LoadTy);
1168 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1169 B.buildExtract(MI.getOperand(0), WideLoad, 0);
1170 }
1171
1172 MI.eraseFromParent();
1173 return true;
1174 }
1175
1176 // 128-bit loads are supported for all instruction types.
1177 if (LoadSize <= MaxNonSmrdLoadSize)
1178 return false;
1179
1180 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1181 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1182
1183 if (SrcRegs.empty())
1184 SrcRegs.push_back(MI.getOperand(1).getReg());
1185
1186 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1187
1188 // RegBankSelect only emits scalar types, so we need to reset the pointer
1189 // operand to a pointer type.
1190 Register BasePtrReg = SrcRegs[0];
1191 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1192 MRI.setType(BasePtrReg, PtrTy);
1193
1194 MachineIRBuilder B(MI);
1195
1196 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1197 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1198 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
1199 GISelObserverWrapper Observer(&O);
1200 B.setChangeObserver(Observer);
1201 LegalizerHelper Helper(B.getMF(), Observer, B);
1202
1203 if (LoadTy.isVector()) {
1204 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1205 return false;
1206 } else {
1207 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1208 return false;
1209 }
1210
1211 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1212 return true;
1213 }
1214
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1215 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1216 MachineInstr &MI,
1217 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1218 MachineRegisterInfo &MRI) const {
1219 const MachineFunction &MF = *MI.getMF();
1220 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1221 const auto &TFI = *ST.getFrameLowering();
1222
1223 // Guard in case the stack growth direction ever changes with scratch
1224 // instructions.
1225 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1226 return false;
1227
1228 Register Dst = MI.getOperand(0).getReg();
1229 Register AllocSize = MI.getOperand(1).getReg();
1230 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1231
1232 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1233
1234 // TODO: Need to emit a wave reduction to get the maximum size.
1235 if (SizeBank != &AMDGPU::SGPRRegBank)
1236 return false;
1237
1238 LLT PtrTy = MRI.getType(Dst);
1239 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1240
1241 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1242 Register SPReg = Info->getStackPtrOffsetReg();
1243 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1244 GISelObserverWrapper Observer(&ApplyBank);
1245
1246 MachineIRBuilder B(MI);
1247 B.setChangeObserver(Observer);
1248
1249 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1250 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1251
1252 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1253 if (Alignment > TFI.getStackAlign()) {
1254 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1255 B.buildMaskLowPtrBits(Dst, PtrAdd,
1256 Log2(Alignment) + ST.getWavefrontSizeLog2());
1257 } else {
1258 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1259 }
1260
1261 MI.eraseFromParent();
1262 return true;
1263 }
1264
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1265 bool AMDGPURegisterBankInfo::applyMappingImage(
1266 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1267 MachineRegisterInfo &MRI, int RsrcIdx) const {
1268 const int NumDefs = MI.getNumExplicitDefs();
1269
1270 // The reported argument index is relative to the IR intrinsic call arguments,
1271 // so we need to shift by the number of defs and the intrinsic ID.
1272 RsrcIdx += NumDefs + 1;
1273
1274 // Insert copies to VGPR arguments.
1275 applyDefaultMapping(OpdMapper);
1276
1277 // Fixup any SGPR arguments.
1278 SmallVector<unsigned, 4> SGPRIndexes;
1279 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1280 if (!MI.getOperand(I).isReg())
1281 continue;
1282
1283 // If this intrinsic has a sampler, it immediately follows rsrc.
1284 if (I == RsrcIdx || I == RsrcIdx + 1)
1285 SGPRIndexes.push_back(I);
1286 }
1287
1288 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1289 return true;
1290 }
1291
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1292 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1293 Register Reg) {
1294 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1295 if (!Def)
1296 return Reg;
1297
1298 // TODO: Guard against this being an implicit def
1299 return Def->getOperand(0).getReg();
1300 }
1301
1302 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1303 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1304 static unsigned setBufferOffsets(MachineIRBuilder &B,
1305 const AMDGPURegisterBankInfo &RBI,
1306 Register CombinedOffset, Register &VOffsetReg,
1307 Register &SOffsetReg, int64_t &InstOffsetVal,
1308 Align Alignment) {
1309 const LLT S32 = LLT::scalar(32);
1310 MachineRegisterInfo *MRI = B.getMRI();
1311
1312 if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
1313 uint32_t SOffset, ImmOffset;
1314 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1315 Alignment)) {
1316 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1317 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1318 InstOffsetVal = ImmOffset;
1319
1320 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1321 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1322 return SOffset + ImmOffset;
1323 }
1324 }
1325
1326 Register Base;
1327 unsigned Offset;
1328 MachineInstr *Unused;
1329
1330 std::tie(Base, Offset, Unused)
1331 = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1332
1333 uint32_t SOffset, ImmOffset;
1334 if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1335 &RBI.Subtarget, Alignment)) {
1336 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1337 VOffsetReg = Base;
1338 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1339 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1340 InstOffsetVal = ImmOffset;
1341 return 0; // XXX - Why is this 0?
1342 }
1343
1344 // If we have SGPR base, we can use it for soffset.
1345 if (SOffset == 0) {
1346 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1347 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1348 SOffsetReg = Base;
1349 InstOffsetVal = ImmOffset;
1350 return 0; // XXX - Why is this 0?
1351 }
1352 }
1353
1354 // Handle the variable sgpr + vgpr case.
1355 if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
1356 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1357 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1358
1359 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1360 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1361
1362 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1363 VOffsetReg = Src0;
1364 SOffsetReg = Src1;
1365 return 0;
1366 }
1367
1368 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1369 VOffsetReg = Src1;
1370 SOffsetReg = Src0;
1371 return 0;
1372 }
1373 }
1374
1375 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1376 // have an SGPR offset and a VGPR resource.
1377 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1378 VOffsetReg = CombinedOffset;
1379 } else {
1380 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1381 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1382 }
1383
1384 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1385 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1386 return 0;
1387 }
1388
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1389 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1390 const OperandsMapper &OpdMapper) const {
1391 MachineInstr &MI = OpdMapper.getMI();
1392 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1393
1394 const LLT S32 = LLT::scalar(32);
1395 Register Dst = MI.getOperand(0).getReg();
1396 LLT Ty = MRI.getType(Dst);
1397
1398 const RegisterBank *RSrcBank =
1399 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1400 const RegisterBank *OffsetBank =
1401 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1402 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1403 OffsetBank == &AMDGPU::SGPRRegBank)
1404 return true; // Legal mapping
1405
1406 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1407 // here but don't have an MMO.
1408
1409 unsigned LoadSize = Ty.getSizeInBits();
1410 int NumLoads = 1;
1411 if (LoadSize == 256 || LoadSize == 512) {
1412 NumLoads = LoadSize / 128;
1413 Ty = Ty.divide(NumLoads);
1414 }
1415
1416 // Use the alignment to ensure that the required offsets will fit into the
1417 // immediate offsets.
1418 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1419
1420 MachineIRBuilder B(MI);
1421 MachineFunction &MF = B.getMF();
1422
1423 Register SOffset;
1424 Register VOffset;
1425 int64_t ImmOffset = 0;
1426
1427 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1428 VOffset, SOffset, ImmOffset, Alignment);
1429
1430 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1431 // can, but we neeed to track an MMO for that.
1432 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1433 const Align MemAlign(4); // FIXME: ABI type alignment?
1434 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1435 MachinePointerInfo(),
1436 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1437 MachineMemOperand::MOInvariant,
1438 MemSize, MemAlign);
1439 if (MMOOffset != 0)
1440 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1441
1442 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1443 // assume that the buffer is unswizzled.
1444
1445 Register RSrc = MI.getOperand(1).getReg();
1446 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1447 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1448
1449 SmallVector<Register, 4> LoadParts(NumLoads);
1450
1451 MachineBasicBlock::iterator MII = MI.getIterator();
1452 MachineInstrSpan Span(MII, &B.getMBB());
1453
1454 for (int i = 0; i < NumLoads; ++i) {
1455 if (NumLoads == 1) {
1456 LoadParts[i] = Dst;
1457 } else {
1458 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1459 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1460 }
1461
1462 MachineMemOperand *MMO = BaseMMO;
1463 if (i != 0)
1464 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1465
1466 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1467 .addDef(LoadParts[i]) // vdata
1468 .addUse(RSrc) // rsrc
1469 .addUse(VIndex) // vindex
1470 .addUse(VOffset) // voffset
1471 .addUse(SOffset) // soffset
1472 .addImm(ImmOffset + 16 * i) // offset(imm)
1473 .addImm(0) // cachepolicy, swizzled buffer(imm)
1474 .addImm(0) // idxen(imm)
1475 .addMemOperand(MMO);
1476 }
1477
1478 // TODO: If only the resource is a VGPR, it may be better to execute the
1479 // scalar load in the waterfall loop if the resource is expected to frequently
1480 // be dynamically uniform.
1481 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1482 // Remove the original instruction to avoid potentially confusing the
1483 // waterfall loop logic.
1484 B.setInstr(*Span.begin());
1485 MI.eraseFromParent();
1486
1487 SmallSet<Register, 4> OpsToWaterfall;
1488
1489 OpsToWaterfall.insert(RSrc);
1490 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1491 OpsToWaterfall, MRI);
1492 }
1493
1494 if (NumLoads != 1) {
1495 if (Ty.isVector())
1496 B.buildConcatVectors(Dst, LoadParts);
1497 else
1498 B.buildMerge(Dst, LoadParts);
1499 }
1500
1501 // We removed the instruction earlier with a waterfall loop.
1502 if (RSrcBank == &AMDGPU::SGPRRegBank)
1503 MI.eraseFromParent();
1504
1505 return true;
1506 }
1507
applyMappingBFEIntrinsic(const OperandsMapper & OpdMapper,bool Signed) const1508 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1509 const OperandsMapper &OpdMapper, bool Signed) const {
1510 MachineInstr &MI = OpdMapper.getMI();
1511 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1512
1513 // Insert basic copies
1514 applyDefaultMapping(OpdMapper);
1515
1516 Register DstReg = MI.getOperand(0).getReg();
1517 LLT Ty = MRI.getType(DstReg);
1518
1519 const LLT S32 = LLT::scalar(32);
1520
1521 const RegisterBank *DstBank =
1522 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1523 if (DstBank == &AMDGPU::VGPRRegBank) {
1524 if (Ty == S32)
1525 return true;
1526
1527 // TODO: 64-bit version is scalar only, so we need to expand this.
1528 return false;
1529 }
1530
1531 Register SrcReg = MI.getOperand(2).getReg();
1532 Register OffsetReg = MI.getOperand(3).getReg();
1533 Register WidthReg = MI.getOperand(4).getReg();
1534
1535 // The scalar form packs the offset and width in a single operand.
1536
1537 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1538 GISelObserverWrapper Observer(&ApplyBank);
1539 MachineIRBuilder B(MI);
1540 B.setChangeObserver(Observer);
1541
1542 // Ensure the high bits are clear to insert the offset.
1543 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1544 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1545
1546 // Zeros out the low bits, so don't bother clamping the input value.
1547 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1548
1549 // Transformation function, pack the offset and width of a BFE into
1550 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1551 // source, bits [5:0] contain the offset and bits [22:16] the width.
1552 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1553
1554 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1555 // register class constraints.
1556 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1557 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1558
1559 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1560 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1561 llvm_unreachable("failed to constrain BFE");
1562
1563 MI.eraseFromParent();
1564 return true;
1565 }
1566
1567 // FIXME: Duplicated from LegalizerHelper
minMaxToCompare(unsigned Opc)1568 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1569 switch (Opc) {
1570 case TargetOpcode::G_SMIN:
1571 return CmpInst::ICMP_SLT;
1572 case TargetOpcode::G_SMAX:
1573 return CmpInst::ICMP_SGT;
1574 case TargetOpcode::G_UMIN:
1575 return CmpInst::ICMP_ULT;
1576 case TargetOpcode::G_UMAX:
1577 return CmpInst::ICMP_UGT;
1578 default:
1579 llvm_unreachable("not in integer min/max");
1580 }
1581 }
1582
minMaxToExtend(unsigned Opc)1583 static unsigned minMaxToExtend(unsigned Opc) {
1584 switch (Opc) {
1585 case TargetOpcode::G_SMIN:
1586 case TargetOpcode::G_SMAX:
1587 return TargetOpcode::G_SEXT;
1588 case TargetOpcode::G_UMIN:
1589 case TargetOpcode::G_UMAX:
1590 return TargetOpcode::G_ZEXT;
1591 default:
1592 llvm_unreachable("not in integer min/max");
1593 }
1594 }
1595
1596 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1597 // any illegal vector extend or unmerge operations.
1598 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1599 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1600 const LLT S32 = LLT::scalar(32);
1601 auto Bitcast = B.buildBitcast(S32, Src);
1602
1603 if (ExtOpcode == TargetOpcode::G_SEXT) {
1604 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1605 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1606 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1607 }
1608
1609 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1610 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1611 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1612 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1613 }
1614
1615 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1616 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1617 }
1618
buildExpandedScalarMinMax(MachineIRBuilder & B,CmpInst::Predicate Pred,Register Dst,Register Src0,Register Src1)1619 static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
1620 CmpInst::Predicate Pred,
1621 Register Dst, Register Src0,
1622 Register Src1) {
1623 const LLT CmpType = LLT::scalar(32);
1624 auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1625 return B.buildSelect(Dst, Cmp, Src0, Src1);
1626 }
1627
1628 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
lowerScalarMinMax(MachineIRBuilder & B,MachineInstr & MI) const1629 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1630 MachineInstr &MI) const {
1631 Register Dst = MI.getOperand(0).getReg();
1632 Register Src0 = MI.getOperand(1).getReg();
1633 Register Src1 = MI.getOperand(2).getReg();
1634
1635 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1636 MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
1637
1638 Register CmpReg = Sel->getOperand(1).getReg();
1639 B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
1640 MI.eraseFromParent();
1641 }
1642
1643 // For cases where only a single copy is inserted for matching register banks.
1644 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1645 static bool substituteSimpleCopyRegs(
1646 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1647 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1648 if (!SrcReg.empty()) {
1649 assert(SrcReg.size() == 1);
1650 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1651 return true;
1652 }
1653
1654 return false;
1655 }
1656
1657 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1658 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1659 MachineRegisterInfo &MRI,
1660 Register Reg) const {
1661 if (!Subtarget.hasUnpackedD16VMem())
1662 return Reg;
1663
1664 const LLT S16 = LLT::scalar(16);
1665 LLT StoreVT = MRI.getType(Reg);
1666 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1667 return Reg;
1668
1669 auto Unmerge = B.buildUnmerge(S16, Reg);
1670
1671
1672 SmallVector<Register, 4> WideRegs;
1673 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1674 WideRegs.push_back(Unmerge.getReg(I));
1675
1676 const LLT S32 = LLT::scalar(32);
1677 int NumElts = StoreVT.getNumElements();
1678
1679 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1680 }
1681
1682 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1683 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1684 int64_t Const;
1685 if (mi_match(Reg, MRI, m_ICst(Const)))
1686 return std::make_pair(Register(), Const);
1687
1688 Register Base;
1689 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1690 return std::make_pair(Base, Const);
1691
1692 // TODO: Handle G_OR used for add case
1693 return std::make_pair(Reg, 0);
1694 }
1695
1696 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1697 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1698 Register OrigOffset) const {
1699 const unsigned MaxImm = 4095;
1700 Register BaseReg;
1701 unsigned ImmOffset;
1702 const LLT S32 = LLT::scalar(32);
1703
1704 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1705 OrigOffset);
1706
1707 unsigned C1 = 0;
1708 if (ImmOffset != 0) {
1709 // If the immediate value is too big for the immoffset field, put the value
1710 // and -4096 into the immoffset field so that the value that is copied/added
1711 // for the voffset field is a multiple of 4096, and it stands more chance
1712 // of being CSEd with the copy/add for another similar load/store.
1713 // However, do not do that rounding down to a multiple of 4096 if that is a
1714 // negative number, as it appears to be illegal to have a negative offset
1715 // in the vgpr, even if adding the immediate offset makes it positive.
1716 unsigned Overflow = ImmOffset & ~MaxImm;
1717 ImmOffset -= Overflow;
1718 if ((int32_t)Overflow < 0) {
1719 Overflow += ImmOffset;
1720 ImmOffset = 0;
1721 }
1722
1723 C1 = ImmOffset;
1724 if (Overflow != 0) {
1725 if (!BaseReg)
1726 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1727 else {
1728 auto OverflowVal = B.buildConstant(S32, Overflow);
1729 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1730 }
1731 }
1732 }
1733
1734 if (!BaseReg)
1735 BaseReg = B.buildConstant(S32, 0).getReg(0);
1736
1737 return {BaseReg, C1};
1738 }
1739
isZero(Register Reg,MachineRegisterInfo & MRI)1740 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1741 int64_t C;
1742 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1743 }
1744
extractGLC(unsigned CachePolicy)1745 static unsigned extractGLC(unsigned CachePolicy) {
1746 return CachePolicy & 1;
1747 }
1748
extractSLC(unsigned CachePolicy)1749 static unsigned extractSLC(unsigned CachePolicy) {
1750 return (CachePolicy >> 1) & 1;
1751 }
1752
extractDLC(unsigned CachePolicy)1753 static unsigned extractDLC(unsigned CachePolicy) {
1754 return (CachePolicy >> 2) & 1;
1755 }
1756
1757 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1758 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1759 MachineInstr &MI) const {
1760 MachineRegisterInfo &MRI = *B.getMRI();
1761 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1762
1763 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1764
1765 Register VData = MI.getOperand(1).getReg();
1766 LLT Ty = MRI.getType(VData);
1767
1768 int EltSize = Ty.getScalarSizeInBits();
1769 int Size = Ty.getSizeInBits();
1770
1771 // FIXME: Broken integer truncstore.
1772 if (EltSize != 32)
1773 report_fatal_error("unhandled intrinsic store");
1774
1775 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1776 const int MemSize = (*MI.memoperands_begin())->getSize();
1777
1778
1779 Register RSrc = MI.getOperand(2).getReg();
1780 Register VOffset = MI.getOperand(3).getReg();
1781 Register SOffset = MI.getOperand(4).getReg();
1782 unsigned CachePolicy = MI.getOperand(5).getImm();
1783
1784 unsigned ImmOffset;
1785 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1786
1787 const bool Offen = !isZero(VOffset, MRI);
1788
1789 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1790 switch (8 * MemSize) {
1791 case 8:
1792 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1793 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1794 break;
1795 case 16:
1796 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1797 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1798 break;
1799 default:
1800 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1801 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1802 if (Size > 32)
1803 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1804 break;
1805 }
1806
1807
1808 // Set the insertion point back to the instruction in case it was moved into a
1809 // loop.
1810 B.setInstr(MI);
1811
1812 MachineInstrBuilder MIB = B.buildInstr(Opc)
1813 .addUse(VData);
1814
1815 if (Offen)
1816 MIB.addUse(VOffset);
1817
1818 MIB.addUse(RSrc)
1819 .addUse(SOffset)
1820 .addImm(ImmOffset)
1821 .addImm(extractGLC(CachePolicy))
1822 .addImm(extractSLC(CachePolicy))
1823 .addImm(0) // tfe: FIXME: Remove from inst
1824 .addImm(extractDLC(CachePolicy))
1825 .cloneMemRefs(MI);
1826
1827 // FIXME: We need a way to report failure from applyMappingImpl.
1828 // Insert constrain copies before inserting the loop.
1829 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1830 report_fatal_error("failed to constrain selected store intrinsic");
1831
1832 return MIB;
1833 }
1834
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1835 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1836 Register SrcReg) const {
1837 MachineRegisterInfo &MRI = *B.getMRI();
1838 LLT SrcTy = MRI.getType(SrcReg);
1839 if (SrcTy.getSizeInBits() == 32) {
1840 // Use a v_mov_b32 here to make the exec dependency explicit.
1841 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1842 .addDef(DstReg)
1843 .addUse(SrcReg);
1844 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1845 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1846 }
1847
1848 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1849 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1850
1851 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1852 .addDef(TmpReg0)
1853 .addUse(SrcReg, 0, AMDGPU::sub0);
1854 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1855 .addDef(TmpReg1)
1856 .addUse(SrcReg, 0, AMDGPU::sub1);
1857 B.buildInstr(AMDGPU::REG_SEQUENCE)
1858 .addDef(DstReg)
1859 .addUse(TmpReg0)
1860 .addImm(AMDGPU::sub0)
1861 .addUse(TmpReg1)
1862 .addImm(AMDGPU::sub1);
1863
1864 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1865 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1866 }
1867
1868 /// Utility function for pushing dynamic vector indexes with a constant offset
1869 /// into waterwall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1870 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1871 MachineInstr &IdxUseInstr,
1872 unsigned OpIdx,
1873 unsigned ConstOffset) {
1874 MachineRegisterInfo &MRI = *B.getMRI();
1875 const LLT S32 = LLT::scalar(32);
1876 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1877 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1878
1879 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1880
1881 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1882 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1883 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1884 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1885 }
1886
1887 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1888 /// original 32-bit source value (to be inserted in the low part of the combined
1889 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1890 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1891 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1892 Register Hi32Reg, Register Lo32Reg,
1893 unsigned ExtOpc,
1894 const RegisterBank &RegBank,
1895 bool IsBooleanSrc = false) {
1896 if (ExtOpc == AMDGPU::G_ZEXT) {
1897 B.buildConstant(Hi32Reg, 0);
1898 } else if (ExtOpc == AMDGPU::G_SEXT) {
1899 if (IsBooleanSrc) {
1900 // If we know the original source was an s1, the high half is the same as
1901 // the low.
1902 B.buildCopy(Hi32Reg, Lo32Reg);
1903 } else {
1904 // Replicate sign bit from 32-bit extended part.
1905 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1906 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1907 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1908 }
1909 } else {
1910 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1911 B.buildUndef(Hi32Reg);
1912 }
1913 }
1914
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1915 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1916 MachineInstr &MI, MachineRegisterInfo &MRI,
1917 const OperandsMapper &OpdMapper) const {
1918
1919 Register VecReg = MI.getOperand(1).getReg();
1920 Register Idx = MI.getOperand(2).getReg();
1921
1922 const RegisterBank &IdxBank =
1923 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1924
1925 bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
1926
1927 LLT VecTy = MRI.getType(VecReg);
1928 unsigned EltSize = VecTy.getScalarSizeInBits();
1929 unsigned NumElem = VecTy.getNumElements();
1930
1931 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1932 IsDivergentIdx))
1933 return false;
1934
1935 MachineIRBuilder B(MI);
1936 LLT S32 = LLT::scalar(32);
1937
1938 const RegisterBank &DstBank =
1939 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1940 const RegisterBank &SrcBank =
1941 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1942
1943 const RegisterBank &CCBank =
1944 (DstBank == AMDGPU::SGPRRegBank &&
1945 SrcBank == AMDGPU::SGPRRegBank &&
1946 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1947 : AMDGPU::VCCRegBank;
1948 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1949
1950 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1951 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1952 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1953 }
1954
1955 LLT EltTy = VecTy.getScalarType();
1956 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1957 unsigned NumLanes = DstRegs.size();
1958 if (!NumLanes)
1959 NumLanes = 1;
1960 else
1961 EltTy = MRI.getType(DstRegs[0]);
1962
1963 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1964 SmallVector<Register, 2> Res(NumLanes);
1965 for (unsigned L = 0; L < NumLanes; ++L)
1966 Res[L] = UnmergeToEltTy.getReg(L);
1967
1968 for (unsigned I = 1; I < NumElem; ++I) {
1969 auto IC = B.buildConstant(S32, I);
1970 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1971 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1972 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1973
1974 for (unsigned L = 0; L < NumLanes; ++L) {
1975 auto S = B.buildSelect(EltTy, Cmp,
1976 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1977
1978 for (unsigned N : { 0, 2, 3 })
1979 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1980
1981 Res[L] = S->getOperand(0).getReg();
1982 }
1983 }
1984
1985 for (unsigned L = 0; L < NumLanes; ++L) {
1986 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1987 B.buildCopy(DstReg, Res[L]);
1988 MRI.setRegBank(DstReg, DstBank);
1989 }
1990
1991 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1992 MI.eraseFromParent();
1993
1994 return true;
1995 }
1996
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1997 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1998 MachineInstr &MI, MachineRegisterInfo &MRI,
1999 const OperandsMapper &OpdMapper) const {
2000
2001 Register VecReg = MI.getOperand(1).getReg();
2002 Register Idx = MI.getOperand(3).getReg();
2003
2004 const RegisterBank &IdxBank =
2005 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2006
2007 bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
2008
2009 LLT VecTy = MRI.getType(VecReg);
2010 unsigned EltSize = VecTy.getScalarSizeInBits();
2011 unsigned NumElem = VecTy.getNumElements();
2012
2013 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2014 IsDivergentIdx))
2015 return false;
2016
2017 MachineIRBuilder B(MI);
2018 LLT S32 = LLT::scalar(32);
2019
2020 const RegisterBank &DstBank =
2021 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2022 const RegisterBank &SrcBank =
2023 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2024 const RegisterBank &InsBank =
2025 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2026
2027 const RegisterBank &CCBank =
2028 (DstBank == AMDGPU::SGPRRegBank &&
2029 SrcBank == AMDGPU::SGPRRegBank &&
2030 InsBank == AMDGPU::SGPRRegBank &&
2031 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2032 : AMDGPU::VCCRegBank;
2033 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2034
2035 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2036 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2037 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2038 }
2039
2040 LLT EltTy = VecTy.getScalarType();
2041 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2042 unsigned NumLanes = InsRegs.size();
2043 if (!NumLanes) {
2044 NumLanes = 1;
2045 InsRegs.push_back(MI.getOperand(2).getReg());
2046 } else {
2047 EltTy = MRI.getType(InsRegs[0]);
2048 }
2049
2050 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2051 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2052
2053 for (unsigned I = 0; I < NumElem; ++I) {
2054 auto IC = B.buildConstant(S32, I);
2055 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2056 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2057 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2058
2059 for (unsigned L = 0; L < NumLanes; ++L) {
2060 auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
2061 UnmergeToEltTy.getReg(I * NumLanes + L));
2062
2063 for (unsigned N : { 0, 2, 3 })
2064 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2065
2066 Ops[I * NumLanes + L] = S->getOperand(0).getReg();
2067 }
2068 }
2069
2070 LLT MergeTy = LLT::vector(Ops.size(), EltTy);
2071 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2072 B.buildBuildVector(MI.getOperand(0), Ops);
2073 } else {
2074 auto Vec = B.buildBuildVector(MergeTy, Ops);
2075 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2076 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2077 }
2078
2079 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2080 MI.eraseFromParent();
2081
2082 return true;
2083 }
2084
applyMappingImpl(const OperandsMapper & OpdMapper) const2085 void AMDGPURegisterBankInfo::applyMappingImpl(
2086 const OperandsMapper &OpdMapper) const {
2087 MachineInstr &MI = OpdMapper.getMI();
2088 unsigned Opc = MI.getOpcode();
2089 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2090 switch (Opc) {
2091 case AMDGPU::G_PHI: {
2092 Register DstReg = MI.getOperand(0).getReg();
2093 LLT DstTy = MRI.getType(DstReg);
2094 if (DstTy != LLT::scalar(1))
2095 break;
2096
2097 const LLT S32 = LLT::scalar(32);
2098 const RegisterBank *DstBank =
2099 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2100 if (DstBank == &AMDGPU::VCCRegBank) {
2101 applyDefaultMapping(OpdMapper);
2102 // The standard handling only considers the result register bank for
2103 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2104 // produce an invalid copy. We can only copy with some kind of compare to
2105 // get a vector boolean result. Insert a regitser bank copy that will be
2106 // correctly lowered to a compare.
2107 MachineIRBuilder B(*MI.getParent()->getParent());
2108
2109 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2110 Register SrcReg = MI.getOperand(I).getReg();
2111 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2112
2113 if (SrcBank != &AMDGPU::VCCRegBank) {
2114 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2115 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2116
2117 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2118 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2119 MI.getOperand(I).setReg(Copy.getReg(0));
2120 }
2121 }
2122
2123 return;
2124 }
2125
2126 // Phi handling is strange and only considers the bank of the destination.
2127 substituteSimpleCopyRegs(OpdMapper, 0);
2128
2129 // Promote SGPR/VGPR booleans to s32
2130 MachineFunction *MF = MI.getParent()->getParent();
2131 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2132 GISelObserverWrapper Observer(&ApplyBank);
2133 MachineIRBuilder B(MI);
2134 LegalizerHelper Helper(*MF, Observer, B);
2135
2136 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2137 llvm_unreachable("widen scalar should have succeeded");
2138
2139 return;
2140 }
2141 case AMDGPU::G_ICMP:
2142 case AMDGPU::G_UADDO:
2143 case AMDGPU::G_USUBO:
2144 case AMDGPU::G_UADDE:
2145 case AMDGPU::G_SADDE:
2146 case AMDGPU::G_USUBE:
2147 case AMDGPU::G_SSUBE: {
2148 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2149 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2150
2151 const RegisterBank *DstBank =
2152 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2153 if (DstBank != &AMDGPU::SGPRRegBank)
2154 break;
2155
2156 const bool HasCarryIn = MI.getNumOperands() == 5;
2157
2158 // If this is a scalar compare, promote the result to s32, as the selection
2159 // will end up using a copy to a 32-bit vreg.
2160 const LLT S32 = LLT::scalar(32);
2161 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2162 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2163 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2164 MachineIRBuilder B(MI);
2165
2166 if (HasCarryIn) {
2167 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2168 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2169 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2170 MI.getOperand(4).setReg(NewSrcReg);
2171 }
2172
2173 MachineBasicBlock *MBB = MI.getParent();
2174 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2175
2176 // If we had a constrained VCC result register, a copy was inserted to VCC
2177 // from SGPR.
2178 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2179 if (DefRegs.empty())
2180 DefRegs.push_back(DstReg);
2181 B.buildTrunc(DefRegs[0], NewDstReg);
2182 return;
2183 }
2184 case AMDGPU::G_SELECT: {
2185 Register DstReg = MI.getOperand(0).getReg();
2186 LLT DstTy = MRI.getType(DstReg);
2187
2188 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2189 if (CondRegs.empty())
2190 CondRegs.push_back(MI.getOperand(1).getReg());
2191 else {
2192 assert(CondRegs.size() == 1);
2193 }
2194
2195 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2196 if (CondBank == &AMDGPU::SGPRRegBank) {
2197 MachineIRBuilder B(MI);
2198 const LLT S32 = LLT::scalar(32);
2199 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2200 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2201
2202 MI.getOperand(1).setReg(NewCondReg);
2203 B.buildZExt(NewCondReg, CondRegs[0]);
2204 }
2205
2206 if (DstTy.getSizeInBits() != 64)
2207 break;
2208
2209 MachineIRBuilder B(MI);
2210 LLT HalfTy = getHalfSizedType(DstTy);
2211
2212 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2213 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2214 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2215
2216 // All inputs are SGPRs, nothing special to do.
2217 if (DefRegs.empty()) {
2218 assert(Src1Regs.empty() && Src2Regs.empty());
2219 break;
2220 }
2221
2222 if (Src1Regs.empty())
2223 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2224 else {
2225 setRegsToType(MRI, Src1Regs, HalfTy);
2226 }
2227
2228 if (Src2Regs.empty())
2229 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2230 else
2231 setRegsToType(MRI, Src2Regs, HalfTy);
2232
2233 setRegsToType(MRI, DefRegs, HalfTy);
2234
2235 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2236 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2237
2238 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2239 MI.eraseFromParent();
2240 return;
2241 }
2242 case AMDGPU::G_BRCOND: {
2243 Register CondReg = MI.getOperand(0).getReg();
2244 // FIXME: Should use legalizer helper, but should change bool ext type.
2245 const RegisterBank *CondBank =
2246 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2247
2248 if (CondBank == &AMDGPU::SGPRRegBank) {
2249 MachineIRBuilder B(MI);
2250 const LLT S32 = LLT::scalar(32);
2251 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2252 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2253
2254 MI.getOperand(0).setReg(NewCondReg);
2255 B.buildZExt(NewCondReg, CondReg);
2256 return;
2257 }
2258
2259 break;
2260 }
2261 case AMDGPU::G_AND:
2262 case AMDGPU::G_OR:
2263 case AMDGPU::G_XOR: {
2264 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2265 // there is a VGPR input.
2266 Register DstReg = MI.getOperand(0).getReg();
2267 LLT DstTy = MRI.getType(DstReg);
2268
2269 if (DstTy.getSizeInBits() == 1) {
2270 const RegisterBank *DstBank =
2271 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2272 if (DstBank == &AMDGPU::VCCRegBank)
2273 break;
2274
2275 MachineFunction *MF = MI.getParent()->getParent();
2276 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2277 GISelObserverWrapper Observer(&ApplyBank);
2278 MachineIRBuilder B(MI);
2279 LegalizerHelper Helper(*MF, Observer, B);
2280
2281 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2282 LegalizerHelper::Legalized)
2283 llvm_unreachable("widen scalar should have succeeded");
2284 return;
2285 }
2286
2287 if (DstTy.getSizeInBits() != 64)
2288 break;
2289
2290 LLT HalfTy = getHalfSizedType(DstTy);
2291 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2292 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2293 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2294
2295 // All inputs are SGPRs, nothing special to do.
2296 if (DefRegs.empty()) {
2297 assert(Src0Regs.empty() && Src1Regs.empty());
2298 break;
2299 }
2300
2301 assert(DefRegs.size() == 2);
2302 assert(Src0Regs.size() == Src1Regs.size() &&
2303 (Src0Regs.empty() || Src0Regs.size() == 2));
2304
2305 // Depending on where the source registers came from, the generic code may
2306 // have decided to split the inputs already or not. If not, we still need to
2307 // extract the values.
2308 MachineIRBuilder B(MI);
2309
2310 if (Src0Regs.empty())
2311 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2312 else
2313 setRegsToType(MRI, Src0Regs, HalfTy);
2314
2315 if (Src1Regs.empty())
2316 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2317 else
2318 setRegsToType(MRI, Src1Regs, HalfTy);
2319
2320 setRegsToType(MRI, DefRegs, HalfTy);
2321
2322 B.buildInstr(Opc)
2323 .addDef(DefRegs[0])
2324 .addUse(Src0Regs[0])
2325 .addUse(Src1Regs[0]);
2326
2327 B.buildInstr(Opc)
2328 .addDef(DefRegs[1])
2329 .addUse(Src0Regs[1])
2330 .addUse(Src1Regs[1]);
2331
2332 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2333 MI.eraseFromParent();
2334 return;
2335 }
2336 case AMDGPU::G_ADD:
2337 case AMDGPU::G_SUB:
2338 case AMDGPU::G_MUL:
2339 case AMDGPU::G_SHL:
2340 case AMDGPU::G_LSHR:
2341 case AMDGPU::G_ASHR: {
2342 Register DstReg = MI.getOperand(0).getReg();
2343 LLT DstTy = MRI.getType(DstReg);
2344
2345 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2346 // Packed 16-bit operations need to be scalarized and promoted.
2347 if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
2348 break;
2349
2350 const RegisterBank *DstBank =
2351 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2352 if (DstBank == &AMDGPU::VGPRRegBank)
2353 break;
2354
2355 const LLT S32 = LLT::scalar(32);
2356 MachineBasicBlock *MBB = MI.getParent();
2357 MachineFunction *MF = MBB->getParent();
2358 MachineIRBuilder B(MI);
2359 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2360 GISelObserverWrapper Observer(&ApplySALU);
2361
2362 if (DstTy.isVector()) {
2363 B.setChangeObserver(Observer);
2364
2365 Register WideSrc0Lo, WideSrc0Hi;
2366 Register WideSrc1Lo, WideSrc1Hi;
2367
2368 std::tie(WideSrc0Lo, WideSrc0Hi)
2369 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
2370 std::tie(WideSrc1Lo, WideSrc1Hi)
2371 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
2372 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2373 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2374 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2375 MI.eraseFromParent();
2376 } else {
2377 LegalizerHelper Helper(*MF, Observer, B);
2378
2379 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2380 llvm_unreachable("widen scalar should have succeeded");
2381
2382 // FIXME: s16 shift amounts should be legal.
2383 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2384 Opc == AMDGPU::G_ASHR) {
2385 B.setInsertPt(*MBB, MI.getIterator());
2386 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2387 llvm_unreachable("widen scalar should have succeeded");
2388 }
2389 }
2390
2391 return;
2392 }
2393 case AMDGPU::G_SMIN:
2394 case AMDGPU::G_SMAX:
2395 case AMDGPU::G_UMIN:
2396 case AMDGPU::G_UMAX: {
2397 Register DstReg = MI.getOperand(0).getReg();
2398 const RegisterBank *DstBank =
2399 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2400 if (DstBank == &AMDGPU::VGPRRegBank)
2401 break;
2402
2403 MachineFunction *MF = MI.getParent()->getParent();
2404 MachineIRBuilder B(MI);
2405
2406 // Turn scalar min/max into a compare and select.
2407 LLT Ty = MRI.getType(DstReg);
2408 const LLT S32 = LLT::scalar(32);
2409 const LLT S16 = LLT::scalar(16);
2410 const LLT V2S16 = LLT::vector(2, 16);
2411
2412 if (Ty == V2S16) {
2413 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2414 GISelObserverWrapper Observer(&ApplySALU);
2415 B.setChangeObserver(Observer);
2416
2417 // Need to widen to s32, and expand as cmp + select, and avoid producing
2418 // illegal vector extends or unmerges that would need further
2419 // legalization.
2420 //
2421 // TODO: Should we just readfirstlane? That should probably be handled
2422 // with a UniformVGPR register bank that wouldn't need special
2423 // consideration here.
2424
2425 Register Dst = MI.getOperand(0).getReg();
2426 Register Src0 = MI.getOperand(1).getReg();
2427 Register Src1 = MI.getOperand(2).getReg();
2428
2429 Register WideSrc0Lo, WideSrc0Hi;
2430 Register WideSrc1Lo, WideSrc1Hi;
2431
2432 unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
2433
2434 std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
2435 std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
2436
2437 Register Lo = MRI.createGenericVirtualRegister(S32);
2438 Register Hi = MRI.createGenericVirtualRegister(S32);
2439 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
2440 buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
2441 buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
2442
2443 B.buildBuildVectorTrunc(Dst, {Lo, Hi});
2444 MI.eraseFromParent();
2445 } else if (Ty == S16) {
2446 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2447 GISelObserverWrapper Observer(&ApplySALU);
2448 LegalizerHelper Helper(*MF, Observer, B);
2449
2450 // Need to widen to s32, and expand as cmp + select.
2451 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2452 llvm_unreachable("widenScalar should have succeeded");
2453
2454 // FIXME: This is relying on widenScalar leaving MI in place.
2455 lowerScalarMinMax(B, MI);
2456 } else
2457 lowerScalarMinMax(B, MI);
2458
2459 return;
2460 }
2461 case AMDGPU::G_SEXT_INREG: {
2462 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2463 if (SrcRegs.empty())
2464 break; // Nothing to repair
2465
2466 const LLT S32 = LLT::scalar(32);
2467 MachineIRBuilder B(MI);
2468 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2469 GISelObserverWrapper Observer(&O);
2470 B.setChangeObserver(Observer);
2471
2472 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2473 // we would need to further expand, and doesn't let us directly set the
2474 // result registers.
2475 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2476
2477 int Amt = MI.getOperand(2).getImm();
2478 if (Amt <= 32) {
2479 if (Amt == 32) {
2480 // The low bits are unchanged.
2481 B.buildCopy(DstRegs[0], SrcRegs[0]);
2482 } else {
2483 // Extend in the low bits and propagate the sign bit to the high half.
2484 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2485 }
2486
2487 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2488 } else {
2489 // The low bits are unchanged, and extend in the high bits.
2490 B.buildCopy(DstRegs[0], SrcRegs[0]);
2491 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2492 }
2493
2494 Register DstReg = MI.getOperand(0).getReg();
2495 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2496 MI.eraseFromParent();
2497 return;
2498 }
2499 case AMDGPU::G_CTPOP:
2500 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2501 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2502 MachineIRBuilder B(MI);
2503 MachineFunction &MF = B.getMF();
2504
2505 const RegisterBank *DstBank =
2506 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2507 if (DstBank == &AMDGPU::SGPRRegBank)
2508 break;
2509
2510 Register SrcReg = MI.getOperand(1).getReg();
2511 const LLT S32 = LLT::scalar(32);
2512 LLT Ty = MRI.getType(SrcReg);
2513 if (Ty == S32)
2514 break;
2515
2516 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2517 GISelObserverWrapper Observer(&ApplyVALU);
2518 LegalizerHelper Helper(MF, Observer, B);
2519
2520 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2521 llvm_unreachable("narrowScalar should have succeeded");
2522 return;
2523 }
2524 case AMDGPU::G_SEXT:
2525 case AMDGPU::G_ZEXT:
2526 case AMDGPU::G_ANYEXT: {
2527 Register SrcReg = MI.getOperand(1).getReg();
2528 LLT SrcTy = MRI.getType(SrcReg);
2529 const bool Signed = Opc == AMDGPU::G_SEXT;
2530
2531 assert(empty(OpdMapper.getVRegs(1)));
2532
2533 MachineIRBuilder B(MI);
2534 const RegisterBank *SrcBank =
2535 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2536
2537 Register DstReg = MI.getOperand(0).getReg();
2538 LLT DstTy = MRI.getType(DstReg);
2539 if (DstTy.isScalar() &&
2540 SrcBank != &AMDGPU::SGPRRegBank &&
2541 SrcBank != &AMDGPU::VCCRegBank &&
2542 // FIXME: Should handle any type that round to s64 when irregular
2543 // breakdowns supported.
2544 DstTy.getSizeInBits() == 64 &&
2545 SrcTy.getSizeInBits() <= 32) {
2546 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2547
2548 // Extend to 32-bit, and then extend the low half.
2549 if (Signed) {
2550 // TODO: Should really be buildSExtOrCopy
2551 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2552 } else if (Opc == AMDGPU::G_ZEXT) {
2553 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2554 } else {
2555 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2556 }
2557
2558 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2559 MRI.setRegBank(DstReg, *SrcBank);
2560 MI.eraseFromParent();
2561 return;
2562 }
2563
2564 if (SrcTy != LLT::scalar(1))
2565 return;
2566
2567 // It is not legal to have a legalization artifact with a VCC source. Rather
2568 // than introducing a copy, insert the select we would have to select the
2569 // copy to.
2570 if (SrcBank == &AMDGPU::VCCRegBank) {
2571 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2572
2573 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2574
2575 unsigned DstSize = DstTy.getSizeInBits();
2576 // 64-bit select is SGPR only
2577 const bool UseSel64 = DstSize > 32 &&
2578 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2579
2580 // TODO: Should s16 select be legal?
2581 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2582 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2583 auto False = B.buildConstant(SelType, 0);
2584
2585 MRI.setRegBank(True.getReg(0), *DstBank);
2586 MRI.setRegBank(False.getReg(0), *DstBank);
2587 MRI.setRegBank(DstReg, *DstBank);
2588
2589 if (DstSize > 32) {
2590 B.buildSelect(DefRegs[0], SrcReg, True, False);
2591 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2592 } else if (DstSize < 32) {
2593 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2594 MRI.setRegBank(Sel.getReg(0), *DstBank);
2595 B.buildTrunc(DstReg, Sel);
2596 } else {
2597 B.buildSelect(DstReg, SrcReg, True, False);
2598 }
2599
2600 MI.eraseFromParent();
2601 return;
2602 }
2603
2604 break;
2605 }
2606 case AMDGPU::G_BUILD_VECTOR:
2607 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2608 Register DstReg = MI.getOperand(0).getReg();
2609 LLT DstTy = MRI.getType(DstReg);
2610 if (DstTy != LLT::vector(2, 16))
2611 break;
2612
2613 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2614 substituteSimpleCopyRegs(OpdMapper, 1);
2615 substituteSimpleCopyRegs(OpdMapper, 2);
2616
2617 const RegisterBank *DstBank =
2618 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2619 if (DstBank == &AMDGPU::SGPRRegBank)
2620 break; // Can use S_PACK_* instructions.
2621
2622 MachineIRBuilder B(MI);
2623
2624 Register Lo = MI.getOperand(1).getReg();
2625 Register Hi = MI.getOperand(2).getReg();
2626 const LLT S32 = LLT::scalar(32);
2627
2628 const RegisterBank *BankLo =
2629 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2630 const RegisterBank *BankHi =
2631 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2632
2633 Register ZextLo;
2634 Register ShiftHi;
2635
2636 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2637 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2638 MRI.setRegBank(ZextLo, *BankLo);
2639
2640 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2641 MRI.setRegBank(ZextHi, *BankHi);
2642
2643 auto ShiftAmt = B.buildConstant(S32, 16);
2644 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2645
2646 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2647 MRI.setRegBank(ShiftHi, *BankHi);
2648 } else {
2649 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2650 MRI.setRegBank(MaskLo, *BankLo);
2651
2652 auto ShiftAmt = B.buildConstant(S32, 16);
2653 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2654
2655 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2656 MRI.setRegBank(ShiftHi, *BankHi);
2657
2658 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2659 MRI.setRegBank(ZextLo, *BankLo);
2660 }
2661
2662 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2663 MRI.setRegBank(Or.getReg(0), *DstBank);
2664
2665 B.buildBitcast(DstReg, Or);
2666 MI.eraseFromParent();
2667 return;
2668 }
2669 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2670 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2671
2672 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2673
2674 Register DstReg = MI.getOperand(0).getReg();
2675 Register SrcReg = MI.getOperand(1).getReg();
2676
2677 const LLT S32 = LLT::scalar(32);
2678 LLT DstTy = MRI.getType(DstReg);
2679 LLT SrcTy = MRI.getType(SrcReg);
2680
2681 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2682 return;
2683
2684 MachineIRBuilder B(MI);
2685
2686 const ValueMapping &DstMapping
2687 = OpdMapper.getInstrMapping().getOperandMapping(0);
2688 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2689 const RegisterBank *SrcBank =
2690 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2691 const RegisterBank *IdxBank =
2692 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2693
2694 Register BaseIdxReg;
2695 unsigned ConstOffset;
2696 MachineInstr *OffsetDef;
2697 std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
2698 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2699
2700 // See if the index is an add of a constant which will be foldable by moving
2701 // the base register of the index later if this is going to be executed in a
2702 // waterfall loop. This is essentially to reassociate the add of a constant
2703 // with the readfirstlane.
2704 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2705 ConstOffset > 0 &&
2706 ConstOffset < SrcTy.getNumElements();
2707
2708 // Move the base register. We'll re-insert the add later.
2709 if (ShouldMoveIndexIntoLoop)
2710 MI.getOperand(2).setReg(BaseIdxReg);
2711
2712 // If this is a VGPR result only because the index was a VGPR result, the
2713 // actual indexing will be done on the SGPR source vector, which will
2714 // produce a scalar result. We need to copy to the VGPR result inside the
2715 // waterfall loop.
2716 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2717 SrcBank == &AMDGPU::SGPRRegBank;
2718 if (DstRegs.empty()) {
2719 applyDefaultMapping(OpdMapper);
2720
2721 executeInWaterfallLoop(MI, MRI, { 2 });
2722
2723 if (NeedCopyToVGPR) {
2724 // We don't want a phi for this temporary reg.
2725 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2726 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2727 MI.getOperand(0).setReg(TmpReg);
2728 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2729
2730 // Use a v_mov_b32 here to make the exec dependency explicit.
2731 buildVCopy(B, DstReg, TmpReg);
2732 }
2733
2734 // Re-insert the constant offset add inside the waterfall loop.
2735 if (ShouldMoveIndexIntoLoop)
2736 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2737
2738 return;
2739 }
2740
2741 assert(DstTy.getSizeInBits() == 64);
2742
2743 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2744
2745 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2746 auto One = B.buildConstant(S32, 1);
2747
2748 MachineBasicBlock::iterator MII = MI.getIterator();
2749
2750 // Split the vector index into 32-bit pieces. Prepare to move all of the
2751 // new instructions into a waterfall loop if necessary.
2752 //
2753 // Don't put the bitcast or constant in the loop.
2754 MachineInstrSpan Span(MII, &B.getMBB());
2755
2756 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2757 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2758 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2759
2760 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2761 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2762
2763 MRI.setRegBank(DstReg, *DstBank);
2764 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2765 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2766 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2767 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2768
2769 SmallSet<Register, 4> OpsToWaterfall;
2770 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2771 MI.eraseFromParent();
2772 return;
2773 }
2774
2775 // Remove the original instruction to avoid potentially confusing the
2776 // waterfall loop logic.
2777 B.setInstr(*Span.begin());
2778 MI.eraseFromParent();
2779 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2780 OpsToWaterfall, MRI);
2781
2782 if (NeedCopyToVGPR) {
2783 MachineBasicBlock *LoopBB = Extract1->getParent();
2784 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2785 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2786 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2787 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2788
2789 Extract0->getOperand(0).setReg(TmpReg0);
2790 Extract1->getOperand(0).setReg(TmpReg1);
2791
2792 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2793
2794 buildVCopy(B, DstRegs[0], TmpReg0);
2795 buildVCopy(B, DstRegs[1], TmpReg1);
2796 }
2797
2798 if (ShouldMoveIndexIntoLoop)
2799 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2800
2801 return;
2802 }
2803 case AMDGPU::G_INSERT_VECTOR_ELT: {
2804 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2805
2806 Register DstReg = MI.getOperand(0).getReg();
2807 LLT VecTy = MRI.getType(DstReg);
2808
2809 assert(OpdMapper.getVRegs(0).empty());
2810 assert(OpdMapper.getVRegs(3).empty());
2811
2812 if (substituteSimpleCopyRegs(OpdMapper, 1))
2813 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2814
2815 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2816 return;
2817
2818 const RegisterBank *IdxBank =
2819 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2820
2821 Register SrcReg = MI.getOperand(1).getReg();
2822 Register InsReg = MI.getOperand(2).getReg();
2823 LLT InsTy = MRI.getType(InsReg);
2824 (void)InsTy;
2825
2826 Register BaseIdxReg;
2827 unsigned ConstOffset;
2828 MachineInstr *OffsetDef;
2829 std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
2830 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2831
2832 // See if the index is an add of a constant which will be foldable by moving
2833 // the base register of the index later if this is going to be executed in a
2834 // waterfall loop. This is essentially to reassociate the add of a constant
2835 // with the readfirstlane.
2836 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2837 ConstOffset > 0 &&
2838 ConstOffset < VecTy.getNumElements();
2839
2840 // Move the base register. We'll re-insert the add later.
2841 if (ShouldMoveIndexIntoLoop)
2842 MI.getOperand(3).setReg(BaseIdxReg);
2843
2844
2845 if (InsRegs.empty()) {
2846 executeInWaterfallLoop(MI, MRI, { 3 });
2847
2848 // Re-insert the constant offset add inside the waterfall loop.
2849 if (ShouldMoveIndexIntoLoop) {
2850 MachineIRBuilder B(MI);
2851 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2852 }
2853
2854 return;
2855 }
2856
2857
2858 assert(InsTy.getSizeInBits() == 64);
2859
2860 const LLT S32 = LLT::scalar(32);
2861 LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2862
2863 MachineIRBuilder B(MI);
2864 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2865 auto One = B.buildConstant(S32, 1);
2866
2867 // Split the vector index into 32-bit pieces. Prepare to move all of the
2868 // new instructions into a waterfall loop if necessary.
2869 //
2870 // Don't put the bitcast or constant in the loop.
2871 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2872
2873 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2874 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2875 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2876
2877 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2878 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2879
2880 const RegisterBank *DstBank =
2881 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2882 const RegisterBank *SrcBank =
2883 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2884 const RegisterBank *InsSrcBank =
2885 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2886
2887 MRI.setRegBank(InsReg, *InsSrcBank);
2888 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2889 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2890 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2891 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2892 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2893 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2894
2895
2896 SmallSet<Register, 4> OpsToWaterfall;
2897 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2898 B.setInsertPt(B.getMBB(), MI);
2899 B.buildBitcast(DstReg, InsHi);
2900 MI.eraseFromParent();
2901 return;
2902 }
2903
2904 B.setInstr(*Span.begin());
2905 MI.eraseFromParent();
2906
2907 // Figure out the point after the waterfall loop before mangling the control
2908 // flow.
2909 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2910 OpsToWaterfall, MRI);
2911
2912 // The insertion point is now right after the original instruction.
2913 //
2914 // Keep the bitcast to the original vector type out of the loop. Doing this
2915 // saved an extra phi we don't need inside the loop.
2916 B.buildBitcast(DstReg, InsHi);
2917
2918 // Re-insert the constant offset add inside the waterfall loop.
2919 if (ShouldMoveIndexIntoLoop)
2920 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2921
2922 return;
2923 }
2924 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2925 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2926 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2927 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2928 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2929 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2930 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2931 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2932 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2933 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2934 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2935 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2936 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2937 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2938 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2939 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2940 applyDefaultMapping(OpdMapper);
2941 executeInWaterfallLoop(MI, MRI, {1, 4});
2942 return;
2943 }
2944 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2945 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2946 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2947 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2948 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2949 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2950 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2951 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2952 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2954 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2955 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2956 applyDefaultMapping(OpdMapper);
2957 executeInWaterfallLoop(MI, MRI, {2, 5});
2958 return;
2959 }
2960 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2961 applyDefaultMapping(OpdMapper);
2962 executeInWaterfallLoop(MI, MRI, {3, 6});
2963 return;
2964 }
2965 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2966 applyMappingSBufferLoad(OpdMapper);
2967 return;
2968 }
2969 case AMDGPU::G_INTRINSIC: {
2970 switch (MI.getIntrinsicID()) {
2971 case Intrinsic::amdgcn_readlane: {
2972 substituteSimpleCopyRegs(OpdMapper, 2);
2973
2974 assert(OpdMapper.getVRegs(0).empty());
2975 assert(OpdMapper.getVRegs(3).empty());
2976
2977 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2978 // waterfall loop, so assume it's a uniform value.
2979 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2980 return;
2981 }
2982 case Intrinsic::amdgcn_writelane: {
2983 assert(OpdMapper.getVRegs(0).empty());
2984 assert(OpdMapper.getVRegs(2).empty());
2985 assert(OpdMapper.getVRegs(3).empty());
2986
2987 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2988 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2989 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2990 return;
2991 }
2992 case Intrinsic::amdgcn_ballot:
2993 case Intrinsic::amdgcn_interp_p1:
2994 case Intrinsic::amdgcn_interp_p2:
2995 case Intrinsic::amdgcn_interp_mov:
2996 case Intrinsic::amdgcn_interp_p1_f16:
2997 case Intrinsic::amdgcn_interp_p2_f16: {
2998 applyDefaultMapping(OpdMapper);
2999
3000 // Readlane for m0 value, which is always the last operand.
3001 // FIXME: Should this be a waterfall loop instead?
3002 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3003 return;
3004 }
3005 case Intrinsic::amdgcn_permlane16:
3006 case Intrinsic::amdgcn_permlanex16: {
3007 // Doing a waterfall loop over these wouldn't make any sense.
3008 substituteSimpleCopyRegs(OpdMapper, 2);
3009 substituteSimpleCopyRegs(OpdMapper, 3);
3010 constrainOpWithReadfirstlane(MI, MRI, 4);
3011 constrainOpWithReadfirstlane(MI, MRI, 5);
3012 return;
3013 }
3014 case Intrinsic::amdgcn_sbfe:
3015 applyMappingBFEIntrinsic(OpdMapper, true);
3016 return;
3017 case Intrinsic::amdgcn_ubfe:
3018 applyMappingBFEIntrinsic(OpdMapper, false);
3019 return;
3020 }
3021 break;
3022 }
3023 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3024 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3025 const AMDGPU::RsrcIntrinsic *RSrcIntrin
3026 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3027 assert(RSrcIntrin && RSrcIntrin->IsImage);
3028 // Non-images can have complications from operands that allow both SGPR
3029 // and VGPR. For now it's too complicated to figure out the final opcode
3030 // to derive the register bank from the MCInstrDesc.
3031 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3032 return;
3033 }
3034 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3035 auto IntrID = MI.getIntrinsicID();
3036 switch (IntrID) {
3037 case Intrinsic::amdgcn_ds_ordered_add:
3038 case Intrinsic::amdgcn_ds_ordered_swap: {
3039 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3040 assert(OpdMapper.getVRegs(0).empty());
3041 substituteSimpleCopyRegs(OpdMapper, 3);
3042 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3043 return;
3044 }
3045 case Intrinsic::amdgcn_ds_gws_init:
3046 case Intrinsic::amdgcn_ds_gws_barrier:
3047 case Intrinsic::amdgcn_ds_gws_sema_br: {
3048 // Only the first lane is executes, so readfirstlane is safe.
3049 substituteSimpleCopyRegs(OpdMapper, 1);
3050 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3051 return;
3052 }
3053 case Intrinsic::amdgcn_ds_gws_sema_v:
3054 case Intrinsic::amdgcn_ds_gws_sema_p:
3055 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3056 // Only the first lane is executes, so readfirstlane is safe.
3057 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3058 return;
3059 }
3060 case Intrinsic::amdgcn_ds_append:
3061 case Intrinsic::amdgcn_ds_consume: {
3062 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3063 return;
3064 }
3065 case Intrinsic::amdgcn_s_sendmsg:
3066 case Intrinsic::amdgcn_s_sendmsghalt: {
3067 // FIXME: Should this use a waterfall loop?
3068 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3069 return;
3070 }
3071 case Intrinsic::amdgcn_s_setreg: {
3072 constrainOpWithReadfirstlane(MI, MRI, 2);
3073 return;
3074 }
3075 default: {
3076 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3077 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3078 // Non-images can have complications from operands that allow both SGPR
3079 // and VGPR. For now it's too complicated to figure out the final opcode
3080 // to derive the register bank from the MCInstrDesc.
3081 if (RSrcIntrin->IsImage) {
3082 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3083 return;
3084 }
3085 }
3086
3087 break;
3088 }
3089 }
3090 break;
3091 }
3092 case AMDGPU::G_LOAD:
3093 case AMDGPU::G_ZEXTLOAD:
3094 case AMDGPU::G_SEXTLOAD: {
3095 if (applyMappingLoad(MI, OpdMapper, MRI))
3096 return;
3097 break;
3098 }
3099 case AMDGPU::G_DYN_STACKALLOC:
3100 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3101 return;
3102 default:
3103 break;
3104 }
3105
3106 return applyDefaultMapping(OpdMapper);
3107 }
3108
isSALUMapping(const MachineInstr & MI) const3109 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3110 const MachineFunction &MF = *MI.getParent()->getParent();
3111 const MachineRegisterInfo &MRI = MF.getRegInfo();
3112 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3113 if (!MI.getOperand(i).isReg())
3114 continue;
3115 Register Reg = MI.getOperand(i).getReg();
3116 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3117 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3118 return false;
3119 }
3120 }
3121 return true;
3122 }
3123
3124 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3125 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3126 const MachineFunction &MF = *MI.getParent()->getParent();
3127 const MachineRegisterInfo &MRI = MF.getRegInfo();
3128 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3129
3130 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3131 const MachineOperand &SrcOp = MI.getOperand(i);
3132 if (!SrcOp.isReg())
3133 continue;
3134
3135 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3136 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3137 }
3138 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3139 MI.getNumOperands());
3140 }
3141
3142 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3143 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3144 const MachineFunction &MF = *MI.getParent()->getParent();
3145 const MachineRegisterInfo &MRI = MF.getRegInfo();
3146 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3147
3148 // Even though we technically could use SGPRs, this would require knowledge of
3149 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3150 //
3151 // TODO: Unary ops are trivially OK, so accept SGPRs?
3152 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3153 const MachineOperand &Src = MI.getOperand(i);
3154 if (!Src.isReg())
3155 continue;
3156
3157 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3158 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3159 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3160 }
3161
3162 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3163 MI.getNumOperands());
3164 }
3165
3166 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3167 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3168 const MachineFunction &MF = *MI.getParent()->getParent();
3169 const MachineRegisterInfo &MRI = MF.getRegInfo();
3170 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3171
3172 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3173 const MachineOperand &Op = MI.getOperand(I);
3174 if (!Op.isReg())
3175 continue;
3176
3177 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3178 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3179 }
3180
3181 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3182 MI.getNumOperands());
3183 }
3184
3185 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3186 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3187 const MachineInstr &MI,
3188 int RsrcIdx) const {
3189 // The reported argument index is relative to the IR intrinsic call arguments,
3190 // so we need to shift by the number of defs and the intrinsic ID.
3191 RsrcIdx += MI.getNumExplicitDefs() + 1;
3192
3193 const int NumOps = MI.getNumOperands();
3194 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3195
3196 // TODO: Should packed/unpacked D16 difference be reported here as part of
3197 // the value mapping?
3198 for (int I = 0; I != NumOps; ++I) {
3199 if (!MI.getOperand(I).isReg())
3200 continue;
3201
3202 Register OpReg = MI.getOperand(I).getReg();
3203 // We replace some dead address operands with $noreg
3204 if (!OpReg)
3205 continue;
3206
3207 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3208
3209 // FIXME: Probably need a new intrinsic register bank searchable table to
3210 // handle arbitrary intrinsics easily.
3211 //
3212 // If this has a sampler, it immediately follows rsrc.
3213 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3214
3215 if (MustBeSGPR) {
3216 // If this must be an SGPR, so we must report whatever it is as legal.
3217 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3218 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3219 } else {
3220 // Some operands must be VGPR, and these are easy to copy to.
3221 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3222 }
3223 }
3224
3225 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3226 }
3227
3228 /// Return the mapping for a pointer arugment.
3229 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3230 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3231 Register PtrReg) const {
3232 LLT PtrTy = MRI.getType(PtrReg);
3233 unsigned Size = PtrTy.getSizeInBits();
3234 if (Subtarget.useFlatForGlobal() ||
3235 !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3236 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3237
3238 // If we're using MUBUF instructions for global memory, an SGPR base register
3239 // is possible. Otherwise this needs to be a VGPR.
3240 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3241 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3242 }
3243
3244 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3245 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3246
3247 const MachineFunction &MF = *MI.getParent()->getParent();
3248 const MachineRegisterInfo &MRI = MF.getRegInfo();
3249 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3250 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3251 Register PtrReg = MI.getOperand(1).getReg();
3252 LLT PtrTy = MRI.getType(PtrReg);
3253 unsigned AS = PtrTy.getAddressSpace();
3254 unsigned PtrSize = PtrTy.getSizeInBits();
3255
3256 const ValueMapping *ValMapping;
3257 const ValueMapping *PtrMapping;
3258
3259 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3260
3261 if (PtrBank == &AMDGPU::SGPRRegBank &&
3262 SITargetLowering::isFlatGlobalAddrSpace(AS)) {
3263 if (isScalarLoadLegal(MI)) {
3264 // We have a uniform instruction so we want to use an SMRD load
3265 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3266 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3267 } else {
3268 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3269
3270 // If we're using MUBUF instructions for global memory, an SGPR base
3271 // register is possible. Otherwise this needs to be a VGPR.
3272 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3273 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3274
3275 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3276 }
3277 } else {
3278 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3279 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3280 }
3281
3282 OpdsMapping[0] = ValMapping;
3283 OpdsMapping[1] = PtrMapping;
3284 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3285 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3286 return Mapping;
3287
3288 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3289 // handle that during instruction selection?
3290 }
3291
3292 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,unsigned Default) const3293 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3294 const MachineRegisterInfo &MRI,
3295 const TargetRegisterInfo &TRI,
3296 unsigned Default) const {
3297 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
3298 return Bank ? Bank->getID() : Default;
3299 }
3300
3301
regBankUnion(unsigned RB0,unsigned RB1)3302 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3303 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
3304 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3305 }
3306
regBankBoolUnion(int RB0,int RB1)3307 static int regBankBoolUnion(int RB0, int RB1) {
3308 if (RB0 == -1)
3309 return RB1;
3310 if (RB1 == -1)
3311 return RB0;
3312
3313 // vcc, vcc -> vcc
3314 // vcc, sgpr -> vcc
3315 // vcc, vgpr -> vcc
3316 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3317 return AMDGPU::VCCRegBankID;
3318
3319 // vcc, vgpr -> vgpr
3320 return regBankUnion(RB0, RB1);
3321 }
3322
3323 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3324 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3325 const MachineRegisterInfo &MRI,
3326 const TargetRegisterInfo &TRI) const {
3327 // Lie and claim anything is legal, even though this needs to be an SGPR
3328 // applyMapping will have to deal with it as a waterfall loop.
3329 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
3330 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3331 return AMDGPU::getValueMapping(Bank, Size);
3332 }
3333
3334 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3335 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3336 const MachineRegisterInfo &MRI,
3337 const TargetRegisterInfo &TRI) const {
3338 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3339 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3340 }
3341
3342 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3343 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3344 const MachineRegisterInfo &MRI,
3345 const TargetRegisterInfo &TRI) const {
3346 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3347 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3348 }
3349
3350 ///
3351 /// This function must return a legal mapping, because
3352 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3353 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3354 /// VGPR to SGPR generated is illegal.
3355 ///
3356 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3357 // legal. These will be dealt with in applyMappingImpl.
3358 //
3359 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3360 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3361 const MachineFunction &MF = *MI.getParent()->getParent();
3362 const MachineRegisterInfo &MRI = MF.getRegInfo();
3363
3364 if (MI.isCopy()) {
3365 // The default logic bothers to analyze impossible alternative mappings. We
3366 // want the most straightforward mapping, so just directly handle this.
3367 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3368 *TRI);
3369 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3370 *TRI);
3371 assert(SrcBank && "src bank should have been assigned already");
3372 if (!DstBank)
3373 DstBank = SrcBank;
3374
3375 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3376 if (cannotCopy(*DstBank, *SrcBank, Size))
3377 return getInvalidInstructionMapping();
3378
3379 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3380 return getInstructionMapping(
3381 1, /*Cost*/ 1,
3382 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3383 }
3384
3385 if (MI.isRegSequence()) {
3386 // If any input is a VGPR, the result must be a VGPR. The default handling
3387 // assumes any copy between banks is legal.
3388 unsigned BankID = AMDGPU::SGPRRegBankID;
3389
3390 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3391 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
3392 // It doesn't make sense to use vcc or scc banks here, so just ignore
3393 // them.
3394 if (OpBank != AMDGPU::SGPRRegBankID) {
3395 BankID = AMDGPU::VGPRRegBankID;
3396 break;
3397 }
3398 }
3399 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3400
3401 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3402 return getInstructionMapping(
3403 1, /*Cost*/ 1,
3404 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3405 }
3406
3407 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3408 // properly.
3409 //
3410 // TODO: There are additional exec masking dependencies to analyze.
3411 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3412 // TODO: Generate proper invalid bank enum.
3413 int ResultBank = -1;
3414 Register DstReg = MI.getOperand(0).getReg();
3415
3416 // Sometimes the result may have already been assigned a bank.
3417 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3418 ResultBank = DstBank->getID();
3419
3420 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3421 Register Reg = MI.getOperand(I).getReg();
3422 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3423
3424 // FIXME: Assuming VGPR for any undetermined inputs.
3425 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3426 ResultBank = AMDGPU::VGPRRegBankID;
3427 break;
3428 }
3429
3430 // FIXME: Need to promote SGPR case to s32
3431 unsigned OpBank = Bank->getID();
3432 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3433 }
3434
3435 assert(ResultBank != -1);
3436
3437 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3438
3439 const ValueMapping &ValMap =
3440 getValueMapping(0, Size, getRegBank(ResultBank));
3441 return getInstructionMapping(
3442 1, /*Cost*/ 1,
3443 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3444 }
3445
3446 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3447 if (Mapping.isValid())
3448 return Mapping;
3449
3450 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3451
3452 switch (MI.getOpcode()) {
3453 default:
3454 return getInvalidInstructionMapping();
3455
3456 case AMDGPU::G_AND:
3457 case AMDGPU::G_OR:
3458 case AMDGPU::G_XOR: {
3459 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3460 if (Size == 1) {
3461 const RegisterBank *DstBank
3462 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3463
3464 unsigned TargetBankID = -1;
3465 unsigned BankLHS = -1;
3466 unsigned BankRHS = -1;
3467 if (DstBank) {
3468 TargetBankID = DstBank->getID();
3469 if (DstBank == &AMDGPU::VCCRegBank) {
3470 TargetBankID = AMDGPU::VCCRegBankID;
3471 BankLHS = AMDGPU::VCCRegBankID;
3472 BankRHS = AMDGPU::VCCRegBankID;
3473 } else {
3474 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3475 AMDGPU::SGPRRegBankID);
3476 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3477 AMDGPU::SGPRRegBankID);
3478 }
3479 } else {
3480 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3481 AMDGPU::VCCRegBankID);
3482 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3483 AMDGPU::VCCRegBankID);
3484
3485 // Both inputs should be true booleans to produce a boolean result.
3486 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3487 TargetBankID = AMDGPU::VGPRRegBankID;
3488 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3489 TargetBankID = AMDGPU::VCCRegBankID;
3490 BankLHS = AMDGPU::VCCRegBankID;
3491 BankRHS = AMDGPU::VCCRegBankID;
3492 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3493 TargetBankID = AMDGPU::SGPRRegBankID;
3494 }
3495 }
3496
3497 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3498 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3499 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3500 break;
3501 }
3502
3503 if (Size == 64) {
3504
3505 if (isSALUMapping(MI)) {
3506 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3507 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3508 } else {
3509 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3510 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
3511 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3512
3513 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
3514 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3515 }
3516
3517 break;
3518 }
3519
3520 LLVM_FALLTHROUGH;
3521 }
3522 case AMDGPU::G_PTR_ADD:
3523 case AMDGPU::G_PTRMASK:
3524 case AMDGPU::G_ADD:
3525 case AMDGPU::G_SUB:
3526 case AMDGPU::G_MUL:
3527 case AMDGPU::G_SHL:
3528 case AMDGPU::G_LSHR:
3529 case AMDGPU::G_ASHR:
3530 case AMDGPU::G_UADDO:
3531 case AMDGPU::G_USUBO:
3532 case AMDGPU::G_UADDE:
3533 case AMDGPU::G_SADDE:
3534 case AMDGPU::G_USUBE:
3535 case AMDGPU::G_SSUBE:
3536 case AMDGPU::G_SMIN:
3537 case AMDGPU::G_SMAX:
3538 case AMDGPU::G_UMIN:
3539 case AMDGPU::G_UMAX:
3540 case AMDGPU::G_SHUFFLE_VECTOR:
3541 if (isSALUMapping(MI))
3542 return getDefaultMappingSOP(MI);
3543 LLVM_FALLTHROUGH;
3544
3545 case AMDGPU::G_FADD:
3546 case AMDGPU::G_FSUB:
3547 case AMDGPU::G_FPTOSI:
3548 case AMDGPU::G_FPTOUI:
3549 case AMDGPU::G_FMUL:
3550 case AMDGPU::G_FMA:
3551 case AMDGPU::G_FMAD:
3552 case AMDGPU::G_FSQRT:
3553 case AMDGPU::G_FFLOOR:
3554 case AMDGPU::G_FCEIL:
3555 case AMDGPU::G_FRINT:
3556 case AMDGPU::G_SITOFP:
3557 case AMDGPU::G_UITOFP:
3558 case AMDGPU::G_FPTRUNC:
3559 case AMDGPU::G_FPEXT:
3560 case AMDGPU::G_FEXP2:
3561 case AMDGPU::G_FLOG2:
3562 case AMDGPU::G_FMINNUM:
3563 case AMDGPU::G_FMAXNUM:
3564 case AMDGPU::G_FMINNUM_IEEE:
3565 case AMDGPU::G_FMAXNUM_IEEE:
3566 case AMDGPU::G_FCANONICALIZE:
3567 case AMDGPU::G_INTRINSIC_TRUNC:
3568 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3569 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3570 case AMDGPU::G_AMDGPU_FFBH_U32:
3571 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3572 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3573 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3574 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3575 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3576 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3577 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3578 return getDefaultMappingVOP(MI);
3579 case AMDGPU::G_UMULH:
3580 case AMDGPU::G_SMULH: {
3581 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3582 return getDefaultMappingSOP(MI);
3583 return getDefaultMappingVOP(MI);
3584 }
3585 case AMDGPU::G_IMPLICIT_DEF: {
3586 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3587 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3588 break;
3589 }
3590 case AMDGPU::G_FCONSTANT:
3591 case AMDGPU::G_CONSTANT:
3592 case AMDGPU::G_GLOBAL_VALUE:
3593 case AMDGPU::G_BLOCK_ADDR:
3594 case AMDGPU::G_READCYCLECOUNTER: {
3595 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3596 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3597 break;
3598 }
3599 case AMDGPU::G_FRAME_INDEX: {
3600 // TODO: This should be the same as other constants, but eliminateFrameIndex
3601 // currently assumes VALU uses.
3602 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3603 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3604 break;
3605 }
3606 case AMDGPU::G_DYN_STACKALLOC: {
3607 // Result is always uniform, and a wave reduction is needed for the source.
3608 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3609 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3610 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3611 break;
3612 }
3613 case AMDGPU::G_INSERT: {
3614 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
3615 AMDGPU::VGPRRegBankID;
3616 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3617 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3618 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3619 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3620 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3621 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3622 OpdsMapping[3] = nullptr;
3623 break;
3624 }
3625 case AMDGPU::G_EXTRACT: {
3626 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3627 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3628 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3629 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3630 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3631 OpdsMapping[2] = nullptr;
3632 break;
3633 }
3634 case AMDGPU::G_BUILD_VECTOR:
3635 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3636 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3637 if (DstTy == LLT::vector(2, 16)) {
3638 unsigned DstSize = DstTy.getSizeInBits();
3639 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3640 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3641 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3642 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3643
3644 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3645 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3646 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3647 break;
3648 }
3649
3650 LLVM_FALLTHROUGH;
3651 }
3652 case AMDGPU::G_MERGE_VALUES:
3653 case AMDGPU::G_CONCAT_VECTORS: {
3654 unsigned Bank = isSALUMapping(MI) ?
3655 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3656 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3657 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3658
3659 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3660 // Op1 and Dst should use the same register bank.
3661 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3662 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3663 break;
3664 }
3665 case AMDGPU::G_BITCAST:
3666 case AMDGPU::G_INTTOPTR:
3667 case AMDGPU::G_PTRTOINT:
3668 case AMDGPU::G_BITREVERSE:
3669 case AMDGPU::G_FABS:
3670 case AMDGPU::G_FNEG: {
3671 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3672 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3673 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3674 break;
3675 }
3676 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3677 case AMDGPU::G_CTTZ_ZERO_UNDEF:
3678 case AMDGPU::G_CTPOP: {
3679 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3680 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3681 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3682
3683 // This should really be getValueMappingSGPR64Only, but allowing the generic
3684 // code to handle the register split just makes using LegalizerHelper more
3685 // difficult.
3686 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3687 break;
3688 }
3689 case AMDGPU::G_TRUNC: {
3690 Register Dst = MI.getOperand(0).getReg();
3691 Register Src = MI.getOperand(1).getReg();
3692 unsigned Bank = getRegBankID(Src, MRI, *TRI);
3693 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3694 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3695 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3696 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3697 break;
3698 }
3699 case AMDGPU::G_ZEXT:
3700 case AMDGPU::G_SEXT:
3701 case AMDGPU::G_ANYEXT:
3702 case AMDGPU::G_SEXT_INREG: {
3703 Register Dst = MI.getOperand(0).getReg();
3704 Register Src = MI.getOperand(1).getReg();
3705 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3706 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3707
3708 unsigned DstBank;
3709 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3710 assert(SrcBank);
3711 switch (SrcBank->getID()) {
3712 case AMDGPU::SGPRRegBankID:
3713 DstBank = AMDGPU::SGPRRegBankID;
3714 break;
3715 default:
3716 DstBank = AMDGPU::VGPRRegBankID;
3717 break;
3718 }
3719
3720 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3721 // 32-bits, and then to 64.
3722 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3723 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3724 SrcSize);
3725 break;
3726 }
3727 case AMDGPU::G_FCMP: {
3728 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3729 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3730 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3731 OpdsMapping[1] = nullptr; // Predicate Operand.
3732 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3733 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3734 break;
3735 }
3736 case AMDGPU::G_STORE: {
3737 assert(MI.getOperand(0).isReg());
3738 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3739
3740 // FIXME: We need to specify a different reg bank once scalar stores are
3741 // supported.
3742 const ValueMapping *ValMapping =
3743 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3744 OpdsMapping[0] = ValMapping;
3745 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3746 break;
3747 }
3748 case AMDGPU::G_ICMP: {
3749 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3750 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3751
3752 // See if the result register has already been constrained to vcc, which may
3753 // happen due to control flow intrinsic lowering.
3754 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
3755 AMDGPU::SGPRRegBankID);
3756 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3757 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
3758
3759 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3760 Op2Bank == AMDGPU::SGPRRegBankID &&
3761 Op3Bank == AMDGPU::SGPRRegBankID &&
3762 (Size == 32 || (Size == 64 &&
3763 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3764 Subtarget.hasScalarCompareEq64()));
3765
3766 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3767 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3768
3769 // TODO: Use 32-bit for scalar output size.
3770 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3771 const unsigned ResultSize = 1;
3772
3773 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3774 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3775 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3776 break;
3777 }
3778 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3779 // VGPR index can be used for waterfall when indexing a SGPR vector.
3780 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3781 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3782 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3783 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3784 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3785 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3786
3787 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3788 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3789
3790 // The index can be either if the source vector is VGPR.
3791 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3792 break;
3793 }
3794 case AMDGPU::G_INSERT_VECTOR_ELT: {
3795 unsigned OutputBankID = isSALUMapping(MI) ?
3796 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3797
3798 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3799 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3800 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3801 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
3802 MRI, *TRI);
3803 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
3804
3805 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3806 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3807
3808 // This is a weird case, because we need to break down the mapping based on
3809 // the register bank of a different operand.
3810 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3811 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3812 InsertSize);
3813 } else {
3814 assert(InsertSize == 32 || InsertSize == 64);
3815 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3816 }
3817
3818 // The index can be either if the source vector is VGPR.
3819 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3820 break;
3821 }
3822 case AMDGPU::G_UNMERGE_VALUES: {
3823 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
3824 AMDGPU::VGPRRegBankID;
3825
3826 // Op1 and Dst should use the same register bank.
3827 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3828 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3829 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3830 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3831 }
3832 break;
3833 }
3834 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3835 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3836 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3837 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3838 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3839 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3840 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3841 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3842 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3843 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3844 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3845 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3846 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3847 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3848 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3849 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3850 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3851
3852 // rsrc
3853 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3854
3855 // vindex
3856 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3857
3858 // voffset
3859 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3860
3861 // soffset
3862 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3863
3864 // Any remaining operands are immediates and were correctly null
3865 // initialized.
3866 break;
3867 }
3868 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3869 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3870 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3871 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3872 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3874 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3875 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3876 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3877 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3878 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3879 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3880 // vdata_out
3881 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3882
3883 // vdata_in
3884 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3885
3886 // rsrc
3887 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3888
3889 // vindex
3890 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3891
3892 // voffset
3893 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3894
3895 // soffset
3896 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3897
3898 // Any remaining operands are immediates and were correctly null
3899 // initialized.
3900 break;
3901 }
3902 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3903 // vdata_out
3904 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3905
3906 // vdata_in
3907 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3908
3909 // cmp
3910 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3911
3912 // rsrc
3913 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3914
3915 // vindex
3916 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3917
3918 // voffset
3919 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3920
3921 // soffset
3922 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3923
3924 // Any remaining operands are immediates and were correctly null
3925 // initialized.
3926 break;
3927 }
3928 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3929 // Lie and claim everything is legal, even though some need to be
3930 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3931 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3932 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3933
3934 // We need to convert this to a MUBUF if either the resource of offset is
3935 // VGPR.
3936 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3937 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3938 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3939
3940 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3941 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3942 break;
3943 }
3944 case AMDGPU::G_INTRINSIC: {
3945 switch (MI.getIntrinsicID()) {
3946 default:
3947 return getInvalidInstructionMapping();
3948 case Intrinsic::amdgcn_div_fmas:
3949 case Intrinsic::amdgcn_div_fixup:
3950 case Intrinsic::amdgcn_trig_preop:
3951 case Intrinsic::amdgcn_sin:
3952 case Intrinsic::amdgcn_cos:
3953 case Intrinsic::amdgcn_log_clamp:
3954 case Intrinsic::amdgcn_rcp:
3955 case Intrinsic::amdgcn_rcp_legacy:
3956 case Intrinsic::amdgcn_sqrt:
3957 case Intrinsic::amdgcn_rsq:
3958 case Intrinsic::amdgcn_rsq_legacy:
3959 case Intrinsic::amdgcn_rsq_clamp:
3960 case Intrinsic::amdgcn_fmul_legacy:
3961 case Intrinsic::amdgcn_ldexp:
3962 case Intrinsic::amdgcn_frexp_mant:
3963 case Intrinsic::amdgcn_frexp_exp:
3964 case Intrinsic::amdgcn_fract:
3965 case Intrinsic::amdgcn_cvt_pkrtz:
3966 case Intrinsic::amdgcn_cvt_pknorm_i16:
3967 case Intrinsic::amdgcn_cvt_pknorm_u16:
3968 case Intrinsic::amdgcn_cvt_pk_i16:
3969 case Intrinsic::amdgcn_cvt_pk_u16:
3970 case Intrinsic::amdgcn_fmed3:
3971 case Intrinsic::amdgcn_cubeid:
3972 case Intrinsic::amdgcn_cubema:
3973 case Intrinsic::amdgcn_cubesc:
3974 case Intrinsic::amdgcn_cubetc:
3975 case Intrinsic::amdgcn_sffbh:
3976 case Intrinsic::amdgcn_fmad_ftz:
3977 case Intrinsic::amdgcn_mbcnt_lo:
3978 case Intrinsic::amdgcn_mbcnt_hi:
3979 case Intrinsic::amdgcn_mul_u24:
3980 case Intrinsic::amdgcn_mul_i24:
3981 case Intrinsic::amdgcn_lerp:
3982 case Intrinsic::amdgcn_sad_u8:
3983 case Intrinsic::amdgcn_msad_u8:
3984 case Intrinsic::amdgcn_sad_hi_u8:
3985 case Intrinsic::amdgcn_sad_u16:
3986 case Intrinsic::amdgcn_qsad_pk_u16_u8:
3987 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
3988 case Intrinsic::amdgcn_mqsad_u32_u8:
3989 case Intrinsic::amdgcn_cvt_pk_u8_f32:
3990 case Intrinsic::amdgcn_alignbit:
3991 case Intrinsic::amdgcn_alignbyte:
3992 case Intrinsic::amdgcn_fdot2:
3993 case Intrinsic::amdgcn_sdot2:
3994 case Intrinsic::amdgcn_udot2:
3995 case Intrinsic::amdgcn_sdot4:
3996 case Intrinsic::amdgcn_udot4:
3997 case Intrinsic::amdgcn_sdot8:
3998 case Intrinsic::amdgcn_udot8:
3999 return getDefaultMappingVOP(MI);
4000 case Intrinsic::amdgcn_sbfe:
4001 case Intrinsic::amdgcn_ubfe:
4002 if (isSALUMapping(MI))
4003 return getDefaultMappingSOP(MI);
4004 return getDefaultMappingVOP(MI);
4005 case Intrinsic::amdgcn_ds_swizzle:
4006 case Intrinsic::amdgcn_ds_permute:
4007 case Intrinsic::amdgcn_ds_bpermute:
4008 case Intrinsic::amdgcn_update_dpp:
4009 case Intrinsic::amdgcn_mov_dpp8:
4010 case Intrinsic::amdgcn_mov_dpp:
4011 case Intrinsic::amdgcn_wwm:
4012 case Intrinsic::amdgcn_wqm:
4013 case Intrinsic::amdgcn_softwqm:
4014 return getDefaultMappingAllVGPR(MI);
4015 case Intrinsic::amdgcn_kernarg_segment_ptr:
4016 case Intrinsic::amdgcn_s_getpc:
4017 case Intrinsic::amdgcn_groupstaticsize: {
4018 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4019 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4020 break;
4021 }
4022 case Intrinsic::amdgcn_wqm_vote: {
4023 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4024 OpdsMapping[0] = OpdsMapping[2]
4025 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4026 break;
4027 }
4028 case Intrinsic::amdgcn_ps_live: {
4029 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4030 break;
4031 }
4032 case Intrinsic::amdgcn_div_scale: {
4033 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4034 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4035 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4036 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4037
4038 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4039 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4040 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4041 break;
4042 }
4043 case Intrinsic::amdgcn_class: {
4044 Register Src0Reg = MI.getOperand(2).getReg();
4045 Register Src1Reg = MI.getOperand(3).getReg();
4046 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4047 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4048 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4049 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4050 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4051 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4052 break;
4053 }
4054 case Intrinsic::amdgcn_icmp:
4055 case Intrinsic::amdgcn_fcmp: {
4056 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4057 // This is not VCCRegBank because this is not used in boolean contexts.
4058 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4059 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4060 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4061 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4062 break;
4063 }
4064 case Intrinsic::amdgcn_readlane: {
4065 // This must be an SGPR, but accept a VGPR.
4066 Register IdxReg = MI.getOperand(3).getReg();
4067 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4068 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
4069 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4070 LLVM_FALLTHROUGH;
4071 }
4072 case Intrinsic::amdgcn_readfirstlane: {
4073 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4074 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4075 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4076 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4077 break;
4078 }
4079 case Intrinsic::amdgcn_writelane: {
4080 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4081 Register SrcReg = MI.getOperand(2).getReg();
4082 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4083 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
4084 Register IdxReg = MI.getOperand(3).getReg();
4085 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4086 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
4087 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4088
4089 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4090 // to legalize.
4091 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4092 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4093 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4094 break;
4095 }
4096 case Intrinsic::amdgcn_if_break: {
4097 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4098 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4099 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4100 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4101 break;
4102 }
4103 case Intrinsic::amdgcn_permlane16:
4104 case Intrinsic::amdgcn_permlanex16: {
4105 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4106 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4107 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4108 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4109 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4110 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4111 break;
4112 }
4113 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4114 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4115 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4116 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4117 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4118 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4119 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4120 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4121 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4122 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4123 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4124 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4125 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4126 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4127 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4128 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4129 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4130 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4131 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4132 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
4133 // Default for MAI intrinsics.
4134 // srcC can also be an immediate which can be folded later.
4135 // FIXME: Should we eventually add an alternative mapping with AGPR src
4136 // for srcA/srcB?
4137 //
4138 // vdst, srcA, srcB, srcC
4139 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4140 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4141 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4142 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4143 break;
4144 }
4145 case Intrinsic::amdgcn_interp_p1:
4146 case Intrinsic::amdgcn_interp_p2:
4147 case Intrinsic::amdgcn_interp_mov:
4148 case Intrinsic::amdgcn_interp_p1_f16:
4149 case Intrinsic::amdgcn_interp_p2_f16: {
4150 const int M0Idx = MI.getNumOperands() - 1;
4151 Register M0Reg = MI.getOperand(M0Idx).getReg();
4152 unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID);
4153 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4154
4155 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4156 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4157 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4158
4159 // Must be SGPR, but we must take whatever the original bank is and fix it
4160 // later.
4161 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4162 break;
4163 }
4164 case Intrinsic::amdgcn_ballot: {
4165 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4166 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4167 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4168 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4169 break;
4170 }
4171 }
4172 break;
4173 }
4174 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4175 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4176 auto IntrID = MI.getIntrinsicID();
4177 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4178 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4179 // Non-images can have complications from operands that allow both SGPR
4180 // and VGPR. For now it's too complicated to figure out the final opcode
4181 // to derive the register bank from the MCInstrDesc.
4182 assert(RSrcIntrin->IsImage);
4183 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4184 }
4185 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4186 auto IntrID = MI.getIntrinsicID();
4187 switch (IntrID) {
4188 case Intrinsic::amdgcn_s_getreg:
4189 case Intrinsic::amdgcn_s_memtime:
4190 case Intrinsic::amdgcn_s_memrealtime:
4191 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4192 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4193 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4194 break;
4195 }
4196 case Intrinsic::amdgcn_ds_fadd:
4197 case Intrinsic::amdgcn_ds_fmin:
4198 case Intrinsic::amdgcn_ds_fmax:
4199 return getDefaultMappingAllVGPR(MI);
4200 case Intrinsic::amdgcn_ds_ordered_add:
4201 case Intrinsic::amdgcn_ds_ordered_swap: {
4202 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4203 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4204 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
4205 AMDGPU::SGPRRegBankID);
4206 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4207 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4208 break;
4209 }
4210 case Intrinsic::amdgcn_ds_append:
4211 case Intrinsic::amdgcn_ds_consume: {
4212 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4213 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4214 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4215 break;
4216 }
4217 case Intrinsic::amdgcn_exp_compr:
4218 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4219 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4220 break;
4221 case Intrinsic::amdgcn_exp:
4222 // FIXME: Could we support packed types here?
4223 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4224 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4225 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4226 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4227 break;
4228 case Intrinsic::amdgcn_s_sendmsg:
4229 case Intrinsic::amdgcn_s_sendmsghalt: {
4230 // This must be an SGPR, but accept a VGPR.
4231 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
4232 AMDGPU::SGPRRegBankID);
4233 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4234 break;
4235 }
4236 case Intrinsic::amdgcn_s_setreg: {
4237 // This must be an SGPR, but accept a VGPR.
4238 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
4239 AMDGPU::SGPRRegBankID);
4240 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4241 break;
4242 }
4243 case Intrinsic::amdgcn_end_cf: {
4244 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4245 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4246 break;
4247 }
4248 case Intrinsic::amdgcn_else: {
4249 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4250 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4251 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4252 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4253 break;
4254 }
4255 case Intrinsic::amdgcn_kill: {
4256 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4257 break;
4258 }
4259 case Intrinsic::amdgcn_raw_buffer_load:
4260 case Intrinsic::amdgcn_raw_tbuffer_load: {
4261 // FIXME: Should make intrinsic ID the last operand of the instruction,
4262 // then this would be the same as store
4263 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4264 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4265 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4266 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4267 break;
4268 }
4269 case Intrinsic::amdgcn_raw_buffer_store:
4270 case Intrinsic::amdgcn_raw_buffer_store_format:
4271 case Intrinsic::amdgcn_raw_tbuffer_store: {
4272 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4273 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4274 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4275 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4276 break;
4277 }
4278 case Intrinsic::amdgcn_struct_buffer_load:
4279 case Intrinsic::amdgcn_struct_tbuffer_load: {
4280 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4281 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4282 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4283 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4284 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4285 break;
4286 }
4287 case Intrinsic::amdgcn_struct_buffer_store:
4288 case Intrinsic::amdgcn_struct_tbuffer_store: {
4289 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4290 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4291 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4292 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4293 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4294 break;
4295 }
4296 case Intrinsic::amdgcn_init_exec_from_input: {
4297 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4298 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4299 break;
4300 }
4301 case Intrinsic::amdgcn_ds_gws_init:
4302 case Intrinsic::amdgcn_ds_gws_barrier:
4303 case Intrinsic::amdgcn_ds_gws_sema_br: {
4304 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4305
4306 // This must be an SGPR, but accept a VGPR.
4307 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
4308 AMDGPU::SGPRRegBankID);
4309 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4310 break;
4311 }
4312 case Intrinsic::amdgcn_ds_gws_sema_v:
4313 case Intrinsic::amdgcn_ds_gws_sema_p:
4314 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4315 // This must be an SGPR, but accept a VGPR.
4316 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
4317 AMDGPU::SGPRRegBankID);
4318 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4319 break;
4320 }
4321 default:
4322 return getInvalidInstructionMapping();
4323 }
4324 break;
4325 }
4326 case AMDGPU::G_SELECT: {
4327 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4328 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
4329 AMDGPU::SGPRRegBankID);
4330 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
4331 AMDGPU::SGPRRegBankID);
4332 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4333 Op3Bank == AMDGPU::SGPRRegBankID;
4334
4335 unsigned CondBankDefault = SGPRSrcs ?
4336 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4337 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
4338 CondBankDefault);
4339 if (CondBank == AMDGPU::SGPRRegBankID)
4340 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4341 else if (CondBank == AMDGPU::VGPRRegBankID)
4342 CondBank = AMDGPU::VCCRegBankID;
4343
4344 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4345 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4346
4347 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4348
4349 // TODO: Should report 32-bit for scalar condition type.
4350 if (Size == 64) {
4351 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4352 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4353 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4354 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4355 } else {
4356 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4357 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4358 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4359 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4360 }
4361
4362 break;
4363 }
4364
4365 case AMDGPU::G_LOAD:
4366 case AMDGPU::G_ZEXTLOAD:
4367 case AMDGPU::G_SEXTLOAD:
4368 return getInstrMappingForLoad(MI);
4369
4370 case AMDGPU::G_ATOMICRMW_XCHG:
4371 case AMDGPU::G_ATOMICRMW_ADD:
4372 case AMDGPU::G_ATOMICRMW_SUB:
4373 case AMDGPU::G_ATOMICRMW_AND:
4374 case AMDGPU::G_ATOMICRMW_OR:
4375 case AMDGPU::G_ATOMICRMW_XOR:
4376 case AMDGPU::G_ATOMICRMW_MAX:
4377 case AMDGPU::G_ATOMICRMW_MIN:
4378 case AMDGPU::G_ATOMICRMW_UMAX:
4379 case AMDGPU::G_ATOMICRMW_UMIN:
4380 case AMDGPU::G_ATOMICRMW_FADD:
4381 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4382 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4383 case AMDGPU::G_AMDGPU_ATOMIC_DEC: {
4384 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4385 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4386 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4387 break;
4388 }
4389 case AMDGPU::G_ATOMIC_CMPXCHG: {
4390 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4391 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4392 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4393 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4394 break;
4395 }
4396 case AMDGPU::G_BRCOND: {
4397 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
4398 AMDGPU::SGPRRegBankID);
4399 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4400 if (Bank != AMDGPU::SGPRRegBankID)
4401 Bank = AMDGPU::VCCRegBankID;
4402
4403 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4404 break;
4405 }
4406 }
4407
4408 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4409 getOperandsMapping(OpdsMapping),
4410 MI.getNumOperands());
4411 }
4412