1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84 
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87 
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90 
91 using namespace llvm;
92 using namespace MIPatternMatch;
93 
94 namespace {
95 
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99   const AMDGPURegisterBankInfo &RBI;
100   MachineRegisterInfo &MRI;
101   const RegisterBank *NewBank;
102   SmallVector<MachineInstr *, 4> NewInsts;
103 
104 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)105   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
107     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108 
~ApplyRegBankMapping()109   ~ApplyRegBankMapping() {
110     for (MachineInstr *MI : NewInsts)
111       applyBank(*MI);
112   }
113 
114   /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)115   void applyBank(MachineInstr &MI) {
116     const unsigned Opc = MI.getOpcode();
117     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118         Opc == AMDGPU::G_SEXT) {
119       // LegalizerHelper wants to use the basic legalization artifacts when
120       // widening etc. We don't handle selection with vcc in artifact sources,
121       // so we need to use a select instead to handle these properly.
122       Register DstReg = MI.getOperand(0).getReg();
123       Register SrcReg = MI.getOperand(1).getReg();
124       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125       if (SrcBank == &AMDGPU::VCCRegBank) {
126         const LLT S32 = LLT::scalar(32);
127         assert(MRI.getType(SrcReg) == LLT::scalar(1));
128         assert(MRI.getType(DstReg) == S32);
129         assert(NewBank == &AMDGPU::VGPRRegBank);
130 
131         // Replace the extension with a select, which really uses the boolean
132         // source.
133         MachineIRBuilder B(MI);
134         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135         auto False = B.buildConstant(S32, 0);
136         B.buildSelect(DstReg, SrcReg, True, False);
137         MRI.setRegBank(True.getReg(0), *NewBank);
138         MRI.setRegBank(False.getReg(0), *NewBank);
139         MI.eraseFromParent();
140       }
141 
142       assert(!MRI.getRegClassOrRegBank(DstReg));
143       MRI.setRegBank(DstReg, *NewBank);
144       return;
145     }
146 
147 #ifndef NDEBUG
148     if (Opc == AMDGPU::G_TRUNC) {
149       Register DstReg = MI.getOperand(0).getReg();
150       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151       assert(DstBank != &AMDGPU::VCCRegBank);
152     }
153 #endif
154 
155     for (MachineOperand &Op : MI.operands()) {
156       if (!Op.isReg())
157         continue;
158 
159       // We may see physical registers if building a real MI
160       Register Reg = Op.getReg();
161       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162         continue;
163 
164       const RegisterBank *RB = NewBank;
165       if (MRI.getType(Reg) == LLT::scalar(1)) {
166         assert(NewBank == &AMDGPU::VGPRRegBank &&
167                "s1 operands should only be used for vector bools");
168         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170                "not expecting legalization artifacts here");
171         RB = &AMDGPU::VCCRegBank;
172       }
173 
174       MRI.setRegBank(Reg, *RB);
175     }
176   }
177 
erasingInstr(MachineInstr & MI)178   void erasingInstr(MachineInstr &MI) override {}
179 
createdInstr(MachineInstr & MI)180   void createdInstr(MachineInstr &MI) override {
181     // At this point, the instruction was just inserted and has no operands.
182     NewInsts.push_back(&MI);
183   }
184 
changingInstr(MachineInstr & MI)185   void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)186   void changedInstr(MachineInstr &MI) override {
187     // FIXME: In principle we should probably add the instruction to NewInsts,
188     // but the way the LegalizerHelper uses the observer, we will always see the
189     // registers we need to set the regbank on also referenced in a new
190     // instruction.
191   }
192 };
193 
194 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196     : AMDGPUGenRegisterBankInfo(),
197       Subtarget(ST),
198       TRI(Subtarget.getRegisterInfo()),
199       TII(Subtarget.getInstrInfo()) {
200 
201   // HACK: Until this is fully tablegen'd.
202   static llvm::once_flag InitializeRegisterBankFlag;
203 
204   static auto InitializeRegisterBankOnce = [this]() {
205     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208     (void)this;
209   };
210 
211   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213 
isVectorRegisterBank(const RegisterBank & Bank)214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215   unsigned BankID = Bank.getID();
216   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218 
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220                                           const RegisterBank &Src,
221                                           unsigned Size) const {
222   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225     return std::numeric_limits<unsigned>::max();
226   }
227 
228   // Bool values are tricky, because the meaning is based on context. The SCC
229   // and VCC banks are for the natural scalar and vector conditions produced by
230   // a compare.
231   //
232   // Legalization doesn't know about the necessary context, so an s1 use may
233   // have been a truncate from an arbitrary value, in which case a copy (lowered
234   // as a compare with 0) needs to be inserted.
235   if (Size == 1 &&
236       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237       (isVectorRegisterBank(Src) ||
238        Src.getID() == AMDGPU::SGPRRegBankID ||
239        Src.getID() == AMDGPU::VCCRegBankID))
240     return std::numeric_limits<unsigned>::max();
241 
242   // There is no direct copy between AGPRs.
243   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244       Src.getID() == AMDGPU::AGPRRegBankID)
245     return 4;
246 
247   return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249 
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251   const ValueMapping &ValMapping,
252   const RegisterBank *CurBank) const {
253   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254   // VGPR.
255   // FIXME: Is there a better way to do this?
256   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257     return 10; // This is expensive.
258 
259   assert(ValMapping.NumBreakDowns == 2 &&
260          ValMapping.BreakDown[0].Length == 32 &&
261          ValMapping.BreakDown[0].StartIdx == 0 &&
262          ValMapping.BreakDown[1].Length == 32 &&
263          ValMapping.BreakDown[1].StartIdx == 32 &&
264          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265 
266   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268   // want.
269 
270   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271   // alignment restrictions, but this probably isn't important.
272   return 1;
273 }
274 
275 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277                                                LLT Ty) const {
278   if (&RC == &AMDGPU::SReg_1RegClass)
279     return AMDGPU::VCCRegBank;
280 
281   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282   // VCC-like use.
283   if (TRI->isSGPRClass(&RC)) {
284     // FIXME: This probably came from a copy from a physical register, which
285     // should be inferable from the copied to-type. We don't have many boolean
286     // physical register constraints so just assume a normal SGPR for now.
287     if (!Ty.isValid())
288       return AMDGPU::SGPRRegBank;
289 
290     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291   }
292 
293   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295 
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const298 AMDGPURegisterBankInfo::addMappingFromTable(
299     const MachineInstr &MI, const MachineRegisterInfo &MRI,
300     const std::array<unsigned, NumOps> RegSrcOpIdx,
301     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302 
303   InstructionMappings AltMappings;
304 
305   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
306 
307   unsigned Sizes[NumOps];
308   for (unsigned I = 0; I < NumOps; ++I) {
309     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311   }
312 
313   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316   }
317 
318   // getInstrMapping's default mapping uses ID 1, so start at 2.
319   unsigned MappingID = 2;
320   for (const auto &Entry : Table) {
321     for (unsigned I = 0; I < NumOps; ++I) {
322       int OpIdx = RegSrcOpIdx[I];
323       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324     }
325 
326     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327                                                  getOperandsMapping(Operands),
328                                                  Operands.size()));
329   }
330 
331   return AltMappings;
332 }
333 
334 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337   switch (MI.getIntrinsicID()) {
338   case Intrinsic::amdgcn_readlane: {
339     static const OpRegBankEntry<3> Table[2] = {
340       // Perfectly legal.
341       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342 
343       // Need a readfirstlane for the index.
344       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345     };
346 
347     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349   }
350   case Intrinsic::amdgcn_writelane: {
351     static const OpRegBankEntry<4> Table[4] = {
352       // Perfectly legal.
353       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354 
355       // Need readfirstlane of first op
356       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357 
358       // Need readfirstlane of second op
359       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361       // Need readfirstlane of both ops
362       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363     };
364 
365     // rsrc, voffset, offset
366     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368   }
369   default:
370     return RegisterBankInfo::getInstrAlternativeMappings(MI);
371   }
372 }
373 
374 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377 
378   switch (MI.getIntrinsicID()) {
379   case Intrinsic::amdgcn_s_buffer_load: {
380     static const OpRegBankEntry<2> Table[4] = {
381       // Perfectly legal.
382       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383 
384       // Only need 1 register in loop
385       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386 
387       // Have to waterfall the resource.
388       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389 
390       // Have to waterfall the resource, and the offset.
391       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392     };
393 
394     // rsrc, offset
395     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397   }
398   case Intrinsic::amdgcn_ds_ordered_add:
399   case Intrinsic::amdgcn_ds_ordered_swap: {
400     // VGPR = M0, VGPR
401     static const OpRegBankEntry<3> Table[2] = {
402       // Perfectly legal.
403       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
404 
405       // Need a readfirstlane for m0
406       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407     };
408 
409     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411   }
412   case Intrinsic::amdgcn_s_sendmsg:
413   case Intrinsic::amdgcn_s_sendmsghalt: {
414     // FIXME: Should have no register for immediate
415     static const OpRegBankEntry<1> Table[2] = {
416       // Perfectly legal.
417       { { AMDGPU::SGPRRegBankID }, 1 },
418 
419       // Need readlane
420       { { AMDGPU::VGPRRegBankID }, 3 }
421     };
422 
423     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425   }
426   default:
427     return RegisterBankInfo::getInstrAlternativeMappings(MI);
428   }
429 }
430 
memOpHasNoClobbered(const MachineMemOperand * MMO)431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433   return I && I->getMetadata("amdgpu.noclobber");
434 }
435 
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439   if (!MI.hasOneMemOperand())
440     return false;
441 
442   const MachineMemOperand *MMO = *MI.memoperands_begin();
443   const unsigned AS = MMO->getAddrSpace();
444   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446   // Require 4-byte alignment.
447   return MMO->getAlign() >= Align(4) &&
448          // Can't do a scalar atomic load.
449          !MMO->isAtomic() &&
450          // Don't use scalar loads for volatile accesses to non-constant address
451          // spaces.
452          (IsConst || !MMO->isVolatile()) &&
453          // Memory must be known constant, or not written before this load.
454          (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455          AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457 
458 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460     const MachineInstr &MI) const {
461 
462   const MachineFunction &MF = *MI.getParent()->getParent();
463   const MachineRegisterInfo &MRI = MF.getRegInfo();
464 
465 
466   InstructionMappings AltMappings;
467   switch (MI.getOpcode()) {
468   case TargetOpcode::G_CONSTANT: {
469     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470     if (Size == 1) {
471       static const OpRegBankEntry<1> Table[3] = {
472         { { AMDGPU::VGPRRegBankID }, 1 },
473         { { AMDGPU::SGPRRegBankID }, 1 },
474         { { AMDGPU::VCCRegBankID }, 1 }
475       };
476 
477       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478     }
479 
480     LLVM_FALLTHROUGH;
481   }
482   case TargetOpcode::G_FCONSTANT:
483   case TargetOpcode::G_FRAME_INDEX:
484   case TargetOpcode::G_GLOBAL_VALUE: {
485     static const OpRegBankEntry<1> Table[2] = {
486       { { AMDGPU::VGPRRegBankID }, 1 },
487       { { AMDGPU::SGPRRegBankID }, 1 }
488     };
489 
490     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491   }
492   case TargetOpcode::G_AND:
493   case TargetOpcode::G_OR:
494   case TargetOpcode::G_XOR: {
495     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496 
497     if (Size == 1) {
498       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499       const InstructionMapping &SCCMapping = getInstructionMapping(
500         1, 1, getOperandsMapping(
501           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504         3); // Num Operands
505       AltMappings.push_back(&SCCMapping);
506 
507       const InstructionMapping &VCCMapping0 = getInstructionMapping(
508         2, 1, getOperandsMapping(
509           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512         3); // Num Operands
513       AltMappings.push_back(&VCCMapping0);
514       return AltMappings;
515     }
516 
517     if (Size != 64)
518       break;
519 
520     const InstructionMapping &SSMapping = getInstructionMapping(
521       1, 1, getOperandsMapping(
522         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525       3); // Num Operands
526     AltMappings.push_back(&SSMapping);
527 
528     const InstructionMapping &VVMapping = getInstructionMapping(
529       2, 2, getOperandsMapping(
530         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533       3); // Num Operands
534     AltMappings.push_back(&VVMapping);
535     break;
536   }
537   case TargetOpcode::G_LOAD:
538   case TargetOpcode::G_ZEXTLOAD:
539   case TargetOpcode::G_SEXTLOAD: {
540     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542     unsigned PtrSize = PtrTy.getSizeInBits();
543     unsigned AS = PtrTy.getAddressSpace();
544 
545     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547         isScalarLoadLegal(MI)) {
548       const InstructionMapping &SSMapping = getInstructionMapping(
549           1, 1, getOperandsMapping(
550                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552           2); // Num Operands
553       AltMappings.push_back(&SSMapping);
554     }
555 
556     const InstructionMapping &VVMapping = getInstructionMapping(
557         2, 1,
558         getOperandsMapping(
559             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561         2); // Num Operands
562     AltMappings.push_back(&VVMapping);
563 
564     // It may be possible to have a vgpr = load sgpr mapping here, because
565     // the mubuf instructions support this kind of load, but probably for only
566     // gfx7 and older.  However, the addressing mode matching in the instruction
567     // selector should be able to do a better job of detecting and selecting
568     // these kinds of loads from the vgpr = load vgpr mapping.
569 
570     return AltMappings;
571 
572   }
573   case TargetOpcode::G_SELECT: {
574     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580       4); // Num Operands
581     AltMappings.push_back(&SSMapping);
582 
583     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588       4); // Num Operands
589     AltMappings.push_back(&VVMapping);
590 
591     return AltMappings;
592   }
593   case TargetOpcode::G_UADDE:
594   case TargetOpcode::G_USUBE:
595   case TargetOpcode::G_SADDE:
596   case TargetOpcode::G_SSUBE: {
597     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
599       getOperandsMapping(
600         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605       5); // Num Operands
606     AltMappings.push_back(&SSMapping);
607 
608     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614       5); // Num Operands
615     AltMappings.push_back(&VVMapping);
616     return AltMappings;
617   }
618   case AMDGPU::G_BRCOND: {
619     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
620 
621     // TODO: Change type to 32 for scalar
622     const InstructionMapping &SMapping = getInstructionMapping(
623       1, 1, getOperandsMapping(
624         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625       2); // Num Operands
626     AltMappings.push_back(&SMapping);
627 
628     const InstructionMapping &VMapping = getInstructionMapping(
629       1, 1, getOperandsMapping(
630         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631       2); // Num Operands
632     AltMappings.push_back(&VMapping);
633     return AltMappings;
634   }
635   case AMDGPU::G_INTRINSIC:
636     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
637   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
638     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
639   default:
640     break;
641   }
642   return RegisterBankInfo::getInstrAlternativeMappings(MI);
643 }
644 
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
646   MachineIRBuilder &B,
647   SmallVector<Register, 2> &Regs,
648   LLT HalfTy,
649   Register Reg) const {
650   assert(HalfTy.getSizeInBits() == 32);
651   MachineRegisterInfo *MRI = B.getMRI();
652   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655   MRI->setRegBank(LoLHS, *Bank);
656   MRI->setRegBank(HiLHS, *Bank);
657 
658   Regs.push_back(LoLHS);
659   Regs.push_back(HiLHS);
660 
661   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662     .addDef(LoLHS)
663     .addDef(HiLHS)
664     .addUse(Reg);
665 }
666 
667 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
669                           LLT NewTy) {
670   for (Register Reg : Regs) {
671     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
672     MRI.setType(Reg, NewTy);
673   }
674 }
675 
getHalfSizedType(LLT Ty)676 static LLT getHalfSizedType(LLT Ty) {
677   if (Ty.isVector()) {
678     assert(Ty.getElementCount().isKnownMultipleOf(2));
679     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
680                                Ty.getElementType());
681   }
682 
683   assert(Ty.getScalarSizeInBits() % 2 == 0);
684   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
685 }
686 
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
692 ///
693 /// Essentially performs this loop:
694 //
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 ///   Enable Lane, Disable all other lanes
698 ///   SGPR = read SGPR value for current lane from VGPR
699 ///   VGPRResult[Lane] = use_op SGPR
700 /// }
701 /// Restore Execution Mask
702 ///
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
706   MachineIRBuilder &B,
707   iterator_range<MachineBasicBlock::iterator> Range,
708   SmallSet<Register, 4> &SGPROperandRegs,
709   MachineRegisterInfo &MRI) const {
710   SmallVector<Register, 4> ResultRegs;
711   SmallVector<Register, 4> InitResultRegs;
712   SmallVector<Register, 4> PhiRegs;
713 
714   // Track use registers which have already been expanded with a readfirstlane
715   // sequence. This may have multiple uses if moving a sequence.
716   DenseMap<Register, Register> WaterfalledRegMap;
717 
718   MachineBasicBlock &MBB = B.getMBB();
719   MachineFunction *MF = &B.getMF();
720 
721   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
722   const unsigned WaveAndOpc = Subtarget.isWave32() ?
723     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
724   const unsigned MovTermOpc = Subtarget.isWave32() ?
725     AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
726   const unsigned XorTermOpc = Subtarget.isWave32() ?
727     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
728   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
729     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
730   const unsigned ExecReg =  Subtarget.isWave32() ?
731     AMDGPU::EXEC_LO : AMDGPU::EXEC;
732 
733 #ifndef NDEBUG
734   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
735 #endif
736 
737   for (MachineInstr &MI : Range) {
738     for (MachineOperand &Def : MI.defs()) {
739       if (MRI.use_nodbg_empty(Def.getReg()))
740         continue;
741 
742       LLT ResTy = MRI.getType(Def.getReg());
743       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
744       ResultRegs.push_back(Def.getReg());
745       Register InitReg = B.buildUndef(ResTy).getReg(0);
746       Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
747       InitResultRegs.push_back(InitReg);
748       PhiRegs.push_back(PhiReg);
749       MRI.setRegBank(PhiReg, *DefBank);
750       MRI.setRegBank(InitReg, *DefBank);
751     }
752   }
753 
754   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
755   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
756 
757   // Don't bother using generic instructions/registers for the exec mask.
758   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
759     .addDef(InitSaveExecReg);
760 
761   Register PhiExec = MRI.createVirtualRegister(WaveRC);
762   Register NewExec = MRI.createVirtualRegister(WaveRC);
763 
764   // To insert the loop we need to split the block. Move everything before this
765   // point to a new block, and insert a new empty block before this instruction.
766   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
767   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
768   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
769   MachineFunction::iterator MBBI(MBB);
770   ++MBBI;
771   MF->insert(MBBI, LoopBB);
772   MF->insert(MBBI, RestoreExecBB);
773   MF->insert(MBBI, RemainderBB);
774 
775   LoopBB->addSuccessor(RestoreExecBB);
776   LoopBB->addSuccessor(LoopBB);
777 
778   // Move the rest of the block into a new block.
779   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
780   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
781 
782   MBB.addSuccessor(LoopBB);
783   RestoreExecBB->addSuccessor(RemainderBB);
784 
785   B.setInsertPt(*LoopBB, LoopBB->end());
786 
787   B.buildInstr(TargetOpcode::PHI)
788     .addDef(PhiExec)
789     .addReg(InitSaveExecReg)
790     .addMBB(&MBB)
791     .addReg(NewExec)
792     .addMBB(LoopBB);
793 
794   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
795     B.buildInstr(TargetOpcode::G_PHI)
796       .addDef(std::get<2>(Result))
797       .addReg(std::get<0>(Result)) // Initial value / implicit_def
798       .addMBB(&MBB)
799       .addReg(std::get<1>(Result)) // Mid-loop value.
800       .addMBB(LoopBB);
801   }
802 
803   const DebugLoc &DL = B.getDL();
804 
805   MachineInstr &FirstInst = *Range.begin();
806 
807   // Move the instruction into the loop. Note we moved everything after
808   // Range.end() already into a new block, so Range.end() is no longer valid.
809   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
810 
811   // Figure out the iterator range after splicing the instructions.
812   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
813   auto NewEnd = LoopBB->end();
814 
815   MachineBasicBlock::iterator I = Range.begin();
816   B.setInsertPt(*LoopBB, I);
817 
818   Register CondReg;
819 
820   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
821 
822   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
823     for (MachineOperand &Op : MI.uses()) {
824       if (!Op.isReg() || Op.isDef())
825         continue;
826 
827       Register OldReg = Op.getReg();
828       if (!SGPROperandRegs.count(OldReg))
829         continue;
830 
831       // See if we already processed this register in another instruction in the
832       // sequence.
833       auto OldVal = WaterfalledRegMap.find(OldReg);
834       if (OldVal != WaterfalledRegMap.end()) {
835         Op.setReg(OldVal->second);
836         continue;
837       }
838 
839       Register OpReg = Op.getReg();
840       LLT OpTy = MRI.getType(OpReg);
841 
842       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
843       if (OpBank != &AMDGPU::VGPRRegBank) {
844         // Insert copy from AGPR to VGPR before the loop.
845         B.setMBB(MBB);
846         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
847         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
848         B.setInstr(*I);
849       }
850 
851       unsigned OpSize = OpTy.getSizeInBits();
852 
853       // Can only do a readlane of 32-bit pieces.
854       if (OpSize == 32) {
855         // Avoid extra copies in the simple case of one 32-bit register.
856         Register CurrentLaneOpReg
857           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
858         MRI.setType(CurrentLaneOpReg, OpTy);
859 
860         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
861         // Read the next variant <- also loop target.
862         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
863                 CurrentLaneOpReg)
864           .addReg(OpReg);
865 
866         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
867         bool First = CondReg == AMDGPU::NoRegister;
868         if (First)
869           CondReg = NewCondReg;
870 
871         // Compare the just read M0 value to all possible Idx values.
872         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
873           .addDef(NewCondReg)
874           .addReg(CurrentLaneOpReg)
875           .addReg(OpReg);
876         Op.setReg(CurrentLaneOpReg);
877 
878         if (!First) {
879           Register AndReg = MRI.createVirtualRegister(WaveRC);
880 
881           // If there are multiple operands to consider, and the conditions.
882           B.buildInstr(WaveAndOpc)
883             .addDef(AndReg)
884             .addReg(NewCondReg)
885             .addReg(CondReg);
886           CondReg = AndReg;
887         }
888       } else {
889         LLT S32 = LLT::scalar(32);
890         SmallVector<Register, 8> ReadlanePieces;
891 
892         // The compares can be done as 64-bit, but the extract needs to be done
893         // in 32-bit pieces.
894 
895         bool Is64 = OpSize % 64 == 0;
896 
897         LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898         unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899           : AMDGPU::V_CMP_EQ_U32_e64;
900 
901         // The compares can be done as 64-bit, but the extract needs to be done
902         // in 32-bit pieces.
903 
904         // Insert the unmerge before the loop.
905 
906         B.setMBB(MBB);
907         auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
908         B.setInstr(*I);
909 
910         unsigned NumPieces = Unmerge->getNumOperands() - 1;
911         for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
912           Register UnmergePiece = Unmerge.getReg(PieceIdx);
913 
914           Register CurrentLaneOpReg;
915           if (Is64) {
916             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
917             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
918 
919             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
920             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
921             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
922 
923             // Read the next variant <- also loop target.
924             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
925                     CurrentLaneOpRegLo)
926               .addReg(UnmergePiece, 0, AMDGPU::sub0);
927 
928             // Read the next variant <- also loop target.
929             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930                     CurrentLaneOpRegHi)
931               .addReg(UnmergePiece, 0, AMDGPU::sub1);
932 
933             CurrentLaneOpReg =
934               B.buildMerge(LLT::scalar(64),
935                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
936               .getReg(0);
937 
938             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
939 
940             if (OpTy.getScalarSizeInBits() == 64) {
941               // If we need to produce a 64-bit element vector, so use the
942               // merged pieces
943               ReadlanePieces.push_back(CurrentLaneOpReg);
944             } else {
945               // 32-bit element type.
946               ReadlanePieces.push_back(CurrentLaneOpRegLo);
947               ReadlanePieces.push_back(CurrentLaneOpRegHi);
948             }
949           } else {
950             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
951             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
952             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
953 
954             // Read the next variant <- also loop target.
955             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
956                     CurrentLaneOpReg)
957               .addReg(UnmergePiece);
958             ReadlanePieces.push_back(CurrentLaneOpReg);
959           }
960 
961           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
962           bool First = CondReg == AMDGPU::NoRegister;
963           if (First)
964             CondReg = NewCondReg;
965 
966           B.buildInstr(CmpOp)
967             .addDef(NewCondReg)
968             .addReg(CurrentLaneOpReg)
969             .addReg(UnmergePiece);
970 
971           if (!First) {
972             Register AndReg = MRI.createVirtualRegister(WaveRC);
973 
974             // If there are multiple operands to consider, and the conditions.
975             B.buildInstr(WaveAndOpc)
976               .addDef(AndReg)
977               .addReg(NewCondReg)
978               .addReg(CondReg);
979             CondReg = AndReg;
980           }
981         }
982 
983         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
984         // BUILD_VECTOR
985         if (OpTy.isVector()) {
986           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
987           Op.setReg(Merge.getReg(0));
988         } else {
989           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
990           Op.setReg(Merge.getReg(0));
991         }
992 
993         MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
994       }
995 
996       // Make sure we don't re-process this register again.
997       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
998     }
999   }
1000 
1001   B.setInsertPt(*LoopBB, LoopBB->end());
1002 
1003   // Update EXEC, save the original EXEC value to VCC.
1004   B.buildInstr(AndSaveExecOpc)
1005     .addDef(NewExec)
1006     .addReg(CondReg, RegState::Kill);
1007 
1008   MRI.setSimpleHint(NewExec, CondReg);
1009 
1010   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011   B.buildInstr(XorTermOpc)
1012     .addDef(ExecReg)
1013     .addReg(ExecReg)
1014     .addReg(NewExec);
1015 
1016   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1017   // s_cbranch_scc0?
1018 
1019   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1021     .addMBB(LoopBB);
1022 
1023   // Save the EXEC mask before the loop.
1024   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1025     .addReg(ExecReg);
1026 
1027   // Restore the EXEC mask after the loop.
1028   B.setMBB(*RestoreExecBB);
1029   B.buildInstr(MovTermOpc)
1030     .addDef(ExecReg)
1031     .addReg(SaveExecReg);
1032 
1033   // Set the insert point after the original instruction, so any new
1034   // instructions will be in the remainder.
1035   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1036 
1037   return true;
1038 }
1039 
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operands to handle and a
1043 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1045   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1046   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1047   for (unsigned Op : OpIndices) {
1048     assert(MI.getOperand(Op).isUse());
1049     Register Reg = MI.getOperand(Op).getReg();
1050     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1051     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1052       SGPROperandRegs.insert(Reg);
1053   }
1054 
1055   // No operands need to be replaced, so no need to loop.
1056   return !SGPROperandRegs.empty();
1057 }
1058 
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1060   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1061   ArrayRef<unsigned> OpIndices) const {
1062   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063   // are the same register.
1064   SmallSet<Register, 4> SGPROperandRegs;
1065 
1066   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1067     return false;
1068 
1069   MachineBasicBlock::iterator I = MI.getIterator();
1070   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1071                                 SGPROperandRegs, MRI);
1072 }
1073 
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1075   MachineInstr &MI, MachineRegisterInfo &MRI,
1076   ArrayRef<unsigned> OpIndices) const {
1077   MachineIRBuilder B(MI);
1078   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1079 }
1080 
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1083     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1084   Register Reg = MI.getOperand(OpIdx).getReg();
1085   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1086   if (Bank == &AMDGPU::SGPRRegBank)
1087     return;
1088 
1089   LLT Ty = MRI.getType(Reg);
1090   MachineIRBuilder B(MI);
1091 
1092   if (Bank != &AMDGPU::VGPRRegBank) {
1093     // We need to copy from AGPR to VGPR
1094     Reg = B.buildCopy(Ty, Reg).getReg(0);
1095     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1096   }
1097 
1098   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1099   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1100     .addDef(SGPR)
1101     .addReg(Reg);
1102 
1103   MRI.setType(SGPR, Ty);
1104 
1105   const TargetRegisterClass *Constrained =
1106       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1107   (void)Constrained;
1108   assert(Constrained && "Failed to constrain readfirstlane src reg");
1109 
1110   MI.getOperand(OpIdx).setReg(SGPR);
1111 }
1112 
1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1114 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1115 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1116   unsigned TotalSize = Ty.getSizeInBits();
1117   if (!Ty.isVector())
1118     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1119 
1120   LLT EltTy = Ty.getElementType();
1121   unsigned EltSize = EltTy.getSizeInBits();
1122   assert(FirstSize % EltSize == 0);
1123 
1124   unsigned FirstPartNumElts = FirstSize / EltSize;
1125   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1126 
1127   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1128           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1129 }
1130 
widen96To128(LLT Ty)1131 static LLT widen96To128(LLT Ty) {
1132   if (!Ty.isVector())
1133     return LLT::scalar(128);
1134 
1135   LLT EltTy = Ty.getElementType();
1136   assert(128 % EltTy.getSizeInBits() == 0);
1137   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1138 }
1139 
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1140 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1141                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1142                                               MachineRegisterInfo &MRI) const {
1143   Register DstReg = MI.getOperand(0).getReg();
1144   const LLT LoadTy = MRI.getType(DstReg);
1145   unsigned LoadSize = LoadTy.getSizeInBits();
1146   const unsigned MaxNonSmrdLoadSize = 128;
1147 
1148   const RegisterBank *DstBank =
1149       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1150   if (DstBank == &AMDGPU::SGPRRegBank) {
1151     // There are some special cases that we need to look at for 32 bit and 96
1152     // bit SGPR loads otherwise we have nothing to do.
1153     if (LoadSize != 32 && LoadSize != 96)
1154       return false;
1155 
1156     MachineMemOperand *MMO = *MI.memoperands_begin();
1157     const unsigned MemSize = 8 * MMO->getSize();
1158     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1159     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1160     // scalar loads should have a load size of 32 but memory access size of less
1161     // than 32.
1162     if (LoadSize == 32 &&
1163         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1164       return false;
1165 
1166     Register PtrReg = MI.getOperand(1).getReg();
1167 
1168     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1169     MachineIRBuilder B(MI, O);
1170 
1171     if (LoadSize == 32) {
1172       // This is an extending load from a sub-dword size. Widen the memory
1173       // access size to 4 bytes and clear the extra high bits appropriately
1174       const LLT S32 = LLT::scalar(32);
1175       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1176         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1177         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1178         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1179       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1180         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1181         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1182         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1183       } else
1184         // We do not need to touch the higher bits for regular loads.
1185         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1186     } else {
1187       // 96-bit loads are only available for vector loads. We need to split this
1188       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1189       if (MMO->getAlign() < Align(16)) {
1190         LLT Part64, Part32;
1191         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1192         auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1193         auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1194 
1195         auto Undef = B.buildUndef(LoadTy);
1196         auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1197         B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1198       } else {
1199         LLT WiderTy = widen96To128(LoadTy);
1200         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1201         B.buildExtract(MI.getOperand(0), WideLoad, 0);
1202       }
1203     }
1204 
1205     MI.eraseFromParent();
1206     return true;
1207   }
1208 
1209   // 128-bit loads are supported for all instruction types.
1210   if (LoadSize <= MaxNonSmrdLoadSize)
1211     return false;
1212 
1213   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1214   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1215 
1216   if (SrcRegs.empty())
1217     SrcRegs.push_back(MI.getOperand(1).getReg());
1218 
1219   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1220 
1221   // RegBankSelect only emits scalar types, so we need to reset the pointer
1222   // operand to a pointer type.
1223   Register BasePtrReg = SrcRegs[0];
1224   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1225   MRI.setType(BasePtrReg, PtrTy);
1226 
1227   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1228   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1229   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1230   MachineIRBuilder B(MI, Observer);
1231   LegalizerHelper Helper(B.getMF(), Observer, B);
1232 
1233   if (LoadTy.isVector()) {
1234     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1235       return false;
1236   } else {
1237     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1238       return false;
1239   }
1240 
1241   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1242   return true;
1243 }
1244 
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1245 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1246   MachineInstr &MI,
1247   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1248   MachineRegisterInfo &MRI) const {
1249   const MachineFunction &MF = *MI.getMF();
1250   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1251   const auto &TFI = *ST.getFrameLowering();
1252 
1253   // Guard in case the stack growth direction ever changes with scratch
1254   // instructions.
1255   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1256     return false;
1257 
1258   Register Dst = MI.getOperand(0).getReg();
1259   Register AllocSize = MI.getOperand(1).getReg();
1260   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1261 
1262   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1263 
1264   // TODO: Need to emit a wave reduction to get the maximum size.
1265   if (SizeBank != &AMDGPU::SGPRRegBank)
1266     return false;
1267 
1268   LLT PtrTy = MRI.getType(Dst);
1269   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1270 
1271   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1272   Register SPReg = Info->getStackPtrOffsetReg();
1273   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1274   MachineIRBuilder B(MI, ApplyBank);
1275 
1276   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1277   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1278 
1279   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1280   if (Alignment > TFI.getStackAlign()) {
1281     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1282     B.buildMaskLowPtrBits(Dst, PtrAdd,
1283                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1284   } else {
1285     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1286   }
1287 
1288   MI.eraseFromParent();
1289   return true;
1290 }
1291 
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1292 bool AMDGPURegisterBankInfo::applyMappingImage(
1293     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1294     MachineRegisterInfo &MRI, int RsrcIdx) const {
1295   const int NumDefs = MI.getNumExplicitDefs();
1296 
1297   // The reported argument index is relative to the IR intrinsic call arguments,
1298   // so we need to shift by the number of defs and the intrinsic ID.
1299   RsrcIdx += NumDefs + 1;
1300 
1301   // Insert copies to VGPR arguments.
1302   applyDefaultMapping(OpdMapper);
1303 
1304   // Fixup any SGPR arguments.
1305   SmallVector<unsigned, 4> SGPRIndexes;
1306   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1307     if (!MI.getOperand(I).isReg())
1308       continue;
1309 
1310     // If this intrinsic has a sampler, it immediately follows rsrc.
1311     if (I == RsrcIdx || I == RsrcIdx + 1)
1312       SGPRIndexes.push_back(I);
1313   }
1314 
1315   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1316   return true;
1317 }
1318 
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1319 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1320                                         Register Reg) {
1321   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1322   if (!Def)
1323     return Reg;
1324 
1325   // TODO: Guard against this being an implicit def
1326   return Def->getOperand(0).getReg();
1327 }
1328 
1329 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1330 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1331 static unsigned setBufferOffsets(MachineIRBuilder &B,
1332                                  const AMDGPURegisterBankInfo &RBI,
1333                                  Register CombinedOffset, Register &VOffsetReg,
1334                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1335                                  Align Alignment) {
1336   const LLT S32 = LLT::scalar(32);
1337   MachineRegisterInfo *MRI = B.getMRI();
1338 
1339   if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1340     uint32_t SOffset, ImmOffset;
1341     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1342                                  Alignment)) {
1343       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1344       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1345       InstOffsetVal = ImmOffset;
1346 
1347       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1348       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1349       return SOffset + ImmOffset;
1350     }
1351   }
1352 
1353   Register Base;
1354   unsigned Offset;
1355 
1356   std::tie(Base, Offset) =
1357       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1358 
1359   uint32_t SOffset, ImmOffset;
1360   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1361                                                   &RBI.Subtarget, Alignment)) {
1362     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1363       VOffsetReg = Base;
1364       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1365       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1366       InstOffsetVal = ImmOffset;
1367       return 0; // XXX - Why is this 0?
1368     }
1369 
1370     // If we have SGPR base, we can use it for soffset.
1371     if (SOffset == 0) {
1372       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1373       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1374       SOffsetReg = Base;
1375       InstOffsetVal = ImmOffset;
1376       return 0; // XXX - Why is this 0?
1377     }
1378   }
1379 
1380   // Handle the variable sgpr + vgpr case.
1381   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1382   if (Add && (int)Offset >= 0) {
1383     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1384     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1385 
1386     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1387     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1388 
1389     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1390       VOffsetReg = Src0;
1391       SOffsetReg = Src1;
1392       return 0;
1393     }
1394 
1395     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1396       VOffsetReg = Src1;
1397       SOffsetReg = Src0;
1398       return 0;
1399     }
1400   }
1401 
1402   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1403   // have an SGPR offset and a VGPR resource.
1404   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1405     VOffsetReg = CombinedOffset;
1406   } else {
1407     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1408     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1409   }
1410 
1411   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1412   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1413   return 0;
1414 }
1415 
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1416 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1417   const OperandsMapper &OpdMapper) const {
1418   MachineInstr &MI = OpdMapper.getMI();
1419   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1420 
1421   const LLT S32 = LLT::scalar(32);
1422   Register Dst = MI.getOperand(0).getReg();
1423   LLT Ty = MRI.getType(Dst);
1424 
1425   const RegisterBank *RSrcBank =
1426     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1427   const RegisterBank *OffsetBank =
1428     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1429   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1430       OffsetBank == &AMDGPU::SGPRRegBank)
1431     return true; // Legal mapping
1432 
1433   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1434   // here but don't have an MMO.
1435 
1436   unsigned LoadSize = Ty.getSizeInBits();
1437   int NumLoads = 1;
1438   if (LoadSize == 256 || LoadSize == 512) {
1439     NumLoads = LoadSize / 128;
1440     Ty = Ty.divide(NumLoads);
1441   }
1442 
1443   // Use the alignment to ensure that the required offsets will fit into the
1444   // immediate offsets.
1445   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1446 
1447   MachineIRBuilder B(MI);
1448   MachineFunction &MF = B.getMF();
1449 
1450   Register SOffset;
1451   Register VOffset;
1452   int64_t ImmOffset = 0;
1453 
1454   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1455                                         VOffset, SOffset, ImmOffset, Alignment);
1456 
1457   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1458   // can, but we need to track an MMO for that.
1459   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1460   const Align MemAlign(4); // FIXME: ABI type alignment?
1461   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1462     MachinePointerInfo(),
1463     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1464     MachineMemOperand::MOInvariant,
1465     MemSize, MemAlign);
1466   if (MMOOffset != 0)
1467     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1468 
1469   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1470   // assume that the buffer is unswizzled.
1471 
1472   Register RSrc = MI.getOperand(1).getReg();
1473   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1474   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1475 
1476   SmallVector<Register, 4> LoadParts(NumLoads);
1477 
1478   MachineBasicBlock::iterator MII = MI.getIterator();
1479   MachineInstrSpan Span(MII, &B.getMBB());
1480 
1481   for (int i = 0; i < NumLoads; ++i) {
1482     if (NumLoads == 1) {
1483       LoadParts[i] = Dst;
1484     } else {
1485       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1486       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1487     }
1488 
1489     MachineMemOperand *MMO = BaseMMO;
1490     if (i != 0)
1491       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1492 
1493     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1494       .addDef(LoadParts[i])       // vdata
1495       .addUse(RSrc)               // rsrc
1496       .addUse(VIndex)             // vindex
1497       .addUse(VOffset)            // voffset
1498       .addUse(SOffset)            // soffset
1499       .addImm(ImmOffset + 16 * i) // offset(imm)
1500       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1501       .addImm(0)                  // idxen(imm)
1502       .addMemOperand(MMO);
1503   }
1504 
1505   // TODO: If only the resource is a VGPR, it may be better to execute the
1506   // scalar load in the waterfall loop if the resource is expected to frequently
1507   // be dynamically uniform.
1508   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1509     // Remove the original instruction to avoid potentially confusing the
1510     // waterfall loop logic.
1511     B.setInstr(*Span.begin());
1512     MI.eraseFromParent();
1513 
1514     SmallSet<Register, 4> OpsToWaterfall;
1515 
1516     OpsToWaterfall.insert(RSrc);
1517     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1518                            OpsToWaterfall, MRI);
1519   }
1520 
1521   if (NumLoads != 1) {
1522     if (Ty.isVector())
1523       B.buildConcatVectors(Dst, LoadParts);
1524     else
1525       B.buildMerge(Dst, LoadParts);
1526   }
1527 
1528   // We removed the instruction earlier with a waterfall loop.
1529   if (RSrcBank == &AMDGPU::SGPRRegBank)
1530     MI.eraseFromParent();
1531 
1532   return true;
1533 }
1534 
applyMappingBFE(const OperandsMapper & OpdMapper,bool Signed) const1535 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1536                                              bool Signed) const {
1537   MachineInstr &MI = OpdMapper.getMI();
1538   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1539 
1540   // Insert basic copies
1541   applyDefaultMapping(OpdMapper);
1542 
1543   Register DstReg = MI.getOperand(0).getReg();
1544   LLT Ty = MRI.getType(DstReg);
1545 
1546   const LLT S32 = LLT::scalar(32);
1547 
1548   unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1549   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1550   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1551   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1552 
1553   const RegisterBank *DstBank =
1554     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1555   if (DstBank == &AMDGPU::VGPRRegBank) {
1556     if (Ty == S32)
1557       return true;
1558 
1559     // There is no 64-bit vgpr bitfield extract instructions so the operation
1560     // is expanded to a sequence of instructions that implement the operation.
1561     ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1562     MachineIRBuilder B(MI, ApplyBank);
1563 
1564     const LLT S64 = LLT::scalar(64);
1565     // Shift the source operand so that extracted bits start at bit 0.
1566     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1567                               : B.buildLShr(S64, SrcReg, OffsetReg);
1568     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1569 
1570     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1571     // if the width is a constant.
1572     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1573       // Use the 32-bit bitfield extract instruction if the width is a constant.
1574       // Depending on the width size, use either the low or high 32-bits.
1575       auto Zero = B.buildConstant(S32, 0);
1576       auto WidthImm = ConstWidth->Value.getZExtValue();
1577       if (WidthImm <= 32) {
1578         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1579         // or clear the upper 32-bits.
1580         auto Extract =
1581             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1582                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1583         auto Extend =
1584             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1585         B.buildMerge(DstReg, {Extract, Extend});
1586       } else {
1587         // Use bitfield extract on upper 32-bit source, and combine with lower
1588         // 32-bit source.
1589         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1590         auto Extract =
1591             Signed
1592                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1593                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1594         B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1595       }
1596       MI.eraseFromParent();
1597       return true;
1598     }
1599 
1600     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1601     // operations.
1602     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1603     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1604     if (Signed)
1605       B.buildAShr(S64, SignBit, ExtShift);
1606     else
1607       B.buildLShr(S64, SignBit, ExtShift);
1608     MI.eraseFromParent();
1609     return true;
1610   }
1611 
1612   // The scalar form packs the offset and width in a single operand.
1613 
1614   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1615   MachineIRBuilder B(MI, ApplyBank);
1616 
1617   // Ensure the high bits are clear to insert the offset.
1618   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1619   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1620 
1621   // Zeros out the low bits, so don't bother clamping the input value.
1622   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1623 
1624   // Transformation function, pack the offset and width of a BFE into
1625   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1626   // source, bits [5:0] contain the offset and bits [22:16] the width.
1627   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1628 
1629   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1630   // register class constraints.
1631   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1632                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1633 
1634   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1635   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1636     llvm_unreachable("failed to constrain BFE");
1637 
1638   MI.eraseFromParent();
1639   return true;
1640 }
1641 
1642 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1643 static unsigned getExtendOp(unsigned Opc) {
1644   switch (Opc) {
1645   case TargetOpcode::G_ASHR:
1646   case TargetOpcode::G_SMIN:
1647   case TargetOpcode::G_SMAX:
1648     return TargetOpcode::G_SEXT;
1649   case TargetOpcode::G_LSHR:
1650   case TargetOpcode::G_UMIN:
1651   case TargetOpcode::G_UMAX:
1652     return TargetOpcode::G_ZEXT;
1653   default:
1654     return TargetOpcode::G_ANYEXT;
1655   }
1656 }
1657 
1658 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1659 // any illegal vector extend or unmerge operations.
1660 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1661 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1662   const LLT S32 = LLT::scalar(32);
1663   auto Bitcast = B.buildBitcast(S32, Src);
1664 
1665   if (ExtOpcode == TargetOpcode::G_SEXT) {
1666     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1667     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1668     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1669   }
1670 
1671   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1672   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1673     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1674     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1675   }
1676 
1677   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1678   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1679 }
1680 
1681 // For cases where only a single copy is inserted for matching register banks.
1682 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1683 static bool substituteSimpleCopyRegs(
1684   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1685   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1686   if (!SrcReg.empty()) {
1687     assert(SrcReg.size() == 1);
1688     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1689     return true;
1690   }
1691 
1692   return false;
1693 }
1694 
1695 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1696 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1697                                                 MachineRegisterInfo &MRI,
1698                                                 Register Reg) const {
1699   if (!Subtarget.hasUnpackedD16VMem())
1700     return Reg;
1701 
1702   const LLT S16 = LLT::scalar(16);
1703   LLT StoreVT = MRI.getType(Reg);
1704   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1705     return Reg;
1706 
1707   auto Unmerge = B.buildUnmerge(S16, Reg);
1708 
1709 
1710   SmallVector<Register, 4> WideRegs;
1711   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1712     WideRegs.push_back(Unmerge.getReg(I));
1713 
1714   const LLT S32 = LLT::scalar(32);
1715   int NumElts = StoreVT.getNumElements();
1716 
1717   return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1718 }
1719 
1720 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1721 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1722   int64_t Const;
1723   if (mi_match(Reg, MRI, m_ICst(Const)))
1724     return std::make_pair(Register(), Const);
1725 
1726   Register Base;
1727   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1728     return std::make_pair(Base, Const);
1729 
1730   // TODO: Handle G_OR used for add case
1731   return std::make_pair(Reg, 0);
1732 }
1733 
1734 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1735 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1736                                            Register OrigOffset) const {
1737   const unsigned MaxImm = 4095;
1738   Register BaseReg;
1739   unsigned ImmOffset;
1740   const LLT S32 = LLT::scalar(32);
1741 
1742   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1743                                                            OrigOffset);
1744 
1745   unsigned C1 = 0;
1746   if (ImmOffset != 0) {
1747     // If the immediate value is too big for the immoffset field, put the value
1748     // and -4096 into the immoffset field so that the value that is copied/added
1749     // for the voffset field is a multiple of 4096, and it stands more chance
1750     // of being CSEd with the copy/add for another similar load/store.
1751     // However, do not do that rounding down to a multiple of 4096 if that is a
1752     // negative number, as it appears to be illegal to have a negative offset
1753     // in the vgpr, even if adding the immediate offset makes it positive.
1754     unsigned Overflow = ImmOffset & ~MaxImm;
1755     ImmOffset -= Overflow;
1756     if ((int32_t)Overflow < 0) {
1757       Overflow += ImmOffset;
1758       ImmOffset = 0;
1759     }
1760 
1761     C1 = ImmOffset;
1762     if (Overflow != 0) {
1763       if (!BaseReg)
1764         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1765       else {
1766         auto OverflowVal = B.buildConstant(S32, Overflow);
1767         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1768       }
1769     }
1770   }
1771 
1772   if (!BaseReg)
1773     BaseReg = B.buildConstant(S32, 0).getReg(0);
1774 
1775   return {BaseReg, C1};
1776 }
1777 
isZero(Register Reg,MachineRegisterInfo & MRI)1778 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1779   int64_t C;
1780   return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1781 }
1782 
extractCPol(unsigned CachePolicy)1783 static unsigned extractCPol(unsigned CachePolicy) {
1784   return CachePolicy & AMDGPU::CPol::ALL;
1785 }
1786 
extractSWZ(unsigned CachePolicy)1787 static unsigned extractSWZ(unsigned CachePolicy) {
1788   return (CachePolicy >> 3) & 1;
1789 }
1790 
1791 
1792 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1793 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1794                                              MachineInstr &MI) const {
1795    MachineRegisterInfo &MRI = *B.getMRI();
1796   executeInWaterfallLoop(B, MI, MRI, {2, 4});
1797 
1798   // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1799 
1800   Register VData = MI.getOperand(1).getReg();
1801   LLT Ty = MRI.getType(VData);
1802 
1803   int EltSize = Ty.getScalarSizeInBits();
1804   int Size = Ty.getSizeInBits();
1805 
1806   // FIXME: Broken integer truncstore.
1807   if (EltSize != 32)
1808     report_fatal_error("unhandled intrinsic store");
1809 
1810   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1811   const int MemSize = (*MI.memoperands_begin())->getSize();
1812 
1813 
1814   Register RSrc = MI.getOperand(2).getReg();
1815   Register VOffset = MI.getOperand(3).getReg();
1816   Register SOffset = MI.getOperand(4).getReg();
1817   unsigned CachePolicy = MI.getOperand(5).getImm();
1818 
1819   unsigned ImmOffset;
1820   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1821 
1822   const bool Offen = !isZero(VOffset, MRI);
1823 
1824   unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1825   switch (8 * MemSize) {
1826   case 8:
1827     Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1828                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1829     break;
1830   case 16:
1831     Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1832                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1833     break;
1834   default:
1835     Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1836                   AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1837     if (Size > 32)
1838       Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1839     break;
1840   }
1841 
1842 
1843   // Set the insertion point back to the instruction in case it was moved into a
1844   // loop.
1845   B.setInstr(MI);
1846 
1847   MachineInstrBuilder MIB = B.buildInstr(Opc)
1848     .addUse(VData);
1849 
1850   if (Offen)
1851     MIB.addUse(VOffset);
1852 
1853   MIB.addUse(RSrc)
1854      .addUse(SOffset)
1855      .addImm(ImmOffset)
1856      .addImm(extractCPol(CachePolicy))
1857      .addImm(0) // tfe: FIXME: Remove from inst
1858      .addImm(extractSWZ(CachePolicy))
1859      .cloneMemRefs(MI);
1860 
1861   // FIXME: We need a way to report failure from applyMappingImpl.
1862   // Insert constrain copies before inserting the loop.
1863   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1864     report_fatal_error("failed to constrain selected store intrinsic");
1865 
1866   return MIB;
1867 }
1868 
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1870                                         Register SrcReg) const {
1871   MachineRegisterInfo &MRI = *B.getMRI();
1872   LLT SrcTy = MRI.getType(SrcReg);
1873   if (SrcTy.getSizeInBits() == 32) {
1874     // Use a v_mov_b32 here to make the exec dependency explicit.
1875     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876       .addDef(DstReg)
1877       .addUse(SrcReg);
1878     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1879            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1880   }
1881 
1882   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1883   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1884 
1885   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1886     .addDef(TmpReg0)
1887     .addUse(SrcReg, 0, AMDGPU::sub0);
1888   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1889     .addDef(TmpReg1)
1890     .addUse(SrcReg, 0, AMDGPU::sub1);
1891   B.buildInstr(AMDGPU::REG_SEQUENCE)
1892     .addDef(DstReg)
1893     .addUse(TmpReg0)
1894     .addImm(AMDGPU::sub0)
1895     .addUse(TmpReg1)
1896     .addImm(AMDGPU::sub1);
1897 
1898   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1899          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1900 }
1901 
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterwall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1905                                    MachineInstr &IdxUseInstr,
1906                                    unsigned OpIdx,
1907                                    unsigned ConstOffset) {
1908   MachineRegisterInfo &MRI = *B.getMRI();
1909   const LLT S32 = LLT::scalar(32);
1910   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1911   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1912 
1913   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1914 
1915   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1916   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1917   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1918   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1919 }
1920 
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1924 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1925 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1926                                   Register Hi32Reg, Register Lo32Reg,
1927                                   unsigned ExtOpc,
1928                                   const RegisterBank &RegBank,
1929                                   bool IsBooleanSrc = false) {
1930   if (ExtOpc == AMDGPU::G_ZEXT) {
1931     B.buildConstant(Hi32Reg, 0);
1932   } else if (ExtOpc == AMDGPU::G_SEXT) {
1933     if (IsBooleanSrc) {
1934       // If we know the original source was an s1, the high half is the same as
1935       // the low.
1936       B.buildCopy(Hi32Reg, Lo32Reg);
1937     } else {
1938       // Replicate sign bit from 32-bit extended part.
1939       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1940       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1941       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1942     }
1943   } else {
1944     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1945     B.buildUndef(Hi32Reg);
1946   }
1947 }
1948 
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1950   MachineInstr &MI, MachineRegisterInfo &MRI,
1951   const OperandsMapper &OpdMapper) const {
1952 
1953   Register VecReg = MI.getOperand(1).getReg();
1954   Register Idx = MI.getOperand(2).getReg();
1955 
1956   const RegisterBank &IdxBank =
1957     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1958 
1959   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1960 
1961   LLT VecTy = MRI.getType(VecReg);
1962   unsigned EltSize = VecTy.getScalarSizeInBits();
1963   unsigned NumElem = VecTy.getNumElements();
1964 
1965   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1966                                                   IsDivergentIdx))
1967     return false;
1968 
1969   MachineIRBuilder B(MI);
1970   LLT S32 = LLT::scalar(32);
1971 
1972   const RegisterBank &DstBank =
1973     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1974   const RegisterBank &SrcBank =
1975     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1976 
1977   const RegisterBank &CCBank =
1978     (DstBank == AMDGPU::SGPRRegBank &&
1979      SrcBank == AMDGPU::SGPRRegBank &&
1980      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1981                                      : AMDGPU::VCCRegBank;
1982   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1983 
1984   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1985     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1986     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1987   }
1988 
1989   LLT EltTy = VecTy.getScalarType();
1990   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1991   unsigned NumLanes = DstRegs.size();
1992   if (!NumLanes)
1993     NumLanes = 1;
1994   else
1995     EltTy = MRI.getType(DstRegs[0]);
1996 
1997   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1998   SmallVector<Register, 2> Res(NumLanes);
1999   for (unsigned L = 0; L < NumLanes; ++L)
2000     Res[L] = UnmergeToEltTy.getReg(L);
2001 
2002   for (unsigned I = 1; I < NumElem; ++I) {
2003     auto IC = B.buildConstant(S32, I);
2004     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2005     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2006     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2007 
2008     for (unsigned L = 0; L < NumLanes; ++L) {
2009       auto S = B.buildSelect(EltTy, Cmp,
2010                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2011 
2012       for (unsigned N : { 0, 2, 3 })
2013         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2014 
2015       Res[L] = S->getOperand(0).getReg();
2016     }
2017   }
2018 
2019   for (unsigned L = 0; L < NumLanes; ++L) {
2020     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2021     B.buildCopy(DstReg, Res[L]);
2022     MRI.setRegBank(DstReg, DstBank);
2023   }
2024 
2025   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2026   MI.eraseFromParent();
2027 
2028   return true;
2029 }
2030 
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2033 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2034                                    MachineIRBuilder &B, Register &Reg,
2035                                    const RegisterBank &Bank) {
2036   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2037   if (CurrBank && *CurrBank != Bank) {
2038     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2039     MRI.setRegBank(Copy, Bank);
2040     return Copy;
2041   }
2042 
2043   MRI.setRegBank(Reg, Bank);
2044   return Reg;
2045 }
2046 
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2048   MachineInstr &MI, MachineRegisterInfo &MRI,
2049   const OperandsMapper &OpdMapper) const {
2050 
2051   Register VecReg = MI.getOperand(1).getReg();
2052   Register Idx = MI.getOperand(3).getReg();
2053 
2054   const RegisterBank &IdxBank =
2055     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2056 
2057   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2058 
2059   LLT VecTy = MRI.getType(VecReg);
2060   unsigned EltSize = VecTy.getScalarSizeInBits();
2061   unsigned NumElem = VecTy.getNumElements();
2062 
2063   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2064                                                   IsDivergentIdx))
2065     return false;
2066 
2067   MachineIRBuilder B(MI);
2068   LLT S32 = LLT::scalar(32);
2069 
2070   const RegisterBank &DstBank =
2071     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2072   const RegisterBank &SrcBank =
2073     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2074   const RegisterBank &InsBank =
2075     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2076 
2077   const RegisterBank &CCBank =
2078     (DstBank == AMDGPU::SGPRRegBank &&
2079      SrcBank == AMDGPU::SGPRRegBank &&
2080      InsBank == AMDGPU::SGPRRegBank &&
2081      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2082                                      : AMDGPU::VCCRegBank;
2083   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2084 
2085   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2086     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2087     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2088   }
2089 
2090   LLT EltTy = VecTy.getScalarType();
2091   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2092   unsigned NumLanes = InsRegs.size();
2093   if (!NumLanes) {
2094     NumLanes = 1;
2095     InsRegs.push_back(MI.getOperand(2).getReg());
2096   } else {
2097     EltTy = MRI.getType(InsRegs[0]);
2098   }
2099 
2100   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2101   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2102 
2103   for (unsigned I = 0; I < NumElem; ++I) {
2104     auto IC = B.buildConstant(S32, I);
2105     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2106     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2107     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2108 
2109     for (unsigned L = 0; L < NumLanes; ++L) {
2110       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2111       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2112       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2113 
2114       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2115       MRI.setRegBank(Select, DstBank);
2116 
2117       Ops[I * NumLanes + L] = Select;
2118     }
2119   }
2120 
2121   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2122   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2123     B.buildBuildVector(MI.getOperand(0), Ops);
2124   } else {
2125     auto Vec = B.buildBuildVector(MergeTy, Ops);
2126     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2127     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2128   }
2129 
2130   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2131   MI.eraseFromParent();
2132 
2133   return true;
2134 }
2135 
applyMappingImpl(const OperandsMapper & OpdMapper) const2136 void AMDGPURegisterBankInfo::applyMappingImpl(
2137     const OperandsMapper &OpdMapper) const {
2138   MachineInstr &MI = OpdMapper.getMI();
2139   unsigned Opc = MI.getOpcode();
2140   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2141   switch (Opc) {
2142   case AMDGPU::G_PHI: {
2143     Register DstReg = MI.getOperand(0).getReg();
2144     LLT DstTy = MRI.getType(DstReg);
2145     if (DstTy != LLT::scalar(1))
2146       break;
2147 
2148     const LLT S32 = LLT::scalar(32);
2149     const RegisterBank *DstBank =
2150       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2151     if (DstBank == &AMDGPU::VCCRegBank) {
2152       applyDefaultMapping(OpdMapper);
2153       // The standard handling only considers the result register bank for
2154       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2155       // produce an invalid copy. We can only copy with some kind of compare to
2156       // get a vector boolean result. Insert a register bank copy that will be
2157       // correctly lowered to a compare.
2158       MachineIRBuilder B(*MI.getParent()->getParent());
2159 
2160       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2161         Register SrcReg = MI.getOperand(I).getReg();
2162         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2163 
2164         if (SrcBank != &AMDGPU::VCCRegBank) {
2165           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2166           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2167 
2168           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2169           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2170           MI.getOperand(I).setReg(Copy.getReg(0));
2171         }
2172       }
2173 
2174       return;
2175     }
2176 
2177     // Phi handling is strange and only considers the bank of the destination.
2178     substituteSimpleCopyRegs(OpdMapper, 0);
2179 
2180     // Promote SGPR/VGPR booleans to s32
2181     MachineFunction *MF = MI.getParent()->getParent();
2182     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2183     MachineIRBuilder B(MI, ApplyBank);
2184     LegalizerHelper Helper(*MF, ApplyBank, B);
2185 
2186     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2187       llvm_unreachable("widen scalar should have succeeded");
2188 
2189     return;
2190   }
2191   case AMDGPU::G_ICMP:
2192   case AMDGPU::G_UADDO:
2193   case AMDGPU::G_USUBO:
2194   case AMDGPU::G_UADDE:
2195   case AMDGPU::G_SADDE:
2196   case AMDGPU::G_USUBE:
2197   case AMDGPU::G_SSUBE: {
2198     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2199     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2200 
2201     const RegisterBank *DstBank =
2202       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2203     if (DstBank != &AMDGPU::SGPRRegBank)
2204       break;
2205 
2206     const bool HasCarryIn = MI.getNumOperands() == 5;
2207 
2208     // If this is a scalar compare, promote the result to s32, as the selection
2209     // will end up using a copy to a 32-bit vreg.
2210     const LLT S32 = LLT::scalar(32);
2211     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2212     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2213     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2214     MachineIRBuilder B(MI);
2215 
2216     if (HasCarryIn) {
2217       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2218       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2219       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2220       MI.getOperand(4).setReg(NewSrcReg);
2221     }
2222 
2223     MachineBasicBlock *MBB = MI.getParent();
2224     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2225 
2226     // If we had a constrained VCC result register, a copy was inserted to VCC
2227     // from SGPR.
2228     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2229     if (DefRegs.empty())
2230       DefRegs.push_back(DstReg);
2231     B.buildTrunc(DefRegs[0], NewDstReg);
2232     return;
2233   }
2234   case AMDGPU::G_SELECT: {
2235     Register DstReg = MI.getOperand(0).getReg();
2236     LLT DstTy = MRI.getType(DstReg);
2237 
2238     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2239     if (CondRegs.empty())
2240       CondRegs.push_back(MI.getOperand(1).getReg());
2241     else {
2242       assert(CondRegs.size() == 1);
2243     }
2244 
2245     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2246     if (CondBank == &AMDGPU::SGPRRegBank) {
2247       MachineIRBuilder B(MI);
2248       const LLT S32 = LLT::scalar(32);
2249       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2250       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2251 
2252       MI.getOperand(1).setReg(NewCondReg);
2253       B.buildZExt(NewCondReg, CondRegs[0]);
2254     }
2255 
2256     if (DstTy.getSizeInBits() != 64)
2257       break;
2258 
2259     MachineIRBuilder B(MI);
2260     LLT HalfTy = getHalfSizedType(DstTy);
2261 
2262     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2263     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2264     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2265 
2266     // All inputs are SGPRs, nothing special to do.
2267     if (DefRegs.empty()) {
2268       assert(Src1Regs.empty() && Src2Regs.empty());
2269       break;
2270     }
2271 
2272     if (Src1Regs.empty())
2273       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2274     else {
2275       setRegsToType(MRI, Src1Regs, HalfTy);
2276     }
2277 
2278     if (Src2Regs.empty())
2279       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2280     else
2281       setRegsToType(MRI, Src2Regs, HalfTy);
2282 
2283     setRegsToType(MRI, DefRegs, HalfTy);
2284 
2285     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2286     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2287 
2288     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2289     MI.eraseFromParent();
2290     return;
2291   }
2292   case AMDGPU::G_BRCOND: {
2293     Register CondReg = MI.getOperand(0).getReg();
2294     // FIXME: Should use legalizer helper, but should change bool ext type.
2295     const RegisterBank *CondBank =
2296       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2297 
2298     if (CondBank == &AMDGPU::SGPRRegBank) {
2299       MachineIRBuilder B(MI);
2300       const LLT S32 = LLT::scalar(32);
2301       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2302       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2303 
2304       MI.getOperand(0).setReg(NewCondReg);
2305       B.buildZExt(NewCondReg, CondReg);
2306       return;
2307     }
2308 
2309     break;
2310   }
2311   case AMDGPU::G_AND:
2312   case AMDGPU::G_OR:
2313   case AMDGPU::G_XOR: {
2314     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2315     // there is a VGPR input.
2316     Register DstReg = MI.getOperand(0).getReg();
2317     LLT DstTy = MRI.getType(DstReg);
2318 
2319     if (DstTy.getSizeInBits() == 1) {
2320       const RegisterBank *DstBank =
2321         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2322       if (DstBank == &AMDGPU::VCCRegBank)
2323         break;
2324 
2325       MachineFunction *MF = MI.getParent()->getParent();
2326       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2327       MachineIRBuilder B(MI, ApplyBank);
2328       LegalizerHelper Helper(*MF, ApplyBank, B);
2329 
2330       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2331           LegalizerHelper::Legalized)
2332         llvm_unreachable("widen scalar should have succeeded");
2333       return;
2334     }
2335 
2336     if (DstTy.getSizeInBits() != 64)
2337       break;
2338 
2339     LLT HalfTy = getHalfSizedType(DstTy);
2340     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2341     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2342     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2343 
2344     // All inputs are SGPRs, nothing special to do.
2345     if (DefRegs.empty()) {
2346       assert(Src0Regs.empty() && Src1Regs.empty());
2347       break;
2348     }
2349 
2350     assert(DefRegs.size() == 2);
2351     assert(Src0Regs.size() == Src1Regs.size() &&
2352            (Src0Regs.empty() || Src0Regs.size() == 2));
2353 
2354     // Depending on where the source registers came from, the generic code may
2355     // have decided to split the inputs already or not. If not, we still need to
2356     // extract the values.
2357     MachineIRBuilder B(MI);
2358 
2359     if (Src0Regs.empty())
2360       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2361     else
2362       setRegsToType(MRI, Src0Regs, HalfTy);
2363 
2364     if (Src1Regs.empty())
2365       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2366     else
2367       setRegsToType(MRI, Src1Regs, HalfTy);
2368 
2369     setRegsToType(MRI, DefRegs, HalfTy);
2370 
2371     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2372     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2373 
2374     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2375     MI.eraseFromParent();
2376     return;
2377   }
2378   case AMDGPU::G_ABS: {
2379     Register SrcReg = MI.getOperand(1).getReg();
2380     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2381 
2382     // There is no VALU abs instruction so we need to replace it with a sub and
2383     // max combination.
2384     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2385       MachineFunction *MF = MI.getParent()->getParent();
2386       ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2387       MachineIRBuilder B(MI, Apply);
2388       LegalizerHelper Helper(*MF, Apply, B);
2389 
2390       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2391         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2392       return;
2393     }
2394     LLVM_FALLTHROUGH;
2395   }
2396   case AMDGPU::G_ADD:
2397   case AMDGPU::G_SUB:
2398   case AMDGPU::G_MUL:
2399   case AMDGPU::G_SHL:
2400   case AMDGPU::G_LSHR:
2401   case AMDGPU::G_ASHR:
2402   case AMDGPU::G_SMIN:
2403   case AMDGPU::G_SMAX:
2404   case AMDGPU::G_UMIN:
2405   case AMDGPU::G_UMAX: {
2406     Register DstReg = MI.getOperand(0).getReg();
2407     LLT DstTy = MRI.getType(DstReg);
2408 
2409     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2410     // Packed 16-bit operations need to be scalarized and promoted.
2411     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2412       break;
2413 
2414     const RegisterBank *DstBank =
2415       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2416     if (DstBank == &AMDGPU::VGPRRegBank)
2417       break;
2418 
2419     const LLT S32 = LLT::scalar(32);
2420     MachineBasicBlock *MBB = MI.getParent();
2421     MachineFunction *MF = MBB->getParent();
2422     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2423     MachineIRBuilder B(MI, ApplySALU);
2424 
2425     if (DstTy.isVector()) {
2426       Register WideSrc0Lo, WideSrc0Hi;
2427       Register WideSrc1Lo, WideSrc1Hi;
2428 
2429       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2430       std::tie(WideSrc0Lo, WideSrc0Hi)
2431         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2432       std::tie(WideSrc1Lo, WideSrc1Hi)
2433         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2434       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2435       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2436       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2437       MI.eraseFromParent();
2438     } else {
2439       LegalizerHelper Helper(*MF, ApplySALU, B);
2440 
2441       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2442         llvm_unreachable("widen scalar should have succeeded");
2443 
2444       // FIXME: s16 shift amounts should be legal.
2445       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2446           Opc == AMDGPU::G_ASHR) {
2447         B.setInsertPt(*MBB, MI.getIterator());
2448         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2449           llvm_unreachable("widen scalar should have succeeded");
2450       }
2451     }
2452 
2453     return;
2454   }
2455   case AMDGPU::G_SEXT_INREG: {
2456     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2457     if (SrcRegs.empty())
2458       break; // Nothing to repair
2459 
2460     const LLT S32 = LLT::scalar(32);
2461     MachineIRBuilder B(MI);
2462     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2463     GISelObserverWrapper Observer(&O);
2464     B.setChangeObserver(Observer);
2465 
2466     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2467     // we would need to further expand, and doesn't let us directly set the
2468     // result registers.
2469     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2470 
2471     int Amt = MI.getOperand(2).getImm();
2472     if (Amt <= 32) {
2473       if (Amt == 32) {
2474         // The low bits are unchanged.
2475         B.buildCopy(DstRegs[0], SrcRegs[0]);
2476       } else {
2477         // Extend in the low bits and propagate the sign bit to the high half.
2478         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2479       }
2480 
2481       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2482     } else {
2483       // The low bits are unchanged, and extend in the high bits.
2484       B.buildCopy(DstRegs[0], SrcRegs[0]);
2485       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2486     }
2487 
2488     Register DstReg = MI.getOperand(0).getReg();
2489     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2490     MI.eraseFromParent();
2491     return;
2492   }
2493   case AMDGPU::G_CTPOP:
2494   case AMDGPU::G_BITREVERSE: {
2495     const RegisterBank *DstBank =
2496       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2497     if (DstBank == &AMDGPU::SGPRRegBank)
2498       break;
2499 
2500     Register SrcReg = MI.getOperand(1).getReg();
2501     const LLT S32 = LLT::scalar(32);
2502     LLT Ty = MRI.getType(SrcReg);
2503     if (Ty == S32)
2504       break;
2505 
2506     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2507     MachineIRBuilder B(MI, ApplyVALU);
2508 
2509     MachineFunction &MF = B.getMF();
2510     LegalizerHelper Helper(MF, ApplyVALU, B);
2511 
2512     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2513       llvm_unreachable("narrowScalar should have succeeded");
2514     return;
2515   }
2516   case AMDGPU::G_AMDGPU_FFBH_U32:
2517   case AMDGPU::G_AMDGPU_FFBL_B32:
2518   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2519   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2520     const RegisterBank *DstBank =
2521         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2522     if (DstBank == &AMDGPU::SGPRRegBank)
2523       break;
2524 
2525     Register SrcReg = MI.getOperand(1).getReg();
2526     const LLT S32 = LLT::scalar(32);
2527     LLT Ty = MRI.getType(SrcReg);
2528     if (Ty == S32)
2529       break;
2530 
2531     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2532     // which return -1 when the input is zero:
2533     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2534     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2535     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2536     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2537     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2538     MachineIRBuilder B(MI, ApplyVALU);
2539     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2540     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2541                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2542                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2543                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2544                                 : Opc;
2545     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2546     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2547     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2548     unsigned AddOpc =
2549         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2550             ? AMDGPU::G_ADD
2551             : AMDGPU::G_UADDSAT;
2552     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2553     Register DstReg = MI.getOperand(0).getReg();
2554     B.buildUMin(DstReg, X, Y);
2555     MI.eraseFromParent();
2556     return;
2557   }
2558   case AMDGPU::G_SEXT:
2559   case AMDGPU::G_ZEXT:
2560   case AMDGPU::G_ANYEXT: {
2561     Register SrcReg = MI.getOperand(1).getReg();
2562     LLT SrcTy = MRI.getType(SrcReg);
2563     const bool Signed = Opc == AMDGPU::G_SEXT;
2564 
2565     assert(empty(OpdMapper.getVRegs(1)));
2566 
2567     MachineIRBuilder B(MI);
2568     const RegisterBank *SrcBank =
2569       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2570 
2571     Register DstReg = MI.getOperand(0).getReg();
2572     LLT DstTy = MRI.getType(DstReg);
2573     if (DstTy.isScalar() &&
2574         SrcBank != &AMDGPU::SGPRRegBank &&
2575         SrcBank != &AMDGPU::VCCRegBank &&
2576         // FIXME: Should handle any type that round to s64 when irregular
2577         // breakdowns supported.
2578         DstTy.getSizeInBits() == 64 &&
2579         SrcTy.getSizeInBits() <= 32) {
2580       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2581 
2582       // Extend to 32-bit, and then extend the low half.
2583       if (Signed) {
2584         // TODO: Should really be buildSExtOrCopy
2585         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2586       } else if (Opc == AMDGPU::G_ZEXT) {
2587         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2588       } else {
2589         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2590       }
2591 
2592       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2593       MRI.setRegBank(DstReg, *SrcBank);
2594       MI.eraseFromParent();
2595       return;
2596     }
2597 
2598     if (SrcTy != LLT::scalar(1))
2599       return;
2600 
2601     // It is not legal to have a legalization artifact with a VCC source. Rather
2602     // than introducing a copy, insert the select we would have to select the
2603     // copy to.
2604     if (SrcBank == &AMDGPU::VCCRegBank) {
2605       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2606 
2607       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2608 
2609       unsigned DstSize = DstTy.getSizeInBits();
2610       // 64-bit select is SGPR only
2611       const bool UseSel64 = DstSize > 32 &&
2612         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2613 
2614       // TODO: Should s16 select be legal?
2615       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2616       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2617       auto False = B.buildConstant(SelType, 0);
2618 
2619       MRI.setRegBank(True.getReg(0), *DstBank);
2620       MRI.setRegBank(False.getReg(0), *DstBank);
2621       MRI.setRegBank(DstReg, *DstBank);
2622 
2623       if (DstSize > 32) {
2624         B.buildSelect(DefRegs[0], SrcReg, True, False);
2625         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2626       } else if (DstSize < 32) {
2627         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2628         MRI.setRegBank(Sel.getReg(0), *DstBank);
2629         B.buildTrunc(DstReg, Sel);
2630       } else {
2631         B.buildSelect(DstReg, SrcReg, True, False);
2632       }
2633 
2634       MI.eraseFromParent();
2635       return;
2636     }
2637 
2638     break;
2639   }
2640   case AMDGPU::G_BUILD_VECTOR:
2641   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2642     Register DstReg = MI.getOperand(0).getReg();
2643     LLT DstTy = MRI.getType(DstReg);
2644     if (DstTy != LLT::fixed_vector(2, 16))
2645       break;
2646 
2647     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2648     substituteSimpleCopyRegs(OpdMapper, 1);
2649     substituteSimpleCopyRegs(OpdMapper, 2);
2650 
2651     const RegisterBank *DstBank =
2652       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2653     if (DstBank == &AMDGPU::SGPRRegBank)
2654       break; // Can use S_PACK_* instructions.
2655 
2656     MachineIRBuilder B(MI);
2657 
2658     Register Lo = MI.getOperand(1).getReg();
2659     Register Hi = MI.getOperand(2).getReg();
2660     const LLT S32 = LLT::scalar(32);
2661 
2662     const RegisterBank *BankLo =
2663       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2664     const RegisterBank *BankHi =
2665       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2666 
2667     Register ZextLo;
2668     Register ShiftHi;
2669 
2670     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2671       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2672       MRI.setRegBank(ZextLo, *BankLo);
2673 
2674       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2675       MRI.setRegBank(ZextHi, *BankHi);
2676 
2677       auto ShiftAmt = B.buildConstant(S32, 16);
2678       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2679 
2680       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2681       MRI.setRegBank(ShiftHi, *BankHi);
2682     } else {
2683       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2684       MRI.setRegBank(MaskLo, *BankLo);
2685 
2686       auto ShiftAmt = B.buildConstant(S32, 16);
2687       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2688 
2689       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2690       MRI.setRegBank(ShiftHi, *BankHi);
2691 
2692       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2693       MRI.setRegBank(ZextLo, *BankLo);
2694     }
2695 
2696     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2697     MRI.setRegBank(Or.getReg(0), *DstBank);
2698 
2699     B.buildBitcast(DstReg, Or);
2700     MI.eraseFromParent();
2701     return;
2702   }
2703   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2704     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2705 
2706     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2707 
2708     Register DstReg = MI.getOperand(0).getReg();
2709     Register SrcReg = MI.getOperand(1).getReg();
2710 
2711     const LLT S32 = LLT::scalar(32);
2712     LLT DstTy = MRI.getType(DstReg);
2713     LLT SrcTy = MRI.getType(SrcReg);
2714 
2715     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2716       return;
2717 
2718     MachineIRBuilder B(MI);
2719 
2720     const ValueMapping &DstMapping
2721       = OpdMapper.getInstrMapping().getOperandMapping(0);
2722     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2723     const RegisterBank *SrcBank =
2724       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2725     const RegisterBank *IdxBank =
2726         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2727 
2728     Register BaseIdxReg;
2729     unsigned ConstOffset;
2730     std::tie(BaseIdxReg, ConstOffset) =
2731         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2732 
2733     // See if the index is an add of a constant which will be foldable by moving
2734     // the base register of the index later if this is going to be executed in a
2735     // waterfall loop. This is essentially to reassociate the add of a constant
2736     // with the readfirstlane.
2737     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2738                                    ConstOffset > 0 &&
2739                                    ConstOffset < SrcTy.getNumElements();
2740 
2741     // Move the base register. We'll re-insert the add later.
2742     if (ShouldMoveIndexIntoLoop)
2743       MI.getOperand(2).setReg(BaseIdxReg);
2744 
2745     // If this is a VGPR result only because the index was a VGPR result, the
2746     // actual indexing will be done on the SGPR source vector, which will
2747     // produce a scalar result. We need to copy to the VGPR result inside the
2748     // waterfall loop.
2749     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2750                                 SrcBank == &AMDGPU::SGPRRegBank;
2751     if (DstRegs.empty()) {
2752       applyDefaultMapping(OpdMapper);
2753 
2754       executeInWaterfallLoop(MI, MRI, { 2 });
2755 
2756       if (NeedCopyToVGPR) {
2757         // We don't want a phi for this temporary reg.
2758         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2759         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2760         MI.getOperand(0).setReg(TmpReg);
2761         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2762 
2763         // Use a v_mov_b32 here to make the exec dependency explicit.
2764         buildVCopy(B, DstReg, TmpReg);
2765       }
2766 
2767       // Re-insert the constant offset add inside the waterfall loop.
2768       if (ShouldMoveIndexIntoLoop)
2769         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2770 
2771       return;
2772     }
2773 
2774     assert(DstTy.getSizeInBits() == 64);
2775 
2776     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2777 
2778     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2779     auto One = B.buildConstant(S32, 1);
2780 
2781     MachineBasicBlock::iterator MII = MI.getIterator();
2782 
2783     // Split the vector index into 32-bit pieces. Prepare to move all of the
2784     // new instructions into a waterfall loop if necessary.
2785     //
2786     // Don't put the bitcast or constant in the loop.
2787     MachineInstrSpan Span(MII, &B.getMBB());
2788 
2789     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2790     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2791     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2792 
2793     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2794     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2795 
2796     MRI.setRegBank(DstReg, *DstBank);
2797     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2798     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2799     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2800     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2801 
2802     SmallSet<Register, 4> OpsToWaterfall;
2803     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2804       MI.eraseFromParent();
2805       return;
2806     }
2807 
2808     // Remove the original instruction to avoid potentially confusing the
2809     // waterfall loop logic.
2810     B.setInstr(*Span.begin());
2811     MI.eraseFromParent();
2812     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2813                            OpsToWaterfall, MRI);
2814 
2815     if (NeedCopyToVGPR) {
2816       MachineBasicBlock *LoopBB = Extract1->getParent();
2817       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2818       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2819       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2820       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2821 
2822       Extract0->getOperand(0).setReg(TmpReg0);
2823       Extract1->getOperand(0).setReg(TmpReg1);
2824 
2825       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2826 
2827       buildVCopy(B, DstRegs[0], TmpReg0);
2828       buildVCopy(B, DstRegs[1], TmpReg1);
2829     }
2830 
2831     if (ShouldMoveIndexIntoLoop)
2832       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2833 
2834     return;
2835   }
2836   case AMDGPU::G_INSERT_VECTOR_ELT: {
2837     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2838 
2839     Register DstReg = MI.getOperand(0).getReg();
2840     LLT VecTy = MRI.getType(DstReg);
2841 
2842     assert(OpdMapper.getVRegs(0).empty());
2843     assert(OpdMapper.getVRegs(3).empty());
2844 
2845     if (substituteSimpleCopyRegs(OpdMapper, 1))
2846       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2847 
2848     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2849       return;
2850 
2851     const RegisterBank *IdxBank =
2852       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2853 
2854     Register SrcReg = MI.getOperand(1).getReg();
2855     Register InsReg = MI.getOperand(2).getReg();
2856     LLT InsTy = MRI.getType(InsReg);
2857     (void)InsTy;
2858 
2859     Register BaseIdxReg;
2860     unsigned ConstOffset;
2861     std::tie(BaseIdxReg, ConstOffset) =
2862         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2863 
2864     // See if the index is an add of a constant which will be foldable by moving
2865     // the base register of the index later if this is going to be executed in a
2866     // waterfall loop. This is essentially to reassociate the add of a constant
2867     // with the readfirstlane.
2868     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2869       ConstOffset > 0 &&
2870       ConstOffset < VecTy.getNumElements();
2871 
2872     // Move the base register. We'll re-insert the add later.
2873     if (ShouldMoveIndexIntoLoop)
2874       MI.getOperand(3).setReg(BaseIdxReg);
2875 
2876 
2877     if (InsRegs.empty()) {
2878       executeInWaterfallLoop(MI, MRI, { 3 });
2879 
2880       // Re-insert the constant offset add inside the waterfall loop.
2881       if (ShouldMoveIndexIntoLoop) {
2882         MachineIRBuilder B(MI);
2883         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2884       }
2885 
2886       return;
2887     }
2888 
2889 
2890     assert(InsTy.getSizeInBits() == 64);
2891 
2892     const LLT S32 = LLT::scalar(32);
2893     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2894 
2895     MachineIRBuilder B(MI);
2896     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2897     auto One = B.buildConstant(S32, 1);
2898 
2899     // Split the vector index into 32-bit pieces. Prepare to move all of the
2900     // new instructions into a waterfall loop if necessary.
2901     //
2902     // Don't put the bitcast or constant in the loop.
2903     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2904 
2905     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2906     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2907     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2908 
2909     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2910     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2911 
2912     const RegisterBank *DstBank =
2913       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2914     const RegisterBank *SrcBank =
2915       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2916     const RegisterBank *InsSrcBank =
2917       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2918 
2919     MRI.setRegBank(InsReg, *InsSrcBank);
2920     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2921     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2922     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2923     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2924     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2925     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2926 
2927 
2928     SmallSet<Register, 4> OpsToWaterfall;
2929     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2930       B.setInsertPt(B.getMBB(), MI);
2931       B.buildBitcast(DstReg, InsHi);
2932       MI.eraseFromParent();
2933       return;
2934     }
2935 
2936     B.setInstr(*Span.begin());
2937     MI.eraseFromParent();
2938 
2939     // Figure out the point after the waterfall loop before mangling the control
2940     // flow.
2941     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2942                            OpsToWaterfall, MRI);
2943 
2944     // The insertion point is now right after the original instruction.
2945     //
2946     // Keep the bitcast to the original vector type out of the loop. Doing this
2947     // saved an extra phi we don't need inside the loop.
2948     B.buildBitcast(DstReg, InsHi);
2949 
2950     // Re-insert the constant offset add inside the waterfall loop.
2951     if (ShouldMoveIndexIntoLoop)
2952       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2953 
2954     return;
2955   }
2956   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2957   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2958   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2959   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2960   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2961   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2962   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2963   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2964   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2965   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2966   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2967   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2968   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2969   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2970   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2971   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2972     applyDefaultMapping(OpdMapper);
2973     executeInWaterfallLoop(MI, MRI, {1, 4});
2974     return;
2975   }
2976   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2977   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2978   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2979   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2980   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2981   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2982   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2983   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2984   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2985   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2986   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2987   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2988     applyDefaultMapping(OpdMapper);
2989     executeInWaterfallLoop(MI, MRI, {2, 5});
2990     return;
2991   }
2992   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2993   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2994   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2995     applyDefaultMapping(OpdMapper);
2996     executeInWaterfallLoop(MI, MRI, {2, 5});
2997     return;
2998   }
2999   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3000     applyDefaultMapping(OpdMapper);
3001     executeInWaterfallLoop(MI, MRI, {3, 6});
3002     return;
3003   }
3004   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3005     applyMappingSBufferLoad(OpdMapper);
3006     return;
3007   }
3008   case AMDGPU::G_INTRINSIC: {
3009     switch (MI.getIntrinsicID()) {
3010     case Intrinsic::amdgcn_readlane: {
3011       substituteSimpleCopyRegs(OpdMapper, 2);
3012 
3013       assert(OpdMapper.getVRegs(0).empty());
3014       assert(OpdMapper.getVRegs(3).empty());
3015 
3016       // Make sure the index is an SGPR. It doesn't make sense to run this in a
3017       // waterfall loop, so assume it's a uniform value.
3018       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3019       return;
3020     }
3021     case Intrinsic::amdgcn_writelane: {
3022       assert(OpdMapper.getVRegs(0).empty());
3023       assert(OpdMapper.getVRegs(2).empty());
3024       assert(OpdMapper.getVRegs(3).empty());
3025 
3026       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3027       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
3028       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3029       return;
3030     }
3031     case Intrinsic::amdgcn_interp_p1:
3032     case Intrinsic::amdgcn_interp_p2:
3033     case Intrinsic::amdgcn_interp_mov:
3034     case Intrinsic::amdgcn_interp_p1_f16:
3035     case Intrinsic::amdgcn_interp_p2_f16: {
3036       applyDefaultMapping(OpdMapper);
3037 
3038       // Readlane for m0 value, which is always the last operand.
3039       // FIXME: Should this be a waterfall loop instead?
3040       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3041       return;
3042     }
3043     case Intrinsic::amdgcn_permlane16:
3044     case Intrinsic::amdgcn_permlanex16: {
3045       // Doing a waterfall loop over these wouldn't make any sense.
3046       substituteSimpleCopyRegs(OpdMapper, 2);
3047       substituteSimpleCopyRegs(OpdMapper, 3);
3048       constrainOpWithReadfirstlane(MI, MRI, 4);
3049       constrainOpWithReadfirstlane(MI, MRI, 5);
3050       return;
3051     }
3052     case Intrinsic::amdgcn_sbfe:
3053       applyMappingBFE(OpdMapper, true);
3054       return;
3055     case Intrinsic::amdgcn_ubfe:
3056       applyMappingBFE(OpdMapper, false);
3057       return;
3058     case Intrinsic::amdgcn_ballot:
3059       // Use default handling and insert copy to vcc source.
3060       break;
3061     }
3062     break;
3063   }
3064   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3065   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3066     const AMDGPU::RsrcIntrinsic *RSrcIntrin
3067       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3068     assert(RSrcIntrin && RSrcIntrin->IsImage);
3069     // Non-images can have complications from operands that allow both SGPR
3070     // and VGPR. For now it's too complicated to figure out the final opcode
3071     // to derive the register bank from the MCInstrDesc.
3072     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3073     return;
3074   }
3075   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3076     unsigned N = MI.getNumExplicitOperands() - 2;
3077     applyDefaultMapping(OpdMapper);
3078     executeInWaterfallLoop(MI, MRI, { N });
3079     return;
3080   }
3081   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3082     auto IntrID = MI.getIntrinsicID();
3083     switch (IntrID) {
3084     case Intrinsic::amdgcn_ds_ordered_add:
3085     case Intrinsic::amdgcn_ds_ordered_swap: {
3086       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3087       assert(OpdMapper.getVRegs(0).empty());
3088       substituteSimpleCopyRegs(OpdMapper, 3);
3089       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3090       return;
3091     }
3092     case Intrinsic::amdgcn_ds_gws_init:
3093     case Intrinsic::amdgcn_ds_gws_barrier:
3094     case Intrinsic::amdgcn_ds_gws_sema_br: {
3095       // Only the first lane is executes, so readfirstlane is safe.
3096       substituteSimpleCopyRegs(OpdMapper, 1);
3097       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3098       return;
3099     }
3100     case Intrinsic::amdgcn_ds_gws_sema_v:
3101     case Intrinsic::amdgcn_ds_gws_sema_p:
3102     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3103       // Only the first lane is executes, so readfirstlane is safe.
3104       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3105       return;
3106     }
3107     case Intrinsic::amdgcn_ds_append:
3108     case Intrinsic::amdgcn_ds_consume: {
3109       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3110       return;
3111     }
3112     case Intrinsic::amdgcn_s_sendmsg:
3113     case Intrinsic::amdgcn_s_sendmsghalt: {
3114       // FIXME: Should this use a waterfall loop?
3115       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3116       return;
3117     }
3118     case Intrinsic::amdgcn_s_setreg: {
3119       constrainOpWithReadfirstlane(MI, MRI, 2);
3120       return;
3121     }
3122     default: {
3123       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3124               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3125         // Non-images can have complications from operands that allow both SGPR
3126         // and VGPR. For now it's too complicated to figure out the final opcode
3127         // to derive the register bank from the MCInstrDesc.
3128         if (RSrcIntrin->IsImage) {
3129           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3130           return;
3131         }
3132       }
3133 
3134       break;
3135     }
3136     }
3137     break;
3138   }
3139   case AMDGPU::G_LOAD:
3140   case AMDGPU::G_ZEXTLOAD:
3141   case AMDGPU::G_SEXTLOAD: {
3142     if (applyMappingLoad(MI, OpdMapper, MRI))
3143       return;
3144     break;
3145   }
3146   case AMDGPU::G_DYN_STACKALLOC:
3147     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3148     return;
3149   case AMDGPU::G_SBFX:
3150     applyMappingBFE(OpdMapper, /*Signed*/ true);
3151     return;
3152   case AMDGPU::G_UBFX:
3153     applyMappingBFE(OpdMapper, /*Signed*/ false);
3154     return;
3155   default:
3156     break;
3157   }
3158 
3159   return applyDefaultMapping(OpdMapper);
3160 }
3161 
3162 // vgpr, sgpr -> vgpr
3163 // vgpr, agpr -> vgpr
3164 // agpr, agpr -> agpr
3165 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3166 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3167   if (RB0 == AMDGPU::InvalidRegBankID)
3168     return RB1;
3169   if (RB1 == AMDGPU::InvalidRegBankID)
3170     return RB0;
3171 
3172   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3173     return AMDGPU::SGPRRegBankID;
3174 
3175   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3176     return AMDGPU::AGPRRegBankID;
3177 
3178   return AMDGPU::VGPRRegBankID;
3179 }
3180 
regBankBoolUnion(unsigned RB0,unsigned RB1)3181 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3182   if (RB0 == AMDGPU::InvalidRegBankID)
3183     return RB1;
3184   if (RB1 == AMDGPU::InvalidRegBankID)
3185     return RB0;
3186 
3187   // vcc, vcc -> vcc
3188   // vcc, sgpr -> vcc
3189   // vcc, vgpr -> vcc
3190   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3191     return AMDGPU::VCCRegBankID;
3192 
3193   // vcc, vgpr -> vgpr
3194   return regBankUnion(RB0, RB1);
3195 }
3196 
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3197 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3198                                                 const MachineInstr &MI) const {
3199   unsigned RegBank = AMDGPU::InvalidRegBankID;
3200 
3201   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3202     if (!MI.getOperand(i).isReg())
3203       continue;
3204     Register Reg = MI.getOperand(i).getReg();
3205     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3206       RegBank = regBankUnion(RegBank, Bank->getID());
3207       if (RegBank == AMDGPU::VGPRRegBankID)
3208         break;
3209     }
3210   }
3211 
3212   return RegBank;
3213 }
3214 
isSALUMapping(const MachineInstr & MI) const3215 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3216   const MachineFunction &MF = *MI.getParent()->getParent();
3217   const MachineRegisterInfo &MRI = MF.getRegInfo();
3218   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3219     if (!MI.getOperand(i).isReg())
3220       continue;
3221     Register Reg = MI.getOperand(i).getReg();
3222     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3223       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3224         return false;
3225     }
3226   }
3227   return true;
3228 }
3229 
3230 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3231 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3232   const MachineFunction &MF = *MI.getParent()->getParent();
3233   const MachineRegisterInfo &MRI = MF.getRegInfo();
3234   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3235 
3236   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3237     const MachineOperand &SrcOp = MI.getOperand(i);
3238     if (!SrcOp.isReg())
3239       continue;
3240 
3241     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3242     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3243   }
3244   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3245                                MI.getNumOperands());
3246 }
3247 
3248 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3249 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3250   const MachineFunction &MF = *MI.getParent()->getParent();
3251   const MachineRegisterInfo &MRI = MF.getRegInfo();
3252   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3253 
3254   // Even though we technically could use SGPRs, this would require knowledge of
3255   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3256   //
3257   // TODO: Unary ops are trivially OK, so accept SGPRs?
3258   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3259     const MachineOperand &Src = MI.getOperand(i);
3260     if (!Src.isReg())
3261       continue;
3262 
3263     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3264     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3265     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3266   }
3267 
3268   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3269                                MI.getNumOperands());
3270 }
3271 
3272 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3273 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3274   const MachineFunction &MF = *MI.getParent()->getParent();
3275   const MachineRegisterInfo &MRI = MF.getRegInfo();
3276   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3277 
3278   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3279     const MachineOperand &Op = MI.getOperand(I);
3280     if (!Op.isReg())
3281       continue;
3282 
3283     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3284     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3285   }
3286 
3287   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3288                                MI.getNumOperands());
3289 }
3290 
3291 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3292 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3293                                         const MachineInstr &MI,
3294                                         int RsrcIdx) const {
3295   // The reported argument index is relative to the IR intrinsic call arguments,
3296   // so we need to shift by the number of defs and the intrinsic ID.
3297   RsrcIdx += MI.getNumExplicitDefs() + 1;
3298 
3299   const int NumOps = MI.getNumOperands();
3300   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3301 
3302   // TODO: Should packed/unpacked D16 difference be reported here as part of
3303   // the value mapping?
3304   for (int I = 0; I != NumOps; ++I) {
3305     if (!MI.getOperand(I).isReg())
3306       continue;
3307 
3308     Register OpReg = MI.getOperand(I).getReg();
3309     // We replace some dead address operands with $noreg
3310     if (!OpReg)
3311       continue;
3312 
3313     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3314 
3315     // FIXME: Probably need a new intrinsic register bank searchable table to
3316     // handle arbitrary intrinsics easily.
3317     //
3318     // If this has a sampler, it immediately follows rsrc.
3319     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3320 
3321     if (MustBeSGPR) {
3322       // If this must be an SGPR, so we must report whatever it is as legal.
3323       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3324       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3325     } else {
3326       // Some operands must be VGPR, and these are easy to copy to.
3327       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3328     }
3329   }
3330 
3331   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3332 }
3333 
3334 /// Return the mapping for a pointer argument.
3335 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3336 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3337                                               Register PtrReg) const {
3338   LLT PtrTy = MRI.getType(PtrReg);
3339   unsigned Size = PtrTy.getSizeInBits();
3340   if (Subtarget.useFlatForGlobal() ||
3341       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3342     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3343 
3344   // If we're using MUBUF instructions for global memory, an SGPR base register
3345   // is possible. Otherwise this needs to be a VGPR.
3346   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3347   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3348 }
3349 
3350 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3351 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3352 
3353   const MachineFunction &MF = *MI.getParent()->getParent();
3354   const MachineRegisterInfo &MRI = MF.getRegInfo();
3355   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3356   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3357   Register PtrReg = MI.getOperand(1).getReg();
3358   LLT PtrTy = MRI.getType(PtrReg);
3359   unsigned AS = PtrTy.getAddressSpace();
3360   unsigned PtrSize = PtrTy.getSizeInBits();
3361 
3362   const ValueMapping *ValMapping;
3363   const ValueMapping *PtrMapping;
3364 
3365   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3366 
3367   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3368     if (isScalarLoadLegal(MI)) {
3369       // We have a uniform instruction so we want to use an SMRD load
3370       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3371       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3372     } else {
3373       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3374 
3375       // If we're using MUBUF instructions for global memory, an SGPR base
3376       // register is possible. Otherwise this needs to be a VGPR.
3377       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3378         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3379 
3380       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3381     }
3382   } else {
3383     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3384     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3385   }
3386 
3387   OpdsMapping[0] = ValMapping;
3388   OpdsMapping[1] = PtrMapping;
3389   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3390       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3391   return Mapping;
3392 
3393   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3394   // handle that during instruction selection?
3395 }
3396 
3397 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3398 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3399                                      const MachineRegisterInfo &MRI,
3400                                      unsigned Default) const {
3401   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3402   return Bank ? Bank->getID() : Default;
3403 }
3404 
3405 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3406 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3407                                          const MachineRegisterInfo &MRI,
3408                                          const TargetRegisterInfo &TRI) const {
3409   // Lie and claim anything is legal, even though this needs to be an SGPR
3410   // applyMapping will have to deal with it as a waterfall loop.
3411   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3412   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3413   return AMDGPU::getValueMapping(Bank, Size);
3414 }
3415 
3416 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3417 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3418                                          const MachineRegisterInfo &MRI,
3419                                          const TargetRegisterInfo &TRI) const {
3420   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3421   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3422 }
3423 
3424 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3425 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3426                                          const MachineRegisterInfo &MRI,
3427                                          const TargetRegisterInfo &TRI) const {
3428   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3429   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3430 }
3431 
3432 ///
3433 /// This function must return a legal mapping, because
3434 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3435 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3436 /// VGPR to SGPR generated is illegal.
3437 ///
3438 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3439 // legal. These will be dealt with in applyMappingImpl.
3440 //
3441 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3442 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3443   const MachineFunction &MF = *MI.getParent()->getParent();
3444   const MachineRegisterInfo &MRI = MF.getRegInfo();
3445 
3446   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3447     // The default logic bothers to analyze impossible alternative mappings. We
3448     // want the most straightforward mapping, so just directly handle this.
3449     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3450                                              *TRI);
3451     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3452                                              *TRI);
3453     assert(SrcBank && "src bank should have been assigned already");
3454     if (!DstBank)
3455       DstBank = SrcBank;
3456 
3457     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3458     if (cannotCopy(*DstBank, *SrcBank, Size))
3459       return getInvalidInstructionMapping();
3460 
3461     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3462     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3463     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3464     OpdsMapping[0] = &ValMap;
3465     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3466       OpdsMapping[1] = &ValMap;
3467 
3468     return getInstructionMapping(
3469         1, /*Cost*/ 1,
3470         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3471   }
3472 
3473   if (MI.isRegSequence()) {
3474     // If any input is a VGPR, the result must be a VGPR. The default handling
3475     // assumes any copy between banks is legal.
3476     unsigned BankID = AMDGPU::SGPRRegBankID;
3477 
3478     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3479       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3480       // It doesn't make sense to use vcc or scc banks here, so just ignore
3481       // them.
3482       if (OpBank != AMDGPU::SGPRRegBankID) {
3483         BankID = AMDGPU::VGPRRegBankID;
3484         break;
3485       }
3486     }
3487     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3488 
3489     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3490     return getInstructionMapping(
3491         1, /*Cost*/ 1,
3492         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3493   }
3494 
3495   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3496   // properly.
3497   //
3498   // TODO: There are additional exec masking dependencies to analyze.
3499   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3500     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3501     Register DstReg = MI.getOperand(0).getReg();
3502 
3503     // Sometimes the result may have already been assigned a bank.
3504     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3505       ResultBank = DstBank->getID();
3506 
3507     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3508       Register Reg = MI.getOperand(I).getReg();
3509       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3510 
3511       // FIXME: Assuming VGPR for any undetermined inputs.
3512       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3513         ResultBank = AMDGPU::VGPRRegBankID;
3514         break;
3515       }
3516 
3517       // FIXME: Need to promote SGPR case to s32
3518       unsigned OpBank = Bank->getID();
3519       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3520     }
3521 
3522     assert(ResultBank != AMDGPU::InvalidRegBankID);
3523 
3524     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3525 
3526     const ValueMapping &ValMap =
3527         getValueMapping(0, Size, getRegBank(ResultBank));
3528     return getInstructionMapping(
3529         1, /*Cost*/ 1,
3530         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3531   }
3532 
3533   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3534   if (Mapping.isValid())
3535     return Mapping;
3536 
3537   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3538 
3539   switch (MI.getOpcode()) {
3540   default:
3541     return getInvalidInstructionMapping();
3542 
3543   case AMDGPU::G_AND:
3544   case AMDGPU::G_OR:
3545   case AMDGPU::G_XOR: {
3546     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3547     if (Size == 1) {
3548       const RegisterBank *DstBank
3549         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3550 
3551       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3552       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3553       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3554       if (DstBank) {
3555         TargetBankID = DstBank->getID();
3556         if (DstBank == &AMDGPU::VCCRegBank) {
3557           TargetBankID = AMDGPU::VCCRegBankID;
3558           BankLHS = AMDGPU::VCCRegBankID;
3559           BankRHS = AMDGPU::VCCRegBankID;
3560         } else {
3561           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3562                                  AMDGPU::SGPRRegBankID);
3563           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3564                                  AMDGPU::SGPRRegBankID);
3565         }
3566       } else {
3567         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3568                                AMDGPU::VCCRegBankID);
3569         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3570                                AMDGPU::VCCRegBankID);
3571 
3572         // Both inputs should be true booleans to produce a boolean result.
3573         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3574           TargetBankID = AMDGPU::VGPRRegBankID;
3575         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3576           TargetBankID = AMDGPU::VCCRegBankID;
3577           BankLHS = AMDGPU::VCCRegBankID;
3578           BankRHS = AMDGPU::VCCRegBankID;
3579         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3580           TargetBankID = AMDGPU::SGPRRegBankID;
3581         }
3582       }
3583 
3584       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3585       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3586       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3587       break;
3588     }
3589 
3590     if (Size == 64) {
3591 
3592       if (isSALUMapping(MI)) {
3593         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3594         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3595       } else {
3596         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3597         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3598         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3599 
3600         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3601         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3602       }
3603 
3604       break;
3605     }
3606 
3607     LLVM_FALLTHROUGH;
3608   }
3609   case AMDGPU::G_PTR_ADD:
3610   case AMDGPU::G_PTRMASK:
3611   case AMDGPU::G_ADD:
3612   case AMDGPU::G_SUB:
3613   case AMDGPU::G_MUL:
3614   case AMDGPU::G_SHL:
3615   case AMDGPU::G_LSHR:
3616   case AMDGPU::G_ASHR:
3617   case AMDGPU::G_UADDO:
3618   case AMDGPU::G_USUBO:
3619   case AMDGPU::G_UADDE:
3620   case AMDGPU::G_SADDE:
3621   case AMDGPU::G_USUBE:
3622   case AMDGPU::G_SSUBE:
3623   case AMDGPU::G_SMIN:
3624   case AMDGPU::G_SMAX:
3625   case AMDGPU::G_UMIN:
3626   case AMDGPU::G_UMAX:
3627   case AMDGPU::G_ABS:
3628   case AMDGPU::G_SHUFFLE_VECTOR:
3629   case AMDGPU::G_SBFX:
3630   case AMDGPU::G_UBFX:
3631     if (isSALUMapping(MI))
3632       return getDefaultMappingSOP(MI);
3633     LLVM_FALLTHROUGH;
3634 
3635   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3636   case AMDGPU::G_SSUBSAT:
3637   case AMDGPU::G_UADDSAT:
3638   case AMDGPU::G_USUBSAT:
3639   case AMDGPU::G_FADD:
3640   case AMDGPU::G_FSUB:
3641   case AMDGPU::G_FPTOSI:
3642   case AMDGPU::G_FPTOUI:
3643   case AMDGPU::G_FMUL:
3644   case AMDGPU::G_FMA:
3645   case AMDGPU::G_FMAD:
3646   case AMDGPU::G_FSQRT:
3647   case AMDGPU::G_FFLOOR:
3648   case AMDGPU::G_FCEIL:
3649   case AMDGPU::G_FRINT:
3650   case AMDGPU::G_SITOFP:
3651   case AMDGPU::G_UITOFP:
3652   case AMDGPU::G_FPTRUNC:
3653   case AMDGPU::G_FPEXT:
3654   case AMDGPU::G_FEXP2:
3655   case AMDGPU::G_FLOG2:
3656   case AMDGPU::G_FMINNUM:
3657   case AMDGPU::G_FMAXNUM:
3658   case AMDGPU::G_FMINNUM_IEEE:
3659   case AMDGPU::G_FMAXNUM_IEEE:
3660   case AMDGPU::G_FCANONICALIZE:
3661   case AMDGPU::G_INTRINSIC_TRUNC:
3662   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3663   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3664   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3665   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3666   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3667   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3668   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3669   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3670   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3671   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3672   case AMDGPU::G_AMDGPU_SMED3:
3673     return getDefaultMappingVOP(MI);
3674   case AMDGPU::G_UMULH:
3675   case AMDGPU::G_SMULH: {
3676     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3677       return getDefaultMappingSOP(MI);
3678     return getDefaultMappingVOP(MI);
3679   }
3680   case AMDGPU::G_IMPLICIT_DEF: {
3681     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3682     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3683     break;
3684   }
3685   case AMDGPU::G_FCONSTANT:
3686   case AMDGPU::G_CONSTANT:
3687   case AMDGPU::G_GLOBAL_VALUE:
3688   case AMDGPU::G_BLOCK_ADDR:
3689   case AMDGPU::G_READCYCLECOUNTER: {
3690     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3691     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3692     break;
3693   }
3694   case AMDGPU::G_FRAME_INDEX: {
3695     // TODO: This should be the same as other constants, but eliminateFrameIndex
3696     // currently assumes VALU uses.
3697     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3698     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3699     break;
3700   }
3701   case AMDGPU::G_DYN_STACKALLOC: {
3702     // Result is always uniform, and a wave reduction is needed for the source.
3703     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3704     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3705     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3706     break;
3707   }
3708   case AMDGPU::G_INSERT: {
3709     unsigned BankID = getMappingType(MRI, MI);
3710     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3711     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3712     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3713     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3714     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3715     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3716     OpdsMapping[3] = nullptr;
3717     break;
3718   }
3719   case AMDGPU::G_EXTRACT: {
3720     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3721     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3722     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3723     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3724     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3725     OpdsMapping[2] = nullptr;
3726     break;
3727   }
3728   case AMDGPU::G_BUILD_VECTOR:
3729   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3730     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3731     if (DstTy == LLT::fixed_vector(2, 16)) {
3732       unsigned DstSize = DstTy.getSizeInBits();
3733       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3734       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3735       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3736       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3737 
3738       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3739       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3740       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3741       break;
3742     }
3743 
3744     LLVM_FALLTHROUGH;
3745   }
3746   case AMDGPU::G_MERGE_VALUES:
3747   case AMDGPU::G_CONCAT_VECTORS: {
3748     unsigned Bank = getMappingType(MRI, MI);
3749     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3750     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3751 
3752     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3753     // Op1 and Dst should use the same register bank.
3754     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3755       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3756     break;
3757   }
3758   case AMDGPU::G_BITREVERSE:
3759   case AMDGPU::G_BITCAST:
3760   case AMDGPU::G_INTTOPTR:
3761   case AMDGPU::G_PTRTOINT:
3762   case AMDGPU::G_FABS:
3763   case AMDGPU::G_FNEG: {
3764     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3765     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3766     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3767     break;
3768   }
3769   case AMDGPU::G_AMDGPU_FFBH_U32:
3770   case AMDGPU::G_AMDGPU_FFBL_B32:
3771   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3772   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3773     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3774     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3775     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3776     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3777     break;
3778   }
3779   case AMDGPU::G_CTPOP: {
3780     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3781     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3782     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3783 
3784     // This should really be getValueMappingSGPR64Only, but allowing the generic
3785     // code to handle the register split just makes using LegalizerHelper more
3786     // difficult.
3787     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3788     break;
3789   }
3790   case AMDGPU::G_TRUNC: {
3791     Register Dst = MI.getOperand(0).getReg();
3792     Register Src = MI.getOperand(1).getReg();
3793     unsigned Bank = getRegBankID(Src, MRI);
3794     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3795     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3796     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3797     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3798     break;
3799   }
3800   case AMDGPU::G_ZEXT:
3801   case AMDGPU::G_SEXT:
3802   case AMDGPU::G_ANYEXT:
3803   case AMDGPU::G_SEXT_INREG: {
3804     Register Dst = MI.getOperand(0).getReg();
3805     Register Src = MI.getOperand(1).getReg();
3806     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3807     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3808 
3809     unsigned DstBank;
3810     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3811     assert(SrcBank);
3812     switch (SrcBank->getID()) {
3813     case AMDGPU::SGPRRegBankID:
3814       DstBank = AMDGPU::SGPRRegBankID;
3815       break;
3816     default:
3817       DstBank = AMDGPU::VGPRRegBankID;
3818       break;
3819     }
3820 
3821     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3822     // 32-bits, and then to 64.
3823     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3824     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3825                                                        SrcSize);
3826     break;
3827   }
3828   case AMDGPU::G_FCMP: {
3829     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3830     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3831     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3832     OpdsMapping[1] = nullptr; // Predicate Operand.
3833     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3834     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3835     break;
3836   }
3837   case AMDGPU::G_STORE: {
3838     assert(MI.getOperand(0).isReg());
3839     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3840 
3841     // FIXME: We need to specify a different reg bank once scalar stores are
3842     // supported.
3843     const ValueMapping *ValMapping =
3844         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3845     OpdsMapping[0] = ValMapping;
3846     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3847     break;
3848   }
3849   case AMDGPU::G_ICMP: {
3850     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3851     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3852 
3853     // See if the result register has already been constrained to vcc, which may
3854     // happen due to control flow intrinsic lowering.
3855     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3856                                     AMDGPU::SGPRRegBankID);
3857     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3858     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3859 
3860     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3861                      Op2Bank == AMDGPU::SGPRRegBankID &&
3862                      Op3Bank == AMDGPU::SGPRRegBankID &&
3863       (Size == 32 || (Size == 64 &&
3864                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3865                       Subtarget.hasScalarCompareEq64()));
3866 
3867     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3868     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3869 
3870     // TODO: Use 32-bit for scalar output size.
3871     // SCC results will need to be copied to a 32-bit SGPR virtual register.
3872     const unsigned ResultSize = 1;
3873 
3874     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3875     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3876     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3877     break;
3878   }
3879   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3880     // VGPR index can be used for waterfall when indexing a SGPR vector.
3881     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3882     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3883     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3884     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3885     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3886     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3887 
3888     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3889     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3890 
3891     // The index can be either if the source vector is VGPR.
3892     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3893     break;
3894   }
3895   case AMDGPU::G_INSERT_VECTOR_ELT: {
3896     unsigned OutputBankID = isSALUMapping(MI) ?
3897       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3898 
3899     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3900     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3901     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3902     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3903     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3904 
3905     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3906     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3907 
3908     // This is a weird case, because we need to break down the mapping based on
3909     // the register bank of a different operand.
3910     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3911       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3912                                                       InsertSize);
3913     } else {
3914       assert(InsertSize == 32 || InsertSize == 64);
3915       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3916     }
3917 
3918     // The index can be either if the source vector is VGPR.
3919     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3920     break;
3921   }
3922   case AMDGPU::G_UNMERGE_VALUES: {
3923     unsigned Bank = getMappingType(MRI, MI);
3924 
3925     // Op1 and Dst should use the same register bank.
3926     // FIXME: Shouldn't this be the default? Why do we need to handle this?
3927     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3928       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3929       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3930     }
3931     break;
3932   }
3933   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3934   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3935   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3936   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3937   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3938   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3939   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3940   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3941   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3942   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3943   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3944   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3945   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3946   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3947   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3948   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3949     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3950 
3951     // rsrc
3952     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3953 
3954     // vindex
3955     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3956 
3957     // voffset
3958     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3959 
3960     // soffset
3961     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3962 
3963     // Any remaining operands are immediates and were correctly null
3964     // initialized.
3965     break;
3966   }
3967   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3968   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3969   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3970   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3971   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3972   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3973   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3974   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3975   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3976   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3977   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3978   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3979   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3980   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3981   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3982     // vdata_out
3983     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3984 
3985     // vdata_in
3986     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3987 
3988     // rsrc
3989     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3990 
3991     // vindex
3992     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3993 
3994     // voffset
3995     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3996 
3997     // soffset
3998     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3999 
4000     // Any remaining operands are immediates and were correctly null
4001     // initialized.
4002     break;
4003   }
4004   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4005     // vdata_out
4006     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4007 
4008     // vdata_in
4009     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4010 
4011     // cmp
4012     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4013 
4014     // rsrc
4015     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4016 
4017     // vindex
4018     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4019 
4020     // voffset
4021     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4022 
4023     // soffset
4024     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4025 
4026     // Any remaining operands are immediates and were correctly null
4027     // initialized.
4028     break;
4029   }
4030   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4031     // Lie and claim everything is legal, even though some need to be
4032     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4033     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4034     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4035 
4036     // We need to convert this to a MUBUF if either the resource of offset is
4037     // VGPR.
4038     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4039     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4040     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4041 
4042     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4043     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4044     break;
4045   }
4046   case AMDGPU::G_INTRINSIC: {
4047     switch (MI.getIntrinsicID()) {
4048     default:
4049       return getInvalidInstructionMapping();
4050     case Intrinsic::amdgcn_div_fmas:
4051     case Intrinsic::amdgcn_div_fixup:
4052     case Intrinsic::amdgcn_trig_preop:
4053     case Intrinsic::amdgcn_sin:
4054     case Intrinsic::amdgcn_cos:
4055     case Intrinsic::amdgcn_log_clamp:
4056     case Intrinsic::amdgcn_rcp:
4057     case Intrinsic::amdgcn_rcp_legacy:
4058     case Intrinsic::amdgcn_sqrt:
4059     case Intrinsic::amdgcn_rsq:
4060     case Intrinsic::amdgcn_rsq_legacy:
4061     case Intrinsic::amdgcn_rsq_clamp:
4062     case Intrinsic::amdgcn_fmul_legacy:
4063     case Intrinsic::amdgcn_fma_legacy:
4064     case Intrinsic::amdgcn_ldexp:
4065     case Intrinsic::amdgcn_frexp_mant:
4066     case Intrinsic::amdgcn_frexp_exp:
4067     case Intrinsic::amdgcn_fract:
4068     case Intrinsic::amdgcn_cvt_pkrtz:
4069     case Intrinsic::amdgcn_cvt_pknorm_i16:
4070     case Intrinsic::amdgcn_cvt_pknorm_u16:
4071     case Intrinsic::amdgcn_cvt_pk_i16:
4072     case Intrinsic::amdgcn_cvt_pk_u16:
4073     case Intrinsic::amdgcn_fmed3:
4074     case Intrinsic::amdgcn_cubeid:
4075     case Intrinsic::amdgcn_cubema:
4076     case Intrinsic::amdgcn_cubesc:
4077     case Intrinsic::amdgcn_cubetc:
4078     case Intrinsic::amdgcn_sffbh:
4079     case Intrinsic::amdgcn_fmad_ftz:
4080     case Intrinsic::amdgcn_mbcnt_lo:
4081     case Intrinsic::amdgcn_mbcnt_hi:
4082     case Intrinsic::amdgcn_mul_u24:
4083     case Intrinsic::amdgcn_mul_i24:
4084     case Intrinsic::amdgcn_lerp:
4085     case Intrinsic::amdgcn_sad_u8:
4086     case Intrinsic::amdgcn_msad_u8:
4087     case Intrinsic::amdgcn_sad_hi_u8:
4088     case Intrinsic::amdgcn_sad_u16:
4089     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4090     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4091     case Intrinsic::amdgcn_mqsad_u32_u8:
4092     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4093     case Intrinsic::amdgcn_alignbit:
4094     case Intrinsic::amdgcn_alignbyte:
4095     case Intrinsic::amdgcn_perm:
4096     case Intrinsic::amdgcn_fdot2:
4097     case Intrinsic::amdgcn_sdot2:
4098     case Intrinsic::amdgcn_udot2:
4099     case Intrinsic::amdgcn_sdot4:
4100     case Intrinsic::amdgcn_udot4:
4101     case Intrinsic::amdgcn_sdot8:
4102     case Intrinsic::amdgcn_udot8:
4103       return getDefaultMappingVOP(MI);
4104     case Intrinsic::amdgcn_sbfe:
4105     case Intrinsic::amdgcn_ubfe:
4106       if (isSALUMapping(MI))
4107         return getDefaultMappingSOP(MI);
4108       return getDefaultMappingVOP(MI);
4109     case Intrinsic::amdgcn_ds_swizzle:
4110     case Intrinsic::amdgcn_ds_permute:
4111     case Intrinsic::amdgcn_ds_bpermute:
4112     case Intrinsic::amdgcn_update_dpp:
4113     case Intrinsic::amdgcn_mov_dpp8:
4114     case Intrinsic::amdgcn_mov_dpp:
4115     case Intrinsic::amdgcn_strict_wwm:
4116     case Intrinsic::amdgcn_wwm:
4117     case Intrinsic::amdgcn_strict_wqm:
4118     case Intrinsic::amdgcn_wqm:
4119     case Intrinsic::amdgcn_softwqm:
4120     case Intrinsic::amdgcn_set_inactive:
4121       return getDefaultMappingAllVGPR(MI);
4122     case Intrinsic::amdgcn_kernarg_segment_ptr:
4123     case Intrinsic::amdgcn_s_getpc:
4124     case Intrinsic::amdgcn_groupstaticsize:
4125     case Intrinsic::amdgcn_reloc_constant:
4126     case Intrinsic::returnaddress: {
4127       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4128       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4129       break;
4130     }
4131     case Intrinsic::amdgcn_wqm_vote: {
4132       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4133       OpdsMapping[0] = OpdsMapping[2]
4134         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4135       break;
4136     }
4137     case Intrinsic::amdgcn_ps_live: {
4138       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4139       break;
4140     }
4141     case Intrinsic::amdgcn_div_scale: {
4142       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4143       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4144       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4145       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4146 
4147       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4148       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4149       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4150       break;
4151     }
4152     case Intrinsic::amdgcn_class: {
4153       Register Src0Reg = MI.getOperand(2).getReg();
4154       Register Src1Reg = MI.getOperand(3).getReg();
4155       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4156       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4157       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4158       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4159       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4160       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4161       break;
4162     }
4163     case Intrinsic::amdgcn_icmp:
4164     case Intrinsic::amdgcn_fcmp: {
4165       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4166       // This is not VCCRegBank because this is not used in boolean contexts.
4167       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4168       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4169       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4170       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4171       break;
4172     }
4173     case Intrinsic::amdgcn_readlane: {
4174       // This must be an SGPR, but accept a VGPR.
4175       Register IdxReg = MI.getOperand(3).getReg();
4176       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4177       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4178       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4179       LLVM_FALLTHROUGH;
4180     }
4181     case Intrinsic::amdgcn_readfirstlane: {
4182       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4183       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4184       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4185       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4186       break;
4187     }
4188     case Intrinsic::amdgcn_writelane: {
4189       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4190       Register SrcReg = MI.getOperand(2).getReg();
4191       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4192       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4193       Register IdxReg = MI.getOperand(3).getReg();
4194       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4195       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4196       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4197 
4198       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4199       // to legalize.
4200       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4201       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4202       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4203       break;
4204     }
4205     case Intrinsic::amdgcn_if_break: {
4206       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4207       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4208       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4209       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4210       break;
4211     }
4212     case Intrinsic::amdgcn_permlane16:
4213     case Intrinsic::amdgcn_permlanex16: {
4214       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4215       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4216       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4217       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4218       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4219       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4220       break;
4221     }
4222     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4223     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4224     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4225     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4226     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4227     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4228     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4229     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4230     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4231     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4232     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4233     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4234     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4235     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4236     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4237     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4238     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4239     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4240     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4241     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4242     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4243     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4244     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4245     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4246     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4247     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4248     case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4249       // Default for MAI intrinsics.
4250       // srcC can also be an immediate which can be folded later.
4251       // FIXME: Should we eventually add an alternative mapping with AGPR src
4252       // for srcA/srcB?
4253       //
4254       // vdst, srcA, srcB, srcC
4255       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4256       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4257       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4258       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4259       break;
4260     }
4261     case Intrinsic::amdgcn_interp_p1:
4262     case Intrinsic::amdgcn_interp_p2:
4263     case Intrinsic::amdgcn_interp_mov:
4264     case Intrinsic::amdgcn_interp_p1_f16:
4265     case Intrinsic::amdgcn_interp_p2_f16: {
4266       const int M0Idx = MI.getNumOperands() - 1;
4267       Register M0Reg = MI.getOperand(M0Idx).getReg();
4268       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4269       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4270 
4271       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4272       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4273         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4274 
4275       // Must be SGPR, but we must take whatever the original bank is and fix it
4276       // later.
4277       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4278       break;
4279     }
4280     case Intrinsic::amdgcn_ballot: {
4281       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4282       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4283       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4284       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4285       break;
4286     }
4287     }
4288     break;
4289   }
4290   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4291   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4292     auto IntrID = MI.getIntrinsicID();
4293     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4294     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4295     // Non-images can have complications from operands that allow both SGPR
4296     // and VGPR. For now it's too complicated to figure out the final opcode
4297     // to derive the register bank from the MCInstrDesc.
4298     assert(RSrcIntrin->IsImage);
4299     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4300   }
4301   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4302     unsigned N = MI.getNumExplicitOperands() - 2;
4303     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4304     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4305     if (N == 3) {
4306       // Sequential form: all operands combined into VGPR256/VGPR512
4307       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4308       if (Size > 256)
4309         Size = 512;
4310       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4311     } else {
4312       // NSA form
4313       for (unsigned I = 2; I < N; ++I)
4314         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4315     }
4316     break;
4317   }
4318   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4319     auto IntrID = MI.getIntrinsicID();
4320     switch (IntrID) {
4321     case Intrinsic::amdgcn_s_getreg:
4322     case Intrinsic::amdgcn_s_memtime:
4323     case Intrinsic::amdgcn_s_memrealtime:
4324     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4325       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4326       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4327       break;
4328     }
4329     case Intrinsic::amdgcn_global_atomic_fadd:
4330     case Intrinsic::amdgcn_global_atomic_csub:
4331     case Intrinsic::amdgcn_global_atomic_fmin:
4332     case Intrinsic::amdgcn_global_atomic_fmax:
4333     case Intrinsic::amdgcn_flat_atomic_fadd:
4334     case Intrinsic::amdgcn_flat_atomic_fmin:
4335     case Intrinsic::amdgcn_flat_atomic_fmax:
4336       return getDefaultMappingAllVGPR(MI);
4337     case Intrinsic::amdgcn_ds_ordered_add:
4338     case Intrinsic::amdgcn_ds_ordered_swap: {
4339       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4340       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4341       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4342                                  AMDGPU::SGPRRegBankID);
4343       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4344       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4345       break;
4346     }
4347     case Intrinsic::amdgcn_ds_append:
4348     case Intrinsic::amdgcn_ds_consume: {
4349       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4350       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4351       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4352       break;
4353     }
4354     case Intrinsic::amdgcn_exp_compr:
4355       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4356       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4357       break;
4358     case Intrinsic::amdgcn_exp:
4359       // FIXME: Could we support packed types here?
4360       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4361       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4362       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4363       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4364       break;
4365     case Intrinsic::amdgcn_s_sendmsg:
4366     case Intrinsic::amdgcn_s_sendmsghalt: {
4367       // This must be an SGPR, but accept a VGPR.
4368       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4369                                    AMDGPU::SGPRRegBankID);
4370       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4371       break;
4372     }
4373     case Intrinsic::amdgcn_s_setreg: {
4374       // This must be an SGPR, but accept a VGPR.
4375       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4376                                    AMDGPU::SGPRRegBankID);
4377       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4378       break;
4379     }
4380     case Intrinsic::amdgcn_end_cf: {
4381       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4382       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4383       break;
4384     }
4385     case Intrinsic::amdgcn_else: {
4386       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4387       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4388       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4389       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4390       break;
4391     }
4392     case Intrinsic::amdgcn_live_mask: {
4393       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4394       break;
4395     }
4396     case Intrinsic::amdgcn_wqm_demote:
4397     case Intrinsic::amdgcn_kill: {
4398       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4399       break;
4400     }
4401     case Intrinsic::amdgcn_raw_buffer_load:
4402     case Intrinsic::amdgcn_raw_tbuffer_load: {
4403       // FIXME: Should make intrinsic ID the last operand of the instruction,
4404       // then this would be the same as store
4405       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4406       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4407       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4408       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4409       break;
4410     }
4411     case Intrinsic::amdgcn_raw_buffer_store:
4412     case Intrinsic::amdgcn_raw_buffer_store_format:
4413     case Intrinsic::amdgcn_raw_tbuffer_store: {
4414       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4415       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4416       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4417       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4418       break;
4419     }
4420     case Intrinsic::amdgcn_struct_buffer_load:
4421     case Intrinsic::amdgcn_struct_tbuffer_load: {
4422       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4423       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4424       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4425       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4426       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4427       break;
4428     }
4429     case Intrinsic::amdgcn_struct_buffer_store:
4430     case Intrinsic::amdgcn_struct_tbuffer_store: {
4431       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4432       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4433       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4434       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4435       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4436       break;
4437     }
4438     case Intrinsic::amdgcn_init_exec_from_input: {
4439       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4440       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4441       break;
4442     }
4443     case Intrinsic::amdgcn_ds_gws_init:
4444     case Intrinsic::amdgcn_ds_gws_barrier:
4445     case Intrinsic::amdgcn_ds_gws_sema_br: {
4446       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4447 
4448       // This must be an SGPR, but accept a VGPR.
4449       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4450                                    AMDGPU::SGPRRegBankID);
4451       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4452       break;
4453     }
4454     case Intrinsic::amdgcn_ds_gws_sema_v:
4455     case Intrinsic::amdgcn_ds_gws_sema_p:
4456     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4457       // This must be an SGPR, but accept a VGPR.
4458       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4459                                    AMDGPU::SGPRRegBankID);
4460       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4461       break;
4462     }
4463     default:
4464       return getInvalidInstructionMapping();
4465     }
4466     break;
4467   }
4468   case AMDGPU::G_SELECT: {
4469     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4470     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4471                                     AMDGPU::SGPRRegBankID);
4472     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4473                                     AMDGPU::SGPRRegBankID);
4474     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4475                     Op3Bank == AMDGPU::SGPRRegBankID;
4476 
4477     unsigned CondBankDefault = SGPRSrcs ?
4478       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4479     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4480                                      CondBankDefault);
4481     if (CondBank == AMDGPU::SGPRRegBankID)
4482       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4483     else if (CondBank == AMDGPU::VGPRRegBankID)
4484       CondBank = AMDGPU::VCCRegBankID;
4485 
4486     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4487       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4488 
4489     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4490 
4491     // TODO: Should report 32-bit for scalar condition type.
4492     if (Size == 64) {
4493       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4494       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4495       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4496       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4497     } else {
4498       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4499       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4500       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4501       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4502     }
4503 
4504     break;
4505   }
4506 
4507   case AMDGPU::G_LOAD:
4508   case AMDGPU::G_ZEXTLOAD:
4509   case AMDGPU::G_SEXTLOAD:
4510     return getInstrMappingForLoad(MI);
4511 
4512   case AMDGPU::G_ATOMICRMW_XCHG:
4513   case AMDGPU::G_ATOMICRMW_ADD:
4514   case AMDGPU::G_ATOMICRMW_SUB:
4515   case AMDGPU::G_ATOMICRMW_AND:
4516   case AMDGPU::G_ATOMICRMW_OR:
4517   case AMDGPU::G_ATOMICRMW_XOR:
4518   case AMDGPU::G_ATOMICRMW_MAX:
4519   case AMDGPU::G_ATOMICRMW_MIN:
4520   case AMDGPU::G_ATOMICRMW_UMAX:
4521   case AMDGPU::G_ATOMICRMW_UMIN:
4522   case AMDGPU::G_ATOMICRMW_FADD:
4523   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4524   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4525   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4526   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4527   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4528     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4529     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4530     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4531     break;
4532   }
4533   case AMDGPU::G_ATOMIC_CMPXCHG: {
4534     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4535     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4536     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4537     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4538     break;
4539   }
4540   case AMDGPU::G_BRCOND: {
4541     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4542                                  AMDGPU::SGPRRegBankID);
4543     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4544     if (Bank != AMDGPU::SGPRRegBankID)
4545       Bank = AMDGPU::VCCRegBankID;
4546 
4547     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4548     break;
4549   }
4550   }
4551 
4552   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4553                                getOperandsMapping(OpdsMapping),
4554                                MI.getNumOperands());
4555 }
4556