1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84 
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87 
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90 
91 using namespace llvm;
92 using namespace MIPatternMatch;
93 
94 namespace {
95 
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99   const AMDGPURegisterBankInfo &RBI;
100   MachineRegisterInfo &MRI;
101   const RegisterBank *NewBank;
102   SmallVector<MachineInstr *, 4> NewInsts;
103 
104 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)105   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
107     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108 
~ApplyRegBankMapping()109   ~ApplyRegBankMapping() {
110     for (MachineInstr *MI : NewInsts)
111       applyBank(*MI);
112   }
113 
114   /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)115   void applyBank(MachineInstr &MI) {
116     const unsigned Opc = MI.getOpcode();
117     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118         Opc == AMDGPU::G_SEXT) {
119       // LegalizerHelper wants to use the basic legalization artifacts when
120       // widening etc. We don't handle selection with vcc in artifact sources,
121       // so we need to use a sslect instead to handle these properly.
122       Register DstReg = MI.getOperand(0).getReg();
123       Register SrcReg = MI.getOperand(1).getReg();
124       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125       if (SrcBank == &AMDGPU::VCCRegBank) {
126         const LLT S32 = LLT::scalar(32);
127         assert(MRI.getType(SrcReg) == LLT::scalar(1));
128         assert(MRI.getType(DstReg) == S32);
129         assert(NewBank == &AMDGPU::VGPRRegBank);
130 
131         // Replace the extension with a select, which really uses the boolean
132         // source.
133         MachineIRBuilder B(MI);
134         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135         auto False = B.buildConstant(S32, 0);
136         B.buildSelect(DstReg, SrcReg, True, False);
137         MRI.setRegBank(True.getReg(0), *NewBank);
138         MRI.setRegBank(False.getReg(0), *NewBank);
139         MI.eraseFromParent();
140       }
141 
142       assert(!MRI.getRegClassOrRegBank(DstReg));
143       MRI.setRegBank(DstReg, *NewBank);
144       return;
145     }
146 
147 #ifndef NDEBUG
148     if (Opc == AMDGPU::G_TRUNC) {
149       Register DstReg = MI.getOperand(0).getReg();
150       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151       assert(DstBank != &AMDGPU::VCCRegBank);
152     }
153 #endif
154 
155     for (MachineOperand &Op : MI.operands()) {
156       if (!Op.isReg())
157         continue;
158 
159       // We may see physical registers if building a real MI
160       Register Reg = Op.getReg();
161       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162         continue;
163 
164       const RegisterBank *RB = NewBank;
165       if (MRI.getType(Reg) == LLT::scalar(1)) {
166         assert(NewBank == &AMDGPU::VGPRRegBank &&
167                "s1 operands should only be used for vector bools");
168         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170                "not expecting legalization artifacts here");
171         RB = &AMDGPU::VCCRegBank;
172       }
173 
174       MRI.setRegBank(Reg, *RB);
175     }
176   }
177 
erasingInstr(MachineInstr & MI)178   void erasingInstr(MachineInstr &MI) override {}
179 
createdInstr(MachineInstr & MI)180   void createdInstr(MachineInstr &MI) override {
181     // At this point, the instruction was just inserted and has no operands.
182     NewInsts.push_back(&MI);
183   }
184 
changingInstr(MachineInstr & MI)185   void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)186   void changedInstr(MachineInstr &MI) override {
187     // FIXME: In principle we should probably add the instruction to NewInsts,
188     // but the way the LegalizerHelper uses the observer, we will always see the
189     // registers we need to set the regbank on also referenced in a new
190     // instruction.
191   }
192 };
193 
194 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196     : AMDGPUGenRegisterBankInfo(),
197       Subtarget(ST),
198       TRI(Subtarget.getRegisterInfo()),
199       TII(Subtarget.getInstrInfo()) {
200 
201   // HACK: Until this is fully tablegen'd.
202   static llvm::once_flag InitializeRegisterBankFlag;
203 
204   static auto InitializeRegisterBankOnce = [this]() {
205     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208     (void)this;
209   };
210 
211   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213 
isVectorRegisterBank(const RegisterBank & Bank)214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215   unsigned BankID = Bank.getID();
216   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218 
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220                                           const RegisterBank &Src,
221                                           unsigned Size) const {
222   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225     return std::numeric_limits<unsigned>::max();
226   }
227 
228   // Bool values are tricky, because the meaning is based on context. The SCC
229   // and VCC banks are for the natural scalar and vector conditions produced by
230   // a compare.
231   //
232   // Legalization doesn't know about the necessary context, so an s1 use may
233   // have been a truncate from an arbitrary value, in which case a copy (lowered
234   // as a compare with 0) needs to be inserted.
235   if (Size == 1 &&
236       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237       (isVectorRegisterBank(Src) ||
238        Src.getID() == AMDGPU::SGPRRegBankID ||
239        Src.getID() == AMDGPU::VCCRegBankID))
240     return std::numeric_limits<unsigned>::max();
241 
242   // There is no direct copy between AGPRs.
243   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244       Src.getID() == AMDGPU::AGPRRegBankID)
245     return 4;
246 
247   return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249 
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251   const ValueMapping &ValMapping,
252   const RegisterBank *CurBank) const {
253   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254   // VGPR.
255   // FIXME: Is there a better way to do this?
256   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257     return 10; // This is expensive.
258 
259   assert(ValMapping.NumBreakDowns == 2 &&
260          ValMapping.BreakDown[0].Length == 32 &&
261          ValMapping.BreakDown[0].StartIdx == 0 &&
262          ValMapping.BreakDown[1].Length == 32 &&
263          ValMapping.BreakDown[1].StartIdx == 32 &&
264          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265 
266   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268   // want.
269 
270   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271   // alignment restrictions, but this probably isn't important.
272   return 1;
273 }
274 
275 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277                                                LLT Ty) const {
278   if (&RC == &AMDGPU::SReg_1RegClass)
279     return AMDGPU::VCCRegBank;
280 
281   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282   // VCC-like use.
283   if (TRI->isSGPRClass(&RC)) {
284     // FIXME: This probably came from a copy from a physical register, which
285     // should be inferrrable from the copied to-type. We don't have many boolean
286     // physical register constraints so just assume a normal SGPR for now.
287     if (!Ty.isValid())
288       return AMDGPU::SGPRRegBank;
289 
290     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291   }
292 
293   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295 
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const298 AMDGPURegisterBankInfo::addMappingFromTable(
299     const MachineInstr &MI, const MachineRegisterInfo &MRI,
300     const std::array<unsigned, NumOps> RegSrcOpIdx,
301     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302 
303   InstructionMappings AltMappings;
304 
305   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
306 
307   unsigned Sizes[NumOps];
308   for (unsigned I = 0; I < NumOps; ++I) {
309     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311   }
312 
313   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316   }
317 
318   // getInstrMapping's default mapping uses ID 1, so start at 2.
319   unsigned MappingID = 2;
320   for (const auto &Entry : Table) {
321     for (unsigned I = 0; I < NumOps; ++I) {
322       int OpIdx = RegSrcOpIdx[I];
323       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324     }
325 
326     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327                                                  getOperandsMapping(Operands),
328                                                  Operands.size()));
329   }
330 
331   return AltMappings;
332 }
333 
334 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337   switch (MI.getIntrinsicID()) {
338   case Intrinsic::amdgcn_readlane: {
339     static const OpRegBankEntry<3> Table[2] = {
340       // Perfectly legal.
341       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342 
343       // Need a readfirstlane for the index.
344       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345     };
346 
347     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349   }
350   case Intrinsic::amdgcn_writelane: {
351     static const OpRegBankEntry<4> Table[4] = {
352       // Perfectly legal.
353       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354 
355       // Need readfirstlane of first op
356       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357 
358       // Need readfirstlane of second op
359       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361       // Need readfirstlane of both ops
362       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363     };
364 
365     // rsrc, voffset, offset
366     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368   }
369   default:
370     return RegisterBankInfo::getInstrAlternativeMappings(MI);
371   }
372 }
373 
374 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377 
378   switch (MI.getIntrinsicID()) {
379   case Intrinsic::amdgcn_s_buffer_load: {
380     static const OpRegBankEntry<2> Table[4] = {
381       // Perfectly legal.
382       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383 
384       // Only need 1 register in loop
385       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386 
387       // Have to waterfall the resource.
388       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389 
390       // Have to waterfall the resource, and the offset.
391       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392     };
393 
394     // rsrc, offset
395     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397   }
398   case Intrinsic::amdgcn_ds_ordered_add:
399   case Intrinsic::amdgcn_ds_ordered_swap: {
400     // VGPR = M0, VGPR
401     static const OpRegBankEntry<3> Table[2] = {
402       // Perfectly legal.
403       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
404 
405       // Need a readfirstlane for m0
406       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407     };
408 
409     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411   }
412   case Intrinsic::amdgcn_s_sendmsg:
413   case Intrinsic::amdgcn_s_sendmsghalt: {
414     // FIXME: Should have no register for immediate
415     static const OpRegBankEntry<1> Table[2] = {
416       // Perfectly legal.
417       { { AMDGPU::SGPRRegBankID }, 1 },
418 
419       // Need readlane
420       { { AMDGPU::VGPRRegBankID }, 3 }
421     };
422 
423     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425   }
426   default:
427     return RegisterBankInfo::getInstrAlternativeMappings(MI);
428   }
429 }
430 
memOpHasNoClobbered(const MachineMemOperand * MMO)431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433   return I && I->getMetadata("amdgpu.noclobber");
434 }
435 
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439   if (!MI.hasOneMemOperand())
440     return false;
441 
442   const MachineMemOperand *MMO = *MI.memoperands_begin();
443   const unsigned AS = MMO->getAddrSpace();
444   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446   // Require 4-byte alignment.
447   return MMO->getAlign() >= Align(4) &&
448          // Can't do a scalar atomic load.
449          !MMO->isAtomic() &&
450          // Don't use scalar loads for volatile accesses to non-constant address
451          // spaces.
452          (IsConst || !MMO->isVolatile()) &&
453          // Memory must be known constant, or not written before this load.
454          (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455          AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457 
458 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460     const MachineInstr &MI) const {
461 
462   const MachineFunction &MF = *MI.getParent()->getParent();
463   const MachineRegisterInfo &MRI = MF.getRegInfo();
464 
465 
466   InstructionMappings AltMappings;
467   switch (MI.getOpcode()) {
468   case TargetOpcode::G_CONSTANT: {
469     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470     if (Size == 1) {
471       static const OpRegBankEntry<1> Table[3] = {
472         { { AMDGPU::VGPRRegBankID }, 1 },
473         { { AMDGPU::SGPRRegBankID }, 1 },
474         { { AMDGPU::VCCRegBankID }, 1 }
475       };
476 
477       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478     }
479 
480     LLVM_FALLTHROUGH;
481   }
482   case TargetOpcode::G_FCONSTANT:
483   case TargetOpcode::G_FRAME_INDEX:
484   case TargetOpcode::G_GLOBAL_VALUE: {
485     static const OpRegBankEntry<1> Table[2] = {
486       { { AMDGPU::VGPRRegBankID }, 1 },
487       { { AMDGPU::SGPRRegBankID }, 1 }
488     };
489 
490     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491   }
492   case TargetOpcode::G_AND:
493   case TargetOpcode::G_OR:
494   case TargetOpcode::G_XOR: {
495     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496 
497     if (Size == 1) {
498       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499       const InstructionMapping &SCCMapping = getInstructionMapping(
500         1, 1, getOperandsMapping(
501           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504         3); // Num Operands
505       AltMappings.push_back(&SCCMapping);
506 
507       const InstructionMapping &VCCMapping0 = getInstructionMapping(
508         2, 1, getOperandsMapping(
509           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512         3); // Num Operands
513       AltMappings.push_back(&VCCMapping0);
514       return AltMappings;
515     }
516 
517     if (Size != 64)
518       break;
519 
520     const InstructionMapping &SSMapping = getInstructionMapping(
521       1, 1, getOperandsMapping(
522         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525       3); // Num Operands
526     AltMappings.push_back(&SSMapping);
527 
528     const InstructionMapping &VVMapping = getInstructionMapping(
529       2, 2, getOperandsMapping(
530         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533       3); // Num Operands
534     AltMappings.push_back(&VVMapping);
535     break;
536   }
537   case TargetOpcode::G_LOAD:
538   case TargetOpcode::G_ZEXTLOAD:
539   case TargetOpcode::G_SEXTLOAD: {
540     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542     unsigned PtrSize = PtrTy.getSizeInBits();
543     unsigned AS = PtrTy.getAddressSpace();
544 
545     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547         isScalarLoadLegal(MI)) {
548       const InstructionMapping &SSMapping = getInstructionMapping(
549           1, 1, getOperandsMapping(
550                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552           2); // Num Operands
553       AltMappings.push_back(&SSMapping);
554     }
555 
556     const InstructionMapping &VVMapping = getInstructionMapping(
557         2, 1,
558         getOperandsMapping(
559             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561         2); // Num Operands
562     AltMappings.push_back(&VVMapping);
563 
564     // It may be possible to have a vgpr = load sgpr mapping here, because
565     // the mubuf instructions support this kind of load, but probably for only
566     // gfx7 and older.  However, the addressing mode matching in the instruction
567     // selector should be able to do a better job of detecting and selecting
568     // these kinds of loads from the vgpr = load vgpr mapping.
569 
570     return AltMappings;
571 
572   }
573   case TargetOpcode::G_SELECT: {
574     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580       4); // Num Operands
581     AltMappings.push_back(&SSMapping);
582 
583     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588       4); // Num Operands
589     AltMappings.push_back(&VVMapping);
590 
591     return AltMappings;
592   }
593   case TargetOpcode::G_UADDE:
594   case TargetOpcode::G_USUBE:
595   case TargetOpcode::G_SADDE:
596   case TargetOpcode::G_SSUBE: {
597     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
599       getOperandsMapping(
600         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605       5); // Num Operands
606     AltMappings.push_back(&SSMapping);
607 
608     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614       5); // Num Operands
615     AltMappings.push_back(&VVMapping);
616     return AltMappings;
617   }
618   case AMDGPU::G_BRCOND: {
619     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
620 
621     // TODO: Change type to 32 for scalar
622     const InstructionMapping &SMapping = getInstructionMapping(
623       1, 1, getOperandsMapping(
624         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625       2); // Num Operands
626     AltMappings.push_back(&SMapping);
627 
628     const InstructionMapping &VMapping = getInstructionMapping(
629       1, 1, getOperandsMapping(
630         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631       2); // Num Operands
632     AltMappings.push_back(&VMapping);
633     return AltMappings;
634   }
635   case AMDGPU::G_INTRINSIC:
636     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
637   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
638     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
639   default:
640     break;
641   }
642   return RegisterBankInfo::getInstrAlternativeMappings(MI);
643 }
644 
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
646   MachineIRBuilder &B,
647   SmallVector<Register, 2> &Regs,
648   LLT HalfTy,
649   Register Reg) const {
650   assert(HalfTy.getSizeInBits() == 32);
651   MachineRegisterInfo *MRI = B.getMRI();
652   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655   MRI->setRegBank(LoLHS, *Bank);
656   MRI->setRegBank(HiLHS, *Bank);
657 
658   Regs.push_back(LoLHS);
659   Regs.push_back(HiLHS);
660 
661   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662     .addDef(LoLHS)
663     .addDef(HiLHS)
664     .addUse(Reg);
665 }
666 
667 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
669                           LLT NewTy) {
670   for (Register Reg : Regs) {
671     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
672     MRI.setType(Reg, NewTy);
673   }
674 }
675 
getHalfSizedType(LLT Ty)676 static LLT getHalfSizedType(LLT Ty) {
677   if (Ty.isVector()) {
678     assert(Ty.getNumElements() % 2 == 0);
679     return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
680   }
681 
682   assert(Ty.getSizeInBits() % 2 == 0);
683   return LLT::scalar(Ty.getSizeInBits() / 2);
684 }
685 
686 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
687 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
688 /// execute the instruction for each unique combination of values in all lanes
689 /// in the wave. The block will be split such that rest of the instructions are
690 /// moved to a new block.
691 ///
692 /// Essentially performs this loop:
693 //
694 /// Save Execution Mask
695 /// For (Lane : Wavefront) {
696 ///   Enable Lane, Disable all other lanes
697 ///   SGPR = read SGPR value for current lane from VGPR
698 ///   VGPRResult[Lane] = use_op SGPR
699 /// }
700 /// Restore Execution Mask
701 ///
702 /// There is additional complexity to try for compare values to identify the
703 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const704 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
705   MachineIRBuilder &B,
706   iterator_range<MachineBasicBlock::iterator> Range,
707   SmallSet<Register, 4> &SGPROperandRegs,
708   MachineRegisterInfo &MRI) const {
709   SmallVector<Register, 4> ResultRegs;
710   SmallVector<Register, 4> InitResultRegs;
711   SmallVector<Register, 4> PhiRegs;
712 
713   // Track use registers which have already been expanded with a readfirstlane
714   // sequence. This may have multiple uses if moving a sequence.
715   DenseMap<Register, Register> WaterfalledRegMap;
716 
717   MachineBasicBlock &MBB = B.getMBB();
718   MachineFunction *MF = &B.getMF();
719 
720   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
721   const unsigned WaveAndOpc = Subtarget.isWave32() ?
722     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
723   const unsigned MovTermOpc = Subtarget.isWave32() ?
724     AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
725   const unsigned XorTermOpc = Subtarget.isWave32() ?
726     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
727   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
728     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
729   const unsigned ExecReg =  Subtarget.isWave32() ?
730     AMDGPU::EXEC_LO : AMDGPU::EXEC;
731 
732 #ifndef NDEBUG
733   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
734 #endif
735 
736   for (MachineInstr &MI : Range) {
737     for (MachineOperand &Def : MI.defs()) {
738       if (MRI.use_nodbg_empty(Def.getReg()))
739         continue;
740 
741       LLT ResTy = MRI.getType(Def.getReg());
742       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
743       ResultRegs.push_back(Def.getReg());
744       Register InitReg = B.buildUndef(ResTy).getReg(0);
745       Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
746       InitResultRegs.push_back(InitReg);
747       PhiRegs.push_back(PhiReg);
748       MRI.setRegBank(PhiReg, *DefBank);
749       MRI.setRegBank(InitReg, *DefBank);
750     }
751   }
752 
753   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
754   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
755 
756   // Don't bother using generic instructions/registers for the exec mask.
757   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
758     .addDef(InitSaveExecReg);
759 
760   Register PhiExec = MRI.createVirtualRegister(WaveRC);
761   Register NewExec = MRI.createVirtualRegister(WaveRC);
762 
763   // To insert the loop we need to split the block. Move everything before this
764   // point to a new block, and insert a new empty block before this instruction.
765   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
766   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
767   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
768   MachineFunction::iterator MBBI(MBB);
769   ++MBBI;
770   MF->insert(MBBI, LoopBB);
771   MF->insert(MBBI, RestoreExecBB);
772   MF->insert(MBBI, RemainderBB);
773 
774   LoopBB->addSuccessor(RestoreExecBB);
775   LoopBB->addSuccessor(LoopBB);
776 
777   // Move the rest of the block into a new block.
778   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
779   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
780 
781   MBB.addSuccessor(LoopBB);
782   RestoreExecBB->addSuccessor(RemainderBB);
783 
784   B.setInsertPt(*LoopBB, LoopBB->end());
785 
786   B.buildInstr(TargetOpcode::PHI)
787     .addDef(PhiExec)
788     .addReg(InitSaveExecReg)
789     .addMBB(&MBB)
790     .addReg(NewExec)
791     .addMBB(LoopBB);
792 
793   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
794     B.buildInstr(TargetOpcode::G_PHI)
795       .addDef(std::get<2>(Result))
796       .addReg(std::get<0>(Result)) // Initial value / implicit_def
797       .addMBB(&MBB)
798       .addReg(std::get<1>(Result)) // Mid-loop value.
799       .addMBB(LoopBB);
800   }
801 
802   const DebugLoc &DL = B.getDL();
803 
804   MachineInstr &FirstInst = *Range.begin();
805 
806   // Move the instruction into the loop. Note we moved everything after
807   // Range.end() already into a new block, so Range.end() is no longer valid.
808   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
809 
810   // Figure out the iterator range after splicing the instructions.
811   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
812   auto NewEnd = LoopBB->end();
813 
814   MachineBasicBlock::iterator I = Range.begin();
815   B.setInsertPt(*LoopBB, I);
816 
817   Register CondReg;
818 
819   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
820 
821   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
822     for (MachineOperand &Op : MI.uses()) {
823       if (!Op.isReg() || Op.isDef())
824         continue;
825 
826       Register OldReg = Op.getReg();
827       if (!SGPROperandRegs.count(OldReg))
828         continue;
829 
830       // See if we already processed this register in another instruction in the
831       // sequence.
832       auto OldVal = WaterfalledRegMap.find(OldReg);
833       if (OldVal != WaterfalledRegMap.end()) {
834         Op.setReg(OldVal->second);
835         continue;
836       }
837 
838       Register OpReg = Op.getReg();
839       LLT OpTy = MRI.getType(OpReg);
840 
841       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
842       if (OpBank != &AMDGPU::VGPRRegBank) {
843         // Insert copy from AGPR to VGPR before the loop.
844         B.setMBB(MBB);
845         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
846         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
847         B.setInstr(*I);
848       }
849 
850       unsigned OpSize = OpTy.getSizeInBits();
851 
852       // Can only do a readlane of 32-bit pieces.
853       if (OpSize == 32) {
854         // Avoid extra copies in the simple case of one 32-bit register.
855         Register CurrentLaneOpReg
856           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
857         MRI.setType(CurrentLaneOpReg, OpTy);
858 
859         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
860         // Read the next variant <- also loop target.
861         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
862                 CurrentLaneOpReg)
863           .addReg(OpReg);
864 
865         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
866         bool First = CondReg == AMDGPU::NoRegister;
867         if (First)
868           CondReg = NewCondReg;
869 
870         // Compare the just read M0 value to all possible Idx values.
871         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
872           .addDef(NewCondReg)
873           .addReg(CurrentLaneOpReg)
874           .addReg(OpReg);
875         Op.setReg(CurrentLaneOpReg);
876 
877         if (!First) {
878           Register AndReg = MRI.createVirtualRegister(WaveRC);
879 
880           // If there are multiple operands to consider, and the conditions.
881           B.buildInstr(WaveAndOpc)
882             .addDef(AndReg)
883             .addReg(NewCondReg)
884             .addReg(CondReg);
885           CondReg = AndReg;
886         }
887       } else {
888         LLT S32 = LLT::scalar(32);
889         SmallVector<Register, 8> ReadlanePieces;
890 
891         // The compares can be done as 64-bit, but the extract needs to be done
892         // in 32-bit pieces.
893 
894         bool Is64 = OpSize % 64 == 0;
895 
896         LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
897         unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
898           : AMDGPU::V_CMP_EQ_U32_e64;
899 
900         // The compares can be done as 64-bit, but the extract needs to be done
901         // in 32-bit pieces.
902 
903         // Insert the unmerge before the loop.
904 
905         B.setMBB(MBB);
906         auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
907         B.setInstr(*I);
908 
909         unsigned NumPieces = Unmerge->getNumOperands() - 1;
910         for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
911           Register UnmergePiece = Unmerge.getReg(PieceIdx);
912 
913           Register CurrentLaneOpReg;
914           if (Is64) {
915             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
916             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
917 
918             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
919             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
920             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
921 
922             // Read the next variant <- also loop target.
923             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
924                     CurrentLaneOpRegLo)
925               .addReg(UnmergePiece, 0, AMDGPU::sub0);
926 
927             // Read the next variant <- also loop target.
928             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
929                     CurrentLaneOpRegHi)
930               .addReg(UnmergePiece, 0, AMDGPU::sub1);
931 
932             CurrentLaneOpReg =
933               B.buildMerge(LLT::scalar(64),
934                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
935               .getReg(0);
936 
937             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
938 
939             if (OpTy.getScalarSizeInBits() == 64) {
940               // If we need to produce a 64-bit element vector, so use the
941               // merged pieces
942               ReadlanePieces.push_back(CurrentLaneOpReg);
943             } else {
944               // 32-bit element type.
945               ReadlanePieces.push_back(CurrentLaneOpRegLo);
946               ReadlanePieces.push_back(CurrentLaneOpRegHi);
947             }
948           } else {
949             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
950             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
951             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
952 
953             // Read the next variant <- also loop target.
954             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
955                     CurrentLaneOpReg)
956               .addReg(UnmergePiece);
957             ReadlanePieces.push_back(CurrentLaneOpReg);
958           }
959 
960           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
961           bool First = CondReg == AMDGPU::NoRegister;
962           if (First)
963             CondReg = NewCondReg;
964 
965           B.buildInstr(CmpOp)
966             .addDef(NewCondReg)
967             .addReg(CurrentLaneOpReg)
968             .addReg(UnmergePiece);
969 
970           if (!First) {
971             Register AndReg = MRI.createVirtualRegister(WaveRC);
972 
973             // If there are multiple operands to consider, and the conditions.
974             B.buildInstr(WaveAndOpc)
975               .addDef(AndReg)
976               .addReg(NewCondReg)
977               .addReg(CondReg);
978             CondReg = AndReg;
979           }
980         }
981 
982         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
983         // BUILD_VECTOR
984         if (OpTy.isVector()) {
985           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
986           Op.setReg(Merge.getReg(0));
987         } else {
988           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
989           Op.setReg(Merge.getReg(0));
990         }
991 
992         MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
993       }
994 
995       // Make sure we don't re-process this register again.
996       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
997     }
998   }
999 
1000   B.setInsertPt(*LoopBB, LoopBB->end());
1001 
1002   // Update EXEC, save the original EXEC value to VCC.
1003   B.buildInstr(AndSaveExecOpc)
1004     .addDef(NewExec)
1005     .addReg(CondReg, RegState::Kill);
1006 
1007   MRI.setSimpleHint(NewExec, CondReg);
1008 
1009   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1010   B.buildInstr(XorTermOpc)
1011     .addDef(ExecReg)
1012     .addReg(ExecReg)
1013     .addReg(NewExec);
1014 
1015   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1016   // s_cbranch_scc0?
1017 
1018   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1019   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1020     .addMBB(LoopBB);
1021 
1022   // Save the EXEC mask before the loop.
1023   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1024     .addReg(ExecReg);
1025 
1026   // Restore the EXEC mask after the loop.
1027   B.setMBB(*RestoreExecBB);
1028   B.buildInstr(MovTermOpc)
1029     .addDef(ExecReg)
1030     .addReg(SaveExecReg);
1031 
1032   // Set the insert point after the original instruction, so any new
1033   // instructions will be in the remainder.
1034   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1035 
1036   return true;
1037 }
1038 
1039 // Return any unique registers used by \p MI at \p OpIndices that need to be
1040 // handled in a waterfall loop. Returns these registers in \p
1041 // SGPROperandRegs. Returns true if there are any operands to handle and a
1042 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1043 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1044   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1045   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1046   for (unsigned Op : OpIndices) {
1047     assert(MI.getOperand(Op).isUse());
1048     Register Reg = MI.getOperand(Op).getReg();
1049     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1050     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1051       SGPROperandRegs.insert(Reg);
1052   }
1053 
1054   // No operands need to be replaced, so no need to loop.
1055   return !SGPROperandRegs.empty();
1056 }
1057 
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1058 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1059   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1060   ArrayRef<unsigned> OpIndices) const {
1061   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1062   // are the same register.
1063   SmallSet<Register, 4> SGPROperandRegs;
1064 
1065   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1066     return false;
1067 
1068   MachineBasicBlock::iterator I = MI.getIterator();
1069   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1070                                 SGPROperandRegs, MRI);
1071 }
1072 
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1073 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1074   MachineInstr &MI, MachineRegisterInfo &MRI,
1075   ArrayRef<unsigned> OpIndices) const {
1076   MachineIRBuilder B(MI);
1077   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1078 }
1079 
1080 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1081 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1082     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1083   Register Reg = MI.getOperand(OpIdx).getReg();
1084   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1085   if (Bank == &AMDGPU::SGPRRegBank)
1086     return;
1087 
1088   LLT Ty = MRI.getType(Reg);
1089   MachineIRBuilder B(MI);
1090 
1091   if (Bank != &AMDGPU::VGPRRegBank) {
1092     // We need to copy from AGPR to VGPR
1093     Reg = B.buildCopy(Ty, Reg).getReg(0);
1094     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1095   }
1096 
1097   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1098   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1099     .addDef(SGPR)
1100     .addReg(Reg);
1101 
1102   MRI.setType(SGPR, Ty);
1103 
1104   const TargetRegisterClass *Constrained =
1105       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1106   (void)Constrained;
1107   assert(Constrained && "Failed to constrain readfirstlane src reg");
1108 
1109   MI.getOperand(OpIdx).setReg(SGPR);
1110 }
1111 
1112 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1113 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1114 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1115   unsigned TotalSize = Ty.getSizeInBits();
1116   if (!Ty.isVector())
1117     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1118 
1119   LLT EltTy = Ty.getElementType();
1120   unsigned EltSize = EltTy.getSizeInBits();
1121   assert(FirstSize % EltSize == 0);
1122 
1123   unsigned FirstPartNumElts = FirstSize / EltSize;
1124   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1125 
1126   return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
1127           LLT::scalarOrVector(RemainderElts, EltTy)};
1128 }
1129 
widen96To128(LLT Ty)1130 static LLT widen96To128(LLT Ty) {
1131   if (!Ty.isVector())
1132     return LLT::scalar(128);
1133 
1134   LLT EltTy = Ty.getElementType();
1135   assert(128 % EltTy.getSizeInBits() == 0);
1136   return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
1137 }
1138 
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1139 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1140                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1141                                               MachineRegisterInfo &MRI) const {
1142   Register DstReg = MI.getOperand(0).getReg();
1143   const LLT LoadTy = MRI.getType(DstReg);
1144   unsigned LoadSize = LoadTy.getSizeInBits();
1145   const unsigned MaxNonSmrdLoadSize = 128;
1146 
1147   const RegisterBank *DstBank =
1148       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1149   if (DstBank == &AMDGPU::SGPRRegBank) {
1150     // There are some special cases that we need to look at for 32 bit and 96
1151     // bit SGPR loads otherwise we have nothing to do.
1152     if (LoadSize != 32 && LoadSize != 96)
1153       return false;
1154 
1155     MachineMemOperand *MMO = *MI.memoperands_begin();
1156     const unsigned MemSize = 8 * MMO->getSize();
1157     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1158     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1159     // scalar loads should have a load size of 32 but memory access size of less
1160     // than 32.
1161     if (LoadSize == 32 &&
1162         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1163       return false;
1164 
1165     Register PtrReg = MI.getOperand(1).getReg();
1166 
1167     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1168     MachineIRBuilder B(MI, O);
1169 
1170     if (LoadSize == 32) {
1171       // This is an extending load from a sub-dword size. Widen the memory
1172       // access size to 4 bytes and clear the extra high bits appropriately
1173       const LLT S32 = LLT::scalar(32);
1174       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1175         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1176         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1177         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1178       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1179         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1180         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1181         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1182       } else
1183         // We do not need to touch the higher bits for regular loads.
1184         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1185     } else {
1186       // 96-bit loads are only available for vector loads. We need to split this
1187       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1188       if (MMO->getAlign() < Align(16)) {
1189         LLT Part64, Part32;
1190         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1191         auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1192         auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1193 
1194         auto Undef = B.buildUndef(LoadTy);
1195         auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1196         B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1197       } else {
1198         LLT WiderTy = widen96To128(LoadTy);
1199         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1200         B.buildExtract(MI.getOperand(0), WideLoad, 0);
1201       }
1202     }
1203 
1204     MI.eraseFromParent();
1205     return true;
1206   }
1207 
1208   // 128-bit loads are supported for all instruction types.
1209   if (LoadSize <= MaxNonSmrdLoadSize)
1210     return false;
1211 
1212   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1213   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1214 
1215   if (SrcRegs.empty())
1216     SrcRegs.push_back(MI.getOperand(1).getReg());
1217 
1218   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1219 
1220   // RegBankSelect only emits scalar types, so we need to reset the pointer
1221   // operand to a pointer type.
1222   Register BasePtrReg = SrcRegs[0];
1223   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1224   MRI.setType(BasePtrReg, PtrTy);
1225 
1226   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1227   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1228   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1229   MachineIRBuilder B(MI, Observer);
1230   LegalizerHelper Helper(B.getMF(), Observer, B);
1231 
1232   if (LoadTy.isVector()) {
1233     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1234       return false;
1235   } else {
1236     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1237       return false;
1238   }
1239 
1240   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1241   return true;
1242 }
1243 
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1244 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1245   MachineInstr &MI,
1246   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1247   MachineRegisterInfo &MRI) const {
1248   const MachineFunction &MF = *MI.getMF();
1249   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1250   const auto &TFI = *ST.getFrameLowering();
1251 
1252   // Guard in case the stack growth direction ever changes with scratch
1253   // instructions.
1254   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1255     return false;
1256 
1257   Register Dst = MI.getOperand(0).getReg();
1258   Register AllocSize = MI.getOperand(1).getReg();
1259   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1260 
1261   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1262 
1263   // TODO: Need to emit a wave reduction to get the maximum size.
1264   if (SizeBank != &AMDGPU::SGPRRegBank)
1265     return false;
1266 
1267   LLT PtrTy = MRI.getType(Dst);
1268   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1269 
1270   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1271   Register SPReg = Info->getStackPtrOffsetReg();
1272   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1273   MachineIRBuilder B(MI, ApplyBank);
1274 
1275   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1276   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1277 
1278   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1279   if (Alignment > TFI.getStackAlign()) {
1280     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1281     B.buildMaskLowPtrBits(Dst, PtrAdd,
1282                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1283   } else {
1284     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1285   }
1286 
1287   MI.eraseFromParent();
1288   return true;
1289 }
1290 
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1291 bool AMDGPURegisterBankInfo::applyMappingImage(
1292     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1293     MachineRegisterInfo &MRI, int RsrcIdx) const {
1294   const int NumDefs = MI.getNumExplicitDefs();
1295 
1296   // The reported argument index is relative to the IR intrinsic call arguments,
1297   // so we need to shift by the number of defs and the intrinsic ID.
1298   RsrcIdx += NumDefs + 1;
1299 
1300   // Insert copies to VGPR arguments.
1301   applyDefaultMapping(OpdMapper);
1302 
1303   // Fixup any SGPR arguments.
1304   SmallVector<unsigned, 4> SGPRIndexes;
1305   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1306     if (!MI.getOperand(I).isReg())
1307       continue;
1308 
1309     // If this intrinsic has a sampler, it immediately follows rsrc.
1310     if (I == RsrcIdx || I == RsrcIdx + 1)
1311       SGPRIndexes.push_back(I);
1312   }
1313 
1314   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1315   return true;
1316 }
1317 
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1318 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1319                                         Register Reg) {
1320   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1321   if (!Def)
1322     return Reg;
1323 
1324   // TODO: Guard against this being an implicit def
1325   return Def->getOperand(0).getReg();
1326 }
1327 
1328 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1329 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1330 static unsigned setBufferOffsets(MachineIRBuilder &B,
1331                                  const AMDGPURegisterBankInfo &RBI,
1332                                  Register CombinedOffset, Register &VOffsetReg,
1333                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1334                                  Align Alignment) {
1335   const LLT S32 = LLT::scalar(32);
1336   MachineRegisterInfo *MRI = B.getMRI();
1337 
1338   if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
1339     uint32_t SOffset, ImmOffset;
1340     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1341                                  Alignment)) {
1342       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1343       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1344       InstOffsetVal = ImmOffset;
1345 
1346       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1347       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1348       return SOffset + ImmOffset;
1349     }
1350   }
1351 
1352   Register Base;
1353   unsigned Offset;
1354 
1355   std::tie(Base, Offset) =
1356       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1357 
1358   uint32_t SOffset, ImmOffset;
1359   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1360                                                   &RBI.Subtarget, Alignment)) {
1361     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1362       VOffsetReg = Base;
1363       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1364       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1365       InstOffsetVal = ImmOffset;
1366       return 0; // XXX - Why is this 0?
1367     }
1368 
1369     // If we have SGPR base, we can use it for soffset.
1370     if (SOffset == 0) {
1371       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1372       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1373       SOffsetReg = Base;
1374       InstOffsetVal = ImmOffset;
1375       return 0; // XXX - Why is this 0?
1376     }
1377   }
1378 
1379   // Handle the variable sgpr + vgpr case.
1380   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1381   if (Add && (int)Offset >= 0) {
1382     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1383     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1384 
1385     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1386     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1387 
1388     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1389       VOffsetReg = Src0;
1390       SOffsetReg = Src1;
1391       return 0;
1392     }
1393 
1394     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1395       VOffsetReg = Src1;
1396       SOffsetReg = Src0;
1397       return 0;
1398     }
1399   }
1400 
1401   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1402   // have an SGPR offset and a VGPR resource.
1403   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1404     VOffsetReg = CombinedOffset;
1405   } else {
1406     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1407     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1408   }
1409 
1410   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1411   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1412   return 0;
1413 }
1414 
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1415 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1416   const OperandsMapper &OpdMapper) const {
1417   MachineInstr &MI = OpdMapper.getMI();
1418   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1419 
1420   const LLT S32 = LLT::scalar(32);
1421   Register Dst = MI.getOperand(0).getReg();
1422   LLT Ty = MRI.getType(Dst);
1423 
1424   const RegisterBank *RSrcBank =
1425     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1426   const RegisterBank *OffsetBank =
1427     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1428   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1429       OffsetBank == &AMDGPU::SGPRRegBank)
1430     return true; // Legal mapping
1431 
1432   // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1433   // here but don't have an MMO.
1434 
1435   unsigned LoadSize = Ty.getSizeInBits();
1436   int NumLoads = 1;
1437   if (LoadSize == 256 || LoadSize == 512) {
1438     NumLoads = LoadSize / 128;
1439     Ty = Ty.divide(NumLoads);
1440   }
1441 
1442   // Use the alignment to ensure that the required offsets will fit into the
1443   // immediate offsets.
1444   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1445 
1446   MachineIRBuilder B(MI);
1447   MachineFunction &MF = B.getMF();
1448 
1449   Register SOffset;
1450   Register VOffset;
1451   int64_t ImmOffset = 0;
1452 
1453   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1454                                         VOffset, SOffset, ImmOffset, Alignment);
1455 
1456   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1457   // can, but we neeed to track an MMO for that.
1458   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1459   const Align MemAlign(4); // FIXME: ABI type alignment?
1460   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1461     MachinePointerInfo(),
1462     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1463     MachineMemOperand::MOInvariant,
1464     MemSize, MemAlign);
1465   if (MMOOffset != 0)
1466     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1467 
1468   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1469   // assume that the buffer is unswizzled.
1470 
1471   Register RSrc = MI.getOperand(1).getReg();
1472   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1473   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1474 
1475   SmallVector<Register, 4> LoadParts(NumLoads);
1476 
1477   MachineBasicBlock::iterator MII = MI.getIterator();
1478   MachineInstrSpan Span(MII, &B.getMBB());
1479 
1480   for (int i = 0; i < NumLoads; ++i) {
1481     if (NumLoads == 1) {
1482       LoadParts[i] = Dst;
1483     } else {
1484       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1485       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1486     }
1487 
1488     MachineMemOperand *MMO = BaseMMO;
1489     if (i != 0)
1490       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1491 
1492     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1493       .addDef(LoadParts[i])       // vdata
1494       .addUse(RSrc)               // rsrc
1495       .addUse(VIndex)             // vindex
1496       .addUse(VOffset)            // voffset
1497       .addUse(SOffset)            // soffset
1498       .addImm(ImmOffset + 16 * i) // offset(imm)
1499       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1500       .addImm(0)                  // idxen(imm)
1501       .addMemOperand(MMO);
1502   }
1503 
1504   // TODO: If only the resource is a VGPR, it may be better to execute the
1505   // scalar load in the waterfall loop if the resource is expected to frequently
1506   // be dynamically uniform.
1507   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1508     // Remove the original instruction to avoid potentially confusing the
1509     // waterfall loop logic.
1510     B.setInstr(*Span.begin());
1511     MI.eraseFromParent();
1512 
1513     SmallSet<Register, 4> OpsToWaterfall;
1514 
1515     OpsToWaterfall.insert(RSrc);
1516     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1517                            OpsToWaterfall, MRI);
1518   }
1519 
1520   if (NumLoads != 1) {
1521     if (Ty.isVector())
1522       B.buildConcatVectors(Dst, LoadParts);
1523     else
1524       B.buildMerge(Dst, LoadParts);
1525   }
1526 
1527   // We removed the instruction earlier with a waterfall loop.
1528   if (RSrcBank == &AMDGPU::SGPRRegBank)
1529     MI.eraseFromParent();
1530 
1531   return true;
1532 }
1533 
applyMappingBFEIntrinsic(const OperandsMapper & OpdMapper,bool Signed) const1534 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1535   const OperandsMapper &OpdMapper, bool Signed) const {
1536   MachineInstr &MI = OpdMapper.getMI();
1537   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1538 
1539   // Insert basic copies
1540   applyDefaultMapping(OpdMapper);
1541 
1542   Register DstReg = MI.getOperand(0).getReg();
1543   LLT Ty = MRI.getType(DstReg);
1544 
1545   const LLT S32 = LLT::scalar(32);
1546 
1547   const RegisterBank *DstBank =
1548     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1549   if (DstBank == &AMDGPU::VGPRRegBank) {
1550     if (Ty == S32)
1551       return true;
1552 
1553     // TODO: 64-bit version is scalar only, so we need to expand this.
1554     return false;
1555   }
1556 
1557   Register SrcReg = MI.getOperand(2).getReg();
1558   Register OffsetReg = MI.getOperand(3).getReg();
1559   Register WidthReg = MI.getOperand(4).getReg();
1560 
1561   // The scalar form packs the offset and width in a single operand.
1562 
1563   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1564   MachineIRBuilder B(MI, ApplyBank);
1565 
1566   // Ensure the high bits are clear to insert the offset.
1567   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1568   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1569 
1570   // Zeros out the low bits, so don't bother clamping the input value.
1571   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1572 
1573   // Transformation function, pack the offset and width of a BFE into
1574   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1575   // source, bits [5:0] contain the offset and bits [22:16] the width.
1576   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1577 
1578   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1579   // register class constraints.
1580   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1581                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1582 
1583   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1584   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1585     llvm_unreachable("failed to constrain BFE");
1586 
1587   MI.eraseFromParent();
1588   return true;
1589 }
1590 
1591 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1592 static unsigned getExtendOp(unsigned Opc) {
1593   switch (Opc) {
1594   case TargetOpcode::G_ASHR:
1595   case TargetOpcode::G_SMIN:
1596   case TargetOpcode::G_SMAX:
1597     return TargetOpcode::G_SEXT;
1598   case TargetOpcode::G_LSHR:
1599   case TargetOpcode::G_UMIN:
1600   case TargetOpcode::G_UMAX:
1601     return TargetOpcode::G_ZEXT;
1602   default:
1603     return TargetOpcode::G_ANYEXT;
1604   }
1605 }
1606 
1607 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1608 // any illegal vector extend or unmerge operations.
1609 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1610 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1611   const LLT S32 = LLT::scalar(32);
1612   auto Bitcast = B.buildBitcast(S32, Src);
1613 
1614   if (ExtOpcode == TargetOpcode::G_SEXT) {
1615     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1616     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1617     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1618   }
1619 
1620   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1621   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1622     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1623     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1624   }
1625 
1626   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1627   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1628 }
1629 
1630 // For cases where only a single copy is inserted for matching register banks.
1631 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1632 static bool substituteSimpleCopyRegs(
1633   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1634   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1635   if (!SrcReg.empty()) {
1636     assert(SrcReg.size() == 1);
1637     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1638     return true;
1639   }
1640 
1641   return false;
1642 }
1643 
1644 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1645 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1646                                                 MachineRegisterInfo &MRI,
1647                                                 Register Reg) const {
1648   if (!Subtarget.hasUnpackedD16VMem())
1649     return Reg;
1650 
1651   const LLT S16 = LLT::scalar(16);
1652   LLT StoreVT = MRI.getType(Reg);
1653   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1654     return Reg;
1655 
1656   auto Unmerge = B.buildUnmerge(S16, Reg);
1657 
1658 
1659   SmallVector<Register, 4> WideRegs;
1660   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1661     WideRegs.push_back(Unmerge.getReg(I));
1662 
1663   const LLT S32 = LLT::scalar(32);
1664   int NumElts = StoreVT.getNumElements();
1665 
1666   return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1667 }
1668 
1669 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1670 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1671   int64_t Const;
1672   if (mi_match(Reg, MRI, m_ICst(Const)))
1673     return std::make_pair(Register(), Const);
1674 
1675   Register Base;
1676   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1677     return std::make_pair(Base, Const);
1678 
1679   // TODO: Handle G_OR used for add case
1680   return std::make_pair(Reg, 0);
1681 }
1682 
1683 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1684 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1685                                            Register OrigOffset) const {
1686   const unsigned MaxImm = 4095;
1687   Register BaseReg;
1688   unsigned ImmOffset;
1689   const LLT S32 = LLT::scalar(32);
1690 
1691   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1692                                                            OrigOffset);
1693 
1694   unsigned C1 = 0;
1695   if (ImmOffset != 0) {
1696     // If the immediate value is too big for the immoffset field, put the value
1697     // and -4096 into the immoffset field so that the value that is copied/added
1698     // for the voffset field is a multiple of 4096, and it stands more chance
1699     // of being CSEd with the copy/add for another similar load/store.
1700     // However, do not do that rounding down to a multiple of 4096 if that is a
1701     // negative number, as it appears to be illegal to have a negative offset
1702     // in the vgpr, even if adding the immediate offset makes it positive.
1703     unsigned Overflow = ImmOffset & ~MaxImm;
1704     ImmOffset -= Overflow;
1705     if ((int32_t)Overflow < 0) {
1706       Overflow += ImmOffset;
1707       ImmOffset = 0;
1708     }
1709 
1710     C1 = ImmOffset;
1711     if (Overflow != 0) {
1712       if (!BaseReg)
1713         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1714       else {
1715         auto OverflowVal = B.buildConstant(S32, Overflow);
1716         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1717       }
1718     }
1719   }
1720 
1721   if (!BaseReg)
1722     BaseReg = B.buildConstant(S32, 0).getReg(0);
1723 
1724   return {BaseReg, C1};
1725 }
1726 
isZero(Register Reg,MachineRegisterInfo & MRI)1727 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1728   int64_t C;
1729   return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1730 }
1731 
extractCPol(unsigned CachePolicy)1732 static unsigned extractCPol(unsigned CachePolicy) {
1733   return CachePolicy & AMDGPU::CPol::ALL;
1734 }
1735 
extractSWZ(unsigned CachePolicy)1736 static unsigned extractSWZ(unsigned CachePolicy) {
1737   return (CachePolicy >> 3) & 1;
1738 }
1739 
1740 
1741 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1742 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1743                                              MachineInstr &MI) const {
1744    MachineRegisterInfo &MRI = *B.getMRI();
1745   executeInWaterfallLoop(B, MI, MRI, {2, 4});
1746 
1747   // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1748 
1749   Register VData = MI.getOperand(1).getReg();
1750   LLT Ty = MRI.getType(VData);
1751 
1752   int EltSize = Ty.getScalarSizeInBits();
1753   int Size = Ty.getSizeInBits();
1754 
1755   // FIXME: Broken integer truncstore.
1756   if (EltSize != 32)
1757     report_fatal_error("unhandled intrinsic store");
1758 
1759   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1760   const int MemSize = (*MI.memoperands_begin())->getSize();
1761 
1762 
1763   Register RSrc = MI.getOperand(2).getReg();
1764   Register VOffset = MI.getOperand(3).getReg();
1765   Register SOffset = MI.getOperand(4).getReg();
1766   unsigned CachePolicy = MI.getOperand(5).getImm();
1767 
1768   unsigned ImmOffset;
1769   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1770 
1771   const bool Offen = !isZero(VOffset, MRI);
1772 
1773   unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1774   switch (8 * MemSize) {
1775   case 8:
1776     Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1777                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1778     break;
1779   case 16:
1780     Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1781                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1782     break;
1783   default:
1784     Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1785                   AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1786     if (Size > 32)
1787       Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1788     break;
1789   }
1790 
1791 
1792   // Set the insertion point back to the instruction in case it was moved into a
1793   // loop.
1794   B.setInstr(MI);
1795 
1796   MachineInstrBuilder MIB = B.buildInstr(Opc)
1797     .addUse(VData);
1798 
1799   if (Offen)
1800     MIB.addUse(VOffset);
1801 
1802   MIB.addUse(RSrc)
1803      .addUse(SOffset)
1804      .addImm(ImmOffset)
1805      .addImm(extractCPol(CachePolicy))
1806      .addImm(0) // tfe: FIXME: Remove from inst
1807      .addImm(extractSWZ(CachePolicy))
1808      .cloneMemRefs(MI);
1809 
1810   // FIXME: We need a way to report failure from applyMappingImpl.
1811   // Insert constrain copies before inserting the loop.
1812   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1813     report_fatal_error("failed to constrain selected store intrinsic");
1814 
1815   return MIB;
1816 }
1817 
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1818 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1819                                         Register SrcReg) const {
1820   MachineRegisterInfo &MRI = *B.getMRI();
1821   LLT SrcTy = MRI.getType(SrcReg);
1822   if (SrcTy.getSizeInBits() == 32) {
1823     // Use a v_mov_b32 here to make the exec dependency explicit.
1824     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1825       .addDef(DstReg)
1826       .addUse(SrcReg);
1827     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1828            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1829   }
1830 
1831   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1832   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1833 
1834   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1835     .addDef(TmpReg0)
1836     .addUse(SrcReg, 0, AMDGPU::sub0);
1837   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1838     .addDef(TmpReg1)
1839     .addUse(SrcReg, 0, AMDGPU::sub1);
1840   B.buildInstr(AMDGPU::REG_SEQUENCE)
1841     .addDef(DstReg)
1842     .addUse(TmpReg0)
1843     .addImm(AMDGPU::sub0)
1844     .addUse(TmpReg1)
1845     .addImm(AMDGPU::sub1);
1846 
1847   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1848          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1849 }
1850 
1851 /// Utility function for pushing dynamic vector indexes with a constant offset
1852 /// into waterwall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1853 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1854                                    MachineInstr &IdxUseInstr,
1855                                    unsigned OpIdx,
1856                                    unsigned ConstOffset) {
1857   MachineRegisterInfo &MRI = *B.getMRI();
1858   const LLT S32 = LLT::scalar(32);
1859   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1860   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1861 
1862   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1863 
1864   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1865   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1866   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1867   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1868 }
1869 
1870 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1871 /// original 32-bit source value (to be inserted in the low part of the combined
1872 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1873 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1874 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1875                                   Register Hi32Reg, Register Lo32Reg,
1876                                   unsigned ExtOpc,
1877                                   const RegisterBank &RegBank,
1878                                   bool IsBooleanSrc = false) {
1879   if (ExtOpc == AMDGPU::G_ZEXT) {
1880     B.buildConstant(Hi32Reg, 0);
1881   } else if (ExtOpc == AMDGPU::G_SEXT) {
1882     if (IsBooleanSrc) {
1883       // If we know the original source was an s1, the high half is the same as
1884       // the low.
1885       B.buildCopy(Hi32Reg, Lo32Reg);
1886     } else {
1887       // Replicate sign bit from 32-bit extended part.
1888       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1889       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1890       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1891     }
1892   } else {
1893     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1894     B.buildUndef(Hi32Reg);
1895   }
1896 }
1897 
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1898 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1899   MachineInstr &MI, MachineRegisterInfo &MRI,
1900   const OperandsMapper &OpdMapper) const {
1901 
1902   Register VecReg = MI.getOperand(1).getReg();
1903   Register Idx = MI.getOperand(2).getReg();
1904 
1905   const RegisterBank &IdxBank =
1906     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1907 
1908   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1909 
1910   LLT VecTy = MRI.getType(VecReg);
1911   unsigned EltSize = VecTy.getScalarSizeInBits();
1912   unsigned NumElem = VecTy.getNumElements();
1913 
1914   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1915                                                   IsDivergentIdx))
1916     return false;
1917 
1918   MachineIRBuilder B(MI);
1919   LLT S32 = LLT::scalar(32);
1920 
1921   const RegisterBank &DstBank =
1922     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1923   const RegisterBank &SrcBank =
1924     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1925 
1926   const RegisterBank &CCBank =
1927     (DstBank == AMDGPU::SGPRRegBank &&
1928      SrcBank == AMDGPU::SGPRRegBank &&
1929      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1930                                      : AMDGPU::VCCRegBank;
1931   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1932 
1933   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1934     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1935     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1936   }
1937 
1938   LLT EltTy = VecTy.getScalarType();
1939   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1940   unsigned NumLanes = DstRegs.size();
1941   if (!NumLanes)
1942     NumLanes = 1;
1943   else
1944     EltTy = MRI.getType(DstRegs[0]);
1945 
1946   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1947   SmallVector<Register, 2> Res(NumLanes);
1948   for (unsigned L = 0; L < NumLanes; ++L)
1949     Res[L] = UnmergeToEltTy.getReg(L);
1950 
1951   for (unsigned I = 1; I < NumElem; ++I) {
1952     auto IC = B.buildConstant(S32, I);
1953     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1954     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1955     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1956 
1957     for (unsigned L = 0; L < NumLanes; ++L) {
1958       auto S = B.buildSelect(EltTy, Cmp,
1959                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1960 
1961       for (unsigned N : { 0, 2, 3 })
1962         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1963 
1964       Res[L] = S->getOperand(0).getReg();
1965     }
1966   }
1967 
1968   for (unsigned L = 0; L < NumLanes; ++L) {
1969     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1970     B.buildCopy(DstReg, Res[L]);
1971     MRI.setRegBank(DstReg, DstBank);
1972   }
1973 
1974   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1975   MI.eraseFromParent();
1976 
1977   return true;
1978 }
1979 
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1980 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1981   MachineInstr &MI, MachineRegisterInfo &MRI,
1982   const OperandsMapper &OpdMapper) const {
1983 
1984   Register VecReg = MI.getOperand(1).getReg();
1985   Register Idx = MI.getOperand(3).getReg();
1986 
1987   const RegisterBank &IdxBank =
1988     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
1989 
1990   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1991 
1992   LLT VecTy = MRI.getType(VecReg);
1993   unsigned EltSize = VecTy.getScalarSizeInBits();
1994   unsigned NumElem = VecTy.getNumElements();
1995 
1996   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1997                                                   IsDivergentIdx))
1998     return false;
1999 
2000   MachineIRBuilder B(MI);
2001   LLT S32 = LLT::scalar(32);
2002 
2003   const RegisterBank &DstBank =
2004     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2005   const RegisterBank &SrcBank =
2006     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2007   const RegisterBank &InsBank =
2008     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2009 
2010   const RegisterBank &CCBank =
2011     (DstBank == AMDGPU::SGPRRegBank &&
2012      SrcBank == AMDGPU::SGPRRegBank &&
2013      InsBank == AMDGPU::SGPRRegBank &&
2014      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2015                                      : AMDGPU::VCCRegBank;
2016   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2017 
2018   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2019     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2020     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2021   }
2022 
2023   LLT EltTy = VecTy.getScalarType();
2024   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2025   unsigned NumLanes = InsRegs.size();
2026   if (!NumLanes) {
2027     NumLanes = 1;
2028     InsRegs.push_back(MI.getOperand(2).getReg());
2029   } else {
2030     EltTy = MRI.getType(InsRegs[0]);
2031   }
2032 
2033   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2034   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2035 
2036   for (unsigned I = 0; I < NumElem; ++I) {
2037     auto IC = B.buildConstant(S32, I);
2038     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2039     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2040     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2041 
2042     for (unsigned L = 0; L < NumLanes; ++L) {
2043       auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
2044                              UnmergeToEltTy.getReg(I * NumLanes + L));
2045 
2046       for (unsigned N : { 0, 2, 3 })
2047         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2048 
2049       Ops[I * NumLanes + L] = S->getOperand(0).getReg();
2050     }
2051   }
2052 
2053   LLT MergeTy = LLT::vector(Ops.size(), EltTy);
2054   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2055     B.buildBuildVector(MI.getOperand(0), Ops);
2056   } else {
2057     auto Vec = B.buildBuildVector(MergeTy, Ops);
2058     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2059     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2060   }
2061 
2062   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2063   MI.eraseFromParent();
2064 
2065   return true;
2066 }
2067 
applyMappingImpl(const OperandsMapper & OpdMapper) const2068 void AMDGPURegisterBankInfo::applyMappingImpl(
2069     const OperandsMapper &OpdMapper) const {
2070   MachineInstr &MI = OpdMapper.getMI();
2071   unsigned Opc = MI.getOpcode();
2072   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2073   switch (Opc) {
2074   case AMDGPU::G_PHI: {
2075     Register DstReg = MI.getOperand(0).getReg();
2076     LLT DstTy = MRI.getType(DstReg);
2077     if (DstTy != LLT::scalar(1))
2078       break;
2079 
2080     const LLT S32 = LLT::scalar(32);
2081     const RegisterBank *DstBank =
2082       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2083     if (DstBank == &AMDGPU::VCCRegBank) {
2084       applyDefaultMapping(OpdMapper);
2085       // The standard handling only considers the result register bank for
2086       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2087       // produce an invalid copy. We can only copy with some kind of compare to
2088       // get a vector boolean result. Insert a regitser bank copy that will be
2089       // correctly lowered to a compare.
2090       MachineIRBuilder B(*MI.getParent()->getParent());
2091 
2092       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2093         Register SrcReg = MI.getOperand(I).getReg();
2094         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2095 
2096         if (SrcBank != &AMDGPU::VCCRegBank) {
2097           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2098           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2099 
2100           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2101           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2102           MI.getOperand(I).setReg(Copy.getReg(0));
2103         }
2104       }
2105 
2106       return;
2107     }
2108 
2109     // Phi handling is strange and only considers the bank of the destination.
2110     substituteSimpleCopyRegs(OpdMapper, 0);
2111 
2112     // Promote SGPR/VGPR booleans to s32
2113     MachineFunction *MF = MI.getParent()->getParent();
2114     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2115     MachineIRBuilder B(MI, ApplyBank);
2116     LegalizerHelper Helper(*MF, ApplyBank, B);
2117 
2118     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2119       llvm_unreachable("widen scalar should have succeeded");
2120 
2121     return;
2122   }
2123   case AMDGPU::G_ICMP:
2124   case AMDGPU::G_UADDO:
2125   case AMDGPU::G_USUBO:
2126   case AMDGPU::G_UADDE:
2127   case AMDGPU::G_SADDE:
2128   case AMDGPU::G_USUBE:
2129   case AMDGPU::G_SSUBE: {
2130     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2131     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2132 
2133     const RegisterBank *DstBank =
2134       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2135     if (DstBank != &AMDGPU::SGPRRegBank)
2136       break;
2137 
2138     const bool HasCarryIn = MI.getNumOperands() == 5;
2139 
2140     // If this is a scalar compare, promote the result to s32, as the selection
2141     // will end up using a copy to a 32-bit vreg.
2142     const LLT S32 = LLT::scalar(32);
2143     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2144     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2145     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2146     MachineIRBuilder B(MI);
2147 
2148     if (HasCarryIn) {
2149       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2150       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2151       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2152       MI.getOperand(4).setReg(NewSrcReg);
2153     }
2154 
2155     MachineBasicBlock *MBB = MI.getParent();
2156     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2157 
2158     // If we had a constrained VCC result register, a copy was inserted to VCC
2159     // from SGPR.
2160     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2161     if (DefRegs.empty())
2162       DefRegs.push_back(DstReg);
2163     B.buildTrunc(DefRegs[0], NewDstReg);
2164     return;
2165   }
2166   case AMDGPU::G_SELECT: {
2167     Register DstReg = MI.getOperand(0).getReg();
2168     LLT DstTy = MRI.getType(DstReg);
2169 
2170     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2171     if (CondRegs.empty())
2172       CondRegs.push_back(MI.getOperand(1).getReg());
2173     else {
2174       assert(CondRegs.size() == 1);
2175     }
2176 
2177     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2178     if (CondBank == &AMDGPU::SGPRRegBank) {
2179       MachineIRBuilder B(MI);
2180       const LLT S32 = LLT::scalar(32);
2181       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2182       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2183 
2184       MI.getOperand(1).setReg(NewCondReg);
2185       B.buildZExt(NewCondReg, CondRegs[0]);
2186     }
2187 
2188     if (DstTy.getSizeInBits() != 64)
2189       break;
2190 
2191     MachineIRBuilder B(MI);
2192     LLT HalfTy = getHalfSizedType(DstTy);
2193 
2194     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2195     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2196     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2197 
2198     // All inputs are SGPRs, nothing special to do.
2199     if (DefRegs.empty()) {
2200       assert(Src1Regs.empty() && Src2Regs.empty());
2201       break;
2202     }
2203 
2204     if (Src1Regs.empty())
2205       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2206     else {
2207       setRegsToType(MRI, Src1Regs, HalfTy);
2208     }
2209 
2210     if (Src2Regs.empty())
2211       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2212     else
2213       setRegsToType(MRI, Src2Regs, HalfTy);
2214 
2215     setRegsToType(MRI, DefRegs, HalfTy);
2216 
2217     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2218     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2219 
2220     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2221     MI.eraseFromParent();
2222     return;
2223   }
2224   case AMDGPU::G_BRCOND: {
2225     Register CondReg = MI.getOperand(0).getReg();
2226     // FIXME: Should use legalizer helper, but should change bool ext type.
2227     const RegisterBank *CondBank =
2228       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2229 
2230     if (CondBank == &AMDGPU::SGPRRegBank) {
2231       MachineIRBuilder B(MI);
2232       const LLT S32 = LLT::scalar(32);
2233       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2234       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2235 
2236       MI.getOperand(0).setReg(NewCondReg);
2237       B.buildZExt(NewCondReg, CondReg);
2238       return;
2239     }
2240 
2241     break;
2242   }
2243   case AMDGPU::G_AND:
2244   case AMDGPU::G_OR:
2245   case AMDGPU::G_XOR: {
2246     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2247     // there is a VGPR input.
2248     Register DstReg = MI.getOperand(0).getReg();
2249     LLT DstTy = MRI.getType(DstReg);
2250 
2251     if (DstTy.getSizeInBits() == 1) {
2252       const RegisterBank *DstBank =
2253         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2254       if (DstBank == &AMDGPU::VCCRegBank)
2255         break;
2256 
2257       MachineFunction *MF = MI.getParent()->getParent();
2258       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2259       MachineIRBuilder B(MI, ApplyBank);
2260       LegalizerHelper Helper(*MF, ApplyBank, B);
2261 
2262       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2263           LegalizerHelper::Legalized)
2264         llvm_unreachable("widen scalar should have succeeded");
2265       return;
2266     }
2267 
2268     if (DstTy.getSizeInBits() != 64)
2269       break;
2270 
2271     LLT HalfTy = getHalfSizedType(DstTy);
2272     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2273     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2274     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2275 
2276     // All inputs are SGPRs, nothing special to do.
2277     if (DefRegs.empty()) {
2278       assert(Src0Regs.empty() && Src1Regs.empty());
2279       break;
2280     }
2281 
2282     assert(DefRegs.size() == 2);
2283     assert(Src0Regs.size() == Src1Regs.size() &&
2284            (Src0Regs.empty() || Src0Regs.size() == 2));
2285 
2286     // Depending on where the source registers came from, the generic code may
2287     // have decided to split the inputs already or not. If not, we still need to
2288     // extract the values.
2289     MachineIRBuilder B(MI);
2290 
2291     if (Src0Regs.empty())
2292       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2293     else
2294       setRegsToType(MRI, Src0Regs, HalfTy);
2295 
2296     if (Src1Regs.empty())
2297       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2298     else
2299       setRegsToType(MRI, Src1Regs, HalfTy);
2300 
2301     setRegsToType(MRI, DefRegs, HalfTy);
2302 
2303     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2304     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2305 
2306     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2307     MI.eraseFromParent();
2308     return;
2309   }
2310   case AMDGPU::G_ADD:
2311   case AMDGPU::G_SUB:
2312   case AMDGPU::G_MUL:
2313   case AMDGPU::G_SHL:
2314   case AMDGPU::G_LSHR:
2315   case AMDGPU::G_ASHR:
2316   case AMDGPU::G_SMIN:
2317   case AMDGPU::G_SMAX:
2318   case AMDGPU::G_UMIN:
2319   case AMDGPU::G_UMAX: {
2320     Register DstReg = MI.getOperand(0).getReg();
2321     LLT DstTy = MRI.getType(DstReg);
2322 
2323     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2324     // Packed 16-bit operations need to be scalarized and promoted.
2325     if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
2326       break;
2327 
2328     const RegisterBank *DstBank =
2329       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2330     if (DstBank == &AMDGPU::VGPRRegBank)
2331       break;
2332 
2333     const LLT S32 = LLT::scalar(32);
2334     MachineBasicBlock *MBB = MI.getParent();
2335     MachineFunction *MF = MBB->getParent();
2336     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2337     MachineIRBuilder B(MI, ApplySALU);
2338 
2339     if (DstTy.isVector()) {
2340       Register WideSrc0Lo, WideSrc0Hi;
2341       Register WideSrc1Lo, WideSrc1Hi;
2342 
2343       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2344       std::tie(WideSrc0Lo, WideSrc0Hi)
2345         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2346       std::tie(WideSrc1Lo, WideSrc1Hi)
2347         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2348       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2349       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2350       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2351       MI.eraseFromParent();
2352     } else {
2353       LegalizerHelper Helper(*MF, ApplySALU, B);
2354 
2355       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2356         llvm_unreachable("widen scalar should have succeeded");
2357 
2358       // FIXME: s16 shift amounts should be legal.
2359       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2360           Opc == AMDGPU::G_ASHR) {
2361         B.setInsertPt(*MBB, MI.getIterator());
2362         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2363           llvm_unreachable("widen scalar should have succeeded");
2364       }
2365     }
2366 
2367     return;
2368   }
2369   case AMDGPU::G_SEXT_INREG: {
2370     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2371     if (SrcRegs.empty())
2372       break; // Nothing to repair
2373 
2374     const LLT S32 = LLT::scalar(32);
2375     MachineIRBuilder B(MI);
2376     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2377     GISelObserverWrapper Observer(&O);
2378     B.setChangeObserver(Observer);
2379 
2380     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2381     // we would need to further expand, and doesn't let us directly set the
2382     // result registers.
2383     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2384 
2385     int Amt = MI.getOperand(2).getImm();
2386     if (Amt <= 32) {
2387       if (Amt == 32) {
2388         // The low bits are unchanged.
2389         B.buildCopy(DstRegs[0], SrcRegs[0]);
2390       } else {
2391         // Extend in the low bits and propagate the sign bit to the high half.
2392         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2393       }
2394 
2395       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2396     } else {
2397       // The low bits are unchanged, and extend in the high bits.
2398       B.buildCopy(DstRegs[0], SrcRegs[0]);
2399       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2400     }
2401 
2402     Register DstReg = MI.getOperand(0).getReg();
2403     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2404     MI.eraseFromParent();
2405     return;
2406   }
2407   case AMDGPU::G_CTPOP:
2408   case AMDGPU::G_BITREVERSE:
2409   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2410   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2411     const RegisterBank *DstBank =
2412       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2413     if (DstBank == &AMDGPU::SGPRRegBank)
2414       break;
2415 
2416     Register SrcReg = MI.getOperand(1).getReg();
2417     const LLT S32 = LLT::scalar(32);
2418     LLT Ty = MRI.getType(SrcReg);
2419     if (Ty == S32)
2420       break;
2421 
2422     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2423     MachineIRBuilder B(MI, ApplyVALU);
2424 
2425     MachineFunction &MF = B.getMF();
2426     LegalizerHelper Helper(MF, ApplyVALU, B);
2427 
2428     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2429       llvm_unreachable("narrowScalar should have succeeded");
2430     return;
2431   }
2432   case AMDGPU::G_SEXT:
2433   case AMDGPU::G_ZEXT:
2434   case AMDGPU::G_ANYEXT: {
2435     Register SrcReg = MI.getOperand(1).getReg();
2436     LLT SrcTy = MRI.getType(SrcReg);
2437     const bool Signed = Opc == AMDGPU::G_SEXT;
2438 
2439     assert(empty(OpdMapper.getVRegs(1)));
2440 
2441     MachineIRBuilder B(MI);
2442     const RegisterBank *SrcBank =
2443       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2444 
2445     Register DstReg = MI.getOperand(0).getReg();
2446     LLT DstTy = MRI.getType(DstReg);
2447     if (DstTy.isScalar() &&
2448         SrcBank != &AMDGPU::SGPRRegBank &&
2449         SrcBank != &AMDGPU::VCCRegBank &&
2450         // FIXME: Should handle any type that round to s64 when irregular
2451         // breakdowns supported.
2452         DstTy.getSizeInBits() == 64 &&
2453         SrcTy.getSizeInBits() <= 32) {
2454       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2455 
2456       // Extend to 32-bit, and then extend the low half.
2457       if (Signed) {
2458         // TODO: Should really be buildSExtOrCopy
2459         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2460       } else if (Opc == AMDGPU::G_ZEXT) {
2461         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2462       } else {
2463         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2464       }
2465 
2466       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2467       MRI.setRegBank(DstReg, *SrcBank);
2468       MI.eraseFromParent();
2469       return;
2470     }
2471 
2472     if (SrcTy != LLT::scalar(1))
2473       return;
2474 
2475     // It is not legal to have a legalization artifact with a VCC source. Rather
2476     // than introducing a copy, insert the select we would have to select the
2477     // copy to.
2478     if (SrcBank == &AMDGPU::VCCRegBank) {
2479       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2480 
2481       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2482 
2483       unsigned DstSize = DstTy.getSizeInBits();
2484       // 64-bit select is SGPR only
2485       const bool UseSel64 = DstSize > 32 &&
2486         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2487 
2488       // TODO: Should s16 select be legal?
2489       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2490       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2491       auto False = B.buildConstant(SelType, 0);
2492 
2493       MRI.setRegBank(True.getReg(0), *DstBank);
2494       MRI.setRegBank(False.getReg(0), *DstBank);
2495       MRI.setRegBank(DstReg, *DstBank);
2496 
2497       if (DstSize > 32) {
2498         B.buildSelect(DefRegs[0], SrcReg, True, False);
2499         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2500       } else if (DstSize < 32) {
2501         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2502         MRI.setRegBank(Sel.getReg(0), *DstBank);
2503         B.buildTrunc(DstReg, Sel);
2504       } else {
2505         B.buildSelect(DstReg, SrcReg, True, False);
2506       }
2507 
2508       MI.eraseFromParent();
2509       return;
2510     }
2511 
2512     break;
2513   }
2514   case AMDGPU::G_BUILD_VECTOR:
2515   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2516     Register DstReg = MI.getOperand(0).getReg();
2517     LLT DstTy = MRI.getType(DstReg);
2518     if (DstTy != LLT::vector(2, 16))
2519       break;
2520 
2521     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2522     substituteSimpleCopyRegs(OpdMapper, 1);
2523     substituteSimpleCopyRegs(OpdMapper, 2);
2524 
2525     const RegisterBank *DstBank =
2526       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2527     if (DstBank == &AMDGPU::SGPRRegBank)
2528       break; // Can use S_PACK_* instructions.
2529 
2530     MachineIRBuilder B(MI);
2531 
2532     Register Lo = MI.getOperand(1).getReg();
2533     Register Hi = MI.getOperand(2).getReg();
2534     const LLT S32 = LLT::scalar(32);
2535 
2536     const RegisterBank *BankLo =
2537       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2538     const RegisterBank *BankHi =
2539       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2540 
2541     Register ZextLo;
2542     Register ShiftHi;
2543 
2544     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2545       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2546       MRI.setRegBank(ZextLo, *BankLo);
2547 
2548       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2549       MRI.setRegBank(ZextHi, *BankHi);
2550 
2551       auto ShiftAmt = B.buildConstant(S32, 16);
2552       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2553 
2554       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2555       MRI.setRegBank(ShiftHi, *BankHi);
2556     } else {
2557       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2558       MRI.setRegBank(MaskLo, *BankLo);
2559 
2560       auto ShiftAmt = B.buildConstant(S32, 16);
2561       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2562 
2563       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2564       MRI.setRegBank(ShiftHi, *BankHi);
2565 
2566       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2567       MRI.setRegBank(ZextLo, *BankLo);
2568     }
2569 
2570     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2571     MRI.setRegBank(Or.getReg(0), *DstBank);
2572 
2573     B.buildBitcast(DstReg, Or);
2574     MI.eraseFromParent();
2575     return;
2576   }
2577   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2578     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2579 
2580     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2581 
2582     Register DstReg = MI.getOperand(0).getReg();
2583     Register SrcReg = MI.getOperand(1).getReg();
2584 
2585     const LLT S32 = LLT::scalar(32);
2586     LLT DstTy = MRI.getType(DstReg);
2587     LLT SrcTy = MRI.getType(SrcReg);
2588 
2589     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2590       return;
2591 
2592     MachineIRBuilder B(MI);
2593 
2594     const ValueMapping &DstMapping
2595       = OpdMapper.getInstrMapping().getOperandMapping(0);
2596     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2597     const RegisterBank *SrcBank =
2598       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2599     const RegisterBank *IdxBank =
2600         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2601 
2602     Register BaseIdxReg;
2603     unsigned ConstOffset;
2604     std::tie(BaseIdxReg, ConstOffset) =
2605         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2606 
2607     // See if the index is an add of a constant which will be foldable by moving
2608     // the base register of the index later if this is going to be executed in a
2609     // waterfall loop. This is essentially to reassociate the add of a constant
2610     // with the readfirstlane.
2611     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2612                                    ConstOffset > 0 &&
2613                                    ConstOffset < SrcTy.getNumElements();
2614 
2615     // Move the base register. We'll re-insert the add later.
2616     if (ShouldMoveIndexIntoLoop)
2617       MI.getOperand(2).setReg(BaseIdxReg);
2618 
2619     // If this is a VGPR result only because the index was a VGPR result, the
2620     // actual indexing will be done on the SGPR source vector, which will
2621     // produce a scalar result. We need to copy to the VGPR result inside the
2622     // waterfall loop.
2623     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2624                                 SrcBank == &AMDGPU::SGPRRegBank;
2625     if (DstRegs.empty()) {
2626       applyDefaultMapping(OpdMapper);
2627 
2628       executeInWaterfallLoop(MI, MRI, { 2 });
2629 
2630       if (NeedCopyToVGPR) {
2631         // We don't want a phi for this temporary reg.
2632         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2633         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2634         MI.getOperand(0).setReg(TmpReg);
2635         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2636 
2637         // Use a v_mov_b32 here to make the exec dependency explicit.
2638         buildVCopy(B, DstReg, TmpReg);
2639       }
2640 
2641       // Re-insert the constant offset add inside the waterfall loop.
2642       if (ShouldMoveIndexIntoLoop)
2643         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2644 
2645       return;
2646     }
2647 
2648     assert(DstTy.getSizeInBits() == 64);
2649 
2650     LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2651 
2652     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2653     auto One = B.buildConstant(S32, 1);
2654 
2655     MachineBasicBlock::iterator MII = MI.getIterator();
2656 
2657     // Split the vector index into 32-bit pieces. Prepare to move all of the
2658     // new instructions into a waterfall loop if necessary.
2659     //
2660     // Don't put the bitcast or constant in the loop.
2661     MachineInstrSpan Span(MII, &B.getMBB());
2662 
2663     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2664     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2665     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2666 
2667     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2668     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2669 
2670     MRI.setRegBank(DstReg, *DstBank);
2671     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2672     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2673     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2674     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2675 
2676     SmallSet<Register, 4> OpsToWaterfall;
2677     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2678       MI.eraseFromParent();
2679       return;
2680     }
2681 
2682     // Remove the original instruction to avoid potentially confusing the
2683     // waterfall loop logic.
2684     B.setInstr(*Span.begin());
2685     MI.eraseFromParent();
2686     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2687                            OpsToWaterfall, MRI);
2688 
2689     if (NeedCopyToVGPR) {
2690       MachineBasicBlock *LoopBB = Extract1->getParent();
2691       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2692       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2693       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2694       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2695 
2696       Extract0->getOperand(0).setReg(TmpReg0);
2697       Extract1->getOperand(0).setReg(TmpReg1);
2698 
2699       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2700 
2701       buildVCopy(B, DstRegs[0], TmpReg0);
2702       buildVCopy(B, DstRegs[1], TmpReg1);
2703     }
2704 
2705     if (ShouldMoveIndexIntoLoop)
2706       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2707 
2708     return;
2709   }
2710   case AMDGPU::G_INSERT_VECTOR_ELT: {
2711     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2712 
2713     Register DstReg = MI.getOperand(0).getReg();
2714     LLT VecTy = MRI.getType(DstReg);
2715 
2716     assert(OpdMapper.getVRegs(0).empty());
2717     assert(OpdMapper.getVRegs(3).empty());
2718 
2719     if (substituteSimpleCopyRegs(OpdMapper, 1))
2720       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2721 
2722     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2723       return;
2724 
2725     const RegisterBank *IdxBank =
2726       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2727 
2728     Register SrcReg = MI.getOperand(1).getReg();
2729     Register InsReg = MI.getOperand(2).getReg();
2730     LLT InsTy = MRI.getType(InsReg);
2731     (void)InsTy;
2732 
2733     Register BaseIdxReg;
2734     unsigned ConstOffset;
2735     std::tie(BaseIdxReg, ConstOffset) =
2736         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2737 
2738     // See if the index is an add of a constant which will be foldable by moving
2739     // the base register of the index later if this is going to be executed in a
2740     // waterfall loop. This is essentially to reassociate the add of a constant
2741     // with the readfirstlane.
2742     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2743       ConstOffset > 0 &&
2744       ConstOffset < VecTy.getNumElements();
2745 
2746     // Move the base register. We'll re-insert the add later.
2747     if (ShouldMoveIndexIntoLoop)
2748       MI.getOperand(3).setReg(BaseIdxReg);
2749 
2750 
2751     if (InsRegs.empty()) {
2752       executeInWaterfallLoop(MI, MRI, { 3 });
2753 
2754       // Re-insert the constant offset add inside the waterfall loop.
2755       if (ShouldMoveIndexIntoLoop) {
2756         MachineIRBuilder B(MI);
2757         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2758       }
2759 
2760       return;
2761     }
2762 
2763 
2764     assert(InsTy.getSizeInBits() == 64);
2765 
2766     const LLT S32 = LLT::scalar(32);
2767     LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2768 
2769     MachineIRBuilder B(MI);
2770     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2771     auto One = B.buildConstant(S32, 1);
2772 
2773     // Split the vector index into 32-bit pieces. Prepare to move all of the
2774     // new instructions into a waterfall loop if necessary.
2775     //
2776     // Don't put the bitcast or constant in the loop.
2777     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2778 
2779     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2780     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2781     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2782 
2783     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2784     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2785 
2786     const RegisterBank *DstBank =
2787       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2788     const RegisterBank *SrcBank =
2789       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2790     const RegisterBank *InsSrcBank =
2791       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2792 
2793     MRI.setRegBank(InsReg, *InsSrcBank);
2794     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2795     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2796     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2797     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2798     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2799     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2800 
2801 
2802     SmallSet<Register, 4> OpsToWaterfall;
2803     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2804       B.setInsertPt(B.getMBB(), MI);
2805       B.buildBitcast(DstReg, InsHi);
2806       MI.eraseFromParent();
2807       return;
2808     }
2809 
2810     B.setInstr(*Span.begin());
2811     MI.eraseFromParent();
2812 
2813     // Figure out the point after the waterfall loop before mangling the control
2814     // flow.
2815     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2816                            OpsToWaterfall, MRI);
2817 
2818     // The insertion point is now right after the original instruction.
2819     //
2820     // Keep the bitcast to the original vector type out of the loop. Doing this
2821     // saved an extra phi we don't need inside the loop.
2822     B.buildBitcast(DstReg, InsHi);
2823 
2824     // Re-insert the constant offset add inside the waterfall loop.
2825     if (ShouldMoveIndexIntoLoop)
2826       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2827 
2828     return;
2829   }
2830   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2831   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2832   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2833   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2834   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2835   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2836   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2837   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2838   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2839   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2840   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2841   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2842   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2843   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2844   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2845   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2846     applyDefaultMapping(OpdMapper);
2847     executeInWaterfallLoop(MI, MRI, {1, 4});
2848     return;
2849   }
2850   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2851   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2852   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2853   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2854   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2855   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2856   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2857   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2858   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2859   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2860   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2861   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2862     applyDefaultMapping(OpdMapper);
2863     executeInWaterfallLoop(MI, MRI, {2, 5});
2864     return;
2865   }
2866   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2867   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2868   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2869     applyDefaultMapping(OpdMapper);
2870     executeInWaterfallLoop(MI, MRI, {2, 5});
2871     return;
2872   }
2873   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2874     applyDefaultMapping(OpdMapper);
2875     executeInWaterfallLoop(MI, MRI, {3, 6});
2876     return;
2877   }
2878   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2879     applyMappingSBufferLoad(OpdMapper);
2880     return;
2881   }
2882   case AMDGPU::G_INTRINSIC: {
2883     switch (MI.getIntrinsicID()) {
2884     case Intrinsic::amdgcn_readlane: {
2885       substituteSimpleCopyRegs(OpdMapper, 2);
2886 
2887       assert(OpdMapper.getVRegs(0).empty());
2888       assert(OpdMapper.getVRegs(3).empty());
2889 
2890       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2891       // waterfall loop, so assume it's a uniform value.
2892       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2893       return;
2894     }
2895     case Intrinsic::amdgcn_writelane: {
2896       assert(OpdMapper.getVRegs(0).empty());
2897       assert(OpdMapper.getVRegs(2).empty());
2898       assert(OpdMapper.getVRegs(3).empty());
2899 
2900       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2901       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2902       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2903       return;
2904     }
2905     case Intrinsic::amdgcn_interp_p1:
2906     case Intrinsic::amdgcn_interp_p2:
2907     case Intrinsic::amdgcn_interp_mov:
2908     case Intrinsic::amdgcn_interp_p1_f16:
2909     case Intrinsic::amdgcn_interp_p2_f16: {
2910       applyDefaultMapping(OpdMapper);
2911 
2912       // Readlane for m0 value, which is always the last operand.
2913       // FIXME: Should this be a waterfall loop instead?
2914       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2915       return;
2916     }
2917     case Intrinsic::amdgcn_permlane16:
2918     case Intrinsic::amdgcn_permlanex16: {
2919       // Doing a waterfall loop over these wouldn't make any sense.
2920       substituteSimpleCopyRegs(OpdMapper, 2);
2921       substituteSimpleCopyRegs(OpdMapper, 3);
2922       constrainOpWithReadfirstlane(MI, MRI, 4);
2923       constrainOpWithReadfirstlane(MI, MRI, 5);
2924       return;
2925     }
2926     case Intrinsic::amdgcn_sbfe:
2927       applyMappingBFEIntrinsic(OpdMapper, true);
2928       return;
2929     case Intrinsic::amdgcn_ubfe:
2930       applyMappingBFEIntrinsic(OpdMapper, false);
2931       return;
2932     case Intrinsic::amdgcn_ballot:
2933       // Use default handling and insert copy to vcc source.
2934       break;
2935     }
2936     break;
2937   }
2938   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2939   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2940     const AMDGPU::RsrcIntrinsic *RSrcIntrin
2941       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
2942     assert(RSrcIntrin && RSrcIntrin->IsImage);
2943     // Non-images can have complications from operands that allow both SGPR
2944     // and VGPR. For now it's too complicated to figure out the final opcode
2945     // to derive the register bank from the MCInstrDesc.
2946     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2947     return;
2948   }
2949   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
2950     unsigned N = MI.getNumExplicitOperands() - 2;
2951     executeInWaterfallLoop(MI, MRI, { N });
2952     return;
2953   }
2954   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2955     auto IntrID = MI.getIntrinsicID();
2956     switch (IntrID) {
2957     case Intrinsic::amdgcn_ds_ordered_add:
2958     case Intrinsic::amdgcn_ds_ordered_swap: {
2959       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2960       assert(OpdMapper.getVRegs(0).empty());
2961       substituteSimpleCopyRegs(OpdMapper, 3);
2962       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2963       return;
2964     }
2965     case Intrinsic::amdgcn_ds_gws_init:
2966     case Intrinsic::amdgcn_ds_gws_barrier:
2967     case Intrinsic::amdgcn_ds_gws_sema_br: {
2968       // Only the first lane is executes, so readfirstlane is safe.
2969       substituteSimpleCopyRegs(OpdMapper, 1);
2970       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2971       return;
2972     }
2973     case Intrinsic::amdgcn_ds_gws_sema_v:
2974     case Intrinsic::amdgcn_ds_gws_sema_p:
2975     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2976       // Only the first lane is executes, so readfirstlane is safe.
2977       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2978       return;
2979     }
2980     case Intrinsic::amdgcn_ds_append:
2981     case Intrinsic::amdgcn_ds_consume: {
2982       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2983       return;
2984     }
2985     case Intrinsic::amdgcn_s_sendmsg:
2986     case Intrinsic::amdgcn_s_sendmsghalt: {
2987       // FIXME: Should this use a waterfall loop?
2988       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2989       return;
2990     }
2991     case Intrinsic::amdgcn_s_setreg: {
2992       constrainOpWithReadfirstlane(MI, MRI, 2);
2993       return;
2994     }
2995     default: {
2996       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2997               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2998         // Non-images can have complications from operands that allow both SGPR
2999         // and VGPR. For now it's too complicated to figure out the final opcode
3000         // to derive the register bank from the MCInstrDesc.
3001         if (RSrcIntrin->IsImage) {
3002           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3003           return;
3004         }
3005       }
3006 
3007       break;
3008     }
3009     }
3010     break;
3011   }
3012   case AMDGPU::G_LOAD:
3013   case AMDGPU::G_ZEXTLOAD:
3014   case AMDGPU::G_SEXTLOAD: {
3015     if (applyMappingLoad(MI, OpdMapper, MRI))
3016       return;
3017     break;
3018   }
3019   case AMDGPU::G_DYN_STACKALLOC:
3020     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3021     return;
3022   default:
3023     break;
3024   }
3025 
3026   return applyDefaultMapping(OpdMapper);
3027 }
3028 
3029 // vgpr, sgpr -> vgpr
3030 // vgpr, agpr -> vgpr
3031 // agpr, agpr -> agpr
3032 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3033 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3034   if (RB0 == AMDGPU::InvalidRegBankID)
3035     return RB1;
3036   if (RB1 == AMDGPU::InvalidRegBankID)
3037     return RB0;
3038 
3039   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3040     return AMDGPU::SGPRRegBankID;
3041 
3042   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3043     return AMDGPU::AGPRRegBankID;
3044 
3045   return AMDGPU::VGPRRegBankID;
3046 }
3047 
regBankBoolUnion(unsigned RB0,unsigned RB1)3048 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3049   if (RB0 == AMDGPU::InvalidRegBankID)
3050     return RB1;
3051   if (RB1 == AMDGPU::InvalidRegBankID)
3052     return RB0;
3053 
3054   // vcc, vcc -> vcc
3055   // vcc, sgpr -> vcc
3056   // vcc, vgpr -> vcc
3057   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3058     return AMDGPU::VCCRegBankID;
3059 
3060   // vcc, vgpr -> vgpr
3061   return regBankUnion(RB0, RB1);
3062 }
3063 
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3064 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3065                                                 const MachineInstr &MI) const {
3066   unsigned RegBank = AMDGPU::InvalidRegBankID;
3067 
3068   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3069     if (!MI.getOperand(i).isReg())
3070       continue;
3071     Register Reg = MI.getOperand(i).getReg();
3072     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3073       RegBank = regBankUnion(RegBank, Bank->getID());
3074       if (RegBank == AMDGPU::VGPRRegBankID)
3075         break;
3076     }
3077   }
3078 
3079   return RegBank;
3080 }
3081 
isSALUMapping(const MachineInstr & MI) const3082 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3083   const MachineFunction &MF = *MI.getParent()->getParent();
3084   const MachineRegisterInfo &MRI = MF.getRegInfo();
3085   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3086     if (!MI.getOperand(i).isReg())
3087       continue;
3088     Register Reg = MI.getOperand(i).getReg();
3089     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3090       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3091         return false;
3092     }
3093   }
3094   return true;
3095 }
3096 
3097 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3098 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3099   const MachineFunction &MF = *MI.getParent()->getParent();
3100   const MachineRegisterInfo &MRI = MF.getRegInfo();
3101   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3102 
3103   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3104     const MachineOperand &SrcOp = MI.getOperand(i);
3105     if (!SrcOp.isReg())
3106       continue;
3107 
3108     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3109     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3110   }
3111   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3112                                MI.getNumOperands());
3113 }
3114 
3115 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3116 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3117   const MachineFunction &MF = *MI.getParent()->getParent();
3118   const MachineRegisterInfo &MRI = MF.getRegInfo();
3119   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3120 
3121   // Even though we technically could use SGPRs, this would require knowledge of
3122   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3123   //
3124   // TODO: Unary ops are trivially OK, so accept SGPRs?
3125   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3126     const MachineOperand &Src = MI.getOperand(i);
3127     if (!Src.isReg())
3128       continue;
3129 
3130     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3131     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3132     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3133   }
3134 
3135   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3136                                MI.getNumOperands());
3137 }
3138 
3139 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3140 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3141   const MachineFunction &MF = *MI.getParent()->getParent();
3142   const MachineRegisterInfo &MRI = MF.getRegInfo();
3143   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3144 
3145   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3146     const MachineOperand &Op = MI.getOperand(I);
3147     if (!Op.isReg())
3148       continue;
3149 
3150     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3151     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3152   }
3153 
3154   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3155                                MI.getNumOperands());
3156 }
3157 
3158 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3159 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3160                                         const MachineInstr &MI,
3161                                         int RsrcIdx) const {
3162   // The reported argument index is relative to the IR intrinsic call arguments,
3163   // so we need to shift by the number of defs and the intrinsic ID.
3164   RsrcIdx += MI.getNumExplicitDefs() + 1;
3165 
3166   const int NumOps = MI.getNumOperands();
3167   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3168 
3169   // TODO: Should packed/unpacked D16 difference be reported here as part of
3170   // the value mapping?
3171   for (int I = 0; I != NumOps; ++I) {
3172     if (!MI.getOperand(I).isReg())
3173       continue;
3174 
3175     Register OpReg = MI.getOperand(I).getReg();
3176     // We replace some dead address operands with $noreg
3177     if (!OpReg)
3178       continue;
3179 
3180     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3181 
3182     // FIXME: Probably need a new intrinsic register bank searchable table to
3183     // handle arbitrary intrinsics easily.
3184     //
3185     // If this has a sampler, it immediately follows rsrc.
3186     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3187 
3188     if (MustBeSGPR) {
3189       // If this must be an SGPR, so we must report whatever it is as legal.
3190       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3191       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3192     } else {
3193       // Some operands must be VGPR, and these are easy to copy to.
3194       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3195     }
3196   }
3197 
3198   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3199 }
3200 
3201 /// Return the mapping for a pointer arugment.
3202 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3203 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3204                                               Register PtrReg) const {
3205   LLT PtrTy = MRI.getType(PtrReg);
3206   unsigned Size = PtrTy.getSizeInBits();
3207   if (Subtarget.useFlatForGlobal() ||
3208       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3209     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3210 
3211   // If we're using MUBUF instructions for global memory, an SGPR base register
3212   // is possible. Otherwise this needs to be a VGPR.
3213   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3214   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3215 }
3216 
3217 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3218 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3219 
3220   const MachineFunction &MF = *MI.getParent()->getParent();
3221   const MachineRegisterInfo &MRI = MF.getRegInfo();
3222   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3223   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3224   Register PtrReg = MI.getOperand(1).getReg();
3225   LLT PtrTy = MRI.getType(PtrReg);
3226   unsigned AS = PtrTy.getAddressSpace();
3227   unsigned PtrSize = PtrTy.getSizeInBits();
3228 
3229   const ValueMapping *ValMapping;
3230   const ValueMapping *PtrMapping;
3231 
3232   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3233 
3234   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3235     if (isScalarLoadLegal(MI)) {
3236       // We have a uniform instruction so we want to use an SMRD load
3237       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3238       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3239     } else {
3240       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3241 
3242       // If we're using MUBUF instructions for global memory, an SGPR base
3243       // register is possible. Otherwise this needs to be a VGPR.
3244       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3245         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3246 
3247       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3248     }
3249   } else {
3250     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3251     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3252   }
3253 
3254   OpdsMapping[0] = ValMapping;
3255   OpdsMapping[1] = PtrMapping;
3256   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3257       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3258   return Mapping;
3259 
3260   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3261   // handle that during instruction selection?
3262 }
3263 
3264 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3265 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3266                                      const MachineRegisterInfo &MRI,
3267                                      unsigned Default) const {
3268   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3269   return Bank ? Bank->getID() : Default;
3270 }
3271 
3272 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3273 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3274                                          const MachineRegisterInfo &MRI,
3275                                          const TargetRegisterInfo &TRI) const {
3276   // Lie and claim anything is legal, even though this needs to be an SGPR
3277   // applyMapping will have to deal with it as a waterfall loop.
3278   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3279   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3280   return AMDGPU::getValueMapping(Bank, Size);
3281 }
3282 
3283 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3284 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3285                                          const MachineRegisterInfo &MRI,
3286                                          const TargetRegisterInfo &TRI) const {
3287   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3288   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3289 }
3290 
3291 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3292 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3293                                          const MachineRegisterInfo &MRI,
3294                                          const TargetRegisterInfo &TRI) const {
3295   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3296   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3297 }
3298 
3299 ///
3300 /// This function must return a legal mapping, because
3301 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3302 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3303 /// VGPR to SGPR generated is illegal.
3304 ///
3305 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3306 // legal. These will be dealt with in applyMappingImpl.
3307 //
3308 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3309 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3310   const MachineFunction &MF = *MI.getParent()->getParent();
3311   const MachineRegisterInfo &MRI = MF.getRegInfo();
3312 
3313   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3314     // The default logic bothers to analyze impossible alternative mappings. We
3315     // want the most straightforward mapping, so just directly handle this.
3316     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3317                                              *TRI);
3318     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3319                                              *TRI);
3320     assert(SrcBank && "src bank should have been assigned already");
3321     if (!DstBank)
3322       DstBank = SrcBank;
3323 
3324     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3325     if (cannotCopy(*DstBank, *SrcBank, Size))
3326       return getInvalidInstructionMapping();
3327 
3328     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3329     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3330     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3331     OpdsMapping[0] = &ValMap;
3332     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3333       OpdsMapping[1] = &ValMap;
3334 
3335     return getInstructionMapping(
3336         1, /*Cost*/ 1,
3337         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3338   }
3339 
3340   if (MI.isRegSequence()) {
3341     // If any input is a VGPR, the result must be a VGPR. The default handling
3342     // assumes any copy between banks is legal.
3343     unsigned BankID = AMDGPU::SGPRRegBankID;
3344 
3345     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3346       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3347       // It doesn't make sense to use vcc or scc banks here, so just ignore
3348       // them.
3349       if (OpBank != AMDGPU::SGPRRegBankID) {
3350         BankID = AMDGPU::VGPRRegBankID;
3351         break;
3352       }
3353     }
3354     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3355 
3356     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3357     return getInstructionMapping(
3358         1, /*Cost*/ 1,
3359         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3360   }
3361 
3362   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3363   // properly.
3364   //
3365   // TODO: There are additional exec masking dependencies to analyze.
3366   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3367     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3368     Register DstReg = MI.getOperand(0).getReg();
3369 
3370     // Sometimes the result may have already been assigned a bank.
3371     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3372       ResultBank = DstBank->getID();
3373 
3374     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3375       Register Reg = MI.getOperand(I).getReg();
3376       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3377 
3378       // FIXME: Assuming VGPR for any undetermined inputs.
3379       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3380         ResultBank = AMDGPU::VGPRRegBankID;
3381         break;
3382       }
3383 
3384       // FIXME: Need to promote SGPR case to s32
3385       unsigned OpBank = Bank->getID();
3386       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3387     }
3388 
3389     assert(ResultBank != AMDGPU::InvalidRegBankID);
3390 
3391     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3392 
3393     const ValueMapping &ValMap =
3394         getValueMapping(0, Size, getRegBank(ResultBank));
3395     return getInstructionMapping(
3396         1, /*Cost*/ 1,
3397         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3398   }
3399 
3400   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3401   if (Mapping.isValid())
3402     return Mapping;
3403 
3404   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3405 
3406   switch (MI.getOpcode()) {
3407   default:
3408     return getInvalidInstructionMapping();
3409 
3410   case AMDGPU::G_AND:
3411   case AMDGPU::G_OR:
3412   case AMDGPU::G_XOR: {
3413     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3414     if (Size == 1) {
3415       const RegisterBank *DstBank
3416         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3417 
3418       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3419       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3420       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3421       if (DstBank) {
3422         TargetBankID = DstBank->getID();
3423         if (DstBank == &AMDGPU::VCCRegBank) {
3424           TargetBankID = AMDGPU::VCCRegBankID;
3425           BankLHS = AMDGPU::VCCRegBankID;
3426           BankRHS = AMDGPU::VCCRegBankID;
3427         } else {
3428           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3429                                  AMDGPU::SGPRRegBankID);
3430           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3431                                  AMDGPU::SGPRRegBankID);
3432         }
3433       } else {
3434         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3435                                AMDGPU::VCCRegBankID);
3436         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3437                                AMDGPU::VCCRegBankID);
3438 
3439         // Both inputs should be true booleans to produce a boolean result.
3440         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3441           TargetBankID = AMDGPU::VGPRRegBankID;
3442         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3443           TargetBankID = AMDGPU::VCCRegBankID;
3444           BankLHS = AMDGPU::VCCRegBankID;
3445           BankRHS = AMDGPU::VCCRegBankID;
3446         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3447           TargetBankID = AMDGPU::SGPRRegBankID;
3448         }
3449       }
3450 
3451       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3452       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3453       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3454       break;
3455     }
3456 
3457     if (Size == 64) {
3458 
3459       if (isSALUMapping(MI)) {
3460         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3461         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3462       } else {
3463         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3464         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3465         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3466 
3467         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3468         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3469       }
3470 
3471       break;
3472     }
3473 
3474     LLVM_FALLTHROUGH;
3475   }
3476   case AMDGPU::G_PTR_ADD:
3477   case AMDGPU::G_PTRMASK:
3478   case AMDGPU::G_ADD:
3479   case AMDGPU::G_SUB:
3480   case AMDGPU::G_MUL:
3481   case AMDGPU::G_SHL:
3482   case AMDGPU::G_LSHR:
3483   case AMDGPU::G_ASHR:
3484   case AMDGPU::G_UADDO:
3485   case AMDGPU::G_USUBO:
3486   case AMDGPU::G_UADDE:
3487   case AMDGPU::G_SADDE:
3488   case AMDGPU::G_USUBE:
3489   case AMDGPU::G_SSUBE:
3490   case AMDGPU::G_SMIN:
3491   case AMDGPU::G_SMAX:
3492   case AMDGPU::G_UMIN:
3493   case AMDGPU::G_UMAX:
3494   case AMDGPU::G_SHUFFLE_VECTOR:
3495     if (isSALUMapping(MI))
3496       return getDefaultMappingSOP(MI);
3497     LLVM_FALLTHROUGH;
3498 
3499   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3500   case AMDGPU::G_SSUBSAT:
3501   case AMDGPU::G_UADDSAT:
3502   case AMDGPU::G_USUBSAT:
3503   case AMDGPU::G_FADD:
3504   case AMDGPU::G_FSUB:
3505   case AMDGPU::G_FPTOSI:
3506   case AMDGPU::G_FPTOUI:
3507   case AMDGPU::G_FMUL:
3508   case AMDGPU::G_FMA:
3509   case AMDGPU::G_FMAD:
3510   case AMDGPU::G_FSQRT:
3511   case AMDGPU::G_FFLOOR:
3512   case AMDGPU::G_FCEIL:
3513   case AMDGPU::G_FRINT:
3514   case AMDGPU::G_SITOFP:
3515   case AMDGPU::G_UITOFP:
3516   case AMDGPU::G_FPTRUNC:
3517   case AMDGPU::G_FPEXT:
3518   case AMDGPU::G_FEXP2:
3519   case AMDGPU::G_FLOG2:
3520   case AMDGPU::G_FMINNUM:
3521   case AMDGPU::G_FMAXNUM:
3522   case AMDGPU::G_FMINNUM_IEEE:
3523   case AMDGPU::G_FMAXNUM_IEEE:
3524   case AMDGPU::G_FCANONICALIZE:
3525   case AMDGPU::G_INTRINSIC_TRUNC:
3526   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3527   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3528   case AMDGPU::G_AMDGPU_FFBH_U32:
3529   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3530   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3531   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3532   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3533   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3534   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3535   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3536   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3537   case AMDGPU::G_AMDGPU_SMED3:
3538     return getDefaultMappingVOP(MI);
3539   case AMDGPU::G_UMULH:
3540   case AMDGPU::G_SMULH: {
3541     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3542       return getDefaultMappingSOP(MI);
3543     return getDefaultMappingVOP(MI);
3544   }
3545   case AMDGPU::G_IMPLICIT_DEF: {
3546     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3547     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3548     break;
3549   }
3550   case AMDGPU::G_FCONSTANT:
3551   case AMDGPU::G_CONSTANT:
3552   case AMDGPU::G_GLOBAL_VALUE:
3553   case AMDGPU::G_BLOCK_ADDR:
3554   case AMDGPU::G_READCYCLECOUNTER: {
3555     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3556     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3557     break;
3558   }
3559   case AMDGPU::G_FRAME_INDEX: {
3560     // TODO: This should be the same as other constants, but eliminateFrameIndex
3561     // currently assumes VALU uses.
3562     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3563     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3564     break;
3565   }
3566   case AMDGPU::G_DYN_STACKALLOC: {
3567     // Result is always uniform, and a wave reduction is needed for the source.
3568     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3569     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3570     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3571     break;
3572   }
3573   case AMDGPU::G_INSERT: {
3574     unsigned BankID = getMappingType(MRI, MI);
3575     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3576     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3577     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3578     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3579     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3580     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3581     OpdsMapping[3] = nullptr;
3582     break;
3583   }
3584   case AMDGPU::G_EXTRACT: {
3585     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3586     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3587     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3588     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3589     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3590     OpdsMapping[2] = nullptr;
3591     break;
3592   }
3593   case AMDGPU::G_BUILD_VECTOR:
3594   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3595     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3596     if (DstTy == LLT::vector(2, 16)) {
3597       unsigned DstSize = DstTy.getSizeInBits();
3598       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3599       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3600       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3601       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3602 
3603       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3604       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3605       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3606       break;
3607     }
3608 
3609     LLVM_FALLTHROUGH;
3610   }
3611   case AMDGPU::G_MERGE_VALUES:
3612   case AMDGPU::G_CONCAT_VECTORS: {
3613     unsigned Bank = getMappingType(MRI, MI);
3614     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3615     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3616 
3617     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3618     // Op1 and Dst should use the same register bank.
3619     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3620       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3621     break;
3622   }
3623   case AMDGPU::G_BITREVERSE:
3624   case AMDGPU::G_BITCAST:
3625   case AMDGPU::G_INTTOPTR:
3626   case AMDGPU::G_PTRTOINT:
3627   case AMDGPU::G_FABS:
3628   case AMDGPU::G_FNEG: {
3629     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3630     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3631     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3632     break;
3633   }
3634   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3635   case AMDGPU::G_CTTZ_ZERO_UNDEF:
3636   case AMDGPU::G_CTPOP: {
3637     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3638     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3639     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3640 
3641     // This should really be getValueMappingSGPR64Only, but allowing the generic
3642     // code to handle the register split just makes using LegalizerHelper more
3643     // difficult.
3644     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3645     break;
3646   }
3647   case AMDGPU::G_TRUNC: {
3648     Register Dst = MI.getOperand(0).getReg();
3649     Register Src = MI.getOperand(1).getReg();
3650     unsigned Bank = getRegBankID(Src, MRI);
3651     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3652     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3653     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3654     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3655     break;
3656   }
3657   case AMDGPU::G_ZEXT:
3658   case AMDGPU::G_SEXT:
3659   case AMDGPU::G_ANYEXT:
3660   case AMDGPU::G_SEXT_INREG: {
3661     Register Dst = MI.getOperand(0).getReg();
3662     Register Src = MI.getOperand(1).getReg();
3663     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3664     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3665 
3666     unsigned DstBank;
3667     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3668     assert(SrcBank);
3669     switch (SrcBank->getID()) {
3670     case AMDGPU::SGPRRegBankID:
3671       DstBank = AMDGPU::SGPRRegBankID;
3672       break;
3673     default:
3674       DstBank = AMDGPU::VGPRRegBankID;
3675       break;
3676     }
3677 
3678     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3679     // 32-bits, and then to 64.
3680     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3681     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3682                                                        SrcSize);
3683     break;
3684   }
3685   case AMDGPU::G_FCMP: {
3686     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3687     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3688     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3689     OpdsMapping[1] = nullptr; // Predicate Operand.
3690     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3691     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3692     break;
3693   }
3694   case AMDGPU::G_STORE: {
3695     assert(MI.getOperand(0).isReg());
3696     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3697 
3698     // FIXME: We need to specify a different reg bank once scalar stores are
3699     // supported.
3700     const ValueMapping *ValMapping =
3701         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3702     OpdsMapping[0] = ValMapping;
3703     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3704     break;
3705   }
3706   case AMDGPU::G_ICMP: {
3707     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3708     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3709 
3710     // See if the result register has already been constrained to vcc, which may
3711     // happen due to control flow intrinsic lowering.
3712     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3713                                     AMDGPU::SGPRRegBankID);
3714     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3715     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3716 
3717     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3718                      Op2Bank == AMDGPU::SGPRRegBankID &&
3719                      Op3Bank == AMDGPU::SGPRRegBankID &&
3720       (Size == 32 || (Size == 64 &&
3721                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3722                       Subtarget.hasScalarCompareEq64()));
3723 
3724     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3725     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3726 
3727     // TODO: Use 32-bit for scalar output size.
3728     // SCC results will need to be copied to a 32-bit SGPR virtual register.
3729     const unsigned ResultSize = 1;
3730 
3731     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3732     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3733     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3734     break;
3735   }
3736   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3737     // VGPR index can be used for waterfall when indexing a SGPR vector.
3738     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3739     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3740     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3741     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3742     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3743     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3744 
3745     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3746     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3747 
3748     // The index can be either if the source vector is VGPR.
3749     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3750     break;
3751   }
3752   case AMDGPU::G_INSERT_VECTOR_ELT: {
3753     unsigned OutputBankID = isSALUMapping(MI) ?
3754       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3755 
3756     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3757     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3758     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3759     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3760     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3761 
3762     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3763     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3764 
3765     // This is a weird case, because we need to break down the mapping based on
3766     // the register bank of a different operand.
3767     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3768       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3769                                                       InsertSize);
3770     } else {
3771       assert(InsertSize == 32 || InsertSize == 64);
3772       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3773     }
3774 
3775     // The index can be either if the source vector is VGPR.
3776     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3777     break;
3778   }
3779   case AMDGPU::G_UNMERGE_VALUES: {
3780     unsigned Bank = getMappingType(MRI, MI);
3781 
3782     // Op1 and Dst should use the same register bank.
3783     // FIXME: Shouldn't this be the default? Why do we need to handle this?
3784     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3785       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3786       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3787     }
3788     break;
3789   }
3790   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3791   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3792   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3793   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3794   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3795   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3796   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3797   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3798   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3799   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3800   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3801   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3802   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3803   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3804   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3805   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3806     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3807 
3808     // rsrc
3809     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3810 
3811     // vindex
3812     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3813 
3814     // voffset
3815     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3816 
3817     // soffset
3818     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3819 
3820     // Any remaining operands are immediates and were correctly null
3821     // initialized.
3822     break;
3823   }
3824   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3825   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3826   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3827   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3828   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3829   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3830   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3831   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3832   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3833   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3834   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3835   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3836   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3837   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3838   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3839     // vdata_out
3840     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3841 
3842     // vdata_in
3843     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3844 
3845     // rsrc
3846     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3847 
3848     // vindex
3849     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3850 
3851     // voffset
3852     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3853 
3854     // soffset
3855     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3856 
3857     // Any remaining operands are immediates and were correctly null
3858     // initialized.
3859     break;
3860   }
3861   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3862     // vdata_out
3863     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3864 
3865     // vdata_in
3866     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3867 
3868     // cmp
3869     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3870 
3871     // rsrc
3872     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3873 
3874     // vindex
3875     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3876 
3877     // voffset
3878     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3879 
3880     // soffset
3881     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3882 
3883     // Any remaining operands are immediates and were correctly null
3884     // initialized.
3885     break;
3886   }
3887   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3888     // Lie and claim everything is legal, even though some need to be
3889     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3890     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3891     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3892 
3893     // We need to convert this to a MUBUF if either the resource of offset is
3894     // VGPR.
3895     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3896     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3897     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3898 
3899     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3900     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3901     break;
3902   }
3903   case AMDGPU::G_INTRINSIC: {
3904     switch (MI.getIntrinsicID()) {
3905     default:
3906       return getInvalidInstructionMapping();
3907     case Intrinsic::amdgcn_div_fmas:
3908     case Intrinsic::amdgcn_div_fixup:
3909     case Intrinsic::amdgcn_trig_preop:
3910     case Intrinsic::amdgcn_sin:
3911     case Intrinsic::amdgcn_cos:
3912     case Intrinsic::amdgcn_log_clamp:
3913     case Intrinsic::amdgcn_rcp:
3914     case Intrinsic::amdgcn_rcp_legacy:
3915     case Intrinsic::amdgcn_sqrt:
3916     case Intrinsic::amdgcn_rsq:
3917     case Intrinsic::amdgcn_rsq_legacy:
3918     case Intrinsic::amdgcn_rsq_clamp:
3919     case Intrinsic::amdgcn_fmul_legacy:
3920     case Intrinsic::amdgcn_fma_legacy:
3921     case Intrinsic::amdgcn_ldexp:
3922     case Intrinsic::amdgcn_frexp_mant:
3923     case Intrinsic::amdgcn_frexp_exp:
3924     case Intrinsic::amdgcn_fract:
3925     case Intrinsic::amdgcn_cvt_pkrtz:
3926     case Intrinsic::amdgcn_cvt_pknorm_i16:
3927     case Intrinsic::amdgcn_cvt_pknorm_u16:
3928     case Intrinsic::amdgcn_cvt_pk_i16:
3929     case Intrinsic::amdgcn_cvt_pk_u16:
3930     case Intrinsic::amdgcn_fmed3:
3931     case Intrinsic::amdgcn_cubeid:
3932     case Intrinsic::amdgcn_cubema:
3933     case Intrinsic::amdgcn_cubesc:
3934     case Intrinsic::amdgcn_cubetc:
3935     case Intrinsic::amdgcn_sffbh:
3936     case Intrinsic::amdgcn_fmad_ftz:
3937     case Intrinsic::amdgcn_mbcnt_lo:
3938     case Intrinsic::amdgcn_mbcnt_hi:
3939     case Intrinsic::amdgcn_mul_u24:
3940     case Intrinsic::amdgcn_mul_i24:
3941     case Intrinsic::amdgcn_lerp:
3942     case Intrinsic::amdgcn_sad_u8:
3943     case Intrinsic::amdgcn_msad_u8:
3944     case Intrinsic::amdgcn_sad_hi_u8:
3945     case Intrinsic::amdgcn_sad_u16:
3946     case Intrinsic::amdgcn_qsad_pk_u16_u8:
3947     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
3948     case Intrinsic::amdgcn_mqsad_u32_u8:
3949     case Intrinsic::amdgcn_cvt_pk_u8_f32:
3950     case Intrinsic::amdgcn_alignbit:
3951     case Intrinsic::amdgcn_alignbyte:
3952     case Intrinsic::amdgcn_perm:
3953     case Intrinsic::amdgcn_fdot2:
3954     case Intrinsic::amdgcn_sdot2:
3955     case Intrinsic::amdgcn_udot2:
3956     case Intrinsic::amdgcn_sdot4:
3957     case Intrinsic::amdgcn_udot4:
3958     case Intrinsic::amdgcn_sdot8:
3959     case Intrinsic::amdgcn_udot8:
3960       return getDefaultMappingVOP(MI);
3961     case Intrinsic::amdgcn_sbfe:
3962     case Intrinsic::amdgcn_ubfe:
3963       if (isSALUMapping(MI))
3964         return getDefaultMappingSOP(MI);
3965       return getDefaultMappingVOP(MI);
3966     case Intrinsic::amdgcn_ds_swizzle:
3967     case Intrinsic::amdgcn_ds_permute:
3968     case Intrinsic::amdgcn_ds_bpermute:
3969     case Intrinsic::amdgcn_update_dpp:
3970     case Intrinsic::amdgcn_mov_dpp8:
3971     case Intrinsic::amdgcn_mov_dpp:
3972     case Intrinsic::amdgcn_strict_wwm:
3973     case Intrinsic::amdgcn_wwm:
3974     case Intrinsic::amdgcn_strict_wqm:
3975     case Intrinsic::amdgcn_wqm:
3976     case Intrinsic::amdgcn_softwqm:
3977     case Intrinsic::amdgcn_set_inactive:
3978       return getDefaultMappingAllVGPR(MI);
3979     case Intrinsic::amdgcn_kernarg_segment_ptr:
3980     case Intrinsic::amdgcn_s_getpc:
3981     case Intrinsic::amdgcn_groupstaticsize:
3982     case Intrinsic::amdgcn_reloc_constant:
3983     case Intrinsic::returnaddress: {
3984       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3985       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3986       break;
3987     }
3988     case Intrinsic::amdgcn_wqm_vote: {
3989       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3990       OpdsMapping[0] = OpdsMapping[2]
3991         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
3992       break;
3993     }
3994     case Intrinsic::amdgcn_ps_live: {
3995       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3996       break;
3997     }
3998     case Intrinsic::amdgcn_div_scale: {
3999       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4000       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4001       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4002       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4003 
4004       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4005       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4006       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4007       break;
4008     }
4009     case Intrinsic::amdgcn_class: {
4010       Register Src0Reg = MI.getOperand(2).getReg();
4011       Register Src1Reg = MI.getOperand(3).getReg();
4012       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4013       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4014       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4015       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4016       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4017       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4018       break;
4019     }
4020     case Intrinsic::amdgcn_icmp:
4021     case Intrinsic::amdgcn_fcmp: {
4022       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4023       // This is not VCCRegBank because this is not used in boolean contexts.
4024       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4025       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4026       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4027       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4028       break;
4029     }
4030     case Intrinsic::amdgcn_readlane: {
4031       // This must be an SGPR, but accept a VGPR.
4032       Register IdxReg = MI.getOperand(3).getReg();
4033       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4034       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4035       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4036       LLVM_FALLTHROUGH;
4037     }
4038     case Intrinsic::amdgcn_readfirstlane: {
4039       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4040       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4041       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4042       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4043       break;
4044     }
4045     case Intrinsic::amdgcn_writelane: {
4046       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4047       Register SrcReg = MI.getOperand(2).getReg();
4048       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4049       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4050       Register IdxReg = MI.getOperand(3).getReg();
4051       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4052       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4053       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4054 
4055       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4056       // to legalize.
4057       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4058       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4059       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4060       break;
4061     }
4062     case Intrinsic::amdgcn_if_break: {
4063       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4064       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4065       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4066       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4067       break;
4068     }
4069     case Intrinsic::amdgcn_permlane16:
4070     case Intrinsic::amdgcn_permlanex16: {
4071       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4072       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4073       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4074       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4075       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4076       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4077       break;
4078     }
4079     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4080     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4081     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4082     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4083     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4084     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4085     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4086     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4087     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4088     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4089     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4090     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4091     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4092     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4093     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4094     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4095     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4096     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4097     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4098     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4099     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4100     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4101     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4102     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4103     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4104     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4105     case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4106       // Default for MAI intrinsics.
4107       // srcC can also be an immediate which can be folded later.
4108       // FIXME: Should we eventually add an alternative mapping with AGPR src
4109       // for srcA/srcB?
4110       //
4111       // vdst, srcA, srcB, srcC
4112       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4113       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4114       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4115       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4116       break;
4117     }
4118     case Intrinsic::amdgcn_interp_p1:
4119     case Intrinsic::amdgcn_interp_p2:
4120     case Intrinsic::amdgcn_interp_mov:
4121     case Intrinsic::amdgcn_interp_p1_f16:
4122     case Intrinsic::amdgcn_interp_p2_f16: {
4123       const int M0Idx = MI.getNumOperands() - 1;
4124       Register M0Reg = MI.getOperand(M0Idx).getReg();
4125       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4126       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4127 
4128       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4129       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4130         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4131 
4132       // Must be SGPR, but we must take whatever the original bank is and fix it
4133       // later.
4134       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4135       break;
4136     }
4137     case Intrinsic::amdgcn_ballot: {
4138       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4139       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4140       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4141       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4142       break;
4143     }
4144     }
4145     break;
4146   }
4147   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4148   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4149     auto IntrID = MI.getIntrinsicID();
4150     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4151     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4152     // Non-images can have complications from operands that allow both SGPR
4153     // and VGPR. For now it's too complicated to figure out the final opcode
4154     // to derive the register bank from the MCInstrDesc.
4155     assert(RSrcIntrin->IsImage);
4156     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4157   }
4158   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4159     unsigned N = MI.getNumExplicitOperands() - 2;
4160     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4161     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4162     for (unsigned I = 2; I < N; ++I)
4163       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4164     break;
4165   }
4166   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4167     auto IntrID = MI.getIntrinsicID();
4168     switch (IntrID) {
4169     case Intrinsic::amdgcn_s_getreg:
4170     case Intrinsic::amdgcn_s_memtime:
4171     case Intrinsic::amdgcn_s_memrealtime:
4172     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4173       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4174       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4175       break;
4176     }
4177     case Intrinsic::amdgcn_global_atomic_fadd:
4178     case Intrinsic::amdgcn_global_atomic_csub:
4179     case Intrinsic::amdgcn_global_atomic_fmin:
4180     case Intrinsic::amdgcn_global_atomic_fmax:
4181     case Intrinsic::amdgcn_flat_atomic_fadd:
4182     case Intrinsic::amdgcn_flat_atomic_fmin:
4183     case Intrinsic::amdgcn_flat_atomic_fmax:
4184       return getDefaultMappingAllVGPR(MI);
4185     case Intrinsic::amdgcn_ds_ordered_add:
4186     case Intrinsic::amdgcn_ds_ordered_swap: {
4187       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4188       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4189       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4190                                  AMDGPU::SGPRRegBankID);
4191       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4192       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4193       break;
4194     }
4195     case Intrinsic::amdgcn_ds_append:
4196     case Intrinsic::amdgcn_ds_consume: {
4197       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4198       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4199       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4200       break;
4201     }
4202     case Intrinsic::amdgcn_exp_compr:
4203       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4204       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4205       break;
4206     case Intrinsic::amdgcn_exp:
4207       // FIXME: Could we support packed types here?
4208       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4209       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4210       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4211       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4212       break;
4213     case Intrinsic::amdgcn_s_sendmsg:
4214     case Intrinsic::amdgcn_s_sendmsghalt: {
4215       // This must be an SGPR, but accept a VGPR.
4216       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4217                                    AMDGPU::SGPRRegBankID);
4218       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4219       break;
4220     }
4221     case Intrinsic::amdgcn_s_setreg: {
4222       // This must be an SGPR, but accept a VGPR.
4223       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4224                                    AMDGPU::SGPRRegBankID);
4225       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4226       break;
4227     }
4228     case Intrinsic::amdgcn_end_cf: {
4229       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4230       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4231       break;
4232     }
4233     case Intrinsic::amdgcn_else: {
4234       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4235       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4236       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4237       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4238       break;
4239     }
4240     case Intrinsic::amdgcn_live_mask: {
4241       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4242       break;
4243     }
4244     case Intrinsic::amdgcn_wqm_demote:
4245     case Intrinsic::amdgcn_kill: {
4246       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4247       break;
4248     }
4249     case Intrinsic::amdgcn_raw_buffer_load:
4250     case Intrinsic::amdgcn_raw_tbuffer_load: {
4251       // FIXME: Should make intrinsic ID the last operand of the instruction,
4252       // then this would be the same as store
4253       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4254       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4255       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4256       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4257       break;
4258     }
4259     case Intrinsic::amdgcn_raw_buffer_store:
4260     case Intrinsic::amdgcn_raw_buffer_store_format:
4261     case Intrinsic::amdgcn_raw_tbuffer_store: {
4262       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4263       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4264       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4265       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4266       break;
4267     }
4268     case Intrinsic::amdgcn_struct_buffer_load:
4269     case Intrinsic::amdgcn_struct_tbuffer_load: {
4270       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4271       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4272       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4273       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4274       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4275       break;
4276     }
4277     case Intrinsic::amdgcn_struct_buffer_store:
4278     case Intrinsic::amdgcn_struct_tbuffer_store: {
4279       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4280       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4281       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4282       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4283       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4284       break;
4285     }
4286     case Intrinsic::amdgcn_init_exec_from_input: {
4287       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4288       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4289       break;
4290     }
4291     case Intrinsic::amdgcn_ds_gws_init:
4292     case Intrinsic::amdgcn_ds_gws_barrier:
4293     case Intrinsic::amdgcn_ds_gws_sema_br: {
4294       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4295 
4296       // This must be an SGPR, but accept a VGPR.
4297       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4298                                    AMDGPU::SGPRRegBankID);
4299       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4300       break;
4301     }
4302     case Intrinsic::amdgcn_ds_gws_sema_v:
4303     case Intrinsic::amdgcn_ds_gws_sema_p:
4304     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4305       // This must be an SGPR, but accept a VGPR.
4306       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4307                                    AMDGPU::SGPRRegBankID);
4308       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4309       break;
4310     }
4311     default:
4312       return getInvalidInstructionMapping();
4313     }
4314     break;
4315   }
4316   case AMDGPU::G_SELECT: {
4317     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4318     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4319                                     AMDGPU::SGPRRegBankID);
4320     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4321                                     AMDGPU::SGPRRegBankID);
4322     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4323                     Op3Bank == AMDGPU::SGPRRegBankID;
4324 
4325     unsigned CondBankDefault = SGPRSrcs ?
4326       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4327     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4328                                      CondBankDefault);
4329     if (CondBank == AMDGPU::SGPRRegBankID)
4330       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4331     else if (CondBank == AMDGPU::VGPRRegBankID)
4332       CondBank = AMDGPU::VCCRegBankID;
4333 
4334     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4335       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4336 
4337     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4338 
4339     // TODO: Should report 32-bit for scalar condition type.
4340     if (Size == 64) {
4341       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4342       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4343       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4344       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4345     } else {
4346       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4347       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4348       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4349       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4350     }
4351 
4352     break;
4353   }
4354 
4355   case AMDGPU::G_LOAD:
4356   case AMDGPU::G_ZEXTLOAD:
4357   case AMDGPU::G_SEXTLOAD:
4358     return getInstrMappingForLoad(MI);
4359 
4360   case AMDGPU::G_ATOMICRMW_XCHG:
4361   case AMDGPU::G_ATOMICRMW_ADD:
4362   case AMDGPU::G_ATOMICRMW_SUB:
4363   case AMDGPU::G_ATOMICRMW_AND:
4364   case AMDGPU::G_ATOMICRMW_OR:
4365   case AMDGPU::G_ATOMICRMW_XOR:
4366   case AMDGPU::G_ATOMICRMW_MAX:
4367   case AMDGPU::G_ATOMICRMW_MIN:
4368   case AMDGPU::G_ATOMICRMW_UMAX:
4369   case AMDGPU::G_ATOMICRMW_UMIN:
4370   case AMDGPU::G_ATOMICRMW_FADD:
4371   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4372   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4373   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4374   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4375   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4376     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4377     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4378     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4379     break;
4380   }
4381   case AMDGPU::G_ATOMIC_CMPXCHG: {
4382     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4383     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4384     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4385     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4386     break;
4387   }
4388   case AMDGPU::G_BRCOND: {
4389     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4390                                  AMDGPU::SGPRRegBankID);
4391     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4392     if (Bank != AMDGPU::SGPRRegBankID)
4393       Bank = AMDGPU::VCCRegBankID;
4394 
4395     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4396     break;
4397   }
4398   }
4399 
4400   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4401                                getOperandsMapping(OpdsMapping),
4402                                MI.getNumOperands());
4403 }
4404