1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84 
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87 
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90 
91 using namespace llvm;
92 using namespace MIPatternMatch;
93 
94 namespace {
95 
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99   const AMDGPURegisterBankInfo &RBI;
100   MachineRegisterInfo &MRI;
101   const RegisterBank *NewBank;
102   SmallVector<MachineInstr *, 4> NewInsts;
103 
104 public:
105   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
107     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108 
109   ~ApplyRegBankMapping() {
110     for (MachineInstr *MI : NewInsts)
111       applyBank(*MI);
112   }
113 
114   /// Set any registers that don't have a set register class or bank to SALU.
115   void applyBank(MachineInstr &MI) {
116     const unsigned Opc = MI.getOpcode();
117     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118         Opc == AMDGPU::G_SEXT) {
119       // LegalizerHelper wants to use the basic legalization artifacts when
120       // widening etc. We don't handle selection with vcc in artifact sources,
121       // so we need to use a select instead to handle these properly.
122       Register DstReg = MI.getOperand(0).getReg();
123       Register SrcReg = MI.getOperand(1).getReg();
124       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125       if (SrcBank == &AMDGPU::VCCRegBank) {
126         const LLT S32 = LLT::scalar(32);
127         assert(MRI.getType(SrcReg) == LLT::scalar(1));
128         assert(MRI.getType(DstReg) == S32);
129         assert(NewBank == &AMDGPU::VGPRRegBank);
130 
131         // Replace the extension with a select, which really uses the boolean
132         // source.
133         MachineIRBuilder B(MI);
134         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135         auto False = B.buildConstant(S32, 0);
136         B.buildSelect(DstReg, SrcReg, True, False);
137         MRI.setRegBank(True.getReg(0), *NewBank);
138         MRI.setRegBank(False.getReg(0), *NewBank);
139         MI.eraseFromParent();
140       }
141 
142       assert(!MRI.getRegClassOrRegBank(DstReg));
143       MRI.setRegBank(DstReg, *NewBank);
144       return;
145     }
146 
147 #ifndef NDEBUG
148     if (Opc == AMDGPU::G_TRUNC) {
149       Register DstReg = MI.getOperand(0).getReg();
150       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151       assert(DstBank != &AMDGPU::VCCRegBank);
152     }
153 #endif
154 
155     for (MachineOperand &Op : MI.operands()) {
156       if (!Op.isReg())
157         continue;
158 
159       // We may see physical registers if building a real MI
160       Register Reg = Op.getReg();
161       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162         continue;
163 
164       const RegisterBank *RB = NewBank;
165       if (MRI.getType(Reg) == LLT::scalar(1)) {
166         assert(NewBank == &AMDGPU::VGPRRegBank &&
167                "s1 operands should only be used for vector bools");
168         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170                "not expecting legalization artifacts here");
171         RB = &AMDGPU::VCCRegBank;
172       }
173 
174       MRI.setRegBank(Reg, *RB);
175     }
176   }
177 
178   void erasingInstr(MachineInstr &MI) override {}
179 
180   void createdInstr(MachineInstr &MI) override {
181     // At this point, the instruction was just inserted and has no operands.
182     NewInsts.push_back(&MI);
183   }
184 
185   void changingInstr(MachineInstr &MI) override {}
186   void changedInstr(MachineInstr &MI) override {
187     // FIXME: In principle we should probably add the instruction to NewInsts,
188     // but the way the LegalizerHelper uses the observer, we will always see the
189     // registers we need to set the regbank on also referenced in a new
190     // instruction.
191   }
192 };
193 
194 }
195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196     : AMDGPUGenRegisterBankInfo(),
197       Subtarget(ST),
198       TRI(Subtarget.getRegisterInfo()),
199       TII(Subtarget.getInstrInfo()) {
200 
201   // HACK: Until this is fully tablegen'd.
202   static llvm::once_flag InitializeRegisterBankFlag;
203 
204   static auto InitializeRegisterBankOnce = [this]() {
205     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208     (void)this;
209   };
210 
211   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213 
214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215   unsigned BankID = Bank.getID();
216   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218 
219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220                                           const RegisterBank &Src,
221                                           unsigned Size) const {
222   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225     return std::numeric_limits<unsigned>::max();
226   }
227 
228   // Bool values are tricky, because the meaning is based on context. The SCC
229   // and VCC banks are for the natural scalar and vector conditions produced by
230   // a compare.
231   //
232   // Legalization doesn't know about the necessary context, so an s1 use may
233   // have been a truncate from an arbitrary value, in which case a copy (lowered
234   // as a compare with 0) needs to be inserted.
235   if (Size == 1 &&
236       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237       (isVectorRegisterBank(Src) ||
238        Src.getID() == AMDGPU::SGPRRegBankID ||
239        Src.getID() == AMDGPU::VCCRegBankID))
240     return std::numeric_limits<unsigned>::max();
241 
242   // There is no direct copy between AGPRs.
243   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244       Src.getID() == AMDGPU::AGPRRegBankID)
245     return 4;
246 
247   return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249 
250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251   const ValueMapping &ValMapping,
252   const RegisterBank *CurBank) const {
253   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254   // VGPR.
255   // FIXME: Is there a better way to do this?
256   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257     return 10; // This is expensive.
258 
259   assert(ValMapping.NumBreakDowns == 2 &&
260          ValMapping.BreakDown[0].Length == 32 &&
261          ValMapping.BreakDown[0].StartIdx == 0 &&
262          ValMapping.BreakDown[1].Length == 32 &&
263          ValMapping.BreakDown[1].StartIdx == 32 &&
264          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265 
266   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268   // want.
269 
270   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271   // alignment restrictions, but this probably isn't important.
272   return 1;
273 }
274 
275 const RegisterBank &
276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277                                                LLT Ty) const {
278   if (&RC == &AMDGPU::SReg_1RegClass)
279     return AMDGPU::VCCRegBank;
280 
281   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282   // VCC-like use.
283   if (TRI->isSGPRClass(&RC)) {
284     // FIXME: This probably came from a copy from a physical register, which
285     // should be inferable from the copied to-type. We don't have many boolean
286     // physical register constraints so just assume a normal SGPR for now.
287     if (!Ty.isValid())
288       return AMDGPU::SGPRRegBank;
289 
290     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291   }
292 
293   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295 
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
298 AMDGPURegisterBankInfo::addMappingFromTable(
299     const MachineInstr &MI, const MachineRegisterInfo &MRI,
300     const std::array<unsigned, NumOps> RegSrcOpIdx,
301     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302 
303   InstructionMappings AltMappings;
304 
305   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
306 
307   unsigned Sizes[NumOps];
308   for (unsigned I = 0; I < NumOps; ++I) {
309     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311   }
312 
313   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316   }
317 
318   // getInstrMapping's default mapping uses ID 1, so start at 2.
319   unsigned MappingID = 2;
320   for (const auto &Entry : Table) {
321     for (unsigned I = 0; I < NumOps; ++I) {
322       int OpIdx = RegSrcOpIdx[I];
323       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324     }
325 
326     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327                                                  getOperandsMapping(Operands),
328                                                  Operands.size()));
329   }
330 
331   return AltMappings;
332 }
333 
334 RegisterBankInfo::InstructionMappings
335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337   switch (MI.getIntrinsicID()) {
338   case Intrinsic::amdgcn_readlane: {
339     static const OpRegBankEntry<3> Table[2] = {
340       // Perfectly legal.
341       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342 
343       // Need a readfirstlane for the index.
344       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345     };
346 
347     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349   }
350   case Intrinsic::amdgcn_writelane: {
351     static const OpRegBankEntry<4> Table[4] = {
352       // Perfectly legal.
353       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354 
355       // Need readfirstlane of first op
356       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357 
358       // Need readfirstlane of second op
359       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361       // Need readfirstlane of both ops
362       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363     };
364 
365     // rsrc, voffset, offset
366     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368   }
369   default:
370     return RegisterBankInfo::getInstrAlternativeMappings(MI);
371   }
372 }
373 
374 RegisterBankInfo::InstructionMappings
375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377 
378   switch (MI.getIntrinsicID()) {
379   case Intrinsic::amdgcn_s_buffer_load: {
380     static const OpRegBankEntry<2> Table[4] = {
381       // Perfectly legal.
382       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383 
384       // Only need 1 register in loop
385       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386 
387       // Have to waterfall the resource.
388       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389 
390       // Have to waterfall the resource, and the offset.
391       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392     };
393 
394     // rsrc, offset
395     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397   }
398   case Intrinsic::amdgcn_ds_ordered_add:
399   case Intrinsic::amdgcn_ds_ordered_swap: {
400     // VGPR = M0, VGPR
401     static const OpRegBankEntry<3> Table[2] = {
402       // Perfectly legal.
403       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
404 
405       // Need a readfirstlane for m0
406       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407     };
408 
409     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411   }
412   case Intrinsic::amdgcn_s_sendmsg:
413   case Intrinsic::amdgcn_s_sendmsghalt: {
414     // FIXME: Should have no register for immediate
415     static const OpRegBankEntry<1> Table[2] = {
416       // Perfectly legal.
417       { { AMDGPU::SGPRRegBankID }, 1 },
418 
419       // Need readlane
420       { { AMDGPU::VGPRRegBankID }, 3 }
421     };
422 
423     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425   }
426   default:
427     return RegisterBankInfo::getInstrAlternativeMappings(MI);
428   }
429 }
430 
431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433   return I && I->getMetadata("amdgpu.noclobber");
434 }
435 
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439   if (!MI.hasOneMemOperand())
440     return false;
441 
442   const MachineMemOperand *MMO = *MI.memoperands_begin();
443   const unsigned AS = MMO->getAddrSpace();
444   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446   // Require 4-byte alignment.
447   return MMO->getAlign() >= Align(4) &&
448          // Can't do a scalar atomic load.
449          !MMO->isAtomic() &&
450          // Don't use scalar loads for volatile accesses to non-constant address
451          // spaces.
452          (IsConst || !MMO->isVolatile()) &&
453          // Memory must be known constant, or not written before this load.
454          (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455          AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457 
458 RegisterBankInfo::InstructionMappings
459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460     const MachineInstr &MI) const {
461 
462   const MachineFunction &MF = *MI.getParent()->getParent();
463   const MachineRegisterInfo &MRI = MF.getRegInfo();
464 
465 
466   InstructionMappings AltMappings;
467   switch (MI.getOpcode()) {
468   case TargetOpcode::G_CONSTANT: {
469     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470     if (Size == 1) {
471       static const OpRegBankEntry<1> Table[3] = {
472         { { AMDGPU::VGPRRegBankID }, 1 },
473         { { AMDGPU::SGPRRegBankID }, 1 },
474         { { AMDGPU::VCCRegBankID }, 1 }
475       };
476 
477       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478     }
479 
480     LLVM_FALLTHROUGH;
481   }
482   case TargetOpcode::G_FCONSTANT:
483   case TargetOpcode::G_FRAME_INDEX:
484   case TargetOpcode::G_GLOBAL_VALUE: {
485     static const OpRegBankEntry<1> Table[2] = {
486       { { AMDGPU::VGPRRegBankID }, 1 },
487       { { AMDGPU::SGPRRegBankID }, 1 }
488     };
489 
490     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491   }
492   case TargetOpcode::G_AND:
493   case TargetOpcode::G_OR:
494   case TargetOpcode::G_XOR: {
495     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496 
497     if (Size == 1) {
498       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499       const InstructionMapping &SCCMapping = getInstructionMapping(
500         1, 1, getOperandsMapping(
501           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504         3); // Num Operands
505       AltMappings.push_back(&SCCMapping);
506 
507       const InstructionMapping &VCCMapping0 = getInstructionMapping(
508         2, 1, getOperandsMapping(
509           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512         3); // Num Operands
513       AltMappings.push_back(&VCCMapping0);
514       return AltMappings;
515     }
516 
517     if (Size != 64)
518       break;
519 
520     const InstructionMapping &SSMapping = getInstructionMapping(
521       1, 1, getOperandsMapping(
522         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525       3); // Num Operands
526     AltMappings.push_back(&SSMapping);
527 
528     const InstructionMapping &VVMapping = getInstructionMapping(
529       2, 2, getOperandsMapping(
530         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533       3); // Num Operands
534     AltMappings.push_back(&VVMapping);
535     break;
536   }
537   case TargetOpcode::G_LOAD:
538   case TargetOpcode::G_ZEXTLOAD:
539   case TargetOpcode::G_SEXTLOAD: {
540     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542     unsigned PtrSize = PtrTy.getSizeInBits();
543     unsigned AS = PtrTy.getAddressSpace();
544 
545     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547         isScalarLoadLegal(MI)) {
548       const InstructionMapping &SSMapping = getInstructionMapping(
549           1, 1, getOperandsMapping(
550                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552           2); // Num Operands
553       AltMappings.push_back(&SSMapping);
554     }
555 
556     const InstructionMapping &VVMapping = getInstructionMapping(
557         2, 1,
558         getOperandsMapping(
559             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561         2); // Num Operands
562     AltMappings.push_back(&VVMapping);
563 
564     // It may be possible to have a vgpr = load sgpr mapping here, because
565     // the mubuf instructions support this kind of load, but probably for only
566     // gfx7 and older.  However, the addressing mode matching in the instruction
567     // selector should be able to do a better job of detecting and selecting
568     // these kinds of loads from the vgpr = load vgpr mapping.
569 
570     return AltMappings;
571 
572   }
573   case TargetOpcode::G_SELECT: {
574     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580       4); // Num Operands
581     AltMappings.push_back(&SSMapping);
582 
583     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588       4); // Num Operands
589     AltMappings.push_back(&VVMapping);
590 
591     return AltMappings;
592   }
593   case TargetOpcode::G_UADDE:
594   case TargetOpcode::G_USUBE:
595   case TargetOpcode::G_SADDE:
596   case TargetOpcode::G_SSUBE: {
597     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
599       getOperandsMapping(
600         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605       5); // Num Operands
606     AltMappings.push_back(&SSMapping);
607 
608     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614       5); // Num Operands
615     AltMappings.push_back(&VVMapping);
616     return AltMappings;
617   }
618   case AMDGPU::G_BRCOND: {
619     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
620 
621     // TODO: Change type to 32 for scalar
622     const InstructionMapping &SMapping = getInstructionMapping(
623       1, 1, getOperandsMapping(
624         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625       2); // Num Operands
626     AltMappings.push_back(&SMapping);
627 
628     const InstructionMapping &VMapping = getInstructionMapping(
629       1, 1, getOperandsMapping(
630         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631       2); // Num Operands
632     AltMappings.push_back(&VMapping);
633     return AltMappings;
634   }
635   case AMDGPU::G_INTRINSIC:
636     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
637   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
638     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
639   default:
640     break;
641   }
642   return RegisterBankInfo::getInstrAlternativeMappings(MI);
643 }
644 
645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
646   MachineIRBuilder &B,
647   SmallVector<Register, 2> &Regs,
648   LLT HalfTy,
649   Register Reg) const {
650   assert(HalfTy.getSizeInBits() == 32);
651   MachineRegisterInfo *MRI = B.getMRI();
652   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655   MRI->setRegBank(LoLHS, *Bank);
656   MRI->setRegBank(HiLHS, *Bank);
657 
658   Regs.push_back(LoLHS);
659   Regs.push_back(HiLHS);
660 
661   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662     .addDef(LoLHS)
663     .addDef(HiLHS)
664     .addUse(Reg);
665 }
666 
667 /// Replace the current type each register in \p Regs has with \p NewTy
668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
669                           LLT NewTy) {
670   for (Register Reg : Regs) {
671     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
672     MRI.setType(Reg, NewTy);
673   }
674 }
675 
676 static LLT getHalfSizedType(LLT Ty) {
677   if (Ty.isVector()) {
678     assert(Ty.getElementCount().isKnownMultipleOf(2));
679     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
680                                Ty.getElementType());
681   }
682 
683   assert(Ty.getScalarSizeInBits() % 2 == 0);
684   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
685 }
686 
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
692 ///
693 /// Essentially performs this loop:
694 //
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 ///   Enable Lane, Disable all other lanes
698 ///   SGPR = read SGPR value for current lane from VGPR
699 ///   VGPRResult[Lane] = use_op SGPR
700 /// }
701 /// Restore Execution Mask
702 ///
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
706   MachineIRBuilder &B,
707   iterator_range<MachineBasicBlock::iterator> Range,
708   SmallSet<Register, 4> &SGPROperandRegs,
709   MachineRegisterInfo &MRI) const {
710 
711   // Track use registers which have already been expanded with a readfirstlane
712   // sequence. This may have multiple uses if moving a sequence.
713   DenseMap<Register, Register> WaterfalledRegMap;
714 
715   MachineBasicBlock &MBB = B.getMBB();
716   MachineFunction *MF = &B.getMF();
717 
718   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
719   const unsigned WaveAndOpc = Subtarget.isWave32() ?
720     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
721   const unsigned MovExecOpc =
722       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
723   const unsigned MovExecTermOpc =
724       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
725 
726   const unsigned XorTermOpc = Subtarget.isWave32() ?
727     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
728   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
729     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
730   const unsigned ExecReg =  Subtarget.isWave32() ?
731     AMDGPU::EXEC_LO : AMDGPU::EXEC;
732 
733 #ifndef NDEBUG
734   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
735 #endif
736 
737   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
738   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
739 
740   // Don't bother using generic instructions/registers for the exec mask.
741   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
742     .addDef(InitSaveExecReg);
743 
744   Register PhiExec = MRI.createVirtualRegister(WaveRC);
745   Register NewExec = MRI.createVirtualRegister(WaveRC);
746 
747   // To insert the loop we need to split the block. Move everything before this
748   // point to a new block, and insert a new empty block before this instruction.
749   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
750   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
751   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
752   MachineFunction::iterator MBBI(MBB);
753   ++MBBI;
754   MF->insert(MBBI, LoopBB);
755   MF->insert(MBBI, RestoreExecBB);
756   MF->insert(MBBI, RemainderBB);
757 
758   LoopBB->addSuccessor(RestoreExecBB);
759   LoopBB->addSuccessor(LoopBB);
760 
761   // Move the rest of the block into a new block.
762   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
763   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
764 
765   MBB.addSuccessor(LoopBB);
766   RestoreExecBB->addSuccessor(RemainderBB);
767 
768   B.setInsertPt(*LoopBB, LoopBB->end());
769 
770   B.buildInstr(TargetOpcode::PHI)
771     .addDef(PhiExec)
772     .addReg(InitSaveExecReg)
773     .addMBB(&MBB)
774     .addReg(NewExec)
775     .addMBB(LoopBB);
776 
777   const DebugLoc &DL = B.getDL();
778 
779   MachineInstr &FirstInst = *Range.begin();
780 
781   // Move the instruction into the loop. Note we moved everything after
782   // Range.end() already into a new block, so Range.end() is no longer valid.
783   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
784 
785   // Figure out the iterator range after splicing the instructions.
786   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
787   auto NewEnd = LoopBB->end();
788 
789   MachineBasicBlock::iterator I = Range.begin();
790   B.setInsertPt(*LoopBB, I);
791 
792   Register CondReg;
793 
794   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
795 
796   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
797     for (MachineOperand &Op : MI.uses()) {
798       if (!Op.isReg() || Op.isDef())
799         continue;
800 
801       Register OldReg = Op.getReg();
802       if (!SGPROperandRegs.count(OldReg))
803         continue;
804 
805       // See if we already processed this register in another instruction in the
806       // sequence.
807       auto OldVal = WaterfalledRegMap.find(OldReg);
808       if (OldVal != WaterfalledRegMap.end()) {
809         Op.setReg(OldVal->second);
810         continue;
811       }
812 
813       Register OpReg = Op.getReg();
814       LLT OpTy = MRI.getType(OpReg);
815 
816       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
817       if (OpBank != &AMDGPU::VGPRRegBank) {
818         // Insert copy from AGPR to VGPR before the loop.
819         B.setMBB(MBB);
820         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
821         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
822         B.setInstr(*I);
823       }
824 
825       unsigned OpSize = OpTy.getSizeInBits();
826 
827       // Can only do a readlane of 32-bit pieces.
828       if (OpSize == 32) {
829         // Avoid extra copies in the simple case of one 32-bit register.
830         Register CurrentLaneOpReg
831           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
832         MRI.setType(CurrentLaneOpReg, OpTy);
833 
834         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
835         // Read the next variant <- also loop target.
836         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
837                 CurrentLaneOpReg)
838           .addReg(OpReg);
839 
840         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
841         bool First = CondReg == AMDGPU::NoRegister;
842         if (First)
843           CondReg = NewCondReg;
844 
845         // Compare the just read M0 value to all possible Idx values.
846         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
847           .addDef(NewCondReg)
848           .addReg(CurrentLaneOpReg)
849           .addReg(OpReg);
850         Op.setReg(CurrentLaneOpReg);
851 
852         if (!First) {
853           Register AndReg = MRI.createVirtualRegister(WaveRC);
854 
855           // If there are multiple operands to consider, and the conditions.
856           B.buildInstr(WaveAndOpc)
857             .addDef(AndReg)
858             .addReg(NewCondReg)
859             .addReg(CondReg);
860           CondReg = AndReg;
861         }
862       } else {
863         LLT S32 = LLT::scalar(32);
864         SmallVector<Register, 8> ReadlanePieces;
865 
866         // The compares can be done as 64-bit, but the extract needs to be done
867         // in 32-bit pieces.
868 
869         bool Is64 = OpSize % 64 == 0;
870 
871         unsigned UnmergeTySize = Is64 ? 64 : 32;
872         unsigned CmpOp =
873             Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
874 
875         // Insert the unmerge before the loop.
876 
877         B.setMBB(MBB);
878         unsigned NumPieces = OpSize / UnmergeTySize;
879         SmallVector<Register, 8> UnmergePieces;
880         if (NumPieces == 1) {
881           UnmergePieces.push_back(OpReg);
882         } else {
883           LLT UnmergeTy = LLT::scalar(UnmergeTySize);
884           MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
885           for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
886             UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
887         }
888         B.setInstr(*I);
889 
890         for (Register UnmergePiece : UnmergePieces) {
891           Register CurrentLaneOpReg;
892           if (Is64) {
893             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
894             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
895 
896             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
897             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
898             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
899 
900             // Read the next variant <- also loop target.
901             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
902                     CurrentLaneOpRegLo)
903               .addReg(UnmergePiece, 0, AMDGPU::sub0);
904 
905             // Read the next variant <- also loop target.
906             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
907                     CurrentLaneOpRegHi)
908               .addReg(UnmergePiece, 0, AMDGPU::sub1);
909 
910             CurrentLaneOpReg =
911               B.buildMerge(LLT::scalar(64),
912                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
913               .getReg(0);
914 
915             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
916 
917             if (OpTy.getScalarSizeInBits() == 64) {
918               // If we need to produce a 64-bit element vector, so use the
919               // merged pieces
920               ReadlanePieces.push_back(CurrentLaneOpReg);
921             } else {
922               // 32-bit element type.
923               ReadlanePieces.push_back(CurrentLaneOpRegLo);
924               ReadlanePieces.push_back(CurrentLaneOpRegHi);
925             }
926           } else {
927             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
928             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
929             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
930 
931             // Read the next variant <- also loop target.
932             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
933                     CurrentLaneOpReg)
934               .addReg(UnmergePiece);
935             ReadlanePieces.push_back(CurrentLaneOpReg);
936           }
937 
938           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
939           bool First = CondReg == AMDGPU::NoRegister;
940           if (First)
941             CondReg = NewCondReg;
942 
943           B.buildInstr(CmpOp)
944             .addDef(NewCondReg)
945             .addReg(CurrentLaneOpReg)
946             .addReg(UnmergePiece);
947 
948           if (!First) {
949             Register AndReg = MRI.createVirtualRegister(WaveRC);
950 
951             // If there are multiple operands to consider, and the conditions.
952             B.buildInstr(WaveAndOpc)
953               .addDef(AndReg)
954               .addReg(NewCondReg)
955               .addReg(CondReg);
956             CondReg = AndReg;
957           }
958         }
959 
960         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
961         // BUILD_VECTOR
962         if (OpTy.isVector()) {
963           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
964           Op.setReg(Merge.getReg(0));
965           MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
966         } else if (ReadlanePieces.size() > 1) {
967           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
968           Op.setReg(Merge.getReg(0));
969           MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
970         } else {
971           Op.setReg(ReadlanePieces[0]);
972         }
973       }
974 
975       // Make sure we don't re-process this register again.
976       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
977     }
978   }
979 
980   // Update EXEC, save the original EXEC value to VCC.
981   B.buildInstr(AndSaveExecOpc)
982     .addDef(NewExec)
983     .addReg(CondReg, RegState::Kill);
984 
985   MRI.setSimpleHint(NewExec, CondReg);
986 
987   B.setInsertPt(*LoopBB, LoopBB->end());
988 
989   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
990   B.buildInstr(XorTermOpc)
991     .addDef(ExecReg)
992     .addReg(ExecReg)
993     .addReg(NewExec);
994 
995   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
996   // s_cbranch_scc0?
997 
998   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
999   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
1000 
1001   // Save the EXEC mask before the loop.
1002   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
1003     .addReg(ExecReg);
1004 
1005   // Restore the EXEC mask after the loop.
1006   B.setMBB(*RestoreExecBB);
1007   B.buildInstr(MovExecTermOpc)
1008     .addDef(ExecReg)
1009     .addReg(SaveExecReg);
1010 
1011   // Set the insert point after the original instruction, so any new
1012   // instructions will be in the remainder.
1013   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1014 
1015   return true;
1016 }
1017 
1018 // Return any unique registers used by \p MI at \p OpIndices that need to be
1019 // handled in a waterfall loop. Returns these registers in \p
1020 // SGPROperandRegs. Returns true if there are any operands to handle and a
1021 // waterfall loop is necessary.
1022 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1023   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1024   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1025   for (unsigned Op : OpIndices) {
1026     assert(MI.getOperand(Op).isUse());
1027     Register Reg = MI.getOperand(Op).getReg();
1028     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1029     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1030       SGPROperandRegs.insert(Reg);
1031   }
1032 
1033   // No operands need to be replaced, so no need to loop.
1034   return !SGPROperandRegs.empty();
1035 }
1036 
1037 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1038   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1039   ArrayRef<unsigned> OpIndices) const {
1040   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1041   // are the same register.
1042   SmallSet<Register, 4> SGPROperandRegs;
1043 
1044   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1045     return false;
1046 
1047   MachineBasicBlock::iterator I = MI.getIterator();
1048   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1049                                 SGPROperandRegs, MRI);
1050 }
1051 
1052 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1053   MachineInstr &MI, MachineRegisterInfo &MRI,
1054   ArrayRef<unsigned> OpIndices) const {
1055   MachineIRBuilder B(MI);
1056   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1057 }
1058 
1059 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1060 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1061     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1062   Register Reg = MI.getOperand(OpIdx).getReg();
1063   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1064   if (Bank == &AMDGPU::SGPRRegBank)
1065     return;
1066 
1067   LLT Ty = MRI.getType(Reg);
1068   MachineIRBuilder B(MI);
1069 
1070   if (Bank != &AMDGPU::VGPRRegBank) {
1071     // We need to copy from AGPR to VGPR
1072     Reg = B.buildCopy(Ty, Reg).getReg(0);
1073     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1074   }
1075 
1076   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1077   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1078     .addDef(SGPR)
1079     .addReg(Reg);
1080 
1081   MRI.setType(SGPR, Ty);
1082 
1083   const TargetRegisterClass *Constrained =
1084       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1085   (void)Constrained;
1086   assert(Constrained && "Failed to constrain readfirstlane src reg");
1087 
1088   MI.getOperand(OpIdx).setReg(SGPR);
1089 }
1090 
1091 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1092 /// rest will be in the remainder.
1093 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1094   unsigned TotalSize = Ty.getSizeInBits();
1095   if (!Ty.isVector())
1096     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1097 
1098   LLT EltTy = Ty.getElementType();
1099   unsigned EltSize = EltTy.getSizeInBits();
1100   assert(FirstSize % EltSize == 0);
1101 
1102   unsigned FirstPartNumElts = FirstSize / EltSize;
1103   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1104 
1105   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1106           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1107 }
1108 
1109 static LLT widen96To128(LLT Ty) {
1110   if (!Ty.isVector())
1111     return LLT::scalar(128);
1112 
1113   LLT EltTy = Ty.getElementType();
1114   assert(128 % EltTy.getSizeInBits() == 0);
1115   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1116 }
1117 
1118 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1119                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1120                                               MachineRegisterInfo &MRI) const {
1121   Register DstReg = MI.getOperand(0).getReg();
1122   const LLT LoadTy = MRI.getType(DstReg);
1123   unsigned LoadSize = LoadTy.getSizeInBits();
1124   const unsigned MaxNonSmrdLoadSize = 128;
1125 
1126   const RegisterBank *DstBank =
1127       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1128   if (DstBank == &AMDGPU::SGPRRegBank) {
1129     // There are some special cases that we need to look at for 32 bit and 96
1130     // bit SGPR loads otherwise we have nothing to do.
1131     if (LoadSize != 32 && LoadSize != 96)
1132       return false;
1133 
1134     MachineMemOperand *MMO = *MI.memoperands_begin();
1135     const unsigned MemSize = 8 * MMO->getSize();
1136     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1137     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1138     // scalar loads should have a load size of 32 but memory access size of less
1139     // than 32.
1140     if (LoadSize == 32 &&
1141         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1142       return false;
1143 
1144     Register PtrReg = MI.getOperand(1).getReg();
1145 
1146     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1147     MachineIRBuilder B(MI, O);
1148 
1149     if (LoadSize == 32) {
1150       // This is an extending load from a sub-dword size. Widen the memory
1151       // access size to 4 bytes and clear the extra high bits appropriately
1152       const LLT S32 = LLT::scalar(32);
1153       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1154         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1155         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1156         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1157       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1158         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1159         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1160         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1161       } else
1162         // We do not need to touch the higher bits for regular loads.
1163         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1164     } else {
1165       // 96-bit loads are only available for vector loads. We need to split this
1166       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1167       if (MMO->getAlign() < Align(16)) {
1168         MachineFunction *MF = MI.getParent()->getParent();
1169         ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1170         MachineIRBuilder B(MI, ApplyBank);
1171         LegalizerHelper Helper(*MF, ApplyBank, B);
1172         LLT Part64, Part32;
1173         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1174         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1175             LegalizerHelper::Legalized)
1176           return false;
1177         return true;
1178       } else {
1179         LLT WiderTy = widen96To128(LoadTy);
1180         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1181         if (WiderTy.isScalar())
1182           B.buildTrunc(MI.getOperand(0), WideLoad);
1183         else {
1184           B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1185                                               WideLoad);
1186         }
1187       }
1188     }
1189 
1190     MI.eraseFromParent();
1191     return true;
1192   }
1193 
1194   // 128-bit loads are supported for all instruction types.
1195   if (LoadSize <= MaxNonSmrdLoadSize)
1196     return false;
1197 
1198   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1199   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1200 
1201   if (SrcRegs.empty())
1202     SrcRegs.push_back(MI.getOperand(1).getReg());
1203 
1204   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1205 
1206   // RegBankSelect only emits scalar types, so we need to reset the pointer
1207   // operand to a pointer type.
1208   Register BasePtrReg = SrcRegs[0];
1209   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1210   MRI.setType(BasePtrReg, PtrTy);
1211 
1212   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1213   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1214   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1215   MachineIRBuilder B(MI, Observer);
1216   LegalizerHelper Helper(B.getMF(), Observer, B);
1217 
1218   if (LoadTy.isVector()) {
1219     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1220       return false;
1221   } else {
1222     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1223       return false;
1224   }
1225 
1226   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1227   return true;
1228 }
1229 
1230 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1231   MachineInstr &MI,
1232   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1233   MachineRegisterInfo &MRI) const {
1234   const MachineFunction &MF = *MI.getMF();
1235   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1236   const auto &TFI = *ST.getFrameLowering();
1237 
1238   // Guard in case the stack growth direction ever changes with scratch
1239   // instructions.
1240   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1241     return false;
1242 
1243   Register Dst = MI.getOperand(0).getReg();
1244   Register AllocSize = MI.getOperand(1).getReg();
1245   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1246 
1247   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1248 
1249   // TODO: Need to emit a wave reduction to get the maximum size.
1250   if (SizeBank != &AMDGPU::SGPRRegBank)
1251     return false;
1252 
1253   LLT PtrTy = MRI.getType(Dst);
1254   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1255 
1256   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1257   Register SPReg = Info->getStackPtrOffsetReg();
1258   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1259   MachineIRBuilder B(MI, ApplyBank);
1260 
1261   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1262   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1263 
1264   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1265   if (Alignment > TFI.getStackAlign()) {
1266     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1267     B.buildMaskLowPtrBits(Dst, PtrAdd,
1268                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1269   } else {
1270     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1271   }
1272 
1273   MI.eraseFromParent();
1274   return true;
1275 }
1276 
1277 bool AMDGPURegisterBankInfo::applyMappingImage(
1278     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1279     MachineRegisterInfo &MRI, int RsrcIdx) const {
1280   const int NumDefs = MI.getNumExplicitDefs();
1281 
1282   // The reported argument index is relative to the IR intrinsic call arguments,
1283   // so we need to shift by the number of defs and the intrinsic ID.
1284   RsrcIdx += NumDefs + 1;
1285 
1286   // Insert copies to VGPR arguments.
1287   applyDefaultMapping(OpdMapper);
1288 
1289   // Fixup any SGPR arguments.
1290   SmallVector<unsigned, 4> SGPRIndexes;
1291   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1292     if (!MI.getOperand(I).isReg())
1293       continue;
1294 
1295     // If this intrinsic has a sampler, it immediately follows rsrc.
1296     if (I == RsrcIdx || I == RsrcIdx + 1)
1297       SGPRIndexes.push_back(I);
1298   }
1299 
1300   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1301   return true;
1302 }
1303 
1304 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1305                                         Register Reg) {
1306   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1307   if (!Def)
1308     return Reg;
1309 
1310   // TODO: Guard against this being an implicit def
1311   return Def->getOperand(0).getReg();
1312 }
1313 
1314 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1315 // the three offsets (voffset, soffset and instoffset)
1316 static unsigned setBufferOffsets(MachineIRBuilder &B,
1317                                  const AMDGPURegisterBankInfo &RBI,
1318                                  Register CombinedOffset, Register &VOffsetReg,
1319                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1320                                  Align Alignment) {
1321   const LLT S32 = LLT::scalar(32);
1322   MachineRegisterInfo *MRI = B.getMRI();
1323 
1324   if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1325     uint32_t SOffset, ImmOffset;
1326     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1327                                  Alignment)) {
1328       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1329       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1330       InstOffsetVal = ImmOffset;
1331 
1332       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1333       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1334       return SOffset + ImmOffset;
1335     }
1336   }
1337 
1338   Register Base;
1339   unsigned Offset;
1340 
1341   std::tie(Base, Offset) =
1342       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1343 
1344   uint32_t SOffset, ImmOffset;
1345   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1346                                                   &RBI.Subtarget, Alignment)) {
1347     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1348       VOffsetReg = Base;
1349       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1350       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1351       InstOffsetVal = ImmOffset;
1352       return 0; // XXX - Why is this 0?
1353     }
1354 
1355     // If we have SGPR base, we can use it for soffset.
1356     if (SOffset == 0) {
1357       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1358       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1359       SOffsetReg = Base;
1360       InstOffsetVal = ImmOffset;
1361       return 0; // XXX - Why is this 0?
1362     }
1363   }
1364 
1365   // Handle the variable sgpr + vgpr case.
1366   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1367   if (Add && (int)Offset >= 0) {
1368     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1369     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1370 
1371     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1372     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1373 
1374     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1375       VOffsetReg = Src0;
1376       SOffsetReg = Src1;
1377       return 0;
1378     }
1379 
1380     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1381       VOffsetReg = Src1;
1382       SOffsetReg = Src0;
1383       return 0;
1384     }
1385   }
1386 
1387   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1388   // have an SGPR offset and a VGPR resource.
1389   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1390     VOffsetReg = CombinedOffset;
1391   } else {
1392     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1393     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1394   }
1395 
1396   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1397   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1398   return 0;
1399 }
1400 
1401 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1402   const OperandsMapper &OpdMapper) const {
1403   MachineInstr &MI = OpdMapper.getMI();
1404   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1405 
1406   const LLT S32 = LLT::scalar(32);
1407   Register Dst = MI.getOperand(0).getReg();
1408   LLT Ty = MRI.getType(Dst);
1409 
1410   const RegisterBank *RSrcBank =
1411     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1412   const RegisterBank *OffsetBank =
1413     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1414   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1415       OffsetBank == &AMDGPU::SGPRRegBank)
1416     return true; // Legal mapping
1417 
1418   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1419   // here but don't have an MMO.
1420 
1421   unsigned LoadSize = Ty.getSizeInBits();
1422   int NumLoads = 1;
1423   if (LoadSize == 256 || LoadSize == 512) {
1424     NumLoads = LoadSize / 128;
1425     Ty = Ty.divide(NumLoads);
1426   }
1427 
1428   // Use the alignment to ensure that the required offsets will fit into the
1429   // immediate offsets.
1430   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1431 
1432   MachineIRBuilder B(MI);
1433   MachineFunction &MF = B.getMF();
1434 
1435   Register SOffset;
1436   Register VOffset;
1437   int64_t ImmOffset = 0;
1438 
1439   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1440                                         VOffset, SOffset, ImmOffset, Alignment);
1441 
1442   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1443   // can, but we need to track an MMO for that.
1444   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1445   const Align MemAlign(4); // FIXME: ABI type alignment?
1446   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1447     MachinePointerInfo(),
1448     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1449     MachineMemOperand::MOInvariant,
1450     MemSize, MemAlign);
1451   if (MMOOffset != 0)
1452     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1453 
1454   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1455   // assume that the buffer is unswizzled.
1456 
1457   Register RSrc = MI.getOperand(1).getReg();
1458   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1459   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1460 
1461   SmallVector<Register, 4> LoadParts(NumLoads);
1462 
1463   MachineBasicBlock::iterator MII = MI.getIterator();
1464   MachineInstrSpan Span(MII, &B.getMBB());
1465 
1466   for (int i = 0; i < NumLoads; ++i) {
1467     if (NumLoads == 1) {
1468       LoadParts[i] = Dst;
1469     } else {
1470       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1471       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1472     }
1473 
1474     MachineMemOperand *MMO = BaseMMO;
1475     if (i != 0)
1476       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1477 
1478     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1479       .addDef(LoadParts[i])       // vdata
1480       .addUse(RSrc)               // rsrc
1481       .addUse(VIndex)             // vindex
1482       .addUse(VOffset)            // voffset
1483       .addUse(SOffset)            // soffset
1484       .addImm(ImmOffset + 16 * i) // offset(imm)
1485       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1486       .addImm(0)                  // idxen(imm)
1487       .addMemOperand(MMO);
1488   }
1489 
1490   // TODO: If only the resource is a VGPR, it may be better to execute the
1491   // scalar load in the waterfall loop if the resource is expected to frequently
1492   // be dynamically uniform.
1493   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1494     // Remove the original instruction to avoid potentially confusing the
1495     // waterfall loop logic.
1496     B.setInstr(*Span.begin());
1497     MI.eraseFromParent();
1498 
1499     SmallSet<Register, 4> OpsToWaterfall;
1500 
1501     OpsToWaterfall.insert(RSrc);
1502     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1503                            OpsToWaterfall, MRI);
1504   }
1505 
1506   if (NumLoads != 1) {
1507     if (Ty.isVector())
1508       B.buildConcatVectors(Dst, LoadParts);
1509     else
1510       B.buildMerge(Dst, LoadParts);
1511   }
1512 
1513   // We removed the instruction earlier with a waterfall loop.
1514   if (RSrcBank == &AMDGPU::SGPRRegBank)
1515     MI.eraseFromParent();
1516 
1517   return true;
1518 }
1519 
1520 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1521                                              bool Signed) const {
1522   MachineInstr &MI = OpdMapper.getMI();
1523   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1524 
1525   // Insert basic copies
1526   applyDefaultMapping(OpdMapper);
1527 
1528   Register DstReg = MI.getOperand(0).getReg();
1529   LLT Ty = MRI.getType(DstReg);
1530 
1531   const LLT S32 = LLT::scalar(32);
1532 
1533   unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1534   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1535   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1536   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1537 
1538   const RegisterBank *DstBank =
1539     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1540   if (DstBank == &AMDGPU::VGPRRegBank) {
1541     if (Ty == S32)
1542       return true;
1543 
1544     // There is no 64-bit vgpr bitfield extract instructions so the operation
1545     // is expanded to a sequence of instructions that implement the operation.
1546     ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1547     MachineIRBuilder B(MI, ApplyBank);
1548 
1549     const LLT S64 = LLT::scalar(64);
1550     // Shift the source operand so that extracted bits start at bit 0.
1551     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1552                               : B.buildLShr(S64, SrcReg, OffsetReg);
1553     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1554 
1555     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1556     // if the width is a constant.
1557     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1558       // Use the 32-bit bitfield extract instruction if the width is a constant.
1559       // Depending on the width size, use either the low or high 32-bits.
1560       auto Zero = B.buildConstant(S32, 0);
1561       auto WidthImm = ConstWidth->Value.getZExtValue();
1562       if (WidthImm <= 32) {
1563         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1564         // or clear the upper 32-bits.
1565         auto Extract =
1566             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1567                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1568         auto Extend =
1569             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1570         B.buildMerge(DstReg, {Extract, Extend});
1571       } else {
1572         // Use bitfield extract on upper 32-bit source, and combine with lower
1573         // 32-bit source.
1574         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1575         auto Extract =
1576             Signed
1577                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1578                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1579         B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1580       }
1581       MI.eraseFromParent();
1582       return true;
1583     }
1584 
1585     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1586     // operations.
1587     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1588     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1589     if (Signed)
1590       B.buildAShr(S64, SignBit, ExtShift);
1591     else
1592       B.buildLShr(S64, SignBit, ExtShift);
1593     MI.eraseFromParent();
1594     return true;
1595   }
1596 
1597   // The scalar form packs the offset and width in a single operand.
1598 
1599   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1600   MachineIRBuilder B(MI, ApplyBank);
1601 
1602   // Ensure the high bits are clear to insert the offset.
1603   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1604   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1605 
1606   // Zeros out the low bits, so don't bother clamping the input value.
1607   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1608 
1609   // Transformation function, pack the offset and width of a BFE into
1610   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1611   // source, bits [5:0] contain the offset and bits [22:16] the width.
1612   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1613 
1614   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1615   // register class constraints.
1616   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1617                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1618 
1619   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1620   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1621     llvm_unreachable("failed to constrain BFE");
1622 
1623   MI.eraseFromParent();
1624   return true;
1625 }
1626 
1627 // Return a suitable opcode for extending the operands of Opc when widening.
1628 static unsigned getExtendOp(unsigned Opc) {
1629   switch (Opc) {
1630   case TargetOpcode::G_ASHR:
1631   case TargetOpcode::G_SMIN:
1632   case TargetOpcode::G_SMAX:
1633     return TargetOpcode::G_SEXT;
1634   case TargetOpcode::G_LSHR:
1635   case TargetOpcode::G_UMIN:
1636   case TargetOpcode::G_UMAX:
1637     return TargetOpcode::G_ZEXT;
1638   default:
1639     return TargetOpcode::G_ANYEXT;
1640   }
1641 }
1642 
1643 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1644 // any illegal vector extend or unmerge operations.
1645 static std::pair<Register, Register>
1646 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1647   const LLT S32 = LLT::scalar(32);
1648   auto Bitcast = B.buildBitcast(S32, Src);
1649 
1650   if (ExtOpcode == TargetOpcode::G_SEXT) {
1651     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1652     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1653     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1654   }
1655 
1656   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1657   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1658     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1659     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1660   }
1661 
1662   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1663   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1664 }
1665 
1666 // For cases where only a single copy is inserted for matching register banks.
1667 // Replace the register in the instruction operand
1668 static bool substituteSimpleCopyRegs(
1669   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1670   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1671   if (!SrcReg.empty()) {
1672     assert(SrcReg.size() == 1);
1673     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1674     return true;
1675   }
1676 
1677   return false;
1678 }
1679 
1680 /// Handle register layout difference for f16 images for some subtargets.
1681 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1682                                                 MachineRegisterInfo &MRI,
1683                                                 Register Reg) const {
1684   if (!Subtarget.hasUnpackedD16VMem())
1685     return Reg;
1686 
1687   const LLT S16 = LLT::scalar(16);
1688   LLT StoreVT = MRI.getType(Reg);
1689   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1690     return Reg;
1691 
1692   auto Unmerge = B.buildUnmerge(S16, Reg);
1693 
1694 
1695   SmallVector<Register, 4> WideRegs;
1696   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1697     WideRegs.push_back(Unmerge.getReg(I));
1698 
1699   const LLT S32 = LLT::scalar(32);
1700   int NumElts = StoreVT.getNumElements();
1701 
1702   return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1703 }
1704 
1705 static std::pair<Register, unsigned>
1706 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1707   int64_t Const;
1708   if (mi_match(Reg, MRI, m_ICst(Const)))
1709     return std::make_pair(Register(), Const);
1710 
1711   Register Base;
1712   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1713     return std::make_pair(Base, Const);
1714 
1715   // TODO: Handle G_OR used for add case
1716   return std::make_pair(Reg, 0);
1717 }
1718 
1719 std::pair<Register, unsigned>
1720 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1721                                            Register OrigOffset) const {
1722   const unsigned MaxImm = 4095;
1723   Register BaseReg;
1724   unsigned ImmOffset;
1725   const LLT S32 = LLT::scalar(32);
1726 
1727   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1728                                                            OrigOffset);
1729 
1730   unsigned C1 = 0;
1731   if (ImmOffset != 0) {
1732     // If the immediate value is too big for the immoffset field, put the value
1733     // and -4096 into the immoffset field so that the value that is copied/added
1734     // for the voffset field is a multiple of 4096, and it stands more chance
1735     // of being CSEd with the copy/add for another similar load/store.
1736     // However, do not do that rounding down to a multiple of 4096 if that is a
1737     // negative number, as it appears to be illegal to have a negative offset
1738     // in the vgpr, even if adding the immediate offset makes it positive.
1739     unsigned Overflow = ImmOffset & ~MaxImm;
1740     ImmOffset -= Overflow;
1741     if ((int32_t)Overflow < 0) {
1742       Overflow += ImmOffset;
1743       ImmOffset = 0;
1744     }
1745 
1746     C1 = ImmOffset;
1747     if (Overflow != 0) {
1748       if (!BaseReg)
1749         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1750       else {
1751         auto OverflowVal = B.buildConstant(S32, Overflow);
1752         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1753       }
1754     }
1755   }
1756 
1757   if (!BaseReg)
1758     BaseReg = B.buildConstant(S32, 0).getReg(0);
1759 
1760   return {BaseReg, C1};
1761 }
1762 
1763 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1764                                         Register SrcReg) const {
1765   MachineRegisterInfo &MRI = *B.getMRI();
1766   LLT SrcTy = MRI.getType(SrcReg);
1767   if (SrcTy.getSizeInBits() == 32) {
1768     // Use a v_mov_b32 here to make the exec dependency explicit.
1769     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1770       .addDef(DstReg)
1771       .addUse(SrcReg);
1772     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1773            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1774   }
1775 
1776   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1777   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1778 
1779   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1780     .addDef(TmpReg0)
1781     .addUse(SrcReg, 0, AMDGPU::sub0);
1782   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1783     .addDef(TmpReg1)
1784     .addUse(SrcReg, 0, AMDGPU::sub1);
1785   B.buildInstr(AMDGPU::REG_SEQUENCE)
1786     .addDef(DstReg)
1787     .addUse(TmpReg0)
1788     .addImm(AMDGPU::sub0)
1789     .addUse(TmpReg1)
1790     .addImm(AMDGPU::sub1);
1791 
1792   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1793          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1794 }
1795 
1796 /// Utility function for pushing dynamic vector indexes with a constant offset
1797 /// into waterwall loops.
1798 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1799                                    MachineInstr &IdxUseInstr,
1800                                    unsigned OpIdx,
1801                                    unsigned ConstOffset) {
1802   MachineRegisterInfo &MRI = *B.getMRI();
1803   const LLT S32 = LLT::scalar(32);
1804   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1805   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1806 
1807   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1808 
1809   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1810   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1811   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1812   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1813 }
1814 
1815 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1816 /// original 32-bit source value (to be inserted in the low part of the combined
1817 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1818 /// value.
1819 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1820                                   Register Hi32Reg, Register Lo32Reg,
1821                                   unsigned ExtOpc,
1822                                   const RegisterBank &RegBank,
1823                                   bool IsBooleanSrc = false) {
1824   if (ExtOpc == AMDGPU::G_ZEXT) {
1825     B.buildConstant(Hi32Reg, 0);
1826   } else if (ExtOpc == AMDGPU::G_SEXT) {
1827     if (IsBooleanSrc) {
1828       // If we know the original source was an s1, the high half is the same as
1829       // the low.
1830       B.buildCopy(Hi32Reg, Lo32Reg);
1831     } else {
1832       // Replicate sign bit from 32-bit extended part.
1833       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1834       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1835       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1836     }
1837   } else {
1838     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1839     B.buildUndef(Hi32Reg);
1840   }
1841 }
1842 
1843 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1844   MachineInstr &MI, MachineRegisterInfo &MRI,
1845   const OperandsMapper &OpdMapper) const {
1846 
1847   Register VecReg = MI.getOperand(1).getReg();
1848   Register Idx = MI.getOperand(2).getReg();
1849 
1850   const RegisterBank &IdxBank =
1851     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1852 
1853   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1854 
1855   LLT VecTy = MRI.getType(VecReg);
1856   unsigned EltSize = VecTy.getScalarSizeInBits();
1857   unsigned NumElem = VecTy.getNumElements();
1858 
1859   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1860                                                   IsDivergentIdx))
1861     return false;
1862 
1863   MachineIRBuilder B(MI);
1864   LLT S32 = LLT::scalar(32);
1865 
1866   const RegisterBank &DstBank =
1867     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1868   const RegisterBank &SrcBank =
1869     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1870 
1871   const RegisterBank &CCBank =
1872     (DstBank == AMDGPU::SGPRRegBank &&
1873      SrcBank == AMDGPU::SGPRRegBank &&
1874      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1875                                      : AMDGPU::VCCRegBank;
1876   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1877 
1878   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1879     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1880     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1881   }
1882 
1883   LLT EltTy = VecTy.getScalarType();
1884   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1885   unsigned NumLanes = DstRegs.size();
1886   if (!NumLanes)
1887     NumLanes = 1;
1888   else
1889     EltTy = MRI.getType(DstRegs[0]);
1890 
1891   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1892   SmallVector<Register, 2> Res(NumLanes);
1893   for (unsigned L = 0; L < NumLanes; ++L)
1894     Res[L] = UnmergeToEltTy.getReg(L);
1895 
1896   for (unsigned I = 1; I < NumElem; ++I) {
1897     auto IC = B.buildConstant(S32, I);
1898     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1899     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1900     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1901 
1902     for (unsigned L = 0; L < NumLanes; ++L) {
1903       auto S = B.buildSelect(EltTy, Cmp,
1904                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1905 
1906       for (unsigned N : { 0, 2, 3 })
1907         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1908 
1909       Res[L] = S->getOperand(0).getReg();
1910     }
1911   }
1912 
1913   for (unsigned L = 0; L < NumLanes; ++L) {
1914     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1915     B.buildCopy(DstReg, Res[L]);
1916     MRI.setRegBank(DstReg, DstBank);
1917   }
1918 
1919   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1920   MI.eraseFromParent();
1921 
1922   return true;
1923 }
1924 
1925 // Insert a cross regbank copy for a register if it already has a bank that
1926 // differs from the one we want to set.
1927 static Register constrainRegToBank(MachineRegisterInfo &MRI,
1928                                    MachineIRBuilder &B, Register &Reg,
1929                                    const RegisterBank &Bank) {
1930   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1931   if (CurrBank && *CurrBank != Bank) {
1932     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
1933     MRI.setRegBank(Copy, Bank);
1934     return Copy;
1935   }
1936 
1937   MRI.setRegBank(Reg, Bank);
1938   return Reg;
1939 }
1940 
1941 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1942   MachineInstr &MI, MachineRegisterInfo &MRI,
1943   const OperandsMapper &OpdMapper) const {
1944 
1945   Register VecReg = MI.getOperand(1).getReg();
1946   Register Idx = MI.getOperand(3).getReg();
1947 
1948   const RegisterBank &IdxBank =
1949     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
1950 
1951   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1952 
1953   LLT VecTy = MRI.getType(VecReg);
1954   unsigned EltSize = VecTy.getScalarSizeInBits();
1955   unsigned NumElem = VecTy.getNumElements();
1956 
1957   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1958                                                   IsDivergentIdx))
1959     return false;
1960 
1961   MachineIRBuilder B(MI);
1962   LLT S32 = LLT::scalar(32);
1963 
1964   const RegisterBank &DstBank =
1965     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1966   const RegisterBank &SrcBank =
1967     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1968   const RegisterBank &InsBank =
1969     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1970 
1971   const RegisterBank &CCBank =
1972     (DstBank == AMDGPU::SGPRRegBank &&
1973      SrcBank == AMDGPU::SGPRRegBank &&
1974      InsBank == AMDGPU::SGPRRegBank &&
1975      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1976                                      : AMDGPU::VCCRegBank;
1977   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1978 
1979   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1980     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1981     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1982   }
1983 
1984   LLT EltTy = VecTy.getScalarType();
1985   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
1986   unsigned NumLanes = InsRegs.size();
1987   if (!NumLanes) {
1988     NumLanes = 1;
1989     InsRegs.push_back(MI.getOperand(2).getReg());
1990   } else {
1991     EltTy = MRI.getType(InsRegs[0]);
1992   }
1993 
1994   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1995   SmallVector<Register, 16> Ops(NumElem * NumLanes);
1996 
1997   for (unsigned I = 0; I < NumElem; ++I) {
1998     auto IC = B.buildConstant(S32, I);
1999     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2000     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2001     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2002 
2003     for (unsigned L = 0; L < NumLanes; ++L) {
2004       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2005       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2006       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2007 
2008       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2009       MRI.setRegBank(Select, DstBank);
2010 
2011       Ops[I * NumLanes + L] = Select;
2012     }
2013   }
2014 
2015   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2016   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2017     B.buildBuildVector(MI.getOperand(0), Ops);
2018   } else {
2019     auto Vec = B.buildBuildVector(MergeTy, Ops);
2020     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2021     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2022   }
2023 
2024   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2025   MI.eraseFromParent();
2026 
2027   return true;
2028 }
2029 
2030 void AMDGPURegisterBankInfo::applyMappingImpl(
2031     const OperandsMapper &OpdMapper) const {
2032   MachineInstr &MI = OpdMapper.getMI();
2033   unsigned Opc = MI.getOpcode();
2034   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2035   switch (Opc) {
2036   case AMDGPU::G_PHI: {
2037     Register DstReg = MI.getOperand(0).getReg();
2038     LLT DstTy = MRI.getType(DstReg);
2039     if (DstTy != LLT::scalar(1))
2040       break;
2041 
2042     const LLT S32 = LLT::scalar(32);
2043     const RegisterBank *DstBank =
2044       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2045     if (DstBank == &AMDGPU::VCCRegBank) {
2046       applyDefaultMapping(OpdMapper);
2047       // The standard handling only considers the result register bank for
2048       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2049       // produce an invalid copy. We can only copy with some kind of compare to
2050       // get a vector boolean result. Insert a register bank copy that will be
2051       // correctly lowered to a compare.
2052       MachineIRBuilder B(*MI.getParent()->getParent());
2053 
2054       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2055         Register SrcReg = MI.getOperand(I).getReg();
2056         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2057 
2058         if (SrcBank != &AMDGPU::VCCRegBank) {
2059           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2060           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2061 
2062           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2063           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2064           MI.getOperand(I).setReg(Copy.getReg(0));
2065         }
2066       }
2067 
2068       return;
2069     }
2070 
2071     // Phi handling is strange and only considers the bank of the destination.
2072     substituteSimpleCopyRegs(OpdMapper, 0);
2073 
2074     // Promote SGPR/VGPR booleans to s32
2075     MachineFunction *MF = MI.getParent()->getParent();
2076     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2077     MachineIRBuilder B(MI, ApplyBank);
2078     LegalizerHelper Helper(*MF, ApplyBank, B);
2079 
2080     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2081       llvm_unreachable("widen scalar should have succeeded");
2082 
2083     return;
2084   }
2085   case AMDGPU::G_ICMP:
2086   case AMDGPU::G_UADDO:
2087   case AMDGPU::G_USUBO:
2088   case AMDGPU::G_UADDE:
2089   case AMDGPU::G_SADDE:
2090   case AMDGPU::G_USUBE:
2091   case AMDGPU::G_SSUBE: {
2092     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2093     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2094 
2095     const RegisterBank *DstBank =
2096       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2097     if (DstBank != &AMDGPU::SGPRRegBank)
2098       break;
2099 
2100     const bool HasCarryIn = MI.getNumOperands() == 5;
2101 
2102     // If this is a scalar compare, promote the result to s32, as the selection
2103     // will end up using a copy to a 32-bit vreg.
2104     const LLT S32 = LLT::scalar(32);
2105     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2106     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2107     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2108     MachineIRBuilder B(MI);
2109 
2110     if (HasCarryIn) {
2111       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2112       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2113       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2114       MI.getOperand(4).setReg(NewSrcReg);
2115     }
2116 
2117     MachineBasicBlock *MBB = MI.getParent();
2118     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2119 
2120     // If we had a constrained VCC result register, a copy was inserted to VCC
2121     // from SGPR.
2122     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2123     if (DefRegs.empty())
2124       DefRegs.push_back(DstReg);
2125     B.buildTrunc(DefRegs[0], NewDstReg);
2126     return;
2127   }
2128   case AMDGPU::G_SELECT: {
2129     Register DstReg = MI.getOperand(0).getReg();
2130     LLT DstTy = MRI.getType(DstReg);
2131 
2132     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2133     if (CondRegs.empty())
2134       CondRegs.push_back(MI.getOperand(1).getReg());
2135     else {
2136       assert(CondRegs.size() == 1);
2137     }
2138 
2139     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2140     if (CondBank == &AMDGPU::SGPRRegBank) {
2141       MachineIRBuilder B(MI);
2142       const LLT S32 = LLT::scalar(32);
2143       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2144       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2145 
2146       MI.getOperand(1).setReg(NewCondReg);
2147       B.buildZExt(NewCondReg, CondRegs[0]);
2148     }
2149 
2150     if (DstTy.getSizeInBits() != 64)
2151       break;
2152 
2153     MachineIRBuilder B(MI);
2154     LLT HalfTy = getHalfSizedType(DstTy);
2155 
2156     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2157     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2158     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2159 
2160     // All inputs are SGPRs, nothing special to do.
2161     if (DefRegs.empty()) {
2162       assert(Src1Regs.empty() && Src2Regs.empty());
2163       break;
2164     }
2165 
2166     if (Src1Regs.empty())
2167       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2168     else {
2169       setRegsToType(MRI, Src1Regs, HalfTy);
2170     }
2171 
2172     if (Src2Regs.empty())
2173       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2174     else
2175       setRegsToType(MRI, Src2Regs, HalfTy);
2176 
2177     setRegsToType(MRI, DefRegs, HalfTy);
2178 
2179     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2180     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2181 
2182     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2183     MI.eraseFromParent();
2184     return;
2185   }
2186   case AMDGPU::G_BRCOND: {
2187     Register CondReg = MI.getOperand(0).getReg();
2188     // FIXME: Should use legalizer helper, but should change bool ext type.
2189     const RegisterBank *CondBank =
2190       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2191 
2192     if (CondBank == &AMDGPU::SGPRRegBank) {
2193       MachineIRBuilder B(MI);
2194       const LLT S32 = LLT::scalar(32);
2195       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2196       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2197 
2198       MI.getOperand(0).setReg(NewCondReg);
2199       B.buildZExt(NewCondReg, CondReg);
2200       return;
2201     }
2202 
2203     break;
2204   }
2205   case AMDGPU::G_AND:
2206   case AMDGPU::G_OR:
2207   case AMDGPU::G_XOR: {
2208     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2209     // there is a VGPR input.
2210     Register DstReg = MI.getOperand(0).getReg();
2211     LLT DstTy = MRI.getType(DstReg);
2212 
2213     if (DstTy.getSizeInBits() == 1) {
2214       const RegisterBank *DstBank =
2215         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2216       if (DstBank == &AMDGPU::VCCRegBank)
2217         break;
2218 
2219       MachineFunction *MF = MI.getParent()->getParent();
2220       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2221       MachineIRBuilder B(MI, ApplyBank);
2222       LegalizerHelper Helper(*MF, ApplyBank, B);
2223 
2224       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2225           LegalizerHelper::Legalized)
2226         llvm_unreachable("widen scalar should have succeeded");
2227       return;
2228     }
2229 
2230     if (DstTy.getSizeInBits() != 64)
2231       break;
2232 
2233     LLT HalfTy = getHalfSizedType(DstTy);
2234     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2235     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2236     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2237 
2238     // All inputs are SGPRs, nothing special to do.
2239     if (DefRegs.empty()) {
2240       assert(Src0Regs.empty() && Src1Regs.empty());
2241       break;
2242     }
2243 
2244     assert(DefRegs.size() == 2);
2245     assert(Src0Regs.size() == Src1Regs.size() &&
2246            (Src0Regs.empty() || Src0Regs.size() == 2));
2247 
2248     // Depending on where the source registers came from, the generic code may
2249     // have decided to split the inputs already or not. If not, we still need to
2250     // extract the values.
2251     MachineIRBuilder B(MI);
2252 
2253     if (Src0Regs.empty())
2254       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2255     else
2256       setRegsToType(MRI, Src0Regs, HalfTy);
2257 
2258     if (Src1Regs.empty())
2259       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2260     else
2261       setRegsToType(MRI, Src1Regs, HalfTy);
2262 
2263     setRegsToType(MRI, DefRegs, HalfTy);
2264 
2265     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2266     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2267 
2268     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2269     MI.eraseFromParent();
2270     return;
2271   }
2272   case AMDGPU::G_ABS: {
2273     Register SrcReg = MI.getOperand(1).getReg();
2274     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2275 
2276     // There is no VALU abs instruction so we need to replace it with a sub and
2277     // max combination.
2278     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2279       MachineFunction *MF = MI.getParent()->getParent();
2280       ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2281       MachineIRBuilder B(MI, Apply);
2282       LegalizerHelper Helper(*MF, Apply, B);
2283 
2284       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2285         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2286       return;
2287     }
2288     LLVM_FALLTHROUGH;
2289   }
2290   case AMDGPU::G_ADD:
2291   case AMDGPU::G_SUB:
2292   case AMDGPU::G_MUL:
2293   case AMDGPU::G_SHL:
2294   case AMDGPU::G_LSHR:
2295   case AMDGPU::G_ASHR:
2296   case AMDGPU::G_SMIN:
2297   case AMDGPU::G_SMAX:
2298   case AMDGPU::G_UMIN:
2299   case AMDGPU::G_UMAX: {
2300     Register DstReg = MI.getOperand(0).getReg();
2301     LLT DstTy = MRI.getType(DstReg);
2302 
2303     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2304     // Packed 16-bit operations need to be scalarized and promoted.
2305     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2306       break;
2307 
2308     const RegisterBank *DstBank =
2309       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2310     if (DstBank == &AMDGPU::VGPRRegBank)
2311       break;
2312 
2313     const LLT S32 = LLT::scalar(32);
2314     MachineBasicBlock *MBB = MI.getParent();
2315     MachineFunction *MF = MBB->getParent();
2316     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2317     MachineIRBuilder B(MI, ApplySALU);
2318 
2319     if (DstTy.isVector()) {
2320       Register WideSrc0Lo, WideSrc0Hi;
2321       Register WideSrc1Lo, WideSrc1Hi;
2322 
2323       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2324       std::tie(WideSrc0Lo, WideSrc0Hi)
2325         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2326       std::tie(WideSrc1Lo, WideSrc1Hi)
2327         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2328       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2329       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2330       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2331       MI.eraseFromParent();
2332     } else {
2333       LegalizerHelper Helper(*MF, ApplySALU, B);
2334 
2335       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2336         llvm_unreachable("widen scalar should have succeeded");
2337 
2338       // FIXME: s16 shift amounts should be legal.
2339       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2340           Opc == AMDGPU::G_ASHR) {
2341         B.setInsertPt(*MBB, MI.getIterator());
2342         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2343           llvm_unreachable("widen scalar should have succeeded");
2344       }
2345     }
2346 
2347     return;
2348   }
2349   case AMDGPU::G_SEXT_INREG: {
2350     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2351     if (SrcRegs.empty())
2352       break; // Nothing to repair
2353 
2354     const LLT S32 = LLT::scalar(32);
2355     MachineIRBuilder B(MI);
2356     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2357     GISelObserverWrapper Observer(&O);
2358     B.setChangeObserver(Observer);
2359 
2360     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2361     // we would need to further expand, and doesn't let us directly set the
2362     // result registers.
2363     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2364 
2365     int Amt = MI.getOperand(2).getImm();
2366     if (Amt <= 32) {
2367       if (Amt == 32) {
2368         // The low bits are unchanged.
2369         B.buildCopy(DstRegs[0], SrcRegs[0]);
2370       } else {
2371         // Extend in the low bits and propagate the sign bit to the high half.
2372         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2373       }
2374 
2375       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2376     } else {
2377       // The low bits are unchanged, and extend in the high bits.
2378       B.buildCopy(DstRegs[0], SrcRegs[0]);
2379       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2380     }
2381 
2382     Register DstReg = MI.getOperand(0).getReg();
2383     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2384     MI.eraseFromParent();
2385     return;
2386   }
2387   case AMDGPU::G_CTPOP:
2388   case AMDGPU::G_BITREVERSE: {
2389     const RegisterBank *DstBank =
2390       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2391     if (DstBank == &AMDGPU::SGPRRegBank)
2392       break;
2393 
2394     Register SrcReg = MI.getOperand(1).getReg();
2395     const LLT S32 = LLT::scalar(32);
2396     LLT Ty = MRI.getType(SrcReg);
2397     if (Ty == S32)
2398       break;
2399 
2400     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2401     MachineIRBuilder B(MI, ApplyVALU);
2402 
2403     MachineFunction &MF = B.getMF();
2404     LegalizerHelper Helper(MF, ApplyVALU, B);
2405 
2406     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2407       llvm_unreachable("narrowScalar should have succeeded");
2408     return;
2409   }
2410   case AMDGPU::G_AMDGPU_FFBH_U32:
2411   case AMDGPU::G_AMDGPU_FFBL_B32:
2412   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2413   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2414     const RegisterBank *DstBank =
2415         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2416     if (DstBank == &AMDGPU::SGPRRegBank)
2417       break;
2418 
2419     Register SrcReg = MI.getOperand(1).getReg();
2420     const LLT S32 = LLT::scalar(32);
2421     LLT Ty = MRI.getType(SrcReg);
2422     if (Ty == S32)
2423       break;
2424 
2425     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2426     // which return -1 when the input is zero:
2427     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2428     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2429     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2430     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2431     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2432     MachineIRBuilder B(MI, ApplyVALU);
2433     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2434     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2435                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2436                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2437                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2438                                 : Opc;
2439     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2440     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2441     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2442     unsigned AddOpc =
2443         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2444             ? AMDGPU::G_ADD
2445             : AMDGPU::G_UADDSAT;
2446     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2447     Register DstReg = MI.getOperand(0).getReg();
2448     B.buildUMin(DstReg, X, Y);
2449     MI.eraseFromParent();
2450     return;
2451   }
2452   case AMDGPU::G_SEXT:
2453   case AMDGPU::G_ZEXT:
2454   case AMDGPU::G_ANYEXT: {
2455     Register SrcReg = MI.getOperand(1).getReg();
2456     LLT SrcTy = MRI.getType(SrcReg);
2457     const bool Signed = Opc == AMDGPU::G_SEXT;
2458 
2459     assert(empty(OpdMapper.getVRegs(1)));
2460 
2461     MachineIRBuilder B(MI);
2462     const RegisterBank *SrcBank =
2463       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2464 
2465     Register DstReg = MI.getOperand(0).getReg();
2466     LLT DstTy = MRI.getType(DstReg);
2467     if (DstTy.isScalar() &&
2468         SrcBank != &AMDGPU::SGPRRegBank &&
2469         SrcBank != &AMDGPU::VCCRegBank &&
2470         // FIXME: Should handle any type that round to s64 when irregular
2471         // breakdowns supported.
2472         DstTy.getSizeInBits() == 64 &&
2473         SrcTy.getSizeInBits() <= 32) {
2474       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2475 
2476       // Extend to 32-bit, and then extend the low half.
2477       if (Signed) {
2478         // TODO: Should really be buildSExtOrCopy
2479         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2480       } else if (Opc == AMDGPU::G_ZEXT) {
2481         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2482       } else {
2483         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2484       }
2485 
2486       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2487       MRI.setRegBank(DstReg, *SrcBank);
2488       MI.eraseFromParent();
2489       return;
2490     }
2491 
2492     if (SrcTy != LLT::scalar(1))
2493       return;
2494 
2495     // It is not legal to have a legalization artifact with a VCC source. Rather
2496     // than introducing a copy, insert the select we would have to select the
2497     // copy to.
2498     if (SrcBank == &AMDGPU::VCCRegBank) {
2499       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2500 
2501       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2502 
2503       unsigned DstSize = DstTy.getSizeInBits();
2504       // 64-bit select is SGPR only
2505       const bool UseSel64 = DstSize > 32 &&
2506         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2507 
2508       // TODO: Should s16 select be legal?
2509       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2510       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2511       auto False = B.buildConstant(SelType, 0);
2512 
2513       MRI.setRegBank(True.getReg(0), *DstBank);
2514       MRI.setRegBank(False.getReg(0), *DstBank);
2515       MRI.setRegBank(DstReg, *DstBank);
2516 
2517       if (DstSize > 32) {
2518         B.buildSelect(DefRegs[0], SrcReg, True, False);
2519         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2520       } else if (DstSize < 32) {
2521         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2522         MRI.setRegBank(Sel.getReg(0), *DstBank);
2523         B.buildTrunc(DstReg, Sel);
2524       } else {
2525         B.buildSelect(DstReg, SrcReg, True, False);
2526       }
2527 
2528       MI.eraseFromParent();
2529       return;
2530     }
2531 
2532     break;
2533   }
2534   case AMDGPU::G_BUILD_VECTOR:
2535   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2536     Register DstReg = MI.getOperand(0).getReg();
2537     LLT DstTy = MRI.getType(DstReg);
2538     if (DstTy != LLT::fixed_vector(2, 16))
2539       break;
2540 
2541     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2542     substituteSimpleCopyRegs(OpdMapper, 1);
2543     substituteSimpleCopyRegs(OpdMapper, 2);
2544 
2545     const RegisterBank *DstBank =
2546       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2547     if (DstBank == &AMDGPU::SGPRRegBank)
2548       break; // Can use S_PACK_* instructions.
2549 
2550     MachineIRBuilder B(MI);
2551 
2552     Register Lo = MI.getOperand(1).getReg();
2553     Register Hi = MI.getOperand(2).getReg();
2554     const LLT S32 = LLT::scalar(32);
2555 
2556     const RegisterBank *BankLo =
2557       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2558     const RegisterBank *BankHi =
2559       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2560 
2561     Register ZextLo;
2562     Register ShiftHi;
2563 
2564     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2565       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2566       MRI.setRegBank(ZextLo, *BankLo);
2567 
2568       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2569       MRI.setRegBank(ZextHi, *BankHi);
2570 
2571       auto ShiftAmt = B.buildConstant(S32, 16);
2572       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2573 
2574       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2575       MRI.setRegBank(ShiftHi, *BankHi);
2576     } else {
2577       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2578       MRI.setRegBank(MaskLo, *BankLo);
2579 
2580       auto ShiftAmt = B.buildConstant(S32, 16);
2581       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2582 
2583       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2584       MRI.setRegBank(ShiftHi, *BankHi);
2585 
2586       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2587       MRI.setRegBank(ZextLo, *BankLo);
2588     }
2589 
2590     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2591     MRI.setRegBank(Or.getReg(0), *DstBank);
2592 
2593     B.buildBitcast(DstReg, Or);
2594     MI.eraseFromParent();
2595     return;
2596   }
2597   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2598     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2599 
2600     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2601 
2602     Register DstReg = MI.getOperand(0).getReg();
2603     Register SrcReg = MI.getOperand(1).getReg();
2604 
2605     const LLT S32 = LLT::scalar(32);
2606     LLT DstTy = MRI.getType(DstReg);
2607     LLT SrcTy = MRI.getType(SrcReg);
2608 
2609     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2610       return;
2611 
2612     MachineIRBuilder B(MI);
2613 
2614     const ValueMapping &DstMapping
2615       = OpdMapper.getInstrMapping().getOperandMapping(0);
2616     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2617     const RegisterBank *SrcBank =
2618       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2619     const RegisterBank *IdxBank =
2620         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2621 
2622     Register BaseIdxReg;
2623     unsigned ConstOffset;
2624     std::tie(BaseIdxReg, ConstOffset) =
2625         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2626 
2627     // See if the index is an add of a constant which will be foldable by moving
2628     // the base register of the index later if this is going to be executed in a
2629     // waterfall loop. This is essentially to reassociate the add of a constant
2630     // with the readfirstlane.
2631     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2632                                    ConstOffset > 0 &&
2633                                    ConstOffset < SrcTy.getNumElements();
2634 
2635     // Move the base register. We'll re-insert the add later.
2636     if (ShouldMoveIndexIntoLoop)
2637       MI.getOperand(2).setReg(BaseIdxReg);
2638 
2639     // If this is a VGPR result only because the index was a VGPR result, the
2640     // actual indexing will be done on the SGPR source vector, which will
2641     // produce a scalar result. We need to copy to the VGPR result inside the
2642     // waterfall loop.
2643     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2644                                 SrcBank == &AMDGPU::SGPRRegBank;
2645     if (DstRegs.empty()) {
2646       applyDefaultMapping(OpdMapper);
2647 
2648       executeInWaterfallLoop(MI, MRI, { 2 });
2649 
2650       if (NeedCopyToVGPR) {
2651         // We don't want a phi for this temporary reg.
2652         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2653         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2654         MI.getOperand(0).setReg(TmpReg);
2655         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2656 
2657         // Use a v_mov_b32 here to make the exec dependency explicit.
2658         buildVCopy(B, DstReg, TmpReg);
2659       }
2660 
2661       // Re-insert the constant offset add inside the waterfall loop.
2662       if (ShouldMoveIndexIntoLoop)
2663         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2664 
2665       return;
2666     }
2667 
2668     assert(DstTy.getSizeInBits() == 64);
2669 
2670     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2671 
2672     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2673     auto One = B.buildConstant(S32, 1);
2674 
2675     MachineBasicBlock::iterator MII = MI.getIterator();
2676 
2677     // Split the vector index into 32-bit pieces. Prepare to move all of the
2678     // new instructions into a waterfall loop if necessary.
2679     //
2680     // Don't put the bitcast or constant in the loop.
2681     MachineInstrSpan Span(MII, &B.getMBB());
2682 
2683     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2684     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2685     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2686 
2687     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2688     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2689 
2690     MRI.setRegBank(DstReg, *DstBank);
2691     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2692     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2693     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2694     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2695 
2696     SmallSet<Register, 4> OpsToWaterfall;
2697     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2698       MI.eraseFromParent();
2699       return;
2700     }
2701 
2702     // Remove the original instruction to avoid potentially confusing the
2703     // waterfall loop logic.
2704     B.setInstr(*Span.begin());
2705     MI.eraseFromParent();
2706     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2707                            OpsToWaterfall, MRI);
2708 
2709     if (NeedCopyToVGPR) {
2710       MachineBasicBlock *LoopBB = Extract1->getParent();
2711       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2712       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2713       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2714       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2715 
2716       Extract0->getOperand(0).setReg(TmpReg0);
2717       Extract1->getOperand(0).setReg(TmpReg1);
2718 
2719       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2720 
2721       buildVCopy(B, DstRegs[0], TmpReg0);
2722       buildVCopy(B, DstRegs[1], TmpReg1);
2723     }
2724 
2725     if (ShouldMoveIndexIntoLoop)
2726       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2727 
2728     return;
2729   }
2730   case AMDGPU::G_INSERT_VECTOR_ELT: {
2731     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2732 
2733     Register DstReg = MI.getOperand(0).getReg();
2734     LLT VecTy = MRI.getType(DstReg);
2735 
2736     assert(OpdMapper.getVRegs(0).empty());
2737     assert(OpdMapper.getVRegs(3).empty());
2738 
2739     if (substituteSimpleCopyRegs(OpdMapper, 1))
2740       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2741 
2742     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2743       return;
2744 
2745     const RegisterBank *IdxBank =
2746       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2747 
2748     Register SrcReg = MI.getOperand(1).getReg();
2749     Register InsReg = MI.getOperand(2).getReg();
2750     LLT InsTy = MRI.getType(InsReg);
2751     (void)InsTy;
2752 
2753     Register BaseIdxReg;
2754     unsigned ConstOffset;
2755     std::tie(BaseIdxReg, ConstOffset) =
2756         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2757 
2758     // See if the index is an add of a constant which will be foldable by moving
2759     // the base register of the index later if this is going to be executed in a
2760     // waterfall loop. This is essentially to reassociate the add of a constant
2761     // with the readfirstlane.
2762     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2763       ConstOffset > 0 &&
2764       ConstOffset < VecTy.getNumElements();
2765 
2766     // Move the base register. We'll re-insert the add later.
2767     if (ShouldMoveIndexIntoLoop)
2768       MI.getOperand(3).setReg(BaseIdxReg);
2769 
2770 
2771     if (InsRegs.empty()) {
2772       executeInWaterfallLoop(MI, MRI, { 3 });
2773 
2774       // Re-insert the constant offset add inside the waterfall loop.
2775       if (ShouldMoveIndexIntoLoop) {
2776         MachineIRBuilder B(MI);
2777         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2778       }
2779 
2780       return;
2781     }
2782 
2783 
2784     assert(InsTy.getSizeInBits() == 64);
2785 
2786     const LLT S32 = LLT::scalar(32);
2787     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2788 
2789     MachineIRBuilder B(MI);
2790     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2791     auto One = B.buildConstant(S32, 1);
2792 
2793     // Split the vector index into 32-bit pieces. Prepare to move all of the
2794     // new instructions into a waterfall loop if necessary.
2795     //
2796     // Don't put the bitcast or constant in the loop.
2797     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2798 
2799     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2800     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2801     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2802 
2803     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2804     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2805 
2806     const RegisterBank *DstBank =
2807       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2808     const RegisterBank *SrcBank =
2809       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2810     const RegisterBank *InsSrcBank =
2811       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2812 
2813     MRI.setRegBank(InsReg, *InsSrcBank);
2814     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2815     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2816     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2817     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2818     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2819     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2820 
2821 
2822     SmallSet<Register, 4> OpsToWaterfall;
2823     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2824       B.setInsertPt(B.getMBB(), MI);
2825       B.buildBitcast(DstReg, InsHi);
2826       MI.eraseFromParent();
2827       return;
2828     }
2829 
2830     B.setInstr(*Span.begin());
2831     MI.eraseFromParent();
2832 
2833     // Figure out the point after the waterfall loop before mangling the control
2834     // flow.
2835     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2836                            OpsToWaterfall, MRI);
2837 
2838     // The insertion point is now right after the original instruction.
2839     //
2840     // Keep the bitcast to the original vector type out of the loop. Doing this
2841     // saved an extra phi we don't need inside the loop.
2842     B.buildBitcast(DstReg, InsHi);
2843 
2844     // Re-insert the constant offset add inside the waterfall loop.
2845     if (ShouldMoveIndexIntoLoop)
2846       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2847 
2848     return;
2849   }
2850   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2851   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2852   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2853   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2854   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2855   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2856   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2857   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2858   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2859   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2860   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2861   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2862   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2863   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2864   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2865   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2866     applyDefaultMapping(OpdMapper);
2867     executeInWaterfallLoop(MI, MRI, {1, 4});
2868     return;
2869   }
2870   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2871   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2872   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2873   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2874   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2875   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2876   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2877   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2878   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2879   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2880   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2881   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2882     applyDefaultMapping(OpdMapper);
2883     executeInWaterfallLoop(MI, MRI, {2, 5});
2884     return;
2885   }
2886   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2887   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2888   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2889     applyDefaultMapping(OpdMapper);
2890     executeInWaterfallLoop(MI, MRI, {2, 5});
2891     return;
2892   }
2893   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2894     applyDefaultMapping(OpdMapper);
2895     executeInWaterfallLoop(MI, MRI, {3, 6});
2896     return;
2897   }
2898   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2899     applyMappingSBufferLoad(OpdMapper);
2900     return;
2901   }
2902   case AMDGPU::G_INTRINSIC: {
2903     switch (MI.getIntrinsicID()) {
2904     case Intrinsic::amdgcn_readlane: {
2905       substituteSimpleCopyRegs(OpdMapper, 2);
2906 
2907       assert(OpdMapper.getVRegs(0).empty());
2908       assert(OpdMapper.getVRegs(3).empty());
2909 
2910       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2911       // waterfall loop, so assume it's a uniform value.
2912       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2913       return;
2914     }
2915     case Intrinsic::amdgcn_writelane: {
2916       assert(OpdMapper.getVRegs(0).empty());
2917       assert(OpdMapper.getVRegs(2).empty());
2918       assert(OpdMapper.getVRegs(3).empty());
2919 
2920       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2921       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2922       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2923       return;
2924     }
2925     case Intrinsic::amdgcn_interp_p1:
2926     case Intrinsic::amdgcn_interp_p2:
2927     case Intrinsic::amdgcn_interp_mov:
2928     case Intrinsic::amdgcn_interp_p1_f16:
2929     case Intrinsic::amdgcn_interp_p2_f16: {
2930       applyDefaultMapping(OpdMapper);
2931 
2932       // Readlane for m0 value, which is always the last operand.
2933       // FIXME: Should this be a waterfall loop instead?
2934       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2935       return;
2936     }
2937     case Intrinsic::amdgcn_permlane16:
2938     case Intrinsic::amdgcn_permlanex16: {
2939       // Doing a waterfall loop over these wouldn't make any sense.
2940       substituteSimpleCopyRegs(OpdMapper, 2);
2941       substituteSimpleCopyRegs(OpdMapper, 3);
2942       constrainOpWithReadfirstlane(MI, MRI, 4);
2943       constrainOpWithReadfirstlane(MI, MRI, 5);
2944       return;
2945     }
2946     case Intrinsic::amdgcn_sbfe:
2947       applyMappingBFE(OpdMapper, true);
2948       return;
2949     case Intrinsic::amdgcn_ubfe:
2950       applyMappingBFE(OpdMapper, false);
2951       return;
2952     case Intrinsic::amdgcn_ballot:
2953       // Use default handling and insert copy to vcc source.
2954       break;
2955     }
2956     break;
2957   }
2958   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2959   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
2960   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
2961   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
2962     const AMDGPU::RsrcIntrinsic *RSrcIntrin
2963       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
2964     assert(RSrcIntrin && RSrcIntrin->IsImage);
2965     // Non-images can have complications from operands that allow both SGPR
2966     // and VGPR. For now it's too complicated to figure out the final opcode
2967     // to derive the register bank from the MCInstrDesc.
2968     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2969     return;
2970   }
2971   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
2972     unsigned N = MI.getNumExplicitOperands() - 2;
2973     applyDefaultMapping(OpdMapper);
2974     executeInWaterfallLoop(MI, MRI, { N });
2975     return;
2976   }
2977   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2978     auto IntrID = MI.getIntrinsicID();
2979     switch (IntrID) {
2980     case Intrinsic::amdgcn_ds_ordered_add:
2981     case Intrinsic::amdgcn_ds_ordered_swap: {
2982       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2983       assert(OpdMapper.getVRegs(0).empty());
2984       substituteSimpleCopyRegs(OpdMapper, 3);
2985       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2986       return;
2987     }
2988     case Intrinsic::amdgcn_ds_gws_init:
2989     case Intrinsic::amdgcn_ds_gws_barrier:
2990     case Intrinsic::amdgcn_ds_gws_sema_br: {
2991       // Only the first lane is executes, so readfirstlane is safe.
2992       substituteSimpleCopyRegs(OpdMapper, 1);
2993       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2994       return;
2995     }
2996     case Intrinsic::amdgcn_ds_gws_sema_v:
2997     case Intrinsic::amdgcn_ds_gws_sema_p:
2998     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2999       // Only the first lane is executes, so readfirstlane is safe.
3000       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3001       return;
3002     }
3003     case Intrinsic::amdgcn_ds_append:
3004     case Intrinsic::amdgcn_ds_consume: {
3005       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3006       return;
3007     }
3008     case Intrinsic::amdgcn_s_sendmsg:
3009     case Intrinsic::amdgcn_s_sendmsghalt: {
3010       // FIXME: Should this use a waterfall loop?
3011       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3012       return;
3013     }
3014     case Intrinsic::amdgcn_s_setreg: {
3015       constrainOpWithReadfirstlane(MI, MRI, 2);
3016       return;
3017     }
3018     default: {
3019       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3020               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3021         // Non-images can have complications from operands that allow both SGPR
3022         // and VGPR. For now it's too complicated to figure out the final opcode
3023         // to derive the register bank from the MCInstrDesc.
3024         if (RSrcIntrin->IsImage) {
3025           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3026           return;
3027         }
3028       }
3029 
3030       break;
3031     }
3032     }
3033     break;
3034   }
3035   case AMDGPU::G_SI_CALL: {
3036     // Use a set to avoid extra readfirstlanes in the case where multiple
3037     // operands are the same register.
3038     SmallSet<Register, 4> SGPROperandRegs;
3039 
3040     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3041       break;
3042 
3043     // Move all copies to physical SGPRs that are used by the call instruction
3044     // into the loop block. Start searching for these copies until the
3045     // ADJCALLSTACKUP.
3046     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3047     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3048 
3049     // Move all non-copies before the copies, so that a complete range can be
3050     // moved into the waterfall loop.
3051     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3052     // Count of NonCopyInstrs found until the current LastCopy.
3053     unsigned NonCopyInstrsLen = 0;
3054     MachineBasicBlock::iterator Start(&MI);
3055     MachineBasicBlock::iterator LastCopy = Start;
3056     MachineBasicBlock *MBB = MI.getParent();
3057     const SIMachineFunctionInfo *Info =
3058         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3059     while (Start->getOpcode() != FrameSetupOpcode) {
3060       --Start;
3061       bool IsCopy = false;
3062       if (Start->getOpcode() == AMDGPU::COPY) {
3063         auto &Dst = Start->getOperand(0);
3064         if (Dst.isReg()) {
3065           Register Reg = Dst.getReg();
3066           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3067             IsCopy = true;
3068           } else {
3069             // Also move the copy from the scratch rsrc descriptor into the loop
3070             // to allow it to be optimized away.
3071             auto &Src = Start->getOperand(1);
3072             if (Src.isReg()) {
3073               Reg = Src.getReg();
3074               IsCopy = Info->getScratchRSrcReg() == Reg;
3075             }
3076           }
3077         }
3078       }
3079 
3080       if (IsCopy) {
3081         LastCopy = Start;
3082         NonCopyInstrsLen = NonCopyInstrs.size();
3083       } else {
3084         NonCopyInstrs.push_back(&*Start);
3085       }
3086     }
3087     NonCopyInstrs.resize(NonCopyInstrsLen);
3088 
3089     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3090       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3091     }
3092     Start = LastCopy;
3093 
3094     // Do the same for copies after the loop
3095     NonCopyInstrs.clear();
3096     NonCopyInstrsLen = 0;
3097     MachineBasicBlock::iterator End(&MI);
3098     LastCopy = End;
3099     while (End->getOpcode() != FrameDestroyOpcode) {
3100       ++End;
3101       bool IsCopy = false;
3102       if (End->getOpcode() == AMDGPU::COPY) {
3103         auto &Src = End->getOperand(1);
3104         if (Src.isReg()) {
3105           Register Reg = Src.getReg();
3106           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3107         }
3108       }
3109 
3110       if (IsCopy) {
3111         LastCopy = End;
3112         NonCopyInstrsLen = NonCopyInstrs.size();
3113       } else {
3114         NonCopyInstrs.push_back(&*End);
3115       }
3116     }
3117     NonCopyInstrs.resize(NonCopyInstrsLen);
3118 
3119     End = LastCopy;
3120     ++LastCopy;
3121     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3122       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3123     }
3124 
3125     ++End;
3126     MachineIRBuilder B(*Start);
3127     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3128     break;
3129   }
3130   case AMDGPU::G_LOAD:
3131   case AMDGPU::G_ZEXTLOAD:
3132   case AMDGPU::G_SEXTLOAD: {
3133     if (applyMappingLoad(MI, OpdMapper, MRI))
3134       return;
3135     break;
3136   }
3137   case AMDGPU::G_DYN_STACKALLOC:
3138     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3139     return;
3140   case AMDGPU::G_SBFX:
3141     applyMappingBFE(OpdMapper, /*Signed*/ true);
3142     return;
3143   case AMDGPU::G_UBFX:
3144     applyMappingBFE(OpdMapper, /*Signed*/ false);
3145     return;
3146   default:
3147     break;
3148   }
3149 
3150   return applyDefaultMapping(OpdMapper);
3151 }
3152 
3153 // vgpr, sgpr -> vgpr
3154 // vgpr, agpr -> vgpr
3155 // agpr, agpr -> agpr
3156 // agpr, sgpr -> vgpr
3157 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3158   if (RB0 == AMDGPU::InvalidRegBankID)
3159     return RB1;
3160   if (RB1 == AMDGPU::InvalidRegBankID)
3161     return RB0;
3162 
3163   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3164     return AMDGPU::SGPRRegBankID;
3165 
3166   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3167     return AMDGPU::AGPRRegBankID;
3168 
3169   return AMDGPU::VGPRRegBankID;
3170 }
3171 
3172 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3173   if (RB0 == AMDGPU::InvalidRegBankID)
3174     return RB1;
3175   if (RB1 == AMDGPU::InvalidRegBankID)
3176     return RB0;
3177 
3178   // vcc, vcc -> vcc
3179   // vcc, sgpr -> vcc
3180   // vcc, vgpr -> vcc
3181   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3182     return AMDGPU::VCCRegBankID;
3183 
3184   // vcc, vgpr -> vgpr
3185   return regBankUnion(RB0, RB1);
3186 }
3187 
3188 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3189                                                 const MachineInstr &MI) const {
3190   unsigned RegBank = AMDGPU::InvalidRegBankID;
3191 
3192   for (const MachineOperand &MO : MI.operands()) {
3193     if (!MO.isReg())
3194       continue;
3195     Register Reg = MO.getReg();
3196     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3197       RegBank = regBankUnion(RegBank, Bank->getID());
3198       if (RegBank == AMDGPU::VGPRRegBankID)
3199         break;
3200     }
3201   }
3202 
3203   return RegBank;
3204 }
3205 
3206 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3207   const MachineFunction &MF = *MI.getParent()->getParent();
3208   const MachineRegisterInfo &MRI = MF.getRegInfo();
3209   for (const MachineOperand &MO : MI.operands()) {
3210     if (!MO.isReg())
3211       continue;
3212     Register Reg = MO.getReg();
3213     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3214       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3215         return false;
3216     }
3217   }
3218   return true;
3219 }
3220 
3221 const RegisterBankInfo::InstructionMapping &
3222 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3223   const MachineFunction &MF = *MI.getParent()->getParent();
3224   const MachineRegisterInfo &MRI = MF.getRegInfo();
3225   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3226 
3227   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3228     const MachineOperand &SrcOp = MI.getOperand(i);
3229     if (!SrcOp.isReg())
3230       continue;
3231 
3232     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3233     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3234   }
3235   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3236                                MI.getNumOperands());
3237 }
3238 
3239 const RegisterBankInfo::InstructionMapping &
3240 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3241   const MachineFunction &MF = *MI.getParent()->getParent();
3242   const MachineRegisterInfo &MRI = MF.getRegInfo();
3243   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3244 
3245   // Even though we technically could use SGPRs, this would require knowledge of
3246   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3247   //
3248   // TODO: Unary ops are trivially OK, so accept SGPRs?
3249   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3250     const MachineOperand &Src = MI.getOperand(i);
3251     if (!Src.isReg())
3252       continue;
3253 
3254     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3255     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3256     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3257   }
3258 
3259   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3260                                MI.getNumOperands());
3261 }
3262 
3263 const RegisterBankInfo::InstructionMapping &
3264 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3265   const MachineFunction &MF = *MI.getParent()->getParent();
3266   const MachineRegisterInfo &MRI = MF.getRegInfo();
3267   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3268 
3269   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3270     const MachineOperand &Op = MI.getOperand(I);
3271     if (!Op.isReg())
3272       continue;
3273 
3274     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3275     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3276   }
3277 
3278   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3279                                MI.getNumOperands());
3280 }
3281 
3282 const RegisterBankInfo::InstructionMapping &
3283 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3284                                         const MachineInstr &MI,
3285                                         int RsrcIdx) const {
3286   // The reported argument index is relative to the IR intrinsic call arguments,
3287   // so we need to shift by the number of defs and the intrinsic ID.
3288   RsrcIdx += MI.getNumExplicitDefs() + 1;
3289 
3290   const int NumOps = MI.getNumOperands();
3291   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3292 
3293   // TODO: Should packed/unpacked D16 difference be reported here as part of
3294   // the value mapping?
3295   for (int I = 0; I != NumOps; ++I) {
3296     if (!MI.getOperand(I).isReg())
3297       continue;
3298 
3299     Register OpReg = MI.getOperand(I).getReg();
3300     // We replace some dead address operands with $noreg
3301     if (!OpReg)
3302       continue;
3303 
3304     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3305 
3306     // FIXME: Probably need a new intrinsic register bank searchable table to
3307     // handle arbitrary intrinsics easily.
3308     //
3309     // If this has a sampler, it immediately follows rsrc.
3310     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3311 
3312     if (MustBeSGPR) {
3313       // If this must be an SGPR, so we must report whatever it is as legal.
3314       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3315       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3316     } else {
3317       // Some operands must be VGPR, and these are easy to copy to.
3318       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3319     }
3320   }
3321 
3322   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3323 }
3324 
3325 /// Return the mapping for a pointer argument.
3326 const RegisterBankInfo::ValueMapping *
3327 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3328                                               Register PtrReg) const {
3329   LLT PtrTy = MRI.getType(PtrReg);
3330   unsigned Size = PtrTy.getSizeInBits();
3331   if (Subtarget.useFlatForGlobal() ||
3332       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3333     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3334 
3335   // If we're using MUBUF instructions for global memory, an SGPR base register
3336   // is possible. Otherwise this needs to be a VGPR.
3337   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3338   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3339 }
3340 
3341 const RegisterBankInfo::InstructionMapping &
3342 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3343 
3344   const MachineFunction &MF = *MI.getParent()->getParent();
3345   const MachineRegisterInfo &MRI = MF.getRegInfo();
3346   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3347   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3348   Register PtrReg = MI.getOperand(1).getReg();
3349   LLT PtrTy = MRI.getType(PtrReg);
3350   unsigned AS = PtrTy.getAddressSpace();
3351   unsigned PtrSize = PtrTy.getSizeInBits();
3352 
3353   const ValueMapping *ValMapping;
3354   const ValueMapping *PtrMapping;
3355 
3356   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3357 
3358   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3359     if (isScalarLoadLegal(MI)) {
3360       // We have a uniform instruction so we want to use an SMRD load
3361       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3362       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3363     } else {
3364       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3365 
3366       // If we're using MUBUF instructions for global memory, an SGPR base
3367       // register is possible. Otherwise this needs to be a VGPR.
3368       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3369         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3370 
3371       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3372     }
3373   } else {
3374     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3375     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3376   }
3377 
3378   OpdsMapping[0] = ValMapping;
3379   OpdsMapping[1] = PtrMapping;
3380   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3381       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3382   return Mapping;
3383 
3384   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3385   // handle that during instruction selection?
3386 }
3387 
3388 unsigned
3389 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3390                                      const MachineRegisterInfo &MRI,
3391                                      unsigned Default) const {
3392   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3393   return Bank ? Bank->getID() : Default;
3394 }
3395 
3396 const RegisterBankInfo::ValueMapping *
3397 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3398                                          const MachineRegisterInfo &MRI,
3399                                          const TargetRegisterInfo &TRI) const {
3400   // Lie and claim anything is legal, even though this needs to be an SGPR
3401   // applyMapping will have to deal with it as a waterfall loop.
3402   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3403   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3404   return AMDGPU::getValueMapping(Bank, Size);
3405 }
3406 
3407 const RegisterBankInfo::ValueMapping *
3408 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3409                                          const MachineRegisterInfo &MRI,
3410                                          const TargetRegisterInfo &TRI) const {
3411   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3412   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3413 }
3414 
3415 const RegisterBankInfo::ValueMapping *
3416 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3417                                          const MachineRegisterInfo &MRI,
3418                                          const TargetRegisterInfo &TRI) const {
3419   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3420   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3421 }
3422 
3423 ///
3424 /// This function must return a legal mapping, because
3425 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3426 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3427 /// VGPR to SGPR generated is illegal.
3428 ///
3429 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3430 // legal. These will be dealt with in applyMappingImpl.
3431 //
3432 const RegisterBankInfo::InstructionMapping &
3433 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3434   const MachineFunction &MF = *MI.getParent()->getParent();
3435   const MachineRegisterInfo &MRI = MF.getRegInfo();
3436 
3437   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3438     // The default logic bothers to analyze impossible alternative mappings. We
3439     // want the most straightforward mapping, so just directly handle this.
3440     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3441                                              *TRI);
3442     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3443                                              *TRI);
3444     assert(SrcBank && "src bank should have been assigned already");
3445     if (!DstBank)
3446       DstBank = SrcBank;
3447 
3448     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3449     if (cannotCopy(*DstBank, *SrcBank, Size))
3450       return getInvalidInstructionMapping();
3451 
3452     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3453     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3454     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3455     OpdsMapping[0] = &ValMap;
3456     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3457       OpdsMapping[1] = &ValMap;
3458 
3459     return getInstructionMapping(
3460         1, /*Cost*/ 1,
3461         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3462   }
3463 
3464   if (MI.isRegSequence()) {
3465     // If any input is a VGPR, the result must be a VGPR. The default handling
3466     // assumes any copy between banks is legal.
3467     unsigned BankID = AMDGPU::SGPRRegBankID;
3468 
3469     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3470       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3471       // It doesn't make sense to use vcc or scc banks here, so just ignore
3472       // them.
3473       if (OpBank != AMDGPU::SGPRRegBankID) {
3474         BankID = AMDGPU::VGPRRegBankID;
3475         break;
3476       }
3477     }
3478     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3479 
3480     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3481     return getInstructionMapping(
3482         1, /*Cost*/ 1,
3483         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3484   }
3485 
3486   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3487   // properly.
3488   //
3489   // TODO: There are additional exec masking dependencies to analyze.
3490   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3491     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3492     Register DstReg = MI.getOperand(0).getReg();
3493 
3494     // Sometimes the result may have already been assigned a bank.
3495     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3496       ResultBank = DstBank->getID();
3497 
3498     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3499       Register Reg = MI.getOperand(I).getReg();
3500       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3501 
3502       // FIXME: Assuming VGPR for any undetermined inputs.
3503       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3504         ResultBank = AMDGPU::VGPRRegBankID;
3505         break;
3506       }
3507 
3508       // FIXME: Need to promote SGPR case to s32
3509       unsigned OpBank = Bank->getID();
3510       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3511     }
3512 
3513     assert(ResultBank != AMDGPU::InvalidRegBankID);
3514 
3515     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3516 
3517     const ValueMapping &ValMap =
3518         getValueMapping(0, Size, getRegBank(ResultBank));
3519     return getInstructionMapping(
3520         1, /*Cost*/ 1,
3521         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3522   }
3523 
3524   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3525   if (Mapping.isValid())
3526     return Mapping;
3527 
3528   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3529 
3530   switch (MI.getOpcode()) {
3531   default:
3532     return getInvalidInstructionMapping();
3533 
3534   case AMDGPU::G_AND:
3535   case AMDGPU::G_OR:
3536   case AMDGPU::G_XOR: {
3537     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3538     if (Size == 1) {
3539       const RegisterBank *DstBank
3540         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3541 
3542       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3543       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3544       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3545       if (DstBank) {
3546         TargetBankID = DstBank->getID();
3547         if (DstBank == &AMDGPU::VCCRegBank) {
3548           TargetBankID = AMDGPU::VCCRegBankID;
3549           BankLHS = AMDGPU::VCCRegBankID;
3550           BankRHS = AMDGPU::VCCRegBankID;
3551         } else {
3552           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3553                                  AMDGPU::SGPRRegBankID);
3554           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3555                                  AMDGPU::SGPRRegBankID);
3556         }
3557       } else {
3558         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3559                                AMDGPU::VCCRegBankID);
3560         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3561                                AMDGPU::VCCRegBankID);
3562 
3563         // Both inputs should be true booleans to produce a boolean result.
3564         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3565           TargetBankID = AMDGPU::VGPRRegBankID;
3566         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3567           TargetBankID = AMDGPU::VCCRegBankID;
3568           BankLHS = AMDGPU::VCCRegBankID;
3569           BankRHS = AMDGPU::VCCRegBankID;
3570         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3571           TargetBankID = AMDGPU::SGPRRegBankID;
3572         }
3573       }
3574 
3575       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3576       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3577       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3578       break;
3579     }
3580 
3581     if (Size == 64) {
3582 
3583       if (isSALUMapping(MI)) {
3584         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3585         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3586       } else {
3587         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3588         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3589         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3590 
3591         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3592         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3593       }
3594 
3595       break;
3596     }
3597 
3598     LLVM_FALLTHROUGH;
3599   }
3600   case AMDGPU::G_PTR_ADD:
3601   case AMDGPU::G_PTRMASK:
3602   case AMDGPU::G_ADD:
3603   case AMDGPU::G_SUB:
3604   case AMDGPU::G_MUL:
3605   case AMDGPU::G_SHL:
3606   case AMDGPU::G_LSHR:
3607   case AMDGPU::G_ASHR:
3608   case AMDGPU::G_UADDO:
3609   case AMDGPU::G_USUBO:
3610   case AMDGPU::G_UADDE:
3611   case AMDGPU::G_SADDE:
3612   case AMDGPU::G_USUBE:
3613   case AMDGPU::G_SSUBE:
3614   case AMDGPU::G_SMIN:
3615   case AMDGPU::G_SMAX:
3616   case AMDGPU::G_UMIN:
3617   case AMDGPU::G_UMAX:
3618   case AMDGPU::G_ABS:
3619   case AMDGPU::G_SHUFFLE_VECTOR:
3620   case AMDGPU::G_SBFX:
3621   case AMDGPU::G_UBFX:
3622     if (isSALUMapping(MI))
3623       return getDefaultMappingSOP(MI);
3624     LLVM_FALLTHROUGH;
3625 
3626   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3627   case AMDGPU::G_SSUBSAT:
3628   case AMDGPU::G_UADDSAT:
3629   case AMDGPU::G_USUBSAT:
3630   case AMDGPU::G_FADD:
3631   case AMDGPU::G_FSUB:
3632   case AMDGPU::G_FPTOSI:
3633   case AMDGPU::G_FPTOUI:
3634   case AMDGPU::G_FMUL:
3635   case AMDGPU::G_FMA:
3636   case AMDGPU::G_FMAD:
3637   case AMDGPU::G_FSQRT:
3638   case AMDGPU::G_FFLOOR:
3639   case AMDGPU::G_FCEIL:
3640   case AMDGPU::G_FRINT:
3641   case AMDGPU::G_SITOFP:
3642   case AMDGPU::G_UITOFP:
3643   case AMDGPU::G_FPTRUNC:
3644   case AMDGPU::G_FPEXT:
3645   case AMDGPU::G_FEXP2:
3646   case AMDGPU::G_FLOG2:
3647   case AMDGPU::G_FMINNUM:
3648   case AMDGPU::G_FMAXNUM:
3649   case AMDGPU::G_FMINNUM_IEEE:
3650   case AMDGPU::G_FMAXNUM_IEEE:
3651   case AMDGPU::G_FCANONICALIZE:
3652   case AMDGPU::G_INTRINSIC_TRUNC:
3653   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3654   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3655   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3656   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3657   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3658   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3659   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3660   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3661   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3662   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3663   case AMDGPU::G_AMDGPU_SMED3:
3664     return getDefaultMappingVOP(MI);
3665   case AMDGPU::G_UMULH:
3666   case AMDGPU::G_SMULH: {
3667     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3668       return getDefaultMappingSOP(MI);
3669     return getDefaultMappingVOP(MI);
3670   }
3671   case AMDGPU::G_IMPLICIT_DEF: {
3672     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3673     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3674     break;
3675   }
3676   case AMDGPU::G_FCONSTANT:
3677   case AMDGPU::G_CONSTANT:
3678   case AMDGPU::G_GLOBAL_VALUE:
3679   case AMDGPU::G_BLOCK_ADDR:
3680   case AMDGPU::G_READCYCLECOUNTER: {
3681     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3682     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3683     break;
3684   }
3685   case AMDGPU::G_FRAME_INDEX: {
3686     // TODO: This should be the same as other constants, but eliminateFrameIndex
3687     // currently assumes VALU uses.
3688     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3689     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3690     break;
3691   }
3692   case AMDGPU::G_DYN_STACKALLOC: {
3693     // Result is always uniform, and a wave reduction is needed for the source.
3694     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3695     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3696     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3697     break;
3698   }
3699   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3700     // This case is weird because we expect a physical register in the source,
3701     // but need to set a bank anyway.
3702     //
3703     // We could select the result to SGPR or VGPR, but for the one current use
3704     // it's more practical to always use VGPR.
3705     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3706     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3707     break;
3708   }
3709   case AMDGPU::G_INSERT: {
3710     unsigned BankID = getMappingType(MRI, MI);
3711     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3712     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3713     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3714     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3715     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3716     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3717     OpdsMapping[3] = nullptr;
3718     break;
3719   }
3720   case AMDGPU::G_EXTRACT: {
3721     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3722     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3723     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3724     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3725     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3726     OpdsMapping[2] = nullptr;
3727     break;
3728   }
3729   case AMDGPU::G_BUILD_VECTOR:
3730   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3731     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3732     if (DstTy == LLT::fixed_vector(2, 16)) {
3733       unsigned DstSize = DstTy.getSizeInBits();
3734       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3735       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3736       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3737       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3738 
3739       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3740       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3741       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3742       break;
3743     }
3744 
3745     LLVM_FALLTHROUGH;
3746   }
3747   case AMDGPU::G_MERGE_VALUES:
3748   case AMDGPU::G_CONCAT_VECTORS: {
3749     unsigned Bank = getMappingType(MRI, MI);
3750     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3751     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3752 
3753     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3754     // Op1 and Dst should use the same register bank.
3755     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3756       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3757     break;
3758   }
3759   case AMDGPU::G_BITREVERSE:
3760   case AMDGPU::G_BITCAST:
3761   case AMDGPU::G_INTTOPTR:
3762   case AMDGPU::G_PTRTOINT:
3763   case AMDGPU::G_FABS:
3764   case AMDGPU::G_FNEG: {
3765     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3766     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3767     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3768     break;
3769   }
3770   case AMDGPU::G_AMDGPU_FFBH_U32:
3771   case AMDGPU::G_AMDGPU_FFBL_B32:
3772   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3773   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3774     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3775     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3776     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3777     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3778     break;
3779   }
3780   case AMDGPU::G_CTPOP: {
3781     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3782     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3783     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3784 
3785     // This should really be getValueMappingSGPR64Only, but allowing the generic
3786     // code to handle the register split just makes using LegalizerHelper more
3787     // difficult.
3788     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3789     break;
3790   }
3791   case AMDGPU::G_TRUNC: {
3792     Register Dst = MI.getOperand(0).getReg();
3793     Register Src = MI.getOperand(1).getReg();
3794     unsigned Bank = getRegBankID(Src, MRI);
3795     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3796     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3797     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3798     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3799     break;
3800   }
3801   case AMDGPU::G_ZEXT:
3802   case AMDGPU::G_SEXT:
3803   case AMDGPU::G_ANYEXT:
3804   case AMDGPU::G_SEXT_INREG: {
3805     Register Dst = MI.getOperand(0).getReg();
3806     Register Src = MI.getOperand(1).getReg();
3807     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3808     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3809 
3810     unsigned DstBank;
3811     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3812     assert(SrcBank);
3813     switch (SrcBank->getID()) {
3814     case AMDGPU::SGPRRegBankID:
3815       DstBank = AMDGPU::SGPRRegBankID;
3816       break;
3817     default:
3818       DstBank = AMDGPU::VGPRRegBankID;
3819       break;
3820     }
3821 
3822     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3823     // 32-bits, and then to 64.
3824     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3825     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3826                                                        SrcSize);
3827     break;
3828   }
3829   case AMDGPU::G_FCMP: {
3830     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3831     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3832     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3833     OpdsMapping[1] = nullptr; // Predicate Operand.
3834     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3835     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3836     break;
3837   }
3838   case AMDGPU::G_STORE: {
3839     assert(MI.getOperand(0).isReg());
3840     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3841 
3842     // FIXME: We need to specify a different reg bank once scalar stores are
3843     // supported.
3844     const ValueMapping *ValMapping =
3845         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3846     OpdsMapping[0] = ValMapping;
3847     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3848     break;
3849   }
3850   case AMDGPU::G_ICMP: {
3851     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3852     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3853 
3854     // See if the result register has already been constrained to vcc, which may
3855     // happen due to control flow intrinsic lowering.
3856     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3857                                     AMDGPU::SGPRRegBankID);
3858     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3859     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3860 
3861     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3862                      Op2Bank == AMDGPU::SGPRRegBankID &&
3863                      Op3Bank == AMDGPU::SGPRRegBankID &&
3864       (Size == 32 || (Size == 64 &&
3865                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3866                       Subtarget.hasScalarCompareEq64()));
3867 
3868     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3869     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3870 
3871     // TODO: Use 32-bit for scalar output size.
3872     // SCC results will need to be copied to a 32-bit SGPR virtual register.
3873     const unsigned ResultSize = 1;
3874 
3875     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3876     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3877     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3878     break;
3879   }
3880   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3881     // VGPR index can be used for waterfall when indexing a SGPR vector.
3882     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3883     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3884     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3885     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3886     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3887     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3888 
3889     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3890     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3891 
3892     // The index can be either if the source vector is VGPR.
3893     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3894     break;
3895   }
3896   case AMDGPU::G_INSERT_VECTOR_ELT: {
3897     unsigned OutputBankID = isSALUMapping(MI) ?
3898       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3899 
3900     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3901     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3902     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3903     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3904     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3905 
3906     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3907     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3908 
3909     // This is a weird case, because we need to break down the mapping based on
3910     // the register bank of a different operand.
3911     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3912       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3913                                                       InsertSize);
3914     } else {
3915       assert(InsertSize == 32 || InsertSize == 64);
3916       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3917     }
3918 
3919     // The index can be either if the source vector is VGPR.
3920     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3921     break;
3922   }
3923   case AMDGPU::G_UNMERGE_VALUES: {
3924     unsigned Bank = getMappingType(MRI, MI);
3925 
3926     // Op1 and Dst should use the same register bank.
3927     // FIXME: Shouldn't this be the default? Why do we need to handle this?
3928     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3929       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3930       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3931     }
3932     break;
3933   }
3934   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3935   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3936   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3937   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3938   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3939   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3940   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3941   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3942   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3943   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3944   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3945   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3946   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3947   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3948   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3949   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3950     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3951 
3952     // rsrc
3953     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3954 
3955     // vindex
3956     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3957 
3958     // voffset
3959     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3960 
3961     // soffset
3962     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3963 
3964     // Any remaining operands are immediates and were correctly null
3965     // initialized.
3966     break;
3967   }
3968   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3969   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3970   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3971   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3972   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3973   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3974   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3975   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3976   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3977   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3978   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3979   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3980   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3981   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3982   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3983     // vdata_out
3984     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3985 
3986     // vdata_in
3987     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3988 
3989     // rsrc
3990     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3991 
3992     // vindex
3993     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3994 
3995     // voffset
3996     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3997 
3998     // soffset
3999     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4000 
4001     // Any remaining operands are immediates and were correctly null
4002     // initialized.
4003     break;
4004   }
4005   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4006     // vdata_out
4007     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4008 
4009     // vdata_in
4010     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4011 
4012     // cmp
4013     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4014 
4015     // rsrc
4016     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4017 
4018     // vindex
4019     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4020 
4021     // voffset
4022     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4023 
4024     // soffset
4025     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4026 
4027     // Any remaining operands are immediates and were correctly null
4028     // initialized.
4029     break;
4030   }
4031   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4032     // Lie and claim everything is legal, even though some need to be
4033     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4034     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4035     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4036 
4037     // We need to convert this to a MUBUF if either the resource of offset is
4038     // VGPR.
4039     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4040     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4041     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4042 
4043     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4044     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4045     break;
4046   }
4047   case AMDGPU::G_INTRINSIC: {
4048     switch (MI.getIntrinsicID()) {
4049     default:
4050       return getInvalidInstructionMapping();
4051     case Intrinsic::amdgcn_div_fmas:
4052     case Intrinsic::amdgcn_div_fixup:
4053     case Intrinsic::amdgcn_trig_preop:
4054     case Intrinsic::amdgcn_sin:
4055     case Intrinsic::amdgcn_cos:
4056     case Intrinsic::amdgcn_log_clamp:
4057     case Intrinsic::amdgcn_rcp:
4058     case Intrinsic::amdgcn_rcp_legacy:
4059     case Intrinsic::amdgcn_sqrt:
4060     case Intrinsic::amdgcn_rsq:
4061     case Intrinsic::amdgcn_rsq_legacy:
4062     case Intrinsic::amdgcn_rsq_clamp:
4063     case Intrinsic::amdgcn_fmul_legacy:
4064     case Intrinsic::amdgcn_fma_legacy:
4065     case Intrinsic::amdgcn_ldexp:
4066     case Intrinsic::amdgcn_frexp_mant:
4067     case Intrinsic::amdgcn_frexp_exp:
4068     case Intrinsic::amdgcn_fract:
4069     case Intrinsic::amdgcn_cvt_pkrtz:
4070     case Intrinsic::amdgcn_cvt_pknorm_i16:
4071     case Intrinsic::amdgcn_cvt_pknorm_u16:
4072     case Intrinsic::amdgcn_cvt_pk_i16:
4073     case Intrinsic::amdgcn_cvt_pk_u16:
4074     case Intrinsic::amdgcn_fmed3:
4075     case Intrinsic::amdgcn_cubeid:
4076     case Intrinsic::amdgcn_cubema:
4077     case Intrinsic::amdgcn_cubesc:
4078     case Intrinsic::amdgcn_cubetc:
4079     case Intrinsic::amdgcn_sffbh:
4080     case Intrinsic::amdgcn_fmad_ftz:
4081     case Intrinsic::amdgcn_mbcnt_lo:
4082     case Intrinsic::amdgcn_mbcnt_hi:
4083     case Intrinsic::amdgcn_mul_u24:
4084     case Intrinsic::amdgcn_mul_i24:
4085     case Intrinsic::amdgcn_mulhi_u24:
4086     case Intrinsic::amdgcn_mulhi_i24:
4087     case Intrinsic::amdgcn_lerp:
4088     case Intrinsic::amdgcn_sad_u8:
4089     case Intrinsic::amdgcn_msad_u8:
4090     case Intrinsic::amdgcn_sad_hi_u8:
4091     case Intrinsic::amdgcn_sad_u16:
4092     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4093     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4094     case Intrinsic::amdgcn_mqsad_u32_u8:
4095     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4096     case Intrinsic::amdgcn_alignbyte:
4097     case Intrinsic::amdgcn_perm:
4098     case Intrinsic::amdgcn_fdot2:
4099     case Intrinsic::amdgcn_sdot2:
4100     case Intrinsic::amdgcn_udot2:
4101     case Intrinsic::amdgcn_sdot4:
4102     case Intrinsic::amdgcn_udot4:
4103     case Intrinsic::amdgcn_sdot8:
4104     case Intrinsic::amdgcn_udot8:
4105       return getDefaultMappingVOP(MI);
4106     case Intrinsic::amdgcn_sbfe:
4107     case Intrinsic::amdgcn_ubfe:
4108       if (isSALUMapping(MI))
4109         return getDefaultMappingSOP(MI);
4110       return getDefaultMappingVOP(MI);
4111     case Intrinsic::amdgcn_ds_swizzle:
4112     case Intrinsic::amdgcn_ds_permute:
4113     case Intrinsic::amdgcn_ds_bpermute:
4114     case Intrinsic::amdgcn_update_dpp:
4115     case Intrinsic::amdgcn_mov_dpp8:
4116     case Intrinsic::amdgcn_mov_dpp:
4117     case Intrinsic::amdgcn_strict_wwm:
4118     case Intrinsic::amdgcn_wwm:
4119     case Intrinsic::amdgcn_strict_wqm:
4120     case Intrinsic::amdgcn_wqm:
4121     case Intrinsic::amdgcn_softwqm:
4122     case Intrinsic::amdgcn_set_inactive:
4123       return getDefaultMappingAllVGPR(MI);
4124     case Intrinsic::amdgcn_kernarg_segment_ptr:
4125     case Intrinsic::amdgcn_s_getpc:
4126     case Intrinsic::amdgcn_groupstaticsize:
4127     case Intrinsic::amdgcn_reloc_constant:
4128     case Intrinsic::returnaddress: {
4129       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4130       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4131       break;
4132     }
4133     case Intrinsic::amdgcn_wqm_vote: {
4134       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4135       OpdsMapping[0] = OpdsMapping[2]
4136         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4137       break;
4138     }
4139     case Intrinsic::amdgcn_ps_live: {
4140       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4141       break;
4142     }
4143     case Intrinsic::amdgcn_div_scale: {
4144       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4145       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4146       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4147       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4148 
4149       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4150       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4151       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4152       break;
4153     }
4154     case Intrinsic::amdgcn_class: {
4155       Register Src0Reg = MI.getOperand(2).getReg();
4156       Register Src1Reg = MI.getOperand(3).getReg();
4157       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4158       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4159       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4160       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4161       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4162       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4163       break;
4164     }
4165     case Intrinsic::amdgcn_icmp:
4166     case Intrinsic::amdgcn_fcmp: {
4167       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4168       // This is not VCCRegBank because this is not used in boolean contexts.
4169       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4170       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4171       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4172       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4173       break;
4174     }
4175     case Intrinsic::amdgcn_readlane: {
4176       // This must be an SGPR, but accept a VGPR.
4177       Register IdxReg = MI.getOperand(3).getReg();
4178       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4179       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4180       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4181       LLVM_FALLTHROUGH;
4182     }
4183     case Intrinsic::amdgcn_readfirstlane: {
4184       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4185       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4186       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4187       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4188       break;
4189     }
4190     case Intrinsic::amdgcn_writelane: {
4191       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4192       Register SrcReg = MI.getOperand(2).getReg();
4193       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4194       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4195       Register IdxReg = MI.getOperand(3).getReg();
4196       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4197       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4198       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4199 
4200       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4201       // to legalize.
4202       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4203       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4204       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4205       break;
4206     }
4207     case Intrinsic::amdgcn_if_break: {
4208       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4209       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4210       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4211       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4212       break;
4213     }
4214     case Intrinsic::amdgcn_permlane16:
4215     case Intrinsic::amdgcn_permlanex16: {
4216       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4217       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4218       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4219       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4220       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4221       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4222       break;
4223     }
4224     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4225     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4226     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4227     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4228     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4229     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4230     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4231     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4232     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4233     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4234     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4235     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4236     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4237     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4238     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4239     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4240     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4241     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4242     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4243     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4244     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4245     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4246     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4247     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4248     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4249     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4250     case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4251       // Default for MAI intrinsics.
4252       // srcC can also be an immediate which can be folded later.
4253       // FIXME: Should we eventually add an alternative mapping with AGPR src
4254       // for srcA/srcB?
4255       //
4256       // vdst, srcA, srcB, srcC
4257       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4258       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4259       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4260       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4261       break;
4262     }
4263     case Intrinsic::amdgcn_interp_p1:
4264     case Intrinsic::amdgcn_interp_p2:
4265     case Intrinsic::amdgcn_interp_mov:
4266     case Intrinsic::amdgcn_interp_p1_f16:
4267     case Intrinsic::amdgcn_interp_p2_f16: {
4268       const int M0Idx = MI.getNumOperands() - 1;
4269       Register M0Reg = MI.getOperand(M0Idx).getReg();
4270       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4271       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4272 
4273       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4274       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4275         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4276 
4277       // Must be SGPR, but we must take whatever the original bank is and fix it
4278       // later.
4279       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4280       break;
4281     }
4282     case Intrinsic::amdgcn_ballot: {
4283       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4284       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4285       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4286       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4287       break;
4288     }
4289     }
4290     break;
4291   }
4292   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4293   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4294   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4295   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4296     auto IntrID = MI.getIntrinsicID();
4297     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4298     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4299     // Non-images can have complications from operands that allow both SGPR
4300     // and VGPR. For now it's too complicated to figure out the final opcode
4301     // to derive the register bank from the MCInstrDesc.
4302     assert(RSrcIntrin->IsImage);
4303     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4304   }
4305   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4306     unsigned N = MI.getNumExplicitOperands() - 2;
4307     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4308     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4309     if (N == 3) {
4310       // Sequential form: all operands combined into VGPR256/VGPR512
4311       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4312       if (Size > 256)
4313         Size = 512;
4314       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4315     } else {
4316       // NSA form
4317       for (unsigned I = 2; I < N; ++I)
4318         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4319     }
4320     break;
4321   }
4322   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4323     auto IntrID = MI.getIntrinsicID();
4324     switch (IntrID) {
4325     case Intrinsic::amdgcn_s_getreg:
4326     case Intrinsic::amdgcn_s_memtime:
4327     case Intrinsic::amdgcn_s_memrealtime:
4328     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4329       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4330       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4331       break;
4332     }
4333     case Intrinsic::amdgcn_global_atomic_fadd:
4334     case Intrinsic::amdgcn_global_atomic_csub:
4335     case Intrinsic::amdgcn_global_atomic_fmin:
4336     case Intrinsic::amdgcn_global_atomic_fmax:
4337     case Intrinsic::amdgcn_flat_atomic_fadd:
4338     case Intrinsic::amdgcn_flat_atomic_fmin:
4339     case Intrinsic::amdgcn_flat_atomic_fmax:
4340       return getDefaultMappingAllVGPR(MI);
4341     case Intrinsic::amdgcn_ds_ordered_add:
4342     case Intrinsic::amdgcn_ds_ordered_swap: {
4343       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4344       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4345       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4346                                  AMDGPU::SGPRRegBankID);
4347       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4348       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4349       break;
4350     }
4351     case Intrinsic::amdgcn_ds_append:
4352     case Intrinsic::amdgcn_ds_consume: {
4353       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4354       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4355       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4356       break;
4357     }
4358     case Intrinsic::amdgcn_exp_compr:
4359       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4360       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4361       break;
4362     case Intrinsic::amdgcn_exp:
4363       // FIXME: Could we support packed types here?
4364       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4365       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4366       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4367       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4368       break;
4369     case Intrinsic::amdgcn_s_sendmsg:
4370     case Intrinsic::amdgcn_s_sendmsghalt: {
4371       // This must be an SGPR, but accept a VGPR.
4372       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4373                                    AMDGPU::SGPRRegBankID);
4374       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4375       break;
4376     }
4377     case Intrinsic::amdgcn_s_setreg: {
4378       // This must be an SGPR, but accept a VGPR.
4379       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4380                                    AMDGPU::SGPRRegBankID);
4381       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4382       break;
4383     }
4384     case Intrinsic::amdgcn_end_cf: {
4385       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4386       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4387       break;
4388     }
4389     case Intrinsic::amdgcn_else: {
4390       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4391       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4392       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4393       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4394       break;
4395     }
4396     case Intrinsic::amdgcn_live_mask: {
4397       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4398       break;
4399     }
4400     case Intrinsic::amdgcn_wqm_demote:
4401     case Intrinsic::amdgcn_kill: {
4402       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4403       break;
4404     }
4405     case Intrinsic::amdgcn_raw_buffer_load:
4406     case Intrinsic::amdgcn_raw_tbuffer_load: {
4407       // FIXME: Should make intrinsic ID the last operand of the instruction,
4408       // then this would be the same as store
4409       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4410       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4411       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4412       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4413       break;
4414     }
4415     case Intrinsic::amdgcn_raw_buffer_store:
4416     case Intrinsic::amdgcn_raw_buffer_store_format:
4417     case Intrinsic::amdgcn_raw_tbuffer_store: {
4418       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4419       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4420       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4421       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4422       break;
4423     }
4424     case Intrinsic::amdgcn_struct_buffer_load:
4425     case Intrinsic::amdgcn_struct_tbuffer_load: {
4426       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4427       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4428       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4429       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4430       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4431       break;
4432     }
4433     case Intrinsic::amdgcn_struct_buffer_store:
4434     case Intrinsic::amdgcn_struct_tbuffer_store: {
4435       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4436       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4437       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4438       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4439       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4440       break;
4441     }
4442     case Intrinsic::amdgcn_init_exec_from_input: {
4443       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4444       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4445       break;
4446     }
4447     case Intrinsic::amdgcn_ds_gws_init:
4448     case Intrinsic::amdgcn_ds_gws_barrier:
4449     case Intrinsic::amdgcn_ds_gws_sema_br: {
4450       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4451 
4452       // This must be an SGPR, but accept a VGPR.
4453       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4454                                    AMDGPU::SGPRRegBankID);
4455       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4456       break;
4457     }
4458     case Intrinsic::amdgcn_ds_gws_sema_v:
4459     case Intrinsic::amdgcn_ds_gws_sema_p:
4460     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4461       // This must be an SGPR, but accept a VGPR.
4462       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4463                                    AMDGPU::SGPRRegBankID);
4464       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4465       break;
4466     }
4467     default:
4468       return getInvalidInstructionMapping();
4469     }
4470     break;
4471   }
4472   case AMDGPU::G_SELECT: {
4473     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4474     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4475                                     AMDGPU::SGPRRegBankID);
4476     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4477                                     AMDGPU::SGPRRegBankID);
4478     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4479                     Op3Bank == AMDGPU::SGPRRegBankID;
4480 
4481     unsigned CondBankDefault = SGPRSrcs ?
4482       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4483     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4484                                      CondBankDefault);
4485     if (CondBank == AMDGPU::SGPRRegBankID)
4486       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4487     else if (CondBank == AMDGPU::VGPRRegBankID)
4488       CondBank = AMDGPU::VCCRegBankID;
4489 
4490     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4491       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4492 
4493     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4494 
4495     // TODO: Should report 32-bit for scalar condition type.
4496     if (Size == 64) {
4497       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4498       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4499       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4500       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4501     } else {
4502       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4503       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4504       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4505       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4506     }
4507 
4508     break;
4509   }
4510 
4511   case AMDGPU::G_SI_CALL: {
4512     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4513     // Lie and claim everything is legal, even though some need to be
4514     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4515     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4516 
4517     // Allow anything for implicit arguments
4518     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4519       if (MI.getOperand(I).isReg()) {
4520         Register Reg = MI.getOperand(I).getReg();
4521         auto OpBank = getRegBankID(Reg, MRI);
4522         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4523         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4524       }
4525     }
4526     break;
4527   }
4528   case AMDGPU::G_LOAD:
4529   case AMDGPU::G_ZEXTLOAD:
4530   case AMDGPU::G_SEXTLOAD:
4531     return getInstrMappingForLoad(MI);
4532 
4533   case AMDGPU::G_ATOMICRMW_XCHG:
4534   case AMDGPU::G_ATOMICRMW_ADD:
4535   case AMDGPU::G_ATOMICRMW_SUB:
4536   case AMDGPU::G_ATOMICRMW_AND:
4537   case AMDGPU::G_ATOMICRMW_OR:
4538   case AMDGPU::G_ATOMICRMW_XOR:
4539   case AMDGPU::G_ATOMICRMW_MAX:
4540   case AMDGPU::G_ATOMICRMW_MIN:
4541   case AMDGPU::G_ATOMICRMW_UMAX:
4542   case AMDGPU::G_ATOMICRMW_UMIN:
4543   case AMDGPU::G_ATOMICRMW_FADD:
4544   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4545   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4546   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4547   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4548   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4549     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4550     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4551     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4552     break;
4553   }
4554   case AMDGPU::G_ATOMIC_CMPXCHG: {
4555     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4556     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4557     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4558     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4559     break;
4560   }
4561   case AMDGPU::G_BRCOND: {
4562     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4563                                  AMDGPU::SGPRRegBankID);
4564     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4565     if (Bank != AMDGPU::SGPRRegBankID)
4566       Bank = AMDGPU::VCCRegBankID;
4567 
4568     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4569     break;
4570   }
4571   }
4572 
4573   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4574                                getOperandsMapping(OpdsMapping),
4575                                MI.getNumOperands());
4576 }
4577