1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84 
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87 
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90 
91 using namespace llvm;
92 using namespace MIPatternMatch;
93 
94 namespace {
95 
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99   const AMDGPURegisterBankInfo &RBI;
100   MachineRegisterInfo &MRI;
101   const RegisterBank *NewBank;
102   SmallVector<MachineInstr *, 4> NewInsts;
103 
104 public:
105   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
107     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108 
109   ~ApplyRegBankMapping() {
110     for (MachineInstr *MI : NewInsts)
111       applyBank(*MI);
112   }
113 
114   /// Set any registers that don't have a set register class or bank to SALU.
115   void applyBank(MachineInstr &MI) {
116     const unsigned Opc = MI.getOpcode();
117     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118         Opc == AMDGPU::G_SEXT) {
119       // LegalizerHelper wants to use the basic legalization artifacts when
120       // widening etc. We don't handle selection with vcc in artifact sources,
121       // so we need to use a sslect instead to handle these properly.
122       Register DstReg = MI.getOperand(0).getReg();
123       Register SrcReg = MI.getOperand(1).getReg();
124       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125       if (SrcBank == &AMDGPU::VCCRegBank) {
126         const LLT S32 = LLT::scalar(32);
127         assert(MRI.getType(SrcReg) == LLT::scalar(1));
128         assert(MRI.getType(DstReg) == S32);
129         assert(NewBank == &AMDGPU::VGPRRegBank);
130 
131         // Replace the extension with a select, which really uses the boolean
132         // source.
133         MachineIRBuilder B(MI);
134         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135         auto False = B.buildConstant(S32, 0);
136         B.buildSelect(DstReg, SrcReg, True, False);
137         MRI.setRegBank(True.getReg(0), *NewBank);
138         MRI.setRegBank(False.getReg(0), *NewBank);
139         MI.eraseFromParent();
140       }
141 
142       assert(!MRI.getRegClassOrRegBank(DstReg));
143       MRI.setRegBank(DstReg, *NewBank);
144       return;
145     }
146 
147 #ifndef NDEBUG
148     if (Opc == AMDGPU::G_TRUNC) {
149       Register DstReg = MI.getOperand(0).getReg();
150       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151       assert(DstBank != &AMDGPU::VCCRegBank);
152     }
153 #endif
154 
155     for (MachineOperand &Op : MI.operands()) {
156       if (!Op.isReg())
157         continue;
158 
159       // We may see physical registers if building a real MI
160       Register Reg = Op.getReg();
161       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162         continue;
163 
164       const RegisterBank *RB = NewBank;
165       if (MRI.getType(Reg) == LLT::scalar(1)) {
166         assert(NewBank == &AMDGPU::VGPRRegBank &&
167                "s1 operands should only be used for vector bools");
168         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170                "not expecting legalization artifacts here");
171         RB = &AMDGPU::VCCRegBank;
172       }
173 
174       MRI.setRegBank(Reg, *RB);
175     }
176   }
177 
178   void erasingInstr(MachineInstr &MI) override {}
179 
180   void createdInstr(MachineInstr &MI) override {
181     // At this point, the instruction was just inserted and has no operands.
182     NewInsts.push_back(&MI);
183   }
184 
185   void changingInstr(MachineInstr &MI) override {}
186   void changedInstr(MachineInstr &MI) override {
187     // FIXME: In principle we should probably add the instruction to NewInsts,
188     // but the way the LegalizerHelper uses the observer, we will always see the
189     // registers we need to set the regbank on also referenced in a new
190     // instruction.
191   }
192 };
193 
194 }
195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196     : AMDGPUGenRegisterBankInfo(),
197       Subtarget(ST),
198       TRI(Subtarget.getRegisterInfo()),
199       TII(Subtarget.getInstrInfo()) {
200 
201   // HACK: Until this is fully tablegen'd.
202   static llvm::once_flag InitializeRegisterBankFlag;
203 
204   static auto InitializeRegisterBankOnce = [this]() {
205     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208     (void)this;
209   };
210 
211   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213 
214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215   unsigned BankID = Bank.getID();
216   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218 
219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220                                           const RegisterBank &Src,
221                                           unsigned Size) const {
222   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225     return std::numeric_limits<unsigned>::max();
226   }
227 
228   // Bool values are tricky, because the meaning is based on context. The SCC
229   // and VCC banks are for the natural scalar and vector conditions produced by
230   // a compare.
231   //
232   // Legalization doesn't know about the necessary context, so an s1 use may
233   // have been a truncate from an arbitrary value, in which case a copy (lowered
234   // as a compare with 0) needs to be inserted.
235   if (Size == 1 &&
236       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237       (isVectorRegisterBank(Src) ||
238        Src.getID() == AMDGPU::SGPRRegBankID ||
239        Src.getID() == AMDGPU::VCCRegBankID))
240     return std::numeric_limits<unsigned>::max();
241 
242   // There is no direct copy between AGPRs.
243   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244       Src.getID() == AMDGPU::AGPRRegBankID)
245     return 4;
246 
247   return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249 
250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251   const ValueMapping &ValMapping,
252   const RegisterBank *CurBank) const {
253   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254   // VGPR.
255   // FIXME: Is there a better way to do this?
256   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257     return 10; // This is expensive.
258 
259   assert(ValMapping.NumBreakDowns == 2 &&
260          ValMapping.BreakDown[0].Length == 32 &&
261          ValMapping.BreakDown[0].StartIdx == 0 &&
262          ValMapping.BreakDown[1].Length == 32 &&
263          ValMapping.BreakDown[1].StartIdx == 32 &&
264          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265 
266   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268   // want.
269 
270   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271   // alignment restrictions, but this probably isn't important.
272   return 1;
273 }
274 
275 const RegisterBank &
276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277                                                LLT Ty) const {
278   if (&RC == &AMDGPU::SReg_1RegClass)
279     return AMDGPU::VCCRegBank;
280 
281   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282   // VCC-like use.
283   if (TRI->isSGPRClass(&RC)) {
284     // FIXME: This probably came from a copy from a physical register, which
285     // should be inferrrable from the copied to-type. We don't have many boolean
286     // physical register constraints so just assume a normal SGPR for now.
287     if (!Ty.isValid())
288       return AMDGPU::SGPRRegBank;
289 
290     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291   }
292 
293   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295 
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
298 AMDGPURegisterBankInfo::addMappingFromTable(
299     const MachineInstr &MI, const MachineRegisterInfo &MRI,
300     const std::array<unsigned, NumOps> RegSrcOpIdx,
301     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302 
303   InstructionMappings AltMappings;
304 
305   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
306 
307   unsigned Sizes[NumOps];
308   for (unsigned I = 0; I < NumOps; ++I) {
309     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311   }
312 
313   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316   }
317 
318   // getInstrMapping's default mapping uses ID 1, so start at 2.
319   unsigned MappingID = 2;
320   for (const auto &Entry : Table) {
321     for (unsigned I = 0; I < NumOps; ++I) {
322       int OpIdx = RegSrcOpIdx[I];
323       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324     }
325 
326     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327                                                  getOperandsMapping(Operands),
328                                                  Operands.size()));
329   }
330 
331   return AltMappings;
332 }
333 
334 RegisterBankInfo::InstructionMappings
335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337   switch (MI.getIntrinsicID()) {
338   case Intrinsic::amdgcn_readlane: {
339     static const OpRegBankEntry<3> Table[2] = {
340       // Perfectly legal.
341       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342 
343       // Need a readfirstlane for the index.
344       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345     };
346 
347     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349   }
350   case Intrinsic::amdgcn_writelane: {
351     static const OpRegBankEntry<4> Table[4] = {
352       // Perfectly legal.
353       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354 
355       // Need readfirstlane of first op
356       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357 
358       // Need readfirstlane of second op
359       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361       // Need readfirstlane of both ops
362       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363     };
364 
365     // rsrc, voffset, offset
366     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368   }
369   default:
370     return RegisterBankInfo::getInstrAlternativeMappings(MI);
371   }
372 }
373 
374 RegisterBankInfo::InstructionMappings
375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377 
378   switch (MI.getIntrinsicID()) {
379   case Intrinsic::amdgcn_s_buffer_load: {
380     static const OpRegBankEntry<2> Table[4] = {
381       // Perfectly legal.
382       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383 
384       // Only need 1 register in loop
385       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386 
387       // Have to waterfall the resource.
388       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389 
390       // Have to waterfall the resource, and the offset.
391       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392     };
393 
394     // rsrc, offset
395     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397   }
398   case Intrinsic::amdgcn_ds_ordered_add:
399   case Intrinsic::amdgcn_ds_ordered_swap: {
400     // VGPR = M0, VGPR
401     static const OpRegBankEntry<3> Table[2] = {
402       // Perfectly legal.
403       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
404 
405       // Need a readfirstlane for m0
406       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407     };
408 
409     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411   }
412   case Intrinsic::amdgcn_s_sendmsg:
413   case Intrinsic::amdgcn_s_sendmsghalt: {
414     // FIXME: Should have no register for immediate
415     static const OpRegBankEntry<1> Table[2] = {
416       // Perfectly legal.
417       { { AMDGPU::SGPRRegBankID }, 1 },
418 
419       // Need readlane
420       { { AMDGPU::VGPRRegBankID }, 3 }
421     };
422 
423     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425   }
426   default:
427     return RegisterBankInfo::getInstrAlternativeMappings(MI);
428   }
429 }
430 
431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433   return I && I->getMetadata("amdgpu.noclobber");
434 }
435 
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439   if (!MI.hasOneMemOperand())
440     return false;
441 
442   const MachineMemOperand *MMO = *MI.memoperands_begin();
443   const unsigned AS = MMO->getAddrSpace();
444   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446 
447   // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
448   return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
449          // Can't do a scalar atomic load.
450          !MMO->isAtomic() &&
451          // Don't use scalar loads for volatile accesses to non-constant address
452          // spaces.
453          (IsConst || !MMO->isVolatile()) &&
454          // Memory must be known constant, or not written before this load.
455          (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
456          AMDGPUInstrInfo::isUniformMMO(MMO);
457 }
458 
459 RegisterBankInfo::InstructionMappings
460 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
461     const MachineInstr &MI) const {
462 
463   const MachineFunction &MF = *MI.getParent()->getParent();
464   const MachineRegisterInfo &MRI = MF.getRegInfo();
465 
466 
467   InstructionMappings AltMappings;
468   switch (MI.getOpcode()) {
469   case TargetOpcode::G_CONSTANT: {
470     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
471     if (Size == 1) {
472       static const OpRegBankEntry<1> Table[3] = {
473         { { AMDGPU::VGPRRegBankID }, 1 },
474         { { AMDGPU::SGPRRegBankID }, 1 },
475         { { AMDGPU::VCCRegBankID }, 1 }
476       };
477 
478       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
479     }
480 
481     LLVM_FALLTHROUGH;
482   }
483   case TargetOpcode::G_FCONSTANT:
484   case TargetOpcode::G_FRAME_INDEX:
485   case TargetOpcode::G_GLOBAL_VALUE: {
486     static const OpRegBankEntry<1> Table[2] = {
487       { { AMDGPU::VGPRRegBankID }, 1 },
488       { { AMDGPU::SGPRRegBankID }, 1 }
489     };
490 
491     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
492   }
493   case TargetOpcode::G_AND:
494   case TargetOpcode::G_OR:
495   case TargetOpcode::G_XOR: {
496     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
497 
498     if (Size == 1) {
499       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
500       const InstructionMapping &SCCMapping = getInstructionMapping(
501         1, 1, getOperandsMapping(
502           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
504            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
505         3); // Num Operands
506       AltMappings.push_back(&SCCMapping);
507 
508       const InstructionMapping &VCCMapping0 = getInstructionMapping(
509         2, 1, getOperandsMapping(
510           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
512            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
513         3); // Num Operands
514       AltMappings.push_back(&VCCMapping0);
515       return AltMappings;
516     }
517 
518     if (Size != 64)
519       break;
520 
521     const InstructionMapping &SSMapping = getInstructionMapping(
522       1, 1, getOperandsMapping(
523         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
525          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
526       3); // Num Operands
527     AltMappings.push_back(&SSMapping);
528 
529     const InstructionMapping &VVMapping = getInstructionMapping(
530       2, 2, getOperandsMapping(
531         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
533          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
534       3); // Num Operands
535     AltMappings.push_back(&VVMapping);
536     break;
537   }
538   case TargetOpcode::G_LOAD:
539   case TargetOpcode::G_ZEXTLOAD:
540   case TargetOpcode::G_SEXTLOAD: {
541     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
542     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
543     unsigned PtrSize = PtrTy.getSizeInBits();
544     unsigned AS = PtrTy.getAddressSpace();
545 
546     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
547          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
548         isScalarLoadLegal(MI)) {
549       const InstructionMapping &SSMapping = getInstructionMapping(
550           1, 1, getOperandsMapping(
551                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
552                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
553           2); // Num Operands
554       AltMappings.push_back(&SSMapping);
555     }
556 
557     const InstructionMapping &VVMapping = getInstructionMapping(
558         2, 1,
559         getOperandsMapping(
560             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
561              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
562         2); // Num Operands
563     AltMappings.push_back(&VVMapping);
564 
565     // It may be possible to have a vgpr = load sgpr mapping here, because
566     // the mubuf instructions support this kind of load, but probably for only
567     // gfx7 and older.  However, the addressing mode matching in the instruction
568     // selector should be able to do a better job of detecting and selecting
569     // these kinds of loads from the vgpr = load vgpr mapping.
570 
571     return AltMappings;
572 
573   }
574   case TargetOpcode::G_SELECT: {
575     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
576     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
577       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
579                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
580                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
581       4); // Num Operands
582     AltMappings.push_back(&SSMapping);
583 
584     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
585       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
586                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
587                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
588                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
589       4); // Num Operands
590     AltMappings.push_back(&VVMapping);
591 
592     return AltMappings;
593   }
594   case TargetOpcode::G_SMIN:
595   case TargetOpcode::G_SMAX:
596   case TargetOpcode::G_UMIN:
597   case TargetOpcode::G_UMAX: {
598     static const OpRegBankEntry<3> Table[2] = {
599       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
600 
601       // Scalar requires cmp+select, and extends if 16-bit.
602       // FIXME: Should there be separate costs for 32 and 16-bit
603       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
604     };
605 
606     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
607     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
608   }
609   case TargetOpcode::G_UADDE:
610   case TargetOpcode::G_USUBE:
611   case TargetOpcode::G_SADDE:
612   case TargetOpcode::G_SSUBE: {
613     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
614     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
615       getOperandsMapping(
616         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
617          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
618          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
619          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
620          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
621       5); // Num Operands
622     AltMappings.push_back(&SSMapping);
623 
624     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
625       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
626                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
627                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
628                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
629                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
630       5); // Num Operands
631     AltMappings.push_back(&VVMapping);
632     return AltMappings;
633   }
634   case AMDGPU::G_BRCOND: {
635     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
636 
637     // TODO: Change type to 32 for scalar
638     const InstructionMapping &SMapping = getInstructionMapping(
639       1, 1, getOperandsMapping(
640         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
641       2); // Num Operands
642     AltMappings.push_back(&SMapping);
643 
644     const InstructionMapping &VMapping = getInstructionMapping(
645       1, 1, getOperandsMapping(
646         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
647       2); // Num Operands
648     AltMappings.push_back(&VMapping);
649     return AltMappings;
650   }
651   case AMDGPU::G_INTRINSIC:
652     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
653   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
654     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
655   default:
656     break;
657   }
658   return RegisterBankInfo::getInstrAlternativeMappings(MI);
659 }
660 
661 void AMDGPURegisterBankInfo::split64BitValueForMapping(
662   MachineIRBuilder &B,
663   SmallVector<Register, 2> &Regs,
664   LLT HalfTy,
665   Register Reg) const {
666   assert(HalfTy.getSizeInBits() == 32);
667   MachineRegisterInfo *MRI = B.getMRI();
668   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
669   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
670   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
671   MRI->setRegBank(LoLHS, *Bank);
672   MRI->setRegBank(HiLHS, *Bank);
673 
674   Regs.push_back(LoLHS);
675   Regs.push_back(HiLHS);
676 
677   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
678     .addDef(LoLHS)
679     .addDef(HiLHS)
680     .addUse(Reg);
681 }
682 
683 /// Replace the current type each register in \p Regs has with \p NewTy
684 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
685                           LLT NewTy) {
686   for (Register Reg : Regs) {
687     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
688     MRI.setType(Reg, NewTy);
689   }
690 }
691 
692 static LLT getHalfSizedType(LLT Ty) {
693   if (Ty.isVector()) {
694     assert(Ty.getNumElements() % 2 == 0);
695     return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
696   }
697 
698   assert(Ty.getSizeInBits() % 2 == 0);
699   return LLT::scalar(Ty.getSizeInBits() / 2);
700 }
701 
702 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
703 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
704 /// execute the instruction for each unique combination of values in all lanes
705 /// in the wave. The block will be split such that rest of the instructions are
706 /// moved to a new block.
707 ///
708 /// Essentially performs this loop:
709 //
710 /// Save Execution Mask
711 /// For (Lane : Wavefront) {
712 ///   Enable Lane, Disable all other lanes
713 ///   SGPR = read SGPR value for current lane from VGPR
714 ///   VGPRResult[Lane] = use_op SGPR
715 /// }
716 /// Restore Execution Mask
717 ///
718 /// There is additional complexity to try for compare values to identify the
719 /// unique values used.
720 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
721   MachineIRBuilder &B,
722   iterator_range<MachineBasicBlock::iterator> Range,
723   SmallSet<Register, 4> &SGPROperandRegs,
724   MachineRegisterInfo &MRI) const {
725   SmallVector<Register, 4> ResultRegs;
726   SmallVector<Register, 4> InitResultRegs;
727   SmallVector<Register, 4> PhiRegs;
728 
729   // Track use registers which have already been expanded with a readfirstlane
730   // sequence. This may have multiple uses if moving a sequence.
731   DenseMap<Register, Register> WaterfalledRegMap;
732 
733   MachineBasicBlock &MBB = B.getMBB();
734   MachineFunction *MF = &B.getMF();
735 
736   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
737   const unsigned WaveAndOpc = Subtarget.isWave32() ?
738     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
739   const unsigned MovTermOpc = Subtarget.isWave32() ?
740     AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
741   const unsigned XorTermOpc = Subtarget.isWave32() ?
742     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
743   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
744     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
745   const unsigned ExecReg =  Subtarget.isWave32() ?
746     AMDGPU::EXEC_LO : AMDGPU::EXEC;
747 
748 #ifndef NDEBUG
749   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
750 #endif
751 
752   for (MachineInstr &MI : Range) {
753     for (MachineOperand &Def : MI.defs()) {
754       if (MRI.use_nodbg_empty(Def.getReg()))
755         continue;
756 
757       LLT ResTy = MRI.getType(Def.getReg());
758       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
759       ResultRegs.push_back(Def.getReg());
760       Register InitReg = B.buildUndef(ResTy).getReg(0);
761       Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
762       InitResultRegs.push_back(InitReg);
763       PhiRegs.push_back(PhiReg);
764       MRI.setRegBank(PhiReg, *DefBank);
765       MRI.setRegBank(InitReg, *DefBank);
766     }
767   }
768 
769   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
770   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
771 
772   // Don't bother using generic instructions/registers for the exec mask.
773   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
774     .addDef(InitSaveExecReg);
775 
776   Register PhiExec = MRI.createVirtualRegister(WaveRC);
777   Register NewExec = MRI.createVirtualRegister(WaveRC);
778 
779   // To insert the loop we need to split the block. Move everything before this
780   // point to a new block, and insert a new empty block before this instruction.
781   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
782   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
783   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
784   MachineFunction::iterator MBBI(MBB);
785   ++MBBI;
786   MF->insert(MBBI, LoopBB);
787   MF->insert(MBBI, RestoreExecBB);
788   MF->insert(MBBI, RemainderBB);
789 
790   LoopBB->addSuccessor(RestoreExecBB);
791   LoopBB->addSuccessor(LoopBB);
792 
793   // Move the rest of the block into a new block.
794   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
795   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
796 
797   MBB.addSuccessor(LoopBB);
798   RestoreExecBB->addSuccessor(RemainderBB);
799 
800   B.setInsertPt(*LoopBB, LoopBB->end());
801 
802   B.buildInstr(TargetOpcode::PHI)
803     .addDef(PhiExec)
804     .addReg(InitSaveExecReg)
805     .addMBB(&MBB)
806     .addReg(NewExec)
807     .addMBB(LoopBB);
808 
809   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
810     B.buildInstr(TargetOpcode::G_PHI)
811       .addDef(std::get<2>(Result))
812       .addReg(std::get<0>(Result)) // Initial value / implicit_def
813       .addMBB(&MBB)
814       .addReg(std::get<1>(Result)) // Mid-loop value.
815       .addMBB(LoopBB);
816   }
817 
818   const DebugLoc &DL = B.getDL();
819 
820   MachineInstr &FirstInst = *Range.begin();
821 
822   // Move the instruction into the loop. Note we moved everything after
823   // Range.end() already into a new block, so Range.end() is no longer valid.
824   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
825 
826   // Figure out the iterator range after splicing the instructions.
827   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
828   auto NewEnd = LoopBB->end();
829 
830   MachineBasicBlock::iterator I = Range.begin();
831   B.setInsertPt(*LoopBB, I);
832 
833   Register CondReg;
834 
835   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
836 
837   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
838     for (MachineOperand &Op : MI.uses()) {
839       if (!Op.isReg() || Op.isDef())
840         continue;
841 
842       Register OldReg = Op.getReg();
843       if (!SGPROperandRegs.count(OldReg))
844         continue;
845 
846       // See if we already processed this register in another instruction in the
847       // sequence.
848       auto OldVal = WaterfalledRegMap.find(OldReg);
849       if (OldVal != WaterfalledRegMap.end()) {
850         Op.setReg(OldVal->second);
851         continue;
852       }
853 
854       Register OpReg = Op.getReg();
855       LLT OpTy = MRI.getType(OpReg);
856 
857       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
858       if (OpBank != &AMDGPU::VGPRRegBank) {
859         // Insert copy from AGPR to VGPR before the loop.
860         B.setMBB(MBB);
861         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
862         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
863         B.setInstr(*I);
864       }
865 
866       unsigned OpSize = OpTy.getSizeInBits();
867 
868       // Can only do a readlane of 32-bit pieces.
869       if (OpSize == 32) {
870         // Avoid extra copies in the simple case of one 32-bit register.
871         Register CurrentLaneOpReg
872           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
873         MRI.setType(CurrentLaneOpReg, OpTy);
874 
875         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
876         // Read the next variant <- also loop target.
877         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
878                 CurrentLaneOpReg)
879           .addReg(OpReg);
880 
881         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
882         bool First = CondReg == AMDGPU::NoRegister;
883         if (First)
884           CondReg = NewCondReg;
885 
886         // Compare the just read M0 value to all possible Idx values.
887         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
888           .addDef(NewCondReg)
889           .addReg(CurrentLaneOpReg)
890           .addReg(OpReg);
891         Op.setReg(CurrentLaneOpReg);
892 
893         if (!First) {
894           Register AndReg = MRI.createVirtualRegister(WaveRC);
895 
896           // If there are multiple operands to consider, and the conditions.
897           B.buildInstr(WaveAndOpc)
898             .addDef(AndReg)
899             .addReg(NewCondReg)
900             .addReg(CondReg);
901           CondReg = AndReg;
902         }
903       } else {
904         LLT S32 = LLT::scalar(32);
905         SmallVector<Register, 8> ReadlanePieces;
906 
907         // The compares can be done as 64-bit, but the extract needs to be done
908         // in 32-bit pieces.
909 
910         bool Is64 = OpSize % 64 == 0;
911 
912         LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
913         unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
914           : AMDGPU::V_CMP_EQ_U32_e64;
915 
916         // The compares can be done as 64-bit, but the extract needs to be done
917         // in 32-bit pieces.
918 
919         // Insert the unmerge before the loop.
920 
921         B.setMBB(MBB);
922         auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
923         B.setInstr(*I);
924 
925         unsigned NumPieces = Unmerge->getNumOperands() - 1;
926         for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
927           Register UnmergePiece = Unmerge.getReg(PieceIdx);
928 
929           Register CurrentLaneOpReg;
930           if (Is64) {
931             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
932             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
933 
934             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
935             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
936             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
937 
938             // Read the next variant <- also loop target.
939             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
940                     CurrentLaneOpRegLo)
941               .addReg(UnmergePiece, 0, AMDGPU::sub0);
942 
943             // Read the next variant <- also loop target.
944             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
945                     CurrentLaneOpRegHi)
946               .addReg(UnmergePiece, 0, AMDGPU::sub1);
947 
948             CurrentLaneOpReg =
949               B.buildMerge(LLT::scalar(64),
950                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
951               .getReg(0);
952 
953             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
954 
955             if (OpTy.getScalarSizeInBits() == 64) {
956               // If we need to produce a 64-bit element vector, so use the
957               // merged pieces
958               ReadlanePieces.push_back(CurrentLaneOpReg);
959             } else {
960               // 32-bit element type.
961               ReadlanePieces.push_back(CurrentLaneOpRegLo);
962               ReadlanePieces.push_back(CurrentLaneOpRegHi);
963             }
964           } else {
965             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
966             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
967             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
968 
969             // Read the next variant <- also loop target.
970             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
971                     CurrentLaneOpReg)
972               .addReg(UnmergePiece);
973             ReadlanePieces.push_back(CurrentLaneOpReg);
974           }
975 
976           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
977           bool First = CondReg == AMDGPU::NoRegister;
978           if (First)
979             CondReg = NewCondReg;
980 
981           B.buildInstr(CmpOp)
982             .addDef(NewCondReg)
983             .addReg(CurrentLaneOpReg)
984             .addReg(UnmergePiece);
985 
986           if (!First) {
987             Register AndReg = MRI.createVirtualRegister(WaveRC);
988 
989             // If there are multiple operands to consider, and the conditions.
990             B.buildInstr(WaveAndOpc)
991               .addDef(AndReg)
992               .addReg(NewCondReg)
993               .addReg(CondReg);
994             CondReg = AndReg;
995           }
996         }
997 
998         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
999         // BUILD_VECTOR
1000         if (OpTy.isVector()) {
1001           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
1002           Op.setReg(Merge.getReg(0));
1003         } else {
1004           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
1005           Op.setReg(Merge.getReg(0));
1006         }
1007 
1008         MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
1009       }
1010 
1011       // Make sure we don't re-process this register again.
1012       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
1013     }
1014   }
1015 
1016   B.setInsertPt(*LoopBB, LoopBB->end());
1017 
1018   // Update EXEC, save the original EXEC value to VCC.
1019   B.buildInstr(AndSaveExecOpc)
1020     .addDef(NewExec)
1021     .addReg(CondReg, RegState::Kill);
1022 
1023   MRI.setSimpleHint(NewExec, CondReg);
1024 
1025   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1026   B.buildInstr(XorTermOpc)
1027     .addDef(ExecReg)
1028     .addReg(ExecReg)
1029     .addReg(NewExec);
1030 
1031   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1032   // s_cbranch_scc0?
1033 
1034   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1035   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1036     .addMBB(LoopBB);
1037 
1038   // Save the EXEC mask before the loop.
1039   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1040     .addReg(ExecReg);
1041 
1042   // Restore the EXEC mask after the loop.
1043   B.setMBB(*RestoreExecBB);
1044   B.buildInstr(MovTermOpc)
1045     .addDef(ExecReg)
1046     .addReg(SaveExecReg);
1047 
1048   // Set the insert point after the original instruction, so any new
1049   // instructions will be in the remainder.
1050   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1051 
1052   return true;
1053 }
1054 
1055 // Return any unique registers used by \p MI at \p OpIndices that need to be
1056 // handled in a waterfall loop. Returns these registers in \p
1057 // SGPROperandRegs. Returns true if there are any operands to handle and a
1058 // waterfall loop is necessary.
1059 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1060   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1061   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1062   for (unsigned Op : OpIndices) {
1063     assert(MI.getOperand(Op).isUse());
1064     Register Reg = MI.getOperand(Op).getReg();
1065     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1066     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1067       SGPROperandRegs.insert(Reg);
1068   }
1069 
1070   // No operands need to be replaced, so no need to loop.
1071   return !SGPROperandRegs.empty();
1072 }
1073 
1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1075   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1076   ArrayRef<unsigned> OpIndices) const {
1077   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1078   // are the same register.
1079   SmallSet<Register, 4> SGPROperandRegs;
1080 
1081   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1082     return false;
1083 
1084   MachineBasicBlock::iterator I = MI.getIterator();
1085   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1086                                 SGPROperandRegs, MRI);
1087 }
1088 
1089 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1090   MachineInstr &MI, MachineRegisterInfo &MRI,
1091   ArrayRef<unsigned> OpIndices) const {
1092   MachineIRBuilder B(MI);
1093   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1094 }
1095 
1096 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1097 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1098     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1099   Register Reg = MI.getOperand(OpIdx).getReg();
1100   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1101   if (Bank == &AMDGPU::SGPRRegBank)
1102     return;
1103 
1104   LLT Ty = MRI.getType(Reg);
1105   MachineIRBuilder B(MI);
1106 
1107   if (Bank != &AMDGPU::VGPRRegBank) {
1108     // We need to copy from AGPR to VGPR
1109     Reg = B.buildCopy(Ty, Reg).getReg(0);
1110     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1111   }
1112 
1113   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1114   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1115     .addDef(SGPR)
1116     .addReg(Reg);
1117 
1118   MRI.setType(SGPR, Ty);
1119 
1120   const TargetRegisterClass *Constrained =
1121       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1122   (void)Constrained;
1123   assert(Constrained && "Failed to constrain readfirstlane src reg");
1124 
1125   MI.getOperand(OpIdx).setReg(SGPR);
1126 }
1127 
1128 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1129 /// rest will be in the remainder.
1130 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1131   unsigned TotalSize = Ty.getSizeInBits();
1132   if (!Ty.isVector())
1133     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1134 
1135   LLT EltTy = Ty.getElementType();
1136   unsigned EltSize = EltTy.getSizeInBits();
1137   assert(FirstSize % EltSize == 0);
1138 
1139   unsigned FirstPartNumElts = FirstSize / EltSize;
1140   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1141 
1142   return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
1143           LLT::scalarOrVector(RemainderElts, EltTy)};
1144 }
1145 
1146 static LLT widen96To128(LLT Ty) {
1147   if (!Ty.isVector())
1148     return LLT::scalar(128);
1149 
1150   LLT EltTy = Ty.getElementType();
1151   assert(128 % EltTy.getSizeInBits() == 0);
1152   return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
1153 }
1154 
1155 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1156                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1157                                               MachineRegisterInfo &MRI) const {
1158   Register DstReg = MI.getOperand(0).getReg();
1159   const LLT LoadTy = MRI.getType(DstReg);
1160   unsigned LoadSize = LoadTy.getSizeInBits();
1161   const unsigned MaxNonSmrdLoadSize = 128;
1162 
1163   const RegisterBank *PtrBank =
1164     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1165   if (PtrBank == &AMDGPU::SGPRRegBank) {
1166     // If the pointer is an SGPR, we ordinarily have nothing to do.
1167     if (LoadSize != 96)
1168       return false;
1169 
1170     MachineMemOperand *MMO = *MI.memoperands_begin();
1171     Register PtrReg = MI.getOperand(1).getReg();
1172     // 96-bit loads are only available for vector loads. We need to split this
1173     // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1174 
1175     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1176     MachineIRBuilder B(MI, O);
1177 
1178     if (MMO->getAlign() < Align(16)) {
1179       LLT Part64, Part32;
1180       std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1181       auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1182       auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1183 
1184       auto Undef = B.buildUndef(LoadTy);
1185       auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1186       B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1187     } else {
1188       LLT WiderTy = widen96To128(LoadTy);
1189       auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1190       B.buildExtract(MI.getOperand(0), WideLoad, 0);
1191     }
1192 
1193     MI.eraseFromParent();
1194     return true;
1195   }
1196 
1197   // 128-bit loads are supported for all instruction types.
1198   if (LoadSize <= MaxNonSmrdLoadSize)
1199     return false;
1200 
1201   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1202   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1203 
1204   if (SrcRegs.empty())
1205     SrcRegs.push_back(MI.getOperand(1).getReg());
1206 
1207   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1208 
1209   // RegBankSelect only emits scalar types, so we need to reset the pointer
1210   // operand to a pointer type.
1211   Register BasePtrReg = SrcRegs[0];
1212   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1213   MRI.setType(BasePtrReg, PtrTy);
1214 
1215   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1216   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1217   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1218   MachineIRBuilder B(MI, Observer);
1219   LegalizerHelper Helper(B.getMF(), Observer, B);
1220 
1221   if (LoadTy.isVector()) {
1222     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1223       return false;
1224   } else {
1225     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1226       return false;
1227   }
1228 
1229   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1230   return true;
1231 }
1232 
1233 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1234   MachineInstr &MI,
1235   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1236   MachineRegisterInfo &MRI) const {
1237   const MachineFunction &MF = *MI.getMF();
1238   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1239   const auto &TFI = *ST.getFrameLowering();
1240 
1241   // Guard in case the stack growth direction ever changes with scratch
1242   // instructions.
1243   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1244     return false;
1245 
1246   Register Dst = MI.getOperand(0).getReg();
1247   Register AllocSize = MI.getOperand(1).getReg();
1248   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1249 
1250   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1251 
1252   // TODO: Need to emit a wave reduction to get the maximum size.
1253   if (SizeBank != &AMDGPU::SGPRRegBank)
1254     return false;
1255 
1256   LLT PtrTy = MRI.getType(Dst);
1257   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1258 
1259   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1260   Register SPReg = Info->getStackPtrOffsetReg();
1261   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1262   MachineIRBuilder B(MI, ApplyBank);
1263 
1264   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1265   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1266 
1267   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1268   if (Alignment > TFI.getStackAlign()) {
1269     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1270     B.buildMaskLowPtrBits(Dst, PtrAdd,
1271                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1272   } else {
1273     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1274   }
1275 
1276   MI.eraseFromParent();
1277   return true;
1278 }
1279 
1280 bool AMDGPURegisterBankInfo::applyMappingImage(
1281     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1282     MachineRegisterInfo &MRI, int RsrcIdx) const {
1283   const int NumDefs = MI.getNumExplicitDefs();
1284 
1285   // The reported argument index is relative to the IR intrinsic call arguments,
1286   // so we need to shift by the number of defs and the intrinsic ID.
1287   RsrcIdx += NumDefs + 1;
1288 
1289   // Insert copies to VGPR arguments.
1290   applyDefaultMapping(OpdMapper);
1291 
1292   // Fixup any SGPR arguments.
1293   SmallVector<unsigned, 4> SGPRIndexes;
1294   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1295     if (!MI.getOperand(I).isReg())
1296       continue;
1297 
1298     // If this intrinsic has a sampler, it immediately follows rsrc.
1299     if (I == RsrcIdx || I == RsrcIdx + 1)
1300       SGPRIndexes.push_back(I);
1301   }
1302 
1303   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1304   return true;
1305 }
1306 
1307 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1308                                         Register Reg) {
1309   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1310   if (!Def)
1311     return Reg;
1312 
1313   // TODO: Guard against this being an implicit def
1314   return Def->getOperand(0).getReg();
1315 }
1316 
1317 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1318 // the three offsets (voffset, soffset and instoffset)
1319 static unsigned setBufferOffsets(MachineIRBuilder &B,
1320                                  const AMDGPURegisterBankInfo &RBI,
1321                                  Register CombinedOffset, Register &VOffsetReg,
1322                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1323                                  Align Alignment) {
1324   const LLT S32 = LLT::scalar(32);
1325   MachineRegisterInfo *MRI = B.getMRI();
1326 
1327   if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
1328     uint32_t SOffset, ImmOffset;
1329     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1330                                  Alignment)) {
1331       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1332       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1333       InstOffsetVal = ImmOffset;
1334 
1335       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1336       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1337       return SOffset + ImmOffset;
1338     }
1339   }
1340 
1341   Register Base;
1342   unsigned Offset;
1343 
1344   std::tie(Base, Offset) =
1345       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1346 
1347   uint32_t SOffset, ImmOffset;
1348   if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1349                                              &RBI.Subtarget, Alignment)) {
1350     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1351       VOffsetReg = Base;
1352       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1353       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1354       InstOffsetVal = ImmOffset;
1355       return 0; // XXX - Why is this 0?
1356     }
1357 
1358     // If we have SGPR base, we can use it for soffset.
1359     if (SOffset == 0) {
1360       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1361       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1362       SOffsetReg = Base;
1363       InstOffsetVal = ImmOffset;
1364       return 0; // XXX - Why is this 0?
1365     }
1366   }
1367 
1368   // Handle the variable sgpr + vgpr case.
1369   if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
1370     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1371     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1372 
1373     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1374     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1375 
1376     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1377       VOffsetReg = Src0;
1378       SOffsetReg = Src1;
1379       return 0;
1380     }
1381 
1382     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1383       VOffsetReg = Src1;
1384       SOffsetReg = Src0;
1385       return 0;
1386     }
1387   }
1388 
1389   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1390   // have an SGPR offset and a VGPR resource.
1391   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1392     VOffsetReg = CombinedOffset;
1393   } else {
1394     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1395     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1396   }
1397 
1398   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1399   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1400   return 0;
1401 }
1402 
1403 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1404   const OperandsMapper &OpdMapper) const {
1405   MachineInstr &MI = OpdMapper.getMI();
1406   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1407 
1408   const LLT S32 = LLT::scalar(32);
1409   Register Dst = MI.getOperand(0).getReg();
1410   LLT Ty = MRI.getType(Dst);
1411 
1412   const RegisterBank *RSrcBank =
1413     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1414   const RegisterBank *OffsetBank =
1415     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1416   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1417       OffsetBank == &AMDGPU::SGPRRegBank)
1418     return true; // Legal mapping
1419 
1420   // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1421   // here but don't have an MMO.
1422 
1423   unsigned LoadSize = Ty.getSizeInBits();
1424   int NumLoads = 1;
1425   if (LoadSize == 256 || LoadSize == 512) {
1426     NumLoads = LoadSize / 128;
1427     Ty = Ty.divide(NumLoads);
1428   }
1429 
1430   // Use the alignment to ensure that the required offsets will fit into the
1431   // immediate offsets.
1432   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1433 
1434   MachineIRBuilder B(MI);
1435   MachineFunction &MF = B.getMF();
1436 
1437   Register SOffset;
1438   Register VOffset;
1439   int64_t ImmOffset = 0;
1440 
1441   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1442                                         VOffset, SOffset, ImmOffset, Alignment);
1443 
1444   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1445   // can, but we neeed to track an MMO for that.
1446   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1447   const Align MemAlign(4); // FIXME: ABI type alignment?
1448   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1449     MachinePointerInfo(),
1450     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1451     MachineMemOperand::MOInvariant,
1452     MemSize, MemAlign);
1453   if (MMOOffset != 0)
1454     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1455 
1456   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1457   // assume that the buffer is unswizzled.
1458 
1459   Register RSrc = MI.getOperand(1).getReg();
1460   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1461   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1462 
1463   SmallVector<Register, 4> LoadParts(NumLoads);
1464 
1465   MachineBasicBlock::iterator MII = MI.getIterator();
1466   MachineInstrSpan Span(MII, &B.getMBB());
1467 
1468   for (int i = 0; i < NumLoads; ++i) {
1469     if (NumLoads == 1) {
1470       LoadParts[i] = Dst;
1471     } else {
1472       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1473       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1474     }
1475 
1476     MachineMemOperand *MMO = BaseMMO;
1477     if (i != 0)
1478       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1479 
1480     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1481       .addDef(LoadParts[i])       // vdata
1482       .addUse(RSrc)               // rsrc
1483       .addUse(VIndex)             // vindex
1484       .addUse(VOffset)            // voffset
1485       .addUse(SOffset)            // soffset
1486       .addImm(ImmOffset + 16 * i) // offset(imm)
1487       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1488       .addImm(0)                  // idxen(imm)
1489       .addMemOperand(MMO);
1490   }
1491 
1492   // TODO: If only the resource is a VGPR, it may be better to execute the
1493   // scalar load in the waterfall loop if the resource is expected to frequently
1494   // be dynamically uniform.
1495   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1496     // Remove the original instruction to avoid potentially confusing the
1497     // waterfall loop logic.
1498     B.setInstr(*Span.begin());
1499     MI.eraseFromParent();
1500 
1501     SmallSet<Register, 4> OpsToWaterfall;
1502 
1503     OpsToWaterfall.insert(RSrc);
1504     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1505                            OpsToWaterfall, MRI);
1506   }
1507 
1508   if (NumLoads != 1) {
1509     if (Ty.isVector())
1510       B.buildConcatVectors(Dst, LoadParts);
1511     else
1512       B.buildMerge(Dst, LoadParts);
1513   }
1514 
1515   // We removed the instruction earlier with a waterfall loop.
1516   if (RSrcBank == &AMDGPU::SGPRRegBank)
1517     MI.eraseFromParent();
1518 
1519   return true;
1520 }
1521 
1522 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1523   const OperandsMapper &OpdMapper, bool Signed) const {
1524   MachineInstr &MI = OpdMapper.getMI();
1525   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1526 
1527   // Insert basic copies
1528   applyDefaultMapping(OpdMapper);
1529 
1530   Register DstReg = MI.getOperand(0).getReg();
1531   LLT Ty = MRI.getType(DstReg);
1532 
1533   const LLT S32 = LLT::scalar(32);
1534 
1535   const RegisterBank *DstBank =
1536     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1537   if (DstBank == &AMDGPU::VGPRRegBank) {
1538     if (Ty == S32)
1539       return true;
1540 
1541     // TODO: 64-bit version is scalar only, so we need to expand this.
1542     return false;
1543   }
1544 
1545   Register SrcReg = MI.getOperand(2).getReg();
1546   Register OffsetReg = MI.getOperand(3).getReg();
1547   Register WidthReg = MI.getOperand(4).getReg();
1548 
1549   // The scalar form packs the offset and width in a single operand.
1550 
1551   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1552   MachineIRBuilder B(MI, ApplyBank);
1553 
1554   // Ensure the high bits are clear to insert the offset.
1555   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1556   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1557 
1558   // Zeros out the low bits, so don't bother clamping the input value.
1559   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1560 
1561   // Transformation function, pack the offset and width of a BFE into
1562   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1563   // source, bits [5:0] contain the offset and bits [22:16] the width.
1564   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1565 
1566   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1567   // register class constraints.
1568   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1569                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1570 
1571   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1572   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1573     llvm_unreachable("failed to constrain BFE");
1574 
1575   MI.eraseFromParent();
1576   return true;
1577 }
1578 
1579 // FIXME: Duplicated from LegalizerHelper
1580 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1581   switch (Opc) {
1582   case TargetOpcode::G_SMIN:
1583     return CmpInst::ICMP_SLT;
1584   case TargetOpcode::G_SMAX:
1585     return CmpInst::ICMP_SGT;
1586   case TargetOpcode::G_UMIN:
1587     return CmpInst::ICMP_ULT;
1588   case TargetOpcode::G_UMAX:
1589     return CmpInst::ICMP_UGT;
1590   default:
1591     llvm_unreachable("not in integer min/max");
1592   }
1593 }
1594 
1595 static unsigned minMaxToExtend(unsigned Opc) {
1596   switch (Opc) {
1597   case TargetOpcode::G_SMIN:
1598   case TargetOpcode::G_SMAX:
1599     return TargetOpcode::G_SEXT;
1600   case TargetOpcode::G_UMIN:
1601   case TargetOpcode::G_UMAX:
1602     return TargetOpcode::G_ZEXT;
1603   default:
1604     llvm_unreachable("not in integer min/max");
1605   }
1606 }
1607 
1608 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1609 // any illegal vector extend or unmerge operations.
1610 static std::pair<Register, Register>
1611 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1612   const LLT S32 = LLT::scalar(32);
1613   auto Bitcast = B.buildBitcast(S32, Src);
1614 
1615   if (ExtOpcode == TargetOpcode::G_SEXT) {
1616     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1617     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1618     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1619   }
1620 
1621   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1622   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1623     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1624     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1625   }
1626 
1627   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1628   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1629 }
1630 
1631 static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
1632                                                CmpInst::Predicate Pred,
1633                                                Register Dst, Register Src0,
1634                                                Register Src1) {
1635   const LLT CmpType = LLT::scalar(32);
1636   auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1637   return B.buildSelect(Dst, Cmp, Src0, Src1);
1638 }
1639 
1640 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
1641 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1642                                                MachineInstr &MI) const {
1643   Register Dst = MI.getOperand(0).getReg();
1644   Register Src0 = MI.getOperand(1).getReg();
1645   Register Src1 = MI.getOperand(2).getReg();
1646 
1647   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1648   MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
1649 
1650   Register CmpReg = Sel->getOperand(1).getReg();
1651   B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
1652   MI.eraseFromParent();
1653 }
1654 
1655 // For cases where only a single copy is inserted for matching register banks.
1656 // Replace the register in the instruction operand
1657 static bool substituteSimpleCopyRegs(
1658   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1659   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1660   if (!SrcReg.empty()) {
1661     assert(SrcReg.size() == 1);
1662     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1663     return true;
1664   }
1665 
1666   return false;
1667 }
1668 
1669 /// Handle register layout difference for f16 images for some subtargets.
1670 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1671                                                 MachineRegisterInfo &MRI,
1672                                                 Register Reg) const {
1673   if (!Subtarget.hasUnpackedD16VMem())
1674     return Reg;
1675 
1676   const LLT S16 = LLT::scalar(16);
1677   LLT StoreVT = MRI.getType(Reg);
1678   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1679     return Reg;
1680 
1681   auto Unmerge = B.buildUnmerge(S16, Reg);
1682 
1683 
1684   SmallVector<Register, 4> WideRegs;
1685   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1686     WideRegs.push_back(Unmerge.getReg(I));
1687 
1688   const LLT S32 = LLT::scalar(32);
1689   int NumElts = StoreVT.getNumElements();
1690 
1691   return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1692 }
1693 
1694 static std::pair<Register, unsigned>
1695 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1696   int64_t Const;
1697   if (mi_match(Reg, MRI, m_ICst(Const)))
1698     return std::make_pair(Register(), Const);
1699 
1700   Register Base;
1701   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1702     return std::make_pair(Base, Const);
1703 
1704   // TODO: Handle G_OR used for add case
1705   return std::make_pair(Reg, 0);
1706 }
1707 
1708 std::pair<Register, unsigned>
1709 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1710                                            Register OrigOffset) const {
1711   const unsigned MaxImm = 4095;
1712   Register BaseReg;
1713   unsigned ImmOffset;
1714   const LLT S32 = LLT::scalar(32);
1715 
1716   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1717                                                            OrigOffset);
1718 
1719   unsigned C1 = 0;
1720   if (ImmOffset != 0) {
1721     // If the immediate value is too big for the immoffset field, put the value
1722     // and -4096 into the immoffset field so that the value that is copied/added
1723     // for the voffset field is a multiple of 4096, and it stands more chance
1724     // of being CSEd with the copy/add for another similar load/store.
1725     // However, do not do that rounding down to a multiple of 4096 if that is a
1726     // negative number, as it appears to be illegal to have a negative offset
1727     // in the vgpr, even if adding the immediate offset makes it positive.
1728     unsigned Overflow = ImmOffset & ~MaxImm;
1729     ImmOffset -= Overflow;
1730     if ((int32_t)Overflow < 0) {
1731       Overflow += ImmOffset;
1732       ImmOffset = 0;
1733     }
1734 
1735     C1 = ImmOffset;
1736     if (Overflow != 0) {
1737       if (!BaseReg)
1738         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1739       else {
1740         auto OverflowVal = B.buildConstant(S32, Overflow);
1741         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1742       }
1743     }
1744   }
1745 
1746   if (!BaseReg)
1747     BaseReg = B.buildConstant(S32, 0).getReg(0);
1748 
1749   return {BaseReg, C1};
1750 }
1751 
1752 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1753   int64_t C;
1754   return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1755 }
1756 
1757 static unsigned extractGLC(unsigned CachePolicy) {
1758   return CachePolicy & 1;
1759 }
1760 
1761 static unsigned extractSLC(unsigned CachePolicy) {
1762   return (CachePolicy >> 1) & 1;
1763 }
1764 
1765 static unsigned extractDLC(unsigned CachePolicy) {
1766   return (CachePolicy >> 2) & 1;
1767 }
1768 
1769 MachineInstr *
1770 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1771                                              MachineInstr &MI) const {
1772    MachineRegisterInfo &MRI = *B.getMRI();
1773   executeInWaterfallLoop(B, MI, MRI, {2, 4});
1774 
1775   // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1776 
1777   Register VData = MI.getOperand(1).getReg();
1778   LLT Ty = MRI.getType(VData);
1779 
1780   int EltSize = Ty.getScalarSizeInBits();
1781   int Size = Ty.getSizeInBits();
1782 
1783   // FIXME: Broken integer truncstore.
1784   if (EltSize != 32)
1785     report_fatal_error("unhandled intrinsic store");
1786 
1787   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1788   const int MemSize = (*MI.memoperands_begin())->getSize();
1789 
1790 
1791   Register RSrc = MI.getOperand(2).getReg();
1792   Register VOffset = MI.getOperand(3).getReg();
1793   Register SOffset = MI.getOperand(4).getReg();
1794   unsigned CachePolicy = MI.getOperand(5).getImm();
1795 
1796   unsigned ImmOffset;
1797   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1798 
1799   const bool Offen = !isZero(VOffset, MRI);
1800 
1801   unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1802   switch (8 * MemSize) {
1803   case 8:
1804     Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1805                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1806     break;
1807   case 16:
1808     Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1809                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1810     break;
1811   default:
1812     Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1813                   AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1814     if (Size > 32)
1815       Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1816     break;
1817   }
1818 
1819 
1820   // Set the insertion point back to the instruction in case it was moved into a
1821   // loop.
1822   B.setInstr(MI);
1823 
1824   MachineInstrBuilder MIB = B.buildInstr(Opc)
1825     .addUse(VData);
1826 
1827   if (Offen)
1828     MIB.addUse(VOffset);
1829 
1830   MIB.addUse(RSrc)
1831      .addUse(SOffset)
1832      .addImm(ImmOffset)
1833      .addImm(extractGLC(CachePolicy))
1834      .addImm(extractSLC(CachePolicy))
1835      .addImm(0) // tfe: FIXME: Remove from inst
1836      .addImm(extractDLC(CachePolicy))
1837      .cloneMemRefs(MI);
1838 
1839   // FIXME: We need a way to report failure from applyMappingImpl.
1840   // Insert constrain copies before inserting the loop.
1841   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1842     report_fatal_error("failed to constrain selected store intrinsic");
1843 
1844   return MIB;
1845 }
1846 
1847 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1848                                         Register SrcReg) const {
1849   MachineRegisterInfo &MRI = *B.getMRI();
1850   LLT SrcTy = MRI.getType(SrcReg);
1851   if (SrcTy.getSizeInBits() == 32) {
1852     // Use a v_mov_b32 here to make the exec dependency explicit.
1853     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1854       .addDef(DstReg)
1855       .addUse(SrcReg);
1856     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1857            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1858   }
1859 
1860   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1861   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1862 
1863   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1864     .addDef(TmpReg0)
1865     .addUse(SrcReg, 0, AMDGPU::sub0);
1866   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1867     .addDef(TmpReg1)
1868     .addUse(SrcReg, 0, AMDGPU::sub1);
1869   B.buildInstr(AMDGPU::REG_SEQUENCE)
1870     .addDef(DstReg)
1871     .addUse(TmpReg0)
1872     .addImm(AMDGPU::sub0)
1873     .addUse(TmpReg1)
1874     .addImm(AMDGPU::sub1);
1875 
1876   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1877          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1878 }
1879 
1880 /// Utility function for pushing dynamic vector indexes with a constant offset
1881 /// into waterwall loops.
1882 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1883                                    MachineInstr &IdxUseInstr,
1884                                    unsigned OpIdx,
1885                                    unsigned ConstOffset) {
1886   MachineRegisterInfo &MRI = *B.getMRI();
1887   const LLT S32 = LLT::scalar(32);
1888   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1889   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1890 
1891   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1892 
1893   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1894   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1895   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1896   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1897 }
1898 
1899 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1900 /// original 32-bit source value (to be inserted in the low part of the combined
1901 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1902 /// value.
1903 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1904                                   Register Hi32Reg, Register Lo32Reg,
1905                                   unsigned ExtOpc,
1906                                   const RegisterBank &RegBank,
1907                                   bool IsBooleanSrc = false) {
1908   if (ExtOpc == AMDGPU::G_ZEXT) {
1909     B.buildConstant(Hi32Reg, 0);
1910   } else if (ExtOpc == AMDGPU::G_SEXT) {
1911     if (IsBooleanSrc) {
1912       // If we know the original source was an s1, the high half is the same as
1913       // the low.
1914       B.buildCopy(Hi32Reg, Lo32Reg);
1915     } else {
1916       // Replicate sign bit from 32-bit extended part.
1917       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1918       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1919       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1920     }
1921   } else {
1922     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1923     B.buildUndef(Hi32Reg);
1924   }
1925 }
1926 
1927 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1928   MachineInstr &MI, MachineRegisterInfo &MRI,
1929   const OperandsMapper &OpdMapper) const {
1930 
1931   Register VecReg = MI.getOperand(1).getReg();
1932   Register Idx = MI.getOperand(2).getReg();
1933 
1934   const RegisterBank &IdxBank =
1935     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1936 
1937   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1938 
1939   LLT VecTy = MRI.getType(VecReg);
1940   unsigned EltSize = VecTy.getScalarSizeInBits();
1941   unsigned NumElem = VecTy.getNumElements();
1942 
1943   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1944                                                   IsDivergentIdx))
1945     return false;
1946 
1947   MachineIRBuilder B(MI);
1948   LLT S32 = LLT::scalar(32);
1949 
1950   const RegisterBank &DstBank =
1951     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1952   const RegisterBank &SrcBank =
1953     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1954 
1955   const RegisterBank &CCBank =
1956     (DstBank == AMDGPU::SGPRRegBank &&
1957      SrcBank == AMDGPU::SGPRRegBank &&
1958      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1959                                      : AMDGPU::VCCRegBank;
1960   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1961 
1962   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1963     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1964     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1965   }
1966 
1967   LLT EltTy = VecTy.getScalarType();
1968   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1969   unsigned NumLanes = DstRegs.size();
1970   if (!NumLanes)
1971     NumLanes = 1;
1972   else
1973     EltTy = MRI.getType(DstRegs[0]);
1974 
1975   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1976   SmallVector<Register, 2> Res(NumLanes);
1977   for (unsigned L = 0; L < NumLanes; ++L)
1978     Res[L] = UnmergeToEltTy.getReg(L);
1979 
1980   for (unsigned I = 1; I < NumElem; ++I) {
1981     auto IC = B.buildConstant(S32, I);
1982     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1983     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1984     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1985 
1986     for (unsigned L = 0; L < NumLanes; ++L) {
1987       auto S = B.buildSelect(EltTy, Cmp,
1988                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1989 
1990       for (unsigned N : { 0, 2, 3 })
1991         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1992 
1993       Res[L] = S->getOperand(0).getReg();
1994     }
1995   }
1996 
1997   for (unsigned L = 0; L < NumLanes; ++L) {
1998     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1999     B.buildCopy(DstReg, Res[L]);
2000     MRI.setRegBank(DstReg, DstBank);
2001   }
2002 
2003   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2004   MI.eraseFromParent();
2005 
2006   return true;
2007 }
2008 
2009 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2010   MachineInstr &MI, MachineRegisterInfo &MRI,
2011   const OperandsMapper &OpdMapper) const {
2012 
2013   Register VecReg = MI.getOperand(1).getReg();
2014   Register Idx = MI.getOperand(3).getReg();
2015 
2016   const RegisterBank &IdxBank =
2017     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2018 
2019   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2020 
2021   LLT VecTy = MRI.getType(VecReg);
2022   unsigned EltSize = VecTy.getScalarSizeInBits();
2023   unsigned NumElem = VecTy.getNumElements();
2024 
2025   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2026                                                   IsDivergentIdx))
2027     return false;
2028 
2029   MachineIRBuilder B(MI);
2030   LLT S32 = LLT::scalar(32);
2031 
2032   const RegisterBank &DstBank =
2033     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2034   const RegisterBank &SrcBank =
2035     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2036   const RegisterBank &InsBank =
2037     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2038 
2039   const RegisterBank &CCBank =
2040     (DstBank == AMDGPU::SGPRRegBank &&
2041      SrcBank == AMDGPU::SGPRRegBank &&
2042      InsBank == AMDGPU::SGPRRegBank &&
2043      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2044                                      : AMDGPU::VCCRegBank;
2045   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2046 
2047   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2048     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2049     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2050   }
2051 
2052   LLT EltTy = VecTy.getScalarType();
2053   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2054   unsigned NumLanes = InsRegs.size();
2055   if (!NumLanes) {
2056     NumLanes = 1;
2057     InsRegs.push_back(MI.getOperand(2).getReg());
2058   } else {
2059     EltTy = MRI.getType(InsRegs[0]);
2060   }
2061 
2062   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2063   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2064 
2065   for (unsigned I = 0; I < NumElem; ++I) {
2066     auto IC = B.buildConstant(S32, I);
2067     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2068     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2069     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2070 
2071     for (unsigned L = 0; L < NumLanes; ++L) {
2072       auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
2073                              UnmergeToEltTy.getReg(I * NumLanes + L));
2074 
2075       for (unsigned N : { 0, 2, 3 })
2076         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2077 
2078       Ops[I * NumLanes + L] = S->getOperand(0).getReg();
2079     }
2080   }
2081 
2082   LLT MergeTy = LLT::vector(Ops.size(), EltTy);
2083   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2084     B.buildBuildVector(MI.getOperand(0), Ops);
2085   } else {
2086     auto Vec = B.buildBuildVector(MergeTy, Ops);
2087     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2088     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2089   }
2090 
2091   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2092   MI.eraseFromParent();
2093 
2094   return true;
2095 }
2096 
2097 void AMDGPURegisterBankInfo::applyMappingImpl(
2098     const OperandsMapper &OpdMapper) const {
2099   MachineInstr &MI = OpdMapper.getMI();
2100   unsigned Opc = MI.getOpcode();
2101   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2102   switch (Opc) {
2103   case AMDGPU::G_PHI: {
2104     Register DstReg = MI.getOperand(0).getReg();
2105     LLT DstTy = MRI.getType(DstReg);
2106     if (DstTy != LLT::scalar(1))
2107       break;
2108 
2109     const LLT S32 = LLT::scalar(32);
2110     const RegisterBank *DstBank =
2111       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2112     if (DstBank == &AMDGPU::VCCRegBank) {
2113       applyDefaultMapping(OpdMapper);
2114       // The standard handling only considers the result register bank for
2115       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2116       // produce an invalid copy. We can only copy with some kind of compare to
2117       // get a vector boolean result. Insert a regitser bank copy that will be
2118       // correctly lowered to a compare.
2119       MachineIRBuilder B(*MI.getParent()->getParent());
2120 
2121       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2122         Register SrcReg = MI.getOperand(I).getReg();
2123         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2124 
2125         if (SrcBank != &AMDGPU::VCCRegBank) {
2126           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2127           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2128 
2129           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2130           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2131           MI.getOperand(I).setReg(Copy.getReg(0));
2132         }
2133       }
2134 
2135       return;
2136     }
2137 
2138     // Phi handling is strange and only considers the bank of the destination.
2139     substituteSimpleCopyRegs(OpdMapper, 0);
2140 
2141     // Promote SGPR/VGPR booleans to s32
2142     MachineFunction *MF = MI.getParent()->getParent();
2143     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2144     MachineIRBuilder B(MI, ApplyBank);
2145     LegalizerHelper Helper(*MF, ApplyBank, B);
2146 
2147     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2148       llvm_unreachable("widen scalar should have succeeded");
2149 
2150     return;
2151   }
2152   case AMDGPU::G_ICMP:
2153   case AMDGPU::G_UADDO:
2154   case AMDGPU::G_USUBO:
2155   case AMDGPU::G_UADDE:
2156   case AMDGPU::G_SADDE:
2157   case AMDGPU::G_USUBE:
2158   case AMDGPU::G_SSUBE: {
2159     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2160     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2161 
2162     const RegisterBank *DstBank =
2163       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2164     if (DstBank != &AMDGPU::SGPRRegBank)
2165       break;
2166 
2167     const bool HasCarryIn = MI.getNumOperands() == 5;
2168 
2169     // If this is a scalar compare, promote the result to s32, as the selection
2170     // will end up using a copy to a 32-bit vreg.
2171     const LLT S32 = LLT::scalar(32);
2172     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2173     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2174     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2175     MachineIRBuilder B(MI);
2176 
2177     if (HasCarryIn) {
2178       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2179       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2180       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2181       MI.getOperand(4).setReg(NewSrcReg);
2182     }
2183 
2184     MachineBasicBlock *MBB = MI.getParent();
2185     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2186 
2187     // If we had a constrained VCC result register, a copy was inserted to VCC
2188     // from SGPR.
2189     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2190     if (DefRegs.empty())
2191       DefRegs.push_back(DstReg);
2192     B.buildTrunc(DefRegs[0], NewDstReg);
2193     return;
2194   }
2195   case AMDGPU::G_SELECT: {
2196     Register DstReg = MI.getOperand(0).getReg();
2197     LLT DstTy = MRI.getType(DstReg);
2198 
2199     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2200     if (CondRegs.empty())
2201       CondRegs.push_back(MI.getOperand(1).getReg());
2202     else {
2203       assert(CondRegs.size() == 1);
2204     }
2205 
2206     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2207     if (CondBank == &AMDGPU::SGPRRegBank) {
2208       MachineIRBuilder B(MI);
2209       const LLT S32 = LLT::scalar(32);
2210       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2211       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2212 
2213       MI.getOperand(1).setReg(NewCondReg);
2214       B.buildZExt(NewCondReg, CondRegs[0]);
2215     }
2216 
2217     if (DstTy.getSizeInBits() != 64)
2218       break;
2219 
2220     MachineIRBuilder B(MI);
2221     LLT HalfTy = getHalfSizedType(DstTy);
2222 
2223     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2224     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2225     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2226 
2227     // All inputs are SGPRs, nothing special to do.
2228     if (DefRegs.empty()) {
2229       assert(Src1Regs.empty() && Src2Regs.empty());
2230       break;
2231     }
2232 
2233     if (Src1Regs.empty())
2234       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2235     else {
2236       setRegsToType(MRI, Src1Regs, HalfTy);
2237     }
2238 
2239     if (Src2Regs.empty())
2240       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2241     else
2242       setRegsToType(MRI, Src2Regs, HalfTy);
2243 
2244     setRegsToType(MRI, DefRegs, HalfTy);
2245 
2246     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2247     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2248 
2249     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2250     MI.eraseFromParent();
2251     return;
2252   }
2253   case AMDGPU::G_BRCOND: {
2254     Register CondReg = MI.getOperand(0).getReg();
2255     // FIXME: Should use legalizer helper, but should change bool ext type.
2256     const RegisterBank *CondBank =
2257       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2258 
2259     if (CondBank == &AMDGPU::SGPRRegBank) {
2260       MachineIRBuilder B(MI);
2261       const LLT S32 = LLT::scalar(32);
2262       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2263       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2264 
2265       MI.getOperand(0).setReg(NewCondReg);
2266       B.buildZExt(NewCondReg, CondReg);
2267       return;
2268     }
2269 
2270     break;
2271   }
2272   case AMDGPU::G_AND:
2273   case AMDGPU::G_OR:
2274   case AMDGPU::G_XOR: {
2275     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2276     // there is a VGPR input.
2277     Register DstReg = MI.getOperand(0).getReg();
2278     LLT DstTy = MRI.getType(DstReg);
2279 
2280     if (DstTy.getSizeInBits() == 1) {
2281       const RegisterBank *DstBank =
2282         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2283       if (DstBank == &AMDGPU::VCCRegBank)
2284         break;
2285 
2286       MachineFunction *MF = MI.getParent()->getParent();
2287       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2288       MachineIRBuilder B(MI, ApplyBank);
2289       LegalizerHelper Helper(*MF, ApplyBank, B);
2290 
2291       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2292           LegalizerHelper::Legalized)
2293         llvm_unreachable("widen scalar should have succeeded");
2294       return;
2295     }
2296 
2297     if (DstTy.getSizeInBits() != 64)
2298       break;
2299 
2300     LLT HalfTy = getHalfSizedType(DstTy);
2301     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2302     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2303     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2304 
2305     // All inputs are SGPRs, nothing special to do.
2306     if (DefRegs.empty()) {
2307       assert(Src0Regs.empty() && Src1Regs.empty());
2308       break;
2309     }
2310 
2311     assert(DefRegs.size() == 2);
2312     assert(Src0Regs.size() == Src1Regs.size() &&
2313            (Src0Regs.empty() || Src0Regs.size() == 2));
2314 
2315     // Depending on where the source registers came from, the generic code may
2316     // have decided to split the inputs already or not. If not, we still need to
2317     // extract the values.
2318     MachineIRBuilder B(MI);
2319 
2320     if (Src0Regs.empty())
2321       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2322     else
2323       setRegsToType(MRI, Src0Regs, HalfTy);
2324 
2325     if (Src1Regs.empty())
2326       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2327     else
2328       setRegsToType(MRI, Src1Regs, HalfTy);
2329 
2330     setRegsToType(MRI, DefRegs, HalfTy);
2331 
2332     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2333     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2334 
2335     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2336     MI.eraseFromParent();
2337     return;
2338   }
2339   case AMDGPU::G_ADD:
2340   case AMDGPU::G_SUB:
2341   case AMDGPU::G_MUL:
2342   case AMDGPU::G_SHL:
2343   case AMDGPU::G_LSHR:
2344   case AMDGPU::G_ASHR: {
2345     Register DstReg = MI.getOperand(0).getReg();
2346     LLT DstTy = MRI.getType(DstReg);
2347 
2348     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2349     // Packed 16-bit operations need to be scalarized and promoted.
2350     if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
2351       break;
2352 
2353     const RegisterBank *DstBank =
2354       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2355     if (DstBank == &AMDGPU::VGPRRegBank)
2356       break;
2357 
2358     const LLT S32 = LLT::scalar(32);
2359     MachineBasicBlock *MBB = MI.getParent();
2360     MachineFunction *MF = MBB->getParent();
2361     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2362     MachineIRBuilder B(MI, ApplySALU);
2363 
2364     if (DstTy.isVector()) {
2365       Register WideSrc0Lo, WideSrc0Hi;
2366       Register WideSrc1Lo, WideSrc1Hi;
2367 
2368       std::tie(WideSrc0Lo, WideSrc0Hi)
2369         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
2370       std::tie(WideSrc1Lo, WideSrc1Hi)
2371         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
2372       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2373       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2374       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2375       MI.eraseFromParent();
2376     } else {
2377       LegalizerHelper Helper(*MF, ApplySALU, B);
2378 
2379       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2380         llvm_unreachable("widen scalar should have succeeded");
2381 
2382       // FIXME: s16 shift amounts should be legal.
2383       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2384           Opc == AMDGPU::G_ASHR) {
2385         B.setInsertPt(*MBB, MI.getIterator());
2386         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2387           llvm_unreachable("widen scalar should have succeeded");
2388       }
2389     }
2390 
2391     return;
2392   }
2393   case AMDGPU::G_SMIN:
2394   case AMDGPU::G_SMAX:
2395   case AMDGPU::G_UMIN:
2396   case AMDGPU::G_UMAX: {
2397     Register DstReg = MI.getOperand(0).getReg();
2398     const RegisterBank *DstBank =
2399       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2400     if (DstBank == &AMDGPU::VGPRRegBank)
2401       break;
2402 
2403     MachineFunction *MF = MI.getParent()->getParent();
2404     MachineIRBuilder B(MI);
2405 
2406     // Turn scalar min/max into a compare and select.
2407     LLT Ty = MRI.getType(DstReg);
2408     const LLT S32 = LLT::scalar(32);
2409     const LLT S16 = LLT::scalar(16);
2410     const LLT V2S16 = LLT::vector(2, 16);
2411 
2412     if (Ty == V2S16) {
2413       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2414       B.setChangeObserver(ApplySALU);
2415 
2416       // Need to widen to s32, and expand as cmp + select, and avoid producing
2417       // illegal vector extends or unmerges that would need further
2418       // legalization.
2419       //
2420       // TODO: Should we just readfirstlane? That should probably be handled
2421       // with a UniformVGPR register bank that wouldn't need special
2422       // consideration here.
2423 
2424       Register Dst = MI.getOperand(0).getReg();
2425       Register Src0 = MI.getOperand(1).getReg();
2426       Register Src1 = MI.getOperand(2).getReg();
2427 
2428       Register WideSrc0Lo, WideSrc0Hi;
2429       Register WideSrc1Lo, WideSrc1Hi;
2430 
2431       unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
2432 
2433       std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
2434       std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
2435 
2436       Register Lo = MRI.createGenericVirtualRegister(S32);
2437       Register Hi = MRI.createGenericVirtualRegister(S32);
2438       const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
2439       buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
2440       buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
2441 
2442       B.buildBuildVectorTrunc(Dst, {Lo, Hi});
2443       MI.eraseFromParent();
2444     } else if (Ty == S16) {
2445       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2446       B.setChangeObserver(ApplySALU);
2447       LegalizerHelper Helper(*MF, ApplySALU, B);
2448 
2449       // Need to widen to s32, and expand as cmp + select.
2450       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2451         llvm_unreachable("widenScalar should have succeeded");
2452 
2453       // FIXME: This is relying on widenScalar leaving MI in place.
2454       lowerScalarMinMax(B, MI);
2455     } else
2456       lowerScalarMinMax(B, MI);
2457 
2458     return;
2459   }
2460   case AMDGPU::G_SEXT_INREG: {
2461     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2462     if (SrcRegs.empty())
2463       break; // Nothing to repair
2464 
2465     const LLT S32 = LLT::scalar(32);
2466     MachineIRBuilder B(MI);
2467     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2468     GISelObserverWrapper Observer(&O);
2469     B.setChangeObserver(Observer);
2470 
2471     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2472     // we would need to further expand, and doesn't let us directly set the
2473     // result registers.
2474     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2475 
2476     int Amt = MI.getOperand(2).getImm();
2477     if (Amt <= 32) {
2478       if (Amt == 32) {
2479         // The low bits are unchanged.
2480         B.buildCopy(DstRegs[0], SrcRegs[0]);
2481       } else {
2482         // Extend in the low bits and propagate the sign bit to the high half.
2483         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2484       }
2485 
2486       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2487     } else {
2488       // The low bits are unchanged, and extend in the high bits.
2489       B.buildCopy(DstRegs[0], SrcRegs[0]);
2490       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2491     }
2492 
2493     Register DstReg = MI.getOperand(0).getReg();
2494     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2495     MI.eraseFromParent();
2496     return;
2497   }
2498   case AMDGPU::G_CTPOP:
2499   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2500   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2501     const RegisterBank *DstBank =
2502       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2503     if (DstBank == &AMDGPU::SGPRRegBank)
2504       break;
2505 
2506     Register SrcReg = MI.getOperand(1).getReg();
2507     const LLT S32 = LLT::scalar(32);
2508     LLT Ty = MRI.getType(SrcReg);
2509     if (Ty == S32)
2510       break;
2511 
2512     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2513     MachineIRBuilder B(MI, ApplyVALU);
2514 
2515     MachineFunction &MF = B.getMF();
2516     LegalizerHelper Helper(MF, ApplyVALU, B);
2517 
2518     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2519       llvm_unreachable("narrowScalar should have succeeded");
2520     return;
2521   }
2522   case AMDGPU::G_SEXT:
2523   case AMDGPU::G_ZEXT:
2524   case AMDGPU::G_ANYEXT: {
2525     Register SrcReg = MI.getOperand(1).getReg();
2526     LLT SrcTy = MRI.getType(SrcReg);
2527     const bool Signed = Opc == AMDGPU::G_SEXT;
2528 
2529     assert(empty(OpdMapper.getVRegs(1)));
2530 
2531     MachineIRBuilder B(MI);
2532     const RegisterBank *SrcBank =
2533       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2534 
2535     Register DstReg = MI.getOperand(0).getReg();
2536     LLT DstTy = MRI.getType(DstReg);
2537     if (DstTy.isScalar() &&
2538         SrcBank != &AMDGPU::SGPRRegBank &&
2539         SrcBank != &AMDGPU::VCCRegBank &&
2540         // FIXME: Should handle any type that round to s64 when irregular
2541         // breakdowns supported.
2542         DstTy.getSizeInBits() == 64 &&
2543         SrcTy.getSizeInBits() <= 32) {
2544       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2545 
2546       // Extend to 32-bit, and then extend the low half.
2547       if (Signed) {
2548         // TODO: Should really be buildSExtOrCopy
2549         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2550       } else if (Opc == AMDGPU::G_ZEXT) {
2551         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2552       } else {
2553         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2554       }
2555 
2556       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2557       MRI.setRegBank(DstReg, *SrcBank);
2558       MI.eraseFromParent();
2559       return;
2560     }
2561 
2562     if (SrcTy != LLT::scalar(1))
2563       return;
2564 
2565     // It is not legal to have a legalization artifact with a VCC source. Rather
2566     // than introducing a copy, insert the select we would have to select the
2567     // copy to.
2568     if (SrcBank == &AMDGPU::VCCRegBank) {
2569       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2570 
2571       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2572 
2573       unsigned DstSize = DstTy.getSizeInBits();
2574       // 64-bit select is SGPR only
2575       const bool UseSel64 = DstSize > 32 &&
2576         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2577 
2578       // TODO: Should s16 select be legal?
2579       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2580       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2581       auto False = B.buildConstant(SelType, 0);
2582 
2583       MRI.setRegBank(True.getReg(0), *DstBank);
2584       MRI.setRegBank(False.getReg(0), *DstBank);
2585       MRI.setRegBank(DstReg, *DstBank);
2586 
2587       if (DstSize > 32) {
2588         B.buildSelect(DefRegs[0], SrcReg, True, False);
2589         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2590       } else if (DstSize < 32) {
2591         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2592         MRI.setRegBank(Sel.getReg(0), *DstBank);
2593         B.buildTrunc(DstReg, Sel);
2594       } else {
2595         B.buildSelect(DstReg, SrcReg, True, False);
2596       }
2597 
2598       MI.eraseFromParent();
2599       return;
2600     }
2601 
2602     break;
2603   }
2604   case AMDGPU::G_BUILD_VECTOR:
2605   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2606     Register DstReg = MI.getOperand(0).getReg();
2607     LLT DstTy = MRI.getType(DstReg);
2608     if (DstTy != LLT::vector(2, 16))
2609       break;
2610 
2611     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2612     substituteSimpleCopyRegs(OpdMapper, 1);
2613     substituteSimpleCopyRegs(OpdMapper, 2);
2614 
2615     const RegisterBank *DstBank =
2616       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2617     if (DstBank == &AMDGPU::SGPRRegBank)
2618       break; // Can use S_PACK_* instructions.
2619 
2620     MachineIRBuilder B(MI);
2621 
2622     Register Lo = MI.getOperand(1).getReg();
2623     Register Hi = MI.getOperand(2).getReg();
2624     const LLT S32 = LLT::scalar(32);
2625 
2626     const RegisterBank *BankLo =
2627       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2628     const RegisterBank *BankHi =
2629       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2630 
2631     Register ZextLo;
2632     Register ShiftHi;
2633 
2634     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2635       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2636       MRI.setRegBank(ZextLo, *BankLo);
2637 
2638       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2639       MRI.setRegBank(ZextHi, *BankHi);
2640 
2641       auto ShiftAmt = B.buildConstant(S32, 16);
2642       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2643 
2644       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2645       MRI.setRegBank(ShiftHi, *BankHi);
2646     } else {
2647       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2648       MRI.setRegBank(MaskLo, *BankLo);
2649 
2650       auto ShiftAmt = B.buildConstant(S32, 16);
2651       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2652 
2653       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2654       MRI.setRegBank(ShiftHi, *BankHi);
2655 
2656       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2657       MRI.setRegBank(ZextLo, *BankLo);
2658     }
2659 
2660     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2661     MRI.setRegBank(Or.getReg(0), *DstBank);
2662 
2663     B.buildBitcast(DstReg, Or);
2664     MI.eraseFromParent();
2665     return;
2666   }
2667   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2668     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2669 
2670     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2671 
2672     Register DstReg = MI.getOperand(0).getReg();
2673     Register SrcReg = MI.getOperand(1).getReg();
2674 
2675     const LLT S32 = LLT::scalar(32);
2676     LLT DstTy = MRI.getType(DstReg);
2677     LLT SrcTy = MRI.getType(SrcReg);
2678 
2679     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2680       return;
2681 
2682     MachineIRBuilder B(MI);
2683 
2684     const ValueMapping &DstMapping
2685       = OpdMapper.getInstrMapping().getOperandMapping(0);
2686     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2687     const RegisterBank *SrcBank =
2688       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2689     const RegisterBank *IdxBank =
2690         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2691 
2692     Register BaseIdxReg;
2693     unsigned ConstOffset;
2694     std::tie(BaseIdxReg, ConstOffset) =
2695         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2696 
2697     // See if the index is an add of a constant which will be foldable by moving
2698     // the base register of the index later if this is going to be executed in a
2699     // waterfall loop. This is essentially to reassociate the add of a constant
2700     // with the readfirstlane.
2701     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2702                                    ConstOffset > 0 &&
2703                                    ConstOffset < SrcTy.getNumElements();
2704 
2705     // Move the base register. We'll re-insert the add later.
2706     if (ShouldMoveIndexIntoLoop)
2707       MI.getOperand(2).setReg(BaseIdxReg);
2708 
2709     // If this is a VGPR result only because the index was a VGPR result, the
2710     // actual indexing will be done on the SGPR source vector, which will
2711     // produce a scalar result. We need to copy to the VGPR result inside the
2712     // waterfall loop.
2713     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2714                                 SrcBank == &AMDGPU::SGPRRegBank;
2715     if (DstRegs.empty()) {
2716       applyDefaultMapping(OpdMapper);
2717 
2718       executeInWaterfallLoop(MI, MRI, { 2 });
2719 
2720       if (NeedCopyToVGPR) {
2721         // We don't want a phi for this temporary reg.
2722         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2723         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2724         MI.getOperand(0).setReg(TmpReg);
2725         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2726 
2727         // Use a v_mov_b32 here to make the exec dependency explicit.
2728         buildVCopy(B, DstReg, TmpReg);
2729       }
2730 
2731       // Re-insert the constant offset add inside the waterfall loop.
2732       if (ShouldMoveIndexIntoLoop)
2733         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2734 
2735       return;
2736     }
2737 
2738     assert(DstTy.getSizeInBits() == 64);
2739 
2740     LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2741 
2742     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2743     auto One = B.buildConstant(S32, 1);
2744 
2745     MachineBasicBlock::iterator MII = MI.getIterator();
2746 
2747     // Split the vector index into 32-bit pieces. Prepare to move all of the
2748     // new instructions into a waterfall loop if necessary.
2749     //
2750     // Don't put the bitcast or constant in the loop.
2751     MachineInstrSpan Span(MII, &B.getMBB());
2752 
2753     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2754     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2755     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2756 
2757     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2758     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2759 
2760     MRI.setRegBank(DstReg, *DstBank);
2761     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2762     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2763     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2764     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2765 
2766     SmallSet<Register, 4> OpsToWaterfall;
2767     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2768       MI.eraseFromParent();
2769       return;
2770     }
2771 
2772     // Remove the original instruction to avoid potentially confusing the
2773     // waterfall loop logic.
2774     B.setInstr(*Span.begin());
2775     MI.eraseFromParent();
2776     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2777                            OpsToWaterfall, MRI);
2778 
2779     if (NeedCopyToVGPR) {
2780       MachineBasicBlock *LoopBB = Extract1->getParent();
2781       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2782       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2783       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2784       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2785 
2786       Extract0->getOperand(0).setReg(TmpReg0);
2787       Extract1->getOperand(0).setReg(TmpReg1);
2788 
2789       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2790 
2791       buildVCopy(B, DstRegs[0], TmpReg0);
2792       buildVCopy(B, DstRegs[1], TmpReg1);
2793     }
2794 
2795     if (ShouldMoveIndexIntoLoop)
2796       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2797 
2798     return;
2799   }
2800   case AMDGPU::G_INSERT_VECTOR_ELT: {
2801     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2802 
2803     Register DstReg = MI.getOperand(0).getReg();
2804     LLT VecTy = MRI.getType(DstReg);
2805 
2806     assert(OpdMapper.getVRegs(0).empty());
2807     assert(OpdMapper.getVRegs(3).empty());
2808 
2809     if (substituteSimpleCopyRegs(OpdMapper, 1))
2810       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2811 
2812     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2813       return;
2814 
2815     const RegisterBank *IdxBank =
2816       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2817 
2818     Register SrcReg = MI.getOperand(1).getReg();
2819     Register InsReg = MI.getOperand(2).getReg();
2820     LLT InsTy = MRI.getType(InsReg);
2821     (void)InsTy;
2822 
2823     Register BaseIdxReg;
2824     unsigned ConstOffset;
2825     std::tie(BaseIdxReg, ConstOffset) =
2826         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2827 
2828     // See if the index is an add of a constant which will be foldable by moving
2829     // the base register of the index later if this is going to be executed in a
2830     // waterfall loop. This is essentially to reassociate the add of a constant
2831     // with the readfirstlane.
2832     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2833       ConstOffset > 0 &&
2834       ConstOffset < VecTy.getNumElements();
2835 
2836     // Move the base register. We'll re-insert the add later.
2837     if (ShouldMoveIndexIntoLoop)
2838       MI.getOperand(3).setReg(BaseIdxReg);
2839 
2840 
2841     if (InsRegs.empty()) {
2842       executeInWaterfallLoop(MI, MRI, { 3 });
2843 
2844       // Re-insert the constant offset add inside the waterfall loop.
2845       if (ShouldMoveIndexIntoLoop) {
2846         MachineIRBuilder B(MI);
2847         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2848       }
2849 
2850       return;
2851     }
2852 
2853 
2854     assert(InsTy.getSizeInBits() == 64);
2855 
2856     const LLT S32 = LLT::scalar(32);
2857     LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2858 
2859     MachineIRBuilder B(MI);
2860     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2861     auto One = B.buildConstant(S32, 1);
2862 
2863     // Split the vector index into 32-bit pieces. Prepare to move all of the
2864     // new instructions into a waterfall loop if necessary.
2865     //
2866     // Don't put the bitcast or constant in the loop.
2867     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2868 
2869     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2870     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2871     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2872 
2873     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2874     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2875 
2876     const RegisterBank *DstBank =
2877       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2878     const RegisterBank *SrcBank =
2879       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2880     const RegisterBank *InsSrcBank =
2881       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2882 
2883     MRI.setRegBank(InsReg, *InsSrcBank);
2884     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2885     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2886     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2887     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2888     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2889     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2890 
2891 
2892     SmallSet<Register, 4> OpsToWaterfall;
2893     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2894       B.setInsertPt(B.getMBB(), MI);
2895       B.buildBitcast(DstReg, InsHi);
2896       MI.eraseFromParent();
2897       return;
2898     }
2899 
2900     B.setInstr(*Span.begin());
2901     MI.eraseFromParent();
2902 
2903     // Figure out the point after the waterfall loop before mangling the control
2904     // flow.
2905     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2906                            OpsToWaterfall, MRI);
2907 
2908     // The insertion point is now right after the original instruction.
2909     //
2910     // Keep the bitcast to the original vector type out of the loop. Doing this
2911     // saved an extra phi we don't need inside the loop.
2912     B.buildBitcast(DstReg, InsHi);
2913 
2914     // Re-insert the constant offset add inside the waterfall loop.
2915     if (ShouldMoveIndexIntoLoop)
2916       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2917 
2918     return;
2919   }
2920   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2921   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2922   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2923   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2924   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2925   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2926   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2927   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2928   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2929   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2930   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2931   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2932   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2933   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2934   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2935   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2936     applyDefaultMapping(OpdMapper);
2937     executeInWaterfallLoop(MI, MRI, {1, 4});
2938     return;
2939   }
2940   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2941   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2942   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2943   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2944   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2945   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2946   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2947   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2948   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2949   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2950   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2951   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2952     applyDefaultMapping(OpdMapper);
2953     executeInWaterfallLoop(MI, MRI, {2, 5});
2954     return;
2955   }
2956   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
2957     applyDefaultMapping(OpdMapper);
2958     executeInWaterfallLoop(MI, MRI, {2, 5});
2959     return;
2960   }
2961   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2962     applyDefaultMapping(OpdMapper);
2963     executeInWaterfallLoop(MI, MRI, {3, 6});
2964     return;
2965   }
2966   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2967     applyMappingSBufferLoad(OpdMapper);
2968     return;
2969   }
2970   case AMDGPU::G_INTRINSIC: {
2971     switch (MI.getIntrinsicID()) {
2972     case Intrinsic::amdgcn_readlane: {
2973       substituteSimpleCopyRegs(OpdMapper, 2);
2974 
2975       assert(OpdMapper.getVRegs(0).empty());
2976       assert(OpdMapper.getVRegs(3).empty());
2977 
2978       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2979       // waterfall loop, so assume it's a uniform value.
2980       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2981       return;
2982     }
2983     case Intrinsic::amdgcn_writelane: {
2984       assert(OpdMapper.getVRegs(0).empty());
2985       assert(OpdMapper.getVRegs(2).empty());
2986       assert(OpdMapper.getVRegs(3).empty());
2987 
2988       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2989       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2990       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2991       return;
2992     }
2993     case Intrinsic::amdgcn_interp_p1:
2994     case Intrinsic::amdgcn_interp_p2:
2995     case Intrinsic::amdgcn_interp_mov:
2996     case Intrinsic::amdgcn_interp_p1_f16:
2997     case Intrinsic::amdgcn_interp_p2_f16: {
2998       applyDefaultMapping(OpdMapper);
2999 
3000       // Readlane for m0 value, which is always the last operand.
3001       // FIXME: Should this be a waterfall loop instead?
3002       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3003       return;
3004     }
3005     case Intrinsic::amdgcn_permlane16:
3006     case Intrinsic::amdgcn_permlanex16: {
3007       // Doing a waterfall loop over these wouldn't make any sense.
3008       substituteSimpleCopyRegs(OpdMapper, 2);
3009       substituteSimpleCopyRegs(OpdMapper, 3);
3010       constrainOpWithReadfirstlane(MI, MRI, 4);
3011       constrainOpWithReadfirstlane(MI, MRI, 5);
3012       return;
3013     }
3014     case Intrinsic::amdgcn_sbfe:
3015       applyMappingBFEIntrinsic(OpdMapper, true);
3016       return;
3017     case Intrinsic::amdgcn_ubfe:
3018       applyMappingBFEIntrinsic(OpdMapper, false);
3019       return;
3020     case Intrinsic::amdgcn_ballot:
3021       // Use default handling and insert copy to vcc source.
3022       break;
3023     }
3024     break;
3025   }
3026   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3027   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3028     const AMDGPU::RsrcIntrinsic *RSrcIntrin
3029       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3030     assert(RSrcIntrin && RSrcIntrin->IsImage);
3031     // Non-images can have complications from operands that allow both SGPR
3032     // and VGPR. For now it's too complicated to figure out the final opcode
3033     // to derive the register bank from the MCInstrDesc.
3034     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3035     return;
3036   }
3037   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3038     unsigned N = MI.getNumExplicitOperands() - 2;
3039     executeInWaterfallLoop(MI, MRI, { N });
3040     return;
3041   }
3042   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3043     auto IntrID = MI.getIntrinsicID();
3044     switch (IntrID) {
3045     case Intrinsic::amdgcn_ds_ordered_add:
3046     case Intrinsic::amdgcn_ds_ordered_swap: {
3047       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3048       assert(OpdMapper.getVRegs(0).empty());
3049       substituteSimpleCopyRegs(OpdMapper, 3);
3050       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3051       return;
3052     }
3053     case Intrinsic::amdgcn_ds_gws_init:
3054     case Intrinsic::amdgcn_ds_gws_barrier:
3055     case Intrinsic::amdgcn_ds_gws_sema_br: {
3056       // Only the first lane is executes, so readfirstlane is safe.
3057       substituteSimpleCopyRegs(OpdMapper, 1);
3058       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3059       return;
3060     }
3061     case Intrinsic::amdgcn_ds_gws_sema_v:
3062     case Intrinsic::amdgcn_ds_gws_sema_p:
3063     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3064       // Only the first lane is executes, so readfirstlane is safe.
3065       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3066       return;
3067     }
3068     case Intrinsic::amdgcn_ds_append:
3069     case Intrinsic::amdgcn_ds_consume: {
3070       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3071       return;
3072     }
3073     case Intrinsic::amdgcn_s_sendmsg:
3074     case Intrinsic::amdgcn_s_sendmsghalt: {
3075       // FIXME: Should this use a waterfall loop?
3076       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3077       return;
3078     }
3079     case Intrinsic::amdgcn_s_setreg: {
3080       constrainOpWithReadfirstlane(MI, MRI, 2);
3081       return;
3082     }
3083     default: {
3084       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3085               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3086         // Non-images can have complications from operands that allow both SGPR
3087         // and VGPR. For now it's too complicated to figure out the final opcode
3088         // to derive the register bank from the MCInstrDesc.
3089         if (RSrcIntrin->IsImage) {
3090           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3091           return;
3092         }
3093       }
3094 
3095       break;
3096     }
3097     }
3098     break;
3099   }
3100   case AMDGPU::G_LOAD:
3101   case AMDGPU::G_ZEXTLOAD:
3102   case AMDGPU::G_SEXTLOAD: {
3103     if (applyMappingLoad(MI, OpdMapper, MRI))
3104       return;
3105     break;
3106   }
3107   case AMDGPU::G_DYN_STACKALLOC:
3108     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3109     return;
3110   default:
3111     break;
3112   }
3113 
3114   return applyDefaultMapping(OpdMapper);
3115 }
3116 
3117 // vgpr, sgpr -> vgpr
3118 // vgpr, agpr -> vgpr
3119 // agpr, agpr -> agpr
3120 // agpr, sgpr -> vgpr
3121 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3122   if (RB0 == AMDGPU::InvalidRegBankID)
3123     return RB1;
3124   if (RB1 == AMDGPU::InvalidRegBankID)
3125     return RB0;
3126 
3127   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3128     return AMDGPU::SGPRRegBankID;
3129 
3130   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3131     return AMDGPU::AGPRRegBankID;
3132 
3133   return AMDGPU::VGPRRegBankID;
3134 }
3135 
3136 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3137   if (RB0 == AMDGPU::InvalidRegBankID)
3138     return RB1;
3139   if (RB1 == AMDGPU::InvalidRegBankID)
3140     return RB0;
3141 
3142   // vcc, vcc -> vcc
3143   // vcc, sgpr -> vcc
3144   // vcc, vgpr -> vcc
3145   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3146     return AMDGPU::VCCRegBankID;
3147 
3148   // vcc, vgpr -> vgpr
3149   return regBankUnion(RB0, RB1);
3150 }
3151 
3152 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3153                                                 const MachineInstr &MI) const {
3154   unsigned RegBank = AMDGPU::InvalidRegBankID;
3155 
3156   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3157     if (!MI.getOperand(i).isReg())
3158       continue;
3159     Register Reg = MI.getOperand(i).getReg();
3160     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3161       RegBank = regBankUnion(RegBank, Bank->getID());
3162       if (RegBank == AMDGPU::VGPRRegBankID)
3163         break;
3164     }
3165   }
3166 
3167   return RegBank;
3168 }
3169 
3170 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3171   const MachineFunction &MF = *MI.getParent()->getParent();
3172   const MachineRegisterInfo &MRI = MF.getRegInfo();
3173   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3174     if (!MI.getOperand(i).isReg())
3175       continue;
3176     Register Reg = MI.getOperand(i).getReg();
3177     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3178       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3179         return false;
3180     }
3181   }
3182   return true;
3183 }
3184 
3185 const RegisterBankInfo::InstructionMapping &
3186 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3187   const MachineFunction &MF = *MI.getParent()->getParent();
3188   const MachineRegisterInfo &MRI = MF.getRegInfo();
3189   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3190 
3191   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3192     const MachineOperand &SrcOp = MI.getOperand(i);
3193     if (!SrcOp.isReg())
3194       continue;
3195 
3196     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3197     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3198   }
3199   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3200                                MI.getNumOperands());
3201 }
3202 
3203 const RegisterBankInfo::InstructionMapping &
3204 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3205   const MachineFunction &MF = *MI.getParent()->getParent();
3206   const MachineRegisterInfo &MRI = MF.getRegInfo();
3207   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3208 
3209   // Even though we technically could use SGPRs, this would require knowledge of
3210   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3211   //
3212   // TODO: Unary ops are trivially OK, so accept SGPRs?
3213   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3214     const MachineOperand &Src = MI.getOperand(i);
3215     if (!Src.isReg())
3216       continue;
3217 
3218     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3219     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3220     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3221   }
3222 
3223   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3224                                MI.getNumOperands());
3225 }
3226 
3227 const RegisterBankInfo::InstructionMapping &
3228 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3229   const MachineFunction &MF = *MI.getParent()->getParent();
3230   const MachineRegisterInfo &MRI = MF.getRegInfo();
3231   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3232 
3233   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3234     const MachineOperand &Op = MI.getOperand(I);
3235     if (!Op.isReg())
3236       continue;
3237 
3238     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3239     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3240   }
3241 
3242   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3243                                MI.getNumOperands());
3244 }
3245 
3246 const RegisterBankInfo::InstructionMapping &
3247 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3248                                         const MachineInstr &MI,
3249                                         int RsrcIdx) const {
3250   // The reported argument index is relative to the IR intrinsic call arguments,
3251   // so we need to shift by the number of defs and the intrinsic ID.
3252   RsrcIdx += MI.getNumExplicitDefs() + 1;
3253 
3254   const int NumOps = MI.getNumOperands();
3255   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3256 
3257   // TODO: Should packed/unpacked D16 difference be reported here as part of
3258   // the value mapping?
3259   for (int I = 0; I != NumOps; ++I) {
3260     if (!MI.getOperand(I).isReg())
3261       continue;
3262 
3263     Register OpReg = MI.getOperand(I).getReg();
3264     // We replace some dead address operands with $noreg
3265     if (!OpReg)
3266       continue;
3267 
3268     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3269 
3270     // FIXME: Probably need a new intrinsic register bank searchable table to
3271     // handle arbitrary intrinsics easily.
3272     //
3273     // If this has a sampler, it immediately follows rsrc.
3274     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3275 
3276     if (MustBeSGPR) {
3277       // If this must be an SGPR, so we must report whatever it is as legal.
3278       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3279       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3280     } else {
3281       // Some operands must be VGPR, and these are easy to copy to.
3282       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3283     }
3284   }
3285 
3286   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3287 }
3288 
3289 /// Return the mapping for a pointer arugment.
3290 const RegisterBankInfo::ValueMapping *
3291 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3292                                               Register PtrReg) const {
3293   LLT PtrTy = MRI.getType(PtrReg);
3294   unsigned Size = PtrTy.getSizeInBits();
3295   if (Subtarget.useFlatForGlobal() ||
3296       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3297     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3298 
3299   // If we're using MUBUF instructions for global memory, an SGPR base register
3300   // is possible. Otherwise this needs to be a VGPR.
3301   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3302   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3303 }
3304 
3305 const RegisterBankInfo::InstructionMapping &
3306 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3307 
3308   const MachineFunction &MF = *MI.getParent()->getParent();
3309   const MachineRegisterInfo &MRI = MF.getRegInfo();
3310   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3311   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3312   Register PtrReg = MI.getOperand(1).getReg();
3313   LLT PtrTy = MRI.getType(PtrReg);
3314   unsigned AS = PtrTy.getAddressSpace();
3315   unsigned PtrSize = PtrTy.getSizeInBits();
3316 
3317   const ValueMapping *ValMapping;
3318   const ValueMapping *PtrMapping;
3319 
3320   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3321 
3322   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3323     if (isScalarLoadLegal(MI)) {
3324       // We have a uniform instruction so we want to use an SMRD load
3325       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3326       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3327     } else {
3328       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3329 
3330       // If we're using MUBUF instructions for global memory, an SGPR base
3331       // register is possible. Otherwise this needs to be a VGPR.
3332       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3333         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3334 
3335       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3336     }
3337   } else {
3338     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3339     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3340   }
3341 
3342   OpdsMapping[0] = ValMapping;
3343   OpdsMapping[1] = PtrMapping;
3344   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3345       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3346   return Mapping;
3347 
3348   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3349   // handle that during instruction selection?
3350 }
3351 
3352 unsigned
3353 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3354                                      const MachineRegisterInfo &MRI,
3355                                      unsigned Default) const {
3356   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3357   return Bank ? Bank->getID() : Default;
3358 }
3359 
3360 const RegisterBankInfo::ValueMapping *
3361 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3362                                          const MachineRegisterInfo &MRI,
3363                                          const TargetRegisterInfo &TRI) const {
3364   // Lie and claim anything is legal, even though this needs to be an SGPR
3365   // applyMapping will have to deal with it as a waterfall loop.
3366   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3367   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3368   return AMDGPU::getValueMapping(Bank, Size);
3369 }
3370 
3371 const RegisterBankInfo::ValueMapping *
3372 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3373                                          const MachineRegisterInfo &MRI,
3374                                          const TargetRegisterInfo &TRI) const {
3375   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3376   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3377 }
3378 
3379 const RegisterBankInfo::ValueMapping *
3380 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3381                                          const MachineRegisterInfo &MRI,
3382                                          const TargetRegisterInfo &TRI) const {
3383   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3384   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3385 }
3386 
3387 ///
3388 /// This function must return a legal mapping, because
3389 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3390 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3391 /// VGPR to SGPR generated is illegal.
3392 ///
3393 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3394 // legal. These will be dealt with in applyMappingImpl.
3395 //
3396 const RegisterBankInfo::InstructionMapping &
3397 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3398   const MachineFunction &MF = *MI.getParent()->getParent();
3399   const MachineRegisterInfo &MRI = MF.getRegInfo();
3400 
3401   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3402     // The default logic bothers to analyze impossible alternative mappings. We
3403     // want the most straightforward mapping, so just directly handle this.
3404     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3405                                              *TRI);
3406     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3407                                              *TRI);
3408     assert(SrcBank && "src bank should have been assigned already");
3409     if (!DstBank)
3410       DstBank = SrcBank;
3411 
3412     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3413     if (cannotCopy(*DstBank, *SrcBank, Size))
3414       return getInvalidInstructionMapping();
3415 
3416     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3417     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3418     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3419     OpdsMapping[0] = &ValMap;
3420     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3421       OpdsMapping[1] = &ValMap;
3422 
3423     return getInstructionMapping(
3424         1, /*Cost*/ 1,
3425         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3426   }
3427 
3428   if (MI.isRegSequence()) {
3429     // If any input is a VGPR, the result must be a VGPR. The default handling
3430     // assumes any copy between banks is legal.
3431     unsigned BankID = AMDGPU::SGPRRegBankID;
3432 
3433     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3434       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3435       // It doesn't make sense to use vcc or scc banks here, so just ignore
3436       // them.
3437       if (OpBank != AMDGPU::SGPRRegBankID) {
3438         BankID = AMDGPU::VGPRRegBankID;
3439         break;
3440       }
3441     }
3442     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3443 
3444     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3445     return getInstructionMapping(
3446         1, /*Cost*/ 1,
3447         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3448   }
3449 
3450   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3451   // properly.
3452   //
3453   // TODO: There are additional exec masking dependencies to analyze.
3454   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3455     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3456     Register DstReg = MI.getOperand(0).getReg();
3457 
3458     // Sometimes the result may have already been assigned a bank.
3459     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3460       ResultBank = DstBank->getID();
3461 
3462     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3463       Register Reg = MI.getOperand(I).getReg();
3464       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3465 
3466       // FIXME: Assuming VGPR for any undetermined inputs.
3467       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3468         ResultBank = AMDGPU::VGPRRegBankID;
3469         break;
3470       }
3471 
3472       // FIXME: Need to promote SGPR case to s32
3473       unsigned OpBank = Bank->getID();
3474       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3475     }
3476 
3477     assert(ResultBank != AMDGPU::InvalidRegBankID);
3478 
3479     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3480 
3481     const ValueMapping &ValMap =
3482         getValueMapping(0, Size, getRegBank(ResultBank));
3483     return getInstructionMapping(
3484         1, /*Cost*/ 1,
3485         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3486   }
3487 
3488   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3489   if (Mapping.isValid())
3490     return Mapping;
3491 
3492   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3493 
3494   switch (MI.getOpcode()) {
3495   default:
3496     return getInvalidInstructionMapping();
3497 
3498   case AMDGPU::G_AND:
3499   case AMDGPU::G_OR:
3500   case AMDGPU::G_XOR: {
3501     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3502     if (Size == 1) {
3503       const RegisterBank *DstBank
3504         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3505 
3506       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3507       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3508       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3509       if (DstBank) {
3510         TargetBankID = DstBank->getID();
3511         if (DstBank == &AMDGPU::VCCRegBank) {
3512           TargetBankID = AMDGPU::VCCRegBankID;
3513           BankLHS = AMDGPU::VCCRegBankID;
3514           BankRHS = AMDGPU::VCCRegBankID;
3515         } else {
3516           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3517                                  AMDGPU::SGPRRegBankID);
3518           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3519                                  AMDGPU::SGPRRegBankID);
3520         }
3521       } else {
3522         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3523                                AMDGPU::VCCRegBankID);
3524         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3525                                AMDGPU::VCCRegBankID);
3526 
3527         // Both inputs should be true booleans to produce a boolean result.
3528         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3529           TargetBankID = AMDGPU::VGPRRegBankID;
3530         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3531           TargetBankID = AMDGPU::VCCRegBankID;
3532           BankLHS = AMDGPU::VCCRegBankID;
3533           BankRHS = AMDGPU::VCCRegBankID;
3534         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3535           TargetBankID = AMDGPU::SGPRRegBankID;
3536         }
3537       }
3538 
3539       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3540       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3541       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3542       break;
3543     }
3544 
3545     if (Size == 64) {
3546 
3547       if (isSALUMapping(MI)) {
3548         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3549         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3550       } else {
3551         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3552         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3553         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3554 
3555         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3556         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3557       }
3558 
3559       break;
3560     }
3561 
3562     LLVM_FALLTHROUGH;
3563   }
3564   case AMDGPU::G_PTR_ADD:
3565   case AMDGPU::G_PTRMASK:
3566   case AMDGPU::G_ADD:
3567   case AMDGPU::G_SUB:
3568   case AMDGPU::G_MUL:
3569   case AMDGPU::G_SHL:
3570   case AMDGPU::G_LSHR:
3571   case AMDGPU::G_ASHR:
3572   case AMDGPU::G_UADDO:
3573   case AMDGPU::G_USUBO:
3574   case AMDGPU::G_UADDE:
3575   case AMDGPU::G_SADDE:
3576   case AMDGPU::G_USUBE:
3577   case AMDGPU::G_SSUBE:
3578   case AMDGPU::G_SMIN:
3579   case AMDGPU::G_SMAX:
3580   case AMDGPU::G_UMIN:
3581   case AMDGPU::G_UMAX:
3582   case AMDGPU::G_SHUFFLE_VECTOR:
3583     if (isSALUMapping(MI))
3584       return getDefaultMappingSOP(MI);
3585     LLVM_FALLTHROUGH;
3586 
3587   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3588   case AMDGPU::G_SSUBSAT:
3589   case AMDGPU::G_UADDSAT:
3590   case AMDGPU::G_USUBSAT:
3591   case AMDGPU::G_FADD:
3592   case AMDGPU::G_FSUB:
3593   case AMDGPU::G_FPTOSI:
3594   case AMDGPU::G_FPTOUI:
3595   case AMDGPU::G_FMUL:
3596   case AMDGPU::G_FMA:
3597   case AMDGPU::G_FMAD:
3598   case AMDGPU::G_FSQRT:
3599   case AMDGPU::G_FFLOOR:
3600   case AMDGPU::G_FCEIL:
3601   case AMDGPU::G_FRINT:
3602   case AMDGPU::G_SITOFP:
3603   case AMDGPU::G_UITOFP:
3604   case AMDGPU::G_FPTRUNC:
3605   case AMDGPU::G_FPEXT:
3606   case AMDGPU::G_FEXP2:
3607   case AMDGPU::G_FLOG2:
3608   case AMDGPU::G_FMINNUM:
3609   case AMDGPU::G_FMAXNUM:
3610   case AMDGPU::G_FMINNUM_IEEE:
3611   case AMDGPU::G_FMAXNUM_IEEE:
3612   case AMDGPU::G_FCANONICALIZE:
3613   case AMDGPU::G_INTRINSIC_TRUNC:
3614   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3615   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3616   case AMDGPU::G_AMDGPU_FFBH_U32:
3617   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3618   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3619   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3620   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3621   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3622   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3623   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3624     return getDefaultMappingVOP(MI);
3625   case AMDGPU::G_UMULH:
3626   case AMDGPU::G_SMULH: {
3627     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3628       return getDefaultMappingSOP(MI);
3629     return getDefaultMappingVOP(MI);
3630   }
3631   case AMDGPU::G_IMPLICIT_DEF: {
3632     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3633     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3634     break;
3635   }
3636   case AMDGPU::G_FCONSTANT:
3637   case AMDGPU::G_CONSTANT:
3638   case AMDGPU::G_GLOBAL_VALUE:
3639   case AMDGPU::G_BLOCK_ADDR:
3640   case AMDGPU::G_READCYCLECOUNTER: {
3641     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3642     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3643     break;
3644   }
3645   case AMDGPU::G_FRAME_INDEX: {
3646     // TODO: This should be the same as other constants, but eliminateFrameIndex
3647     // currently assumes VALU uses.
3648     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3649     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3650     break;
3651   }
3652   case AMDGPU::G_DYN_STACKALLOC: {
3653     // Result is always uniform, and a wave reduction is needed for the source.
3654     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3655     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3656     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3657     break;
3658   }
3659   case AMDGPU::G_INSERT: {
3660     unsigned BankID = getMappingType(MRI, MI);
3661     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3662     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3663     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3664     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3665     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3666     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3667     OpdsMapping[3] = nullptr;
3668     break;
3669   }
3670   case AMDGPU::G_EXTRACT: {
3671     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3672     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3673     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3674     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3675     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3676     OpdsMapping[2] = nullptr;
3677     break;
3678   }
3679   case AMDGPU::G_BUILD_VECTOR:
3680   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3681     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3682     if (DstTy == LLT::vector(2, 16)) {
3683       unsigned DstSize = DstTy.getSizeInBits();
3684       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3685       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3686       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3687       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3688 
3689       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3690       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3691       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3692       break;
3693     }
3694 
3695     LLVM_FALLTHROUGH;
3696   }
3697   case AMDGPU::G_MERGE_VALUES:
3698   case AMDGPU::G_CONCAT_VECTORS: {
3699     unsigned Bank = getMappingType(MRI, MI);
3700     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3701     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3702 
3703     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3704     // Op1 and Dst should use the same register bank.
3705     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3706       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3707     break;
3708   }
3709   case AMDGPU::G_BITCAST:
3710   case AMDGPU::G_INTTOPTR:
3711   case AMDGPU::G_PTRTOINT:
3712   case AMDGPU::G_BITREVERSE:
3713   case AMDGPU::G_FABS:
3714   case AMDGPU::G_FNEG: {
3715     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3716     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3717     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3718     break;
3719   }
3720   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3721   case AMDGPU::G_CTTZ_ZERO_UNDEF:
3722   case AMDGPU::G_CTPOP: {
3723     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3724     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3725     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3726 
3727     // This should really be getValueMappingSGPR64Only, but allowing the generic
3728     // code to handle the register split just makes using LegalizerHelper more
3729     // difficult.
3730     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3731     break;
3732   }
3733   case AMDGPU::G_TRUNC: {
3734     Register Dst = MI.getOperand(0).getReg();
3735     Register Src = MI.getOperand(1).getReg();
3736     unsigned Bank = getRegBankID(Src, MRI);
3737     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3738     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3739     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3740     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3741     break;
3742   }
3743   case AMDGPU::G_ZEXT:
3744   case AMDGPU::G_SEXT:
3745   case AMDGPU::G_ANYEXT:
3746   case AMDGPU::G_SEXT_INREG: {
3747     Register Dst = MI.getOperand(0).getReg();
3748     Register Src = MI.getOperand(1).getReg();
3749     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3750     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3751 
3752     unsigned DstBank;
3753     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3754     assert(SrcBank);
3755     switch (SrcBank->getID()) {
3756     case AMDGPU::SGPRRegBankID:
3757       DstBank = AMDGPU::SGPRRegBankID;
3758       break;
3759     default:
3760       DstBank = AMDGPU::VGPRRegBankID;
3761       break;
3762     }
3763 
3764     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3765     // 32-bits, and then to 64.
3766     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3767     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3768                                                        SrcSize);
3769     break;
3770   }
3771   case AMDGPU::G_FCMP: {
3772     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3773     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3774     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3775     OpdsMapping[1] = nullptr; // Predicate Operand.
3776     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3777     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3778     break;
3779   }
3780   case AMDGPU::G_STORE: {
3781     assert(MI.getOperand(0).isReg());
3782     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3783 
3784     // FIXME: We need to specify a different reg bank once scalar stores are
3785     // supported.
3786     const ValueMapping *ValMapping =
3787         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3788     OpdsMapping[0] = ValMapping;
3789     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3790     break;
3791   }
3792   case AMDGPU::G_ICMP: {
3793     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3794     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3795 
3796     // See if the result register has already been constrained to vcc, which may
3797     // happen due to control flow intrinsic lowering.
3798     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3799                                     AMDGPU::SGPRRegBankID);
3800     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3801     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3802 
3803     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3804                      Op2Bank == AMDGPU::SGPRRegBankID &&
3805                      Op3Bank == AMDGPU::SGPRRegBankID &&
3806       (Size == 32 || (Size == 64 &&
3807                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3808                       Subtarget.hasScalarCompareEq64()));
3809 
3810     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3811     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3812 
3813     // TODO: Use 32-bit for scalar output size.
3814     // SCC results will need to be copied to a 32-bit SGPR virtual register.
3815     const unsigned ResultSize = 1;
3816 
3817     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3818     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3819     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3820     break;
3821   }
3822   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3823     // VGPR index can be used for waterfall when indexing a SGPR vector.
3824     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3825     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3826     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3827     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3828     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3829     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3830 
3831     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3832     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3833 
3834     // The index can be either if the source vector is VGPR.
3835     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3836     break;
3837   }
3838   case AMDGPU::G_INSERT_VECTOR_ELT: {
3839     unsigned OutputBankID = isSALUMapping(MI) ?
3840       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3841 
3842     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3843     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3844     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3845     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3846     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3847 
3848     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3849     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3850 
3851     // This is a weird case, because we need to break down the mapping based on
3852     // the register bank of a different operand.
3853     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3854       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3855                                                       InsertSize);
3856     } else {
3857       assert(InsertSize == 32 || InsertSize == 64);
3858       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3859     }
3860 
3861     // The index can be either if the source vector is VGPR.
3862     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3863     break;
3864   }
3865   case AMDGPU::G_UNMERGE_VALUES: {
3866     unsigned Bank = getMappingType(MRI, MI);
3867 
3868     // Op1 and Dst should use the same register bank.
3869     // FIXME: Shouldn't this be the default? Why do we need to handle this?
3870     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3871       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3872       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3873     }
3874     break;
3875   }
3876   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3877   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3878   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3879   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3880   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3881   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3882   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3883   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3884   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3885   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3886   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3887   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3888   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3889   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3890   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3891   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3892     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3893 
3894     // rsrc
3895     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3896 
3897     // vindex
3898     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3899 
3900     // voffset
3901     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3902 
3903     // soffset
3904     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3905 
3906     // Any remaining operands are immediates and were correctly null
3907     // initialized.
3908     break;
3909   }
3910   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3911   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3912   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3913   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3914   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3915   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3916   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3917   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3918   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3919   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3920   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3921   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3922   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
3923     // vdata_out
3924     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3925 
3926     // vdata_in
3927     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3928 
3929     // rsrc
3930     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3931 
3932     // vindex
3933     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3934 
3935     // voffset
3936     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3937 
3938     // soffset
3939     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3940 
3941     // Any remaining operands are immediates and were correctly null
3942     // initialized.
3943     break;
3944   }
3945   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3946     // vdata_out
3947     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3948 
3949     // vdata_in
3950     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3951 
3952     // cmp
3953     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3954 
3955     // rsrc
3956     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3957 
3958     // vindex
3959     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3960 
3961     // voffset
3962     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3963 
3964     // soffset
3965     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3966 
3967     // Any remaining operands are immediates and were correctly null
3968     // initialized.
3969     break;
3970   }
3971   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3972     // Lie and claim everything is legal, even though some need to be
3973     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3974     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3975     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3976 
3977     // We need to convert this to a MUBUF if either the resource of offset is
3978     // VGPR.
3979     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3980     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3981     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3982 
3983     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3984     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3985     break;
3986   }
3987   case AMDGPU::G_INTRINSIC: {
3988     switch (MI.getIntrinsicID()) {
3989     default:
3990       return getInvalidInstructionMapping();
3991     case Intrinsic::amdgcn_div_fmas:
3992     case Intrinsic::amdgcn_div_fixup:
3993     case Intrinsic::amdgcn_trig_preop:
3994     case Intrinsic::amdgcn_sin:
3995     case Intrinsic::amdgcn_cos:
3996     case Intrinsic::amdgcn_log_clamp:
3997     case Intrinsic::amdgcn_rcp:
3998     case Intrinsic::amdgcn_rcp_legacy:
3999     case Intrinsic::amdgcn_sqrt:
4000     case Intrinsic::amdgcn_rsq:
4001     case Intrinsic::amdgcn_rsq_legacy:
4002     case Intrinsic::amdgcn_rsq_clamp:
4003     case Intrinsic::amdgcn_fmul_legacy:
4004     case Intrinsic::amdgcn_fma_legacy:
4005     case Intrinsic::amdgcn_ldexp:
4006     case Intrinsic::amdgcn_frexp_mant:
4007     case Intrinsic::amdgcn_frexp_exp:
4008     case Intrinsic::amdgcn_fract:
4009     case Intrinsic::amdgcn_cvt_pkrtz:
4010     case Intrinsic::amdgcn_cvt_pknorm_i16:
4011     case Intrinsic::amdgcn_cvt_pknorm_u16:
4012     case Intrinsic::amdgcn_cvt_pk_i16:
4013     case Intrinsic::amdgcn_cvt_pk_u16:
4014     case Intrinsic::amdgcn_fmed3:
4015     case Intrinsic::amdgcn_cubeid:
4016     case Intrinsic::amdgcn_cubema:
4017     case Intrinsic::amdgcn_cubesc:
4018     case Intrinsic::amdgcn_cubetc:
4019     case Intrinsic::amdgcn_sffbh:
4020     case Intrinsic::amdgcn_fmad_ftz:
4021     case Intrinsic::amdgcn_mbcnt_lo:
4022     case Intrinsic::amdgcn_mbcnt_hi:
4023     case Intrinsic::amdgcn_mul_u24:
4024     case Intrinsic::amdgcn_mul_i24:
4025     case Intrinsic::amdgcn_lerp:
4026     case Intrinsic::amdgcn_sad_u8:
4027     case Intrinsic::amdgcn_msad_u8:
4028     case Intrinsic::amdgcn_sad_hi_u8:
4029     case Intrinsic::amdgcn_sad_u16:
4030     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4031     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4032     case Intrinsic::amdgcn_mqsad_u32_u8:
4033     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4034     case Intrinsic::amdgcn_alignbit:
4035     case Intrinsic::amdgcn_alignbyte:
4036     case Intrinsic::amdgcn_fdot2:
4037     case Intrinsic::amdgcn_sdot2:
4038     case Intrinsic::amdgcn_udot2:
4039     case Intrinsic::amdgcn_sdot4:
4040     case Intrinsic::amdgcn_udot4:
4041     case Intrinsic::amdgcn_sdot8:
4042     case Intrinsic::amdgcn_udot8:
4043       return getDefaultMappingVOP(MI);
4044     case Intrinsic::amdgcn_sbfe:
4045     case Intrinsic::amdgcn_ubfe:
4046       if (isSALUMapping(MI))
4047         return getDefaultMappingSOP(MI);
4048       return getDefaultMappingVOP(MI);
4049     case Intrinsic::amdgcn_ds_swizzle:
4050     case Intrinsic::amdgcn_ds_permute:
4051     case Intrinsic::amdgcn_ds_bpermute:
4052     case Intrinsic::amdgcn_update_dpp:
4053     case Intrinsic::amdgcn_mov_dpp8:
4054     case Intrinsic::amdgcn_mov_dpp:
4055     case Intrinsic::amdgcn_wwm:
4056     case Intrinsic::amdgcn_wqm:
4057     case Intrinsic::amdgcn_softwqm:
4058     case Intrinsic::amdgcn_set_inactive:
4059       return getDefaultMappingAllVGPR(MI);
4060     case Intrinsic::amdgcn_kernarg_segment_ptr:
4061     case Intrinsic::amdgcn_s_getpc:
4062     case Intrinsic::amdgcn_groupstaticsize:
4063     case Intrinsic::amdgcn_reloc_constant:
4064     case Intrinsic::returnaddress: {
4065       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4066       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4067       break;
4068     }
4069     case Intrinsic::amdgcn_wqm_vote: {
4070       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4071       OpdsMapping[0] = OpdsMapping[2]
4072         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4073       break;
4074     }
4075     case Intrinsic::amdgcn_ps_live: {
4076       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4077       break;
4078     }
4079     case Intrinsic::amdgcn_div_scale: {
4080       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4081       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4082       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4083       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4084 
4085       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4086       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4087       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4088       break;
4089     }
4090     case Intrinsic::amdgcn_class: {
4091       Register Src0Reg = MI.getOperand(2).getReg();
4092       Register Src1Reg = MI.getOperand(3).getReg();
4093       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4094       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4095       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4096       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4097       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4098       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4099       break;
4100     }
4101     case Intrinsic::amdgcn_icmp:
4102     case Intrinsic::amdgcn_fcmp: {
4103       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4104       // This is not VCCRegBank because this is not used in boolean contexts.
4105       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4106       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4107       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4108       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4109       break;
4110     }
4111     case Intrinsic::amdgcn_readlane: {
4112       // This must be an SGPR, but accept a VGPR.
4113       Register IdxReg = MI.getOperand(3).getReg();
4114       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4115       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4116       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4117       LLVM_FALLTHROUGH;
4118     }
4119     case Intrinsic::amdgcn_readfirstlane: {
4120       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4121       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4122       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4123       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4124       break;
4125     }
4126     case Intrinsic::amdgcn_writelane: {
4127       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4128       Register SrcReg = MI.getOperand(2).getReg();
4129       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4130       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4131       Register IdxReg = MI.getOperand(3).getReg();
4132       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4133       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4134       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4135 
4136       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4137       // to legalize.
4138       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4139       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4140       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4141       break;
4142     }
4143     case Intrinsic::amdgcn_if_break: {
4144       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4145       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4146       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4147       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4148       break;
4149     }
4150     case Intrinsic::amdgcn_permlane16:
4151     case Intrinsic::amdgcn_permlanex16: {
4152       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4153       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4154       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4155       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4156       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4157       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4158       break;
4159     }
4160     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4161     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4162     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4163     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4164     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4165     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4166     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4167     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4168     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4169     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4170     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4171     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4172     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4173     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4174     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4175     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4176     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4177     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4178     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4179     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
4180       // Default for MAI intrinsics.
4181       // srcC can also be an immediate which can be folded later.
4182       // FIXME: Should we eventually add an alternative mapping with AGPR src
4183       // for srcA/srcB?
4184       //
4185       // vdst, srcA, srcB, srcC
4186       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4187       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4188       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4189       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4190       break;
4191     }
4192     case Intrinsic::amdgcn_interp_p1:
4193     case Intrinsic::amdgcn_interp_p2:
4194     case Intrinsic::amdgcn_interp_mov:
4195     case Intrinsic::amdgcn_interp_p1_f16:
4196     case Intrinsic::amdgcn_interp_p2_f16: {
4197       const int M0Idx = MI.getNumOperands() - 1;
4198       Register M0Reg = MI.getOperand(M0Idx).getReg();
4199       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4200       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4201 
4202       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4203       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4204         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4205 
4206       // Must be SGPR, but we must take whatever the original bank is and fix it
4207       // later.
4208       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4209       break;
4210     }
4211     case Intrinsic::amdgcn_ballot: {
4212       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4213       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4214       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4215       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4216       break;
4217     }
4218     }
4219     break;
4220   }
4221   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4222   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4223     auto IntrID = MI.getIntrinsicID();
4224     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4225     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4226     // Non-images can have complications from operands that allow both SGPR
4227     // and VGPR. For now it's too complicated to figure out the final opcode
4228     // to derive the register bank from the MCInstrDesc.
4229     assert(RSrcIntrin->IsImage);
4230     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4231   }
4232   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4233     unsigned N = MI.getNumExplicitOperands() - 2;
4234     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4235     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4236     for (unsigned I = 2; I < N; ++I)
4237       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4238     break;
4239   }
4240   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4241     auto IntrID = MI.getIntrinsicID();
4242     switch (IntrID) {
4243     case Intrinsic::amdgcn_s_getreg:
4244     case Intrinsic::amdgcn_s_memtime:
4245     case Intrinsic::amdgcn_s_memrealtime:
4246     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4247       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4248       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4249       break;
4250     }
4251     case Intrinsic::amdgcn_global_atomic_fadd:
4252     case Intrinsic::amdgcn_global_atomic_csub:
4253       return getDefaultMappingAllVGPR(MI);
4254     case Intrinsic::amdgcn_ds_ordered_add:
4255     case Intrinsic::amdgcn_ds_ordered_swap: {
4256       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4257       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4258       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4259                                  AMDGPU::SGPRRegBankID);
4260       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4261       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4262       break;
4263     }
4264     case Intrinsic::amdgcn_ds_append:
4265     case Intrinsic::amdgcn_ds_consume: {
4266       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4267       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4268       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4269       break;
4270     }
4271     case Intrinsic::amdgcn_exp_compr:
4272       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4273       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4274       break;
4275     case Intrinsic::amdgcn_exp:
4276       // FIXME: Could we support packed types here?
4277       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4278       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4279       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4280       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4281       break;
4282     case Intrinsic::amdgcn_s_sendmsg:
4283     case Intrinsic::amdgcn_s_sendmsghalt: {
4284       // This must be an SGPR, but accept a VGPR.
4285       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4286                                    AMDGPU::SGPRRegBankID);
4287       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4288       break;
4289     }
4290     case Intrinsic::amdgcn_s_setreg: {
4291       // This must be an SGPR, but accept a VGPR.
4292       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4293                                    AMDGPU::SGPRRegBankID);
4294       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4295       break;
4296     }
4297     case Intrinsic::amdgcn_end_cf: {
4298       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4299       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4300       break;
4301     }
4302     case Intrinsic::amdgcn_else: {
4303       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4304       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4305       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4306       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4307       break;
4308     }
4309     case Intrinsic::amdgcn_kill: {
4310       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4311       break;
4312     }
4313     case Intrinsic::amdgcn_raw_buffer_load:
4314     case Intrinsic::amdgcn_raw_tbuffer_load: {
4315       // FIXME: Should make intrinsic ID the last operand of the instruction,
4316       // then this would be the same as store
4317       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4318       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4319       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4320       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4321       break;
4322     }
4323     case Intrinsic::amdgcn_raw_buffer_store:
4324     case Intrinsic::amdgcn_raw_buffer_store_format:
4325     case Intrinsic::amdgcn_raw_tbuffer_store: {
4326       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4327       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4328       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4329       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4330       break;
4331     }
4332     case Intrinsic::amdgcn_struct_buffer_load:
4333     case Intrinsic::amdgcn_struct_tbuffer_load: {
4334       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4335       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4336       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4337       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4338       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4339       break;
4340     }
4341     case Intrinsic::amdgcn_struct_buffer_store:
4342     case Intrinsic::amdgcn_struct_tbuffer_store: {
4343       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4344       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4345       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4346       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4347       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4348       break;
4349     }
4350     case Intrinsic::amdgcn_init_exec_from_input: {
4351       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4352       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4353       break;
4354     }
4355     case Intrinsic::amdgcn_ds_gws_init:
4356     case Intrinsic::amdgcn_ds_gws_barrier:
4357     case Intrinsic::amdgcn_ds_gws_sema_br: {
4358       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4359 
4360       // This must be an SGPR, but accept a VGPR.
4361       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4362                                    AMDGPU::SGPRRegBankID);
4363       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4364       break;
4365     }
4366     case Intrinsic::amdgcn_ds_gws_sema_v:
4367     case Intrinsic::amdgcn_ds_gws_sema_p:
4368     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4369       // This must be an SGPR, but accept a VGPR.
4370       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4371                                    AMDGPU::SGPRRegBankID);
4372       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4373       break;
4374     }
4375     default:
4376       return getInvalidInstructionMapping();
4377     }
4378     break;
4379   }
4380   case AMDGPU::G_SELECT: {
4381     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4382     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4383                                     AMDGPU::SGPRRegBankID);
4384     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4385                                     AMDGPU::SGPRRegBankID);
4386     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4387                     Op3Bank == AMDGPU::SGPRRegBankID;
4388 
4389     unsigned CondBankDefault = SGPRSrcs ?
4390       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4391     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4392                                      CondBankDefault);
4393     if (CondBank == AMDGPU::SGPRRegBankID)
4394       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4395     else if (CondBank == AMDGPU::VGPRRegBankID)
4396       CondBank = AMDGPU::VCCRegBankID;
4397 
4398     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4399       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4400 
4401     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4402 
4403     // TODO: Should report 32-bit for scalar condition type.
4404     if (Size == 64) {
4405       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4406       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4407       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4408       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4409     } else {
4410       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4411       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4412       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4413       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4414     }
4415 
4416     break;
4417   }
4418 
4419   case AMDGPU::G_LOAD:
4420   case AMDGPU::G_ZEXTLOAD:
4421   case AMDGPU::G_SEXTLOAD:
4422     return getInstrMappingForLoad(MI);
4423 
4424   case AMDGPU::G_ATOMICRMW_XCHG:
4425   case AMDGPU::G_ATOMICRMW_ADD:
4426   case AMDGPU::G_ATOMICRMW_SUB:
4427   case AMDGPU::G_ATOMICRMW_AND:
4428   case AMDGPU::G_ATOMICRMW_OR:
4429   case AMDGPU::G_ATOMICRMW_XOR:
4430   case AMDGPU::G_ATOMICRMW_MAX:
4431   case AMDGPU::G_ATOMICRMW_MIN:
4432   case AMDGPU::G_ATOMICRMW_UMAX:
4433   case AMDGPU::G_ATOMICRMW_UMIN:
4434   case AMDGPU::G_ATOMICRMW_FADD:
4435   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4436   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4437   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4438   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4439   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4440     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4441     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4442     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4443     break;
4444   }
4445   case AMDGPU::G_ATOMIC_CMPXCHG: {
4446     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4447     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4448     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4449     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4450     break;
4451   }
4452   case AMDGPU::G_BRCOND: {
4453     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4454                                  AMDGPU::SGPRRegBankID);
4455     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4456     if (Bank != AMDGPU::SGPRRegBankID)
4457       Bank = AMDGPU::VCCRegBankID;
4458 
4459     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4460     break;
4461   }
4462   }
4463 
4464   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4465                                getOperandsMapping(OpdsMapping),
4466                                MI.getNumOperands());
4467 }
4468