1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84 
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87 
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90 
91 using namespace llvm;
92 using namespace MIPatternMatch;
93 
94 namespace {
95 
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99   const AMDGPURegisterBankInfo &RBI;
100   MachineRegisterInfo &MRI;
101   const RegisterBank *NewBank;
102   SmallVector<MachineInstr *, 4> NewInsts;
103 
104 public:
105   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
107     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108 
109   ~ApplyRegBankMapping() {
110     for (MachineInstr *MI : NewInsts)
111       applyBank(*MI);
112   }
113 
114   /// Set any registers that don't have a set register class or bank to SALU.
115   void applyBank(MachineInstr &MI) {
116     const unsigned Opc = MI.getOpcode();
117     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118         Opc == AMDGPU::G_SEXT) {
119       // LegalizerHelper wants to use the basic legalization artifacts when
120       // widening etc. We don't handle selection with vcc in artifact sources,
121       // so we need to use a sslect instead to handle these properly.
122       Register DstReg = MI.getOperand(0).getReg();
123       Register SrcReg = MI.getOperand(1).getReg();
124       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125       if (SrcBank == &AMDGPU::VCCRegBank) {
126         const LLT S32 = LLT::scalar(32);
127         assert(MRI.getType(SrcReg) == LLT::scalar(1));
128         assert(MRI.getType(DstReg) == S32);
129         assert(NewBank == &AMDGPU::VGPRRegBank);
130 
131         // Replace the extension with a select, which really uses the boolean
132         // source.
133         MachineIRBuilder B(MI);
134         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135         auto False = B.buildConstant(S32, 0);
136         B.buildSelect(DstReg, SrcReg, True, False);
137         MRI.setRegBank(True.getReg(0), *NewBank);
138         MRI.setRegBank(False.getReg(0), *NewBank);
139         MI.eraseFromParent();
140       }
141 
142       assert(!MRI.getRegClassOrRegBank(DstReg));
143       MRI.setRegBank(DstReg, *NewBank);
144       return;
145     }
146 
147 #ifndef NDEBUG
148     if (Opc == AMDGPU::G_TRUNC) {
149       Register DstReg = MI.getOperand(0).getReg();
150       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151       assert(DstBank != &AMDGPU::VCCRegBank);
152     }
153 #endif
154 
155     for (MachineOperand &Op : MI.operands()) {
156       if (!Op.isReg())
157         continue;
158 
159       // We may see physical registers if building a real MI
160       Register Reg = Op.getReg();
161       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162         continue;
163 
164       const RegisterBank *RB = NewBank;
165       if (MRI.getType(Reg) == LLT::scalar(1)) {
166         assert(NewBank == &AMDGPU::VGPRRegBank &&
167                "s1 operands should only be used for vector bools");
168         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170                "not expecting legalization artifacts here");
171         RB = &AMDGPU::VCCRegBank;
172       }
173 
174       MRI.setRegBank(Reg, *RB);
175     }
176   }
177 
178   void erasingInstr(MachineInstr &MI) override {}
179 
180   void createdInstr(MachineInstr &MI) override {
181     // At this point, the instruction was just inserted and has no operands.
182     NewInsts.push_back(&MI);
183   }
184 
185   void changingInstr(MachineInstr &MI) override {}
186   void changedInstr(MachineInstr &MI) override {
187     // FIXME: In principle we should probably add the instruction to NewInsts,
188     // but the way the LegalizerHelper uses the observer, we will always see the
189     // registers we need to set the regbank on also referenced in a new
190     // instruction.
191   }
192 };
193 
194 }
195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196     : AMDGPUGenRegisterBankInfo(),
197       Subtarget(ST),
198       TRI(Subtarget.getRegisterInfo()),
199       TII(Subtarget.getInstrInfo()) {
200 
201   // HACK: Until this is fully tablegen'd.
202   static llvm::once_flag InitializeRegisterBankFlag;
203 
204   static auto InitializeRegisterBankOnce = [this]() {
205     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208     (void)this;
209   };
210 
211   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213 
214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215   unsigned BankID = Bank.getID();
216   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218 
219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220                                           const RegisterBank &Src,
221                                           unsigned Size) const {
222   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225     return std::numeric_limits<unsigned>::max();
226   }
227 
228   // Bool values are tricky, because the meaning is based on context. The SCC
229   // and VCC banks are for the natural scalar and vector conditions produced by
230   // a compare.
231   //
232   // Legalization doesn't know about the necessary context, so an s1 use may
233   // have been a truncate from an arbitrary value, in which case a copy (lowered
234   // as a compare with 0) needs to be inserted.
235   if (Size == 1 &&
236       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237       (isVectorRegisterBank(Src) ||
238        Src.getID() == AMDGPU::SGPRRegBankID ||
239        Src.getID() == AMDGPU::VCCRegBankID))
240     return std::numeric_limits<unsigned>::max();
241 
242   // There is no direct copy between AGPRs.
243   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244       Src.getID() == AMDGPU::AGPRRegBankID)
245     return 4;
246 
247   return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249 
250 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251   const ValueMapping &ValMapping,
252   const RegisterBank *CurBank) const {
253   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254   // VGPR.
255   // FIXME: Is there a better way to do this?
256   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257     return 10; // This is expensive.
258 
259   assert(ValMapping.NumBreakDowns == 2 &&
260          ValMapping.BreakDown[0].Length == 32 &&
261          ValMapping.BreakDown[0].StartIdx == 0 &&
262          ValMapping.BreakDown[1].Length == 32 &&
263          ValMapping.BreakDown[1].StartIdx == 32 &&
264          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265 
266   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268   // want.
269 
270   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271   // alignment restrictions, but this probably isn't important.
272   return 1;
273 }
274 
275 const RegisterBank &
276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277                                                LLT Ty) const {
278   if (&RC == &AMDGPU::SReg_1RegClass)
279     return AMDGPU::VCCRegBank;
280 
281   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282   // VCC-like use.
283   if (TRI->isSGPRClass(&RC)) {
284     // FIXME: This probably came from a copy from a physical register, which
285     // should be inferrrable from the copied to-type. We don't have many boolean
286     // physical register constraints so just assume a normal SGPR for now.
287     if (!Ty.isValid())
288       return AMDGPU::SGPRRegBank;
289 
290     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291   }
292 
293   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295 
296 template <unsigned NumOps>
297 RegisterBankInfo::InstructionMappings
298 AMDGPURegisterBankInfo::addMappingFromTable(
299     const MachineInstr &MI, const MachineRegisterInfo &MRI,
300     const std::array<unsigned, NumOps> RegSrcOpIdx,
301     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302 
303   InstructionMappings AltMappings;
304 
305   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
306 
307   unsigned Sizes[NumOps];
308   for (unsigned I = 0; I < NumOps; ++I) {
309     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311   }
312 
313   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316   }
317 
318   // getInstrMapping's default mapping uses ID 1, so start at 2.
319   unsigned MappingID = 2;
320   for (const auto &Entry : Table) {
321     for (unsigned I = 0; I < NumOps; ++I) {
322       int OpIdx = RegSrcOpIdx[I];
323       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324     }
325 
326     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327                                                  getOperandsMapping(Operands),
328                                                  Operands.size()));
329   }
330 
331   return AltMappings;
332 }
333 
334 RegisterBankInfo::InstructionMappings
335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337   switch (MI.getIntrinsicID()) {
338   case Intrinsic::amdgcn_readlane: {
339     static const OpRegBankEntry<3> Table[2] = {
340       // Perfectly legal.
341       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342 
343       // Need a readfirstlane for the index.
344       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345     };
346 
347     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349   }
350   case Intrinsic::amdgcn_writelane: {
351     static const OpRegBankEntry<4> Table[4] = {
352       // Perfectly legal.
353       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354 
355       // Need readfirstlane of first op
356       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357 
358       // Need readfirstlane of second op
359       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361       // Need readfirstlane of both ops
362       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363     };
364 
365     // rsrc, voffset, offset
366     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368   }
369   default:
370     return RegisterBankInfo::getInstrAlternativeMappings(MI);
371   }
372 }
373 
374 RegisterBankInfo::InstructionMappings
375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377 
378   switch (MI.getIntrinsicID()) {
379   case Intrinsic::amdgcn_s_buffer_load: {
380     static const OpRegBankEntry<2> Table[4] = {
381       // Perfectly legal.
382       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383 
384       // Only need 1 register in loop
385       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386 
387       // Have to waterfall the resource.
388       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389 
390       // Have to waterfall the resource, and the offset.
391       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392     };
393 
394     // rsrc, offset
395     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397   }
398   case Intrinsic::amdgcn_ds_ordered_add:
399   case Intrinsic::amdgcn_ds_ordered_swap: {
400     // VGPR = M0, VGPR
401     static const OpRegBankEntry<3> Table[2] = {
402       // Perfectly legal.
403       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
404 
405       // Need a readfirstlane for m0
406       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407     };
408 
409     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411   }
412   case Intrinsic::amdgcn_s_sendmsg:
413   case Intrinsic::amdgcn_s_sendmsghalt: {
414     // FIXME: Should have no register for immediate
415     static const OpRegBankEntry<1> Table[2] = {
416       // Perfectly legal.
417       { { AMDGPU::SGPRRegBankID }, 1 },
418 
419       // Need readlane
420       { { AMDGPU::VGPRRegBankID }, 3 }
421     };
422 
423     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425   }
426   default:
427     return RegisterBankInfo::getInstrAlternativeMappings(MI);
428   }
429 }
430 
431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433   return I && I->getMetadata("amdgpu.noclobber");
434 }
435 
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439   if (!MI.hasOneMemOperand())
440     return false;
441 
442   const MachineMemOperand *MMO = *MI.memoperands_begin();
443   const unsigned AS = MMO->getAddrSpace();
444   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446   // Require 4-byte alignment.
447   return MMO->getAlign() >= Align(4) &&
448          // Can't do a scalar atomic load.
449          !MMO->isAtomic() &&
450          // Don't use scalar loads for volatile accesses to non-constant address
451          // spaces.
452          (IsConst || !MMO->isVolatile()) &&
453          // Memory must be known constant, or not written before this load.
454          (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455          AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457 
458 RegisterBankInfo::InstructionMappings
459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460     const MachineInstr &MI) const {
461 
462   const MachineFunction &MF = *MI.getParent()->getParent();
463   const MachineRegisterInfo &MRI = MF.getRegInfo();
464 
465 
466   InstructionMappings AltMappings;
467   switch (MI.getOpcode()) {
468   case TargetOpcode::G_CONSTANT: {
469     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470     if (Size == 1) {
471       static const OpRegBankEntry<1> Table[3] = {
472         { { AMDGPU::VGPRRegBankID }, 1 },
473         { { AMDGPU::SGPRRegBankID }, 1 },
474         { { AMDGPU::VCCRegBankID }, 1 }
475       };
476 
477       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478     }
479 
480     LLVM_FALLTHROUGH;
481   }
482   case TargetOpcode::G_FCONSTANT:
483   case TargetOpcode::G_FRAME_INDEX:
484   case TargetOpcode::G_GLOBAL_VALUE: {
485     static const OpRegBankEntry<1> Table[2] = {
486       { { AMDGPU::VGPRRegBankID }, 1 },
487       { { AMDGPU::SGPRRegBankID }, 1 }
488     };
489 
490     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491   }
492   case TargetOpcode::G_AND:
493   case TargetOpcode::G_OR:
494   case TargetOpcode::G_XOR: {
495     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496 
497     if (Size == 1) {
498       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499       const InstructionMapping &SCCMapping = getInstructionMapping(
500         1, 1, getOperandsMapping(
501           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504         3); // Num Operands
505       AltMappings.push_back(&SCCMapping);
506 
507       const InstructionMapping &VCCMapping0 = getInstructionMapping(
508         2, 1, getOperandsMapping(
509           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512         3); // Num Operands
513       AltMappings.push_back(&VCCMapping0);
514       return AltMappings;
515     }
516 
517     if (Size != 64)
518       break;
519 
520     const InstructionMapping &SSMapping = getInstructionMapping(
521       1, 1, getOperandsMapping(
522         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525       3); // Num Operands
526     AltMappings.push_back(&SSMapping);
527 
528     const InstructionMapping &VVMapping = getInstructionMapping(
529       2, 2, getOperandsMapping(
530         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533       3); // Num Operands
534     AltMappings.push_back(&VVMapping);
535     break;
536   }
537   case TargetOpcode::G_LOAD:
538   case TargetOpcode::G_ZEXTLOAD:
539   case TargetOpcode::G_SEXTLOAD: {
540     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542     unsigned PtrSize = PtrTy.getSizeInBits();
543     unsigned AS = PtrTy.getAddressSpace();
544 
545     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547         isScalarLoadLegal(MI)) {
548       const InstructionMapping &SSMapping = getInstructionMapping(
549           1, 1, getOperandsMapping(
550                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552           2); // Num Operands
553       AltMappings.push_back(&SSMapping);
554     }
555 
556     const InstructionMapping &VVMapping = getInstructionMapping(
557         2, 1,
558         getOperandsMapping(
559             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561         2); // Num Operands
562     AltMappings.push_back(&VVMapping);
563 
564     // It may be possible to have a vgpr = load sgpr mapping here, because
565     // the mubuf instructions support this kind of load, but probably for only
566     // gfx7 and older.  However, the addressing mode matching in the instruction
567     // selector should be able to do a better job of detecting and selecting
568     // these kinds of loads from the vgpr = load vgpr mapping.
569 
570     return AltMappings;
571 
572   }
573   case TargetOpcode::G_SELECT: {
574     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580       4); // Num Operands
581     AltMappings.push_back(&SSMapping);
582 
583     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588       4); // Num Operands
589     AltMappings.push_back(&VVMapping);
590 
591     return AltMappings;
592   }
593   case TargetOpcode::G_UADDE:
594   case TargetOpcode::G_USUBE:
595   case TargetOpcode::G_SADDE:
596   case TargetOpcode::G_SSUBE: {
597     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
599       getOperandsMapping(
600         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605       5); // Num Operands
606     AltMappings.push_back(&SSMapping);
607 
608     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614       5); // Num Operands
615     AltMappings.push_back(&VVMapping);
616     return AltMappings;
617   }
618   case AMDGPU::G_BRCOND: {
619     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
620 
621     // TODO: Change type to 32 for scalar
622     const InstructionMapping &SMapping = getInstructionMapping(
623       1, 1, getOperandsMapping(
624         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625       2); // Num Operands
626     AltMappings.push_back(&SMapping);
627 
628     const InstructionMapping &VMapping = getInstructionMapping(
629       1, 1, getOperandsMapping(
630         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631       2); // Num Operands
632     AltMappings.push_back(&VMapping);
633     return AltMappings;
634   }
635   case AMDGPU::G_INTRINSIC:
636     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
637   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
638     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
639   default:
640     break;
641   }
642   return RegisterBankInfo::getInstrAlternativeMappings(MI);
643 }
644 
645 void AMDGPURegisterBankInfo::split64BitValueForMapping(
646   MachineIRBuilder &B,
647   SmallVector<Register, 2> &Regs,
648   LLT HalfTy,
649   Register Reg) const {
650   assert(HalfTy.getSizeInBits() == 32);
651   MachineRegisterInfo *MRI = B.getMRI();
652   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655   MRI->setRegBank(LoLHS, *Bank);
656   MRI->setRegBank(HiLHS, *Bank);
657 
658   Regs.push_back(LoLHS);
659   Regs.push_back(HiLHS);
660 
661   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662     .addDef(LoLHS)
663     .addDef(HiLHS)
664     .addUse(Reg);
665 }
666 
667 /// Replace the current type each register in \p Regs has with \p NewTy
668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
669                           LLT NewTy) {
670   for (Register Reg : Regs) {
671     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
672     MRI.setType(Reg, NewTy);
673   }
674 }
675 
676 static LLT getHalfSizedType(LLT Ty) {
677   if (Ty.isVector()) {
678     assert(Ty.getElementCount().isKnownMultipleOf(2));
679     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
680                                Ty.getElementType());
681   }
682 
683   assert(Ty.getScalarSizeInBits() % 2 == 0);
684   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
685 }
686 
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
692 ///
693 /// Essentially performs this loop:
694 //
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 ///   Enable Lane, Disable all other lanes
698 ///   SGPR = read SGPR value for current lane from VGPR
699 ///   VGPRResult[Lane] = use_op SGPR
700 /// }
701 /// Restore Execution Mask
702 ///
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
706   MachineIRBuilder &B,
707   iterator_range<MachineBasicBlock::iterator> Range,
708   SmallSet<Register, 4> &SGPROperandRegs,
709   MachineRegisterInfo &MRI) const {
710   SmallVector<Register, 4> ResultRegs;
711   SmallVector<Register, 4> InitResultRegs;
712   SmallVector<Register, 4> PhiRegs;
713 
714   // Track use registers which have already been expanded with a readfirstlane
715   // sequence. This may have multiple uses if moving a sequence.
716   DenseMap<Register, Register> WaterfalledRegMap;
717 
718   MachineBasicBlock &MBB = B.getMBB();
719   MachineFunction *MF = &B.getMF();
720 
721   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
722   const unsigned WaveAndOpc = Subtarget.isWave32() ?
723     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
724   const unsigned MovTermOpc = Subtarget.isWave32() ?
725     AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
726   const unsigned XorTermOpc = Subtarget.isWave32() ?
727     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
728   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
729     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
730   const unsigned ExecReg =  Subtarget.isWave32() ?
731     AMDGPU::EXEC_LO : AMDGPU::EXEC;
732 
733 #ifndef NDEBUG
734   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
735 #endif
736 
737   for (MachineInstr &MI : Range) {
738     for (MachineOperand &Def : MI.defs()) {
739       if (MRI.use_nodbg_empty(Def.getReg()))
740         continue;
741 
742       LLT ResTy = MRI.getType(Def.getReg());
743       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
744       ResultRegs.push_back(Def.getReg());
745       Register InitReg = B.buildUndef(ResTy).getReg(0);
746       Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
747       InitResultRegs.push_back(InitReg);
748       PhiRegs.push_back(PhiReg);
749       MRI.setRegBank(PhiReg, *DefBank);
750       MRI.setRegBank(InitReg, *DefBank);
751     }
752   }
753 
754   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
755   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
756 
757   // Don't bother using generic instructions/registers for the exec mask.
758   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
759     .addDef(InitSaveExecReg);
760 
761   Register PhiExec = MRI.createVirtualRegister(WaveRC);
762   Register NewExec = MRI.createVirtualRegister(WaveRC);
763 
764   // To insert the loop we need to split the block. Move everything before this
765   // point to a new block, and insert a new empty block before this instruction.
766   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
767   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
768   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
769   MachineFunction::iterator MBBI(MBB);
770   ++MBBI;
771   MF->insert(MBBI, LoopBB);
772   MF->insert(MBBI, RestoreExecBB);
773   MF->insert(MBBI, RemainderBB);
774 
775   LoopBB->addSuccessor(RestoreExecBB);
776   LoopBB->addSuccessor(LoopBB);
777 
778   // Move the rest of the block into a new block.
779   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
780   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
781 
782   MBB.addSuccessor(LoopBB);
783   RestoreExecBB->addSuccessor(RemainderBB);
784 
785   B.setInsertPt(*LoopBB, LoopBB->end());
786 
787   B.buildInstr(TargetOpcode::PHI)
788     .addDef(PhiExec)
789     .addReg(InitSaveExecReg)
790     .addMBB(&MBB)
791     .addReg(NewExec)
792     .addMBB(LoopBB);
793 
794   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
795     B.buildInstr(TargetOpcode::G_PHI)
796       .addDef(std::get<2>(Result))
797       .addReg(std::get<0>(Result)) // Initial value / implicit_def
798       .addMBB(&MBB)
799       .addReg(std::get<1>(Result)) // Mid-loop value.
800       .addMBB(LoopBB);
801   }
802 
803   const DebugLoc &DL = B.getDL();
804 
805   MachineInstr &FirstInst = *Range.begin();
806 
807   // Move the instruction into the loop. Note we moved everything after
808   // Range.end() already into a new block, so Range.end() is no longer valid.
809   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
810 
811   // Figure out the iterator range after splicing the instructions.
812   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
813   auto NewEnd = LoopBB->end();
814 
815   MachineBasicBlock::iterator I = Range.begin();
816   B.setInsertPt(*LoopBB, I);
817 
818   Register CondReg;
819 
820   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
821 
822   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
823     for (MachineOperand &Op : MI.uses()) {
824       if (!Op.isReg() || Op.isDef())
825         continue;
826 
827       Register OldReg = Op.getReg();
828       if (!SGPROperandRegs.count(OldReg))
829         continue;
830 
831       // See if we already processed this register in another instruction in the
832       // sequence.
833       auto OldVal = WaterfalledRegMap.find(OldReg);
834       if (OldVal != WaterfalledRegMap.end()) {
835         Op.setReg(OldVal->second);
836         continue;
837       }
838 
839       Register OpReg = Op.getReg();
840       LLT OpTy = MRI.getType(OpReg);
841 
842       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
843       if (OpBank != &AMDGPU::VGPRRegBank) {
844         // Insert copy from AGPR to VGPR before the loop.
845         B.setMBB(MBB);
846         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
847         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
848         B.setInstr(*I);
849       }
850 
851       unsigned OpSize = OpTy.getSizeInBits();
852 
853       // Can only do a readlane of 32-bit pieces.
854       if (OpSize == 32) {
855         // Avoid extra copies in the simple case of one 32-bit register.
856         Register CurrentLaneOpReg
857           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
858         MRI.setType(CurrentLaneOpReg, OpTy);
859 
860         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
861         // Read the next variant <- also loop target.
862         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
863                 CurrentLaneOpReg)
864           .addReg(OpReg);
865 
866         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
867         bool First = CondReg == AMDGPU::NoRegister;
868         if (First)
869           CondReg = NewCondReg;
870 
871         // Compare the just read M0 value to all possible Idx values.
872         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
873           .addDef(NewCondReg)
874           .addReg(CurrentLaneOpReg)
875           .addReg(OpReg);
876         Op.setReg(CurrentLaneOpReg);
877 
878         if (!First) {
879           Register AndReg = MRI.createVirtualRegister(WaveRC);
880 
881           // If there are multiple operands to consider, and the conditions.
882           B.buildInstr(WaveAndOpc)
883             .addDef(AndReg)
884             .addReg(NewCondReg)
885             .addReg(CondReg);
886           CondReg = AndReg;
887         }
888       } else {
889         LLT S32 = LLT::scalar(32);
890         SmallVector<Register, 8> ReadlanePieces;
891 
892         // The compares can be done as 64-bit, but the extract needs to be done
893         // in 32-bit pieces.
894 
895         bool Is64 = OpSize % 64 == 0;
896 
897         LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898         unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899           : AMDGPU::V_CMP_EQ_U32_e64;
900 
901         // The compares can be done as 64-bit, but the extract needs to be done
902         // in 32-bit pieces.
903 
904         // Insert the unmerge before the loop.
905 
906         B.setMBB(MBB);
907         auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
908         B.setInstr(*I);
909 
910         unsigned NumPieces = Unmerge->getNumOperands() - 1;
911         for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
912           Register UnmergePiece = Unmerge.getReg(PieceIdx);
913 
914           Register CurrentLaneOpReg;
915           if (Is64) {
916             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
917             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
918 
919             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
920             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
921             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
922 
923             // Read the next variant <- also loop target.
924             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
925                     CurrentLaneOpRegLo)
926               .addReg(UnmergePiece, 0, AMDGPU::sub0);
927 
928             // Read the next variant <- also loop target.
929             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930                     CurrentLaneOpRegHi)
931               .addReg(UnmergePiece, 0, AMDGPU::sub1);
932 
933             CurrentLaneOpReg =
934               B.buildMerge(LLT::scalar(64),
935                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
936               .getReg(0);
937 
938             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
939 
940             if (OpTy.getScalarSizeInBits() == 64) {
941               // If we need to produce a 64-bit element vector, so use the
942               // merged pieces
943               ReadlanePieces.push_back(CurrentLaneOpReg);
944             } else {
945               // 32-bit element type.
946               ReadlanePieces.push_back(CurrentLaneOpRegLo);
947               ReadlanePieces.push_back(CurrentLaneOpRegHi);
948             }
949           } else {
950             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
951             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
952             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
953 
954             // Read the next variant <- also loop target.
955             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
956                     CurrentLaneOpReg)
957               .addReg(UnmergePiece);
958             ReadlanePieces.push_back(CurrentLaneOpReg);
959           }
960 
961           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
962           bool First = CondReg == AMDGPU::NoRegister;
963           if (First)
964             CondReg = NewCondReg;
965 
966           B.buildInstr(CmpOp)
967             .addDef(NewCondReg)
968             .addReg(CurrentLaneOpReg)
969             .addReg(UnmergePiece);
970 
971           if (!First) {
972             Register AndReg = MRI.createVirtualRegister(WaveRC);
973 
974             // If there are multiple operands to consider, and the conditions.
975             B.buildInstr(WaveAndOpc)
976               .addDef(AndReg)
977               .addReg(NewCondReg)
978               .addReg(CondReg);
979             CondReg = AndReg;
980           }
981         }
982 
983         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
984         // BUILD_VECTOR
985         if (OpTy.isVector()) {
986           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
987           Op.setReg(Merge.getReg(0));
988         } else {
989           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
990           Op.setReg(Merge.getReg(0));
991         }
992 
993         MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
994       }
995 
996       // Make sure we don't re-process this register again.
997       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
998     }
999   }
1000 
1001   B.setInsertPt(*LoopBB, LoopBB->end());
1002 
1003   // Update EXEC, save the original EXEC value to VCC.
1004   B.buildInstr(AndSaveExecOpc)
1005     .addDef(NewExec)
1006     .addReg(CondReg, RegState::Kill);
1007 
1008   MRI.setSimpleHint(NewExec, CondReg);
1009 
1010   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011   B.buildInstr(XorTermOpc)
1012     .addDef(ExecReg)
1013     .addReg(ExecReg)
1014     .addReg(NewExec);
1015 
1016   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1017   // s_cbranch_scc0?
1018 
1019   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1021     .addMBB(LoopBB);
1022 
1023   // Save the EXEC mask before the loop.
1024   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1025     .addReg(ExecReg);
1026 
1027   // Restore the EXEC mask after the loop.
1028   B.setMBB(*RestoreExecBB);
1029   B.buildInstr(MovTermOpc)
1030     .addDef(ExecReg)
1031     .addReg(SaveExecReg);
1032 
1033   // Set the insert point after the original instruction, so any new
1034   // instructions will be in the remainder.
1035   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1036 
1037   return true;
1038 }
1039 
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operands to handle and a
1043 // waterfall loop is necessary.
1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1045   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1046   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1047   for (unsigned Op : OpIndices) {
1048     assert(MI.getOperand(Op).isUse());
1049     Register Reg = MI.getOperand(Op).getReg();
1050     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1051     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1052       SGPROperandRegs.insert(Reg);
1053   }
1054 
1055   // No operands need to be replaced, so no need to loop.
1056   return !SGPROperandRegs.empty();
1057 }
1058 
1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1060   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1061   ArrayRef<unsigned> OpIndices) const {
1062   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063   // are the same register.
1064   SmallSet<Register, 4> SGPROperandRegs;
1065 
1066   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1067     return false;
1068 
1069   MachineBasicBlock::iterator I = MI.getIterator();
1070   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1071                                 SGPROperandRegs, MRI);
1072 }
1073 
1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1075   MachineInstr &MI, MachineRegisterInfo &MRI,
1076   ArrayRef<unsigned> OpIndices) const {
1077   MachineIRBuilder B(MI);
1078   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1079 }
1080 
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1083     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1084   Register Reg = MI.getOperand(OpIdx).getReg();
1085   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1086   if (Bank == &AMDGPU::SGPRRegBank)
1087     return;
1088 
1089   LLT Ty = MRI.getType(Reg);
1090   MachineIRBuilder B(MI);
1091 
1092   if (Bank != &AMDGPU::VGPRRegBank) {
1093     // We need to copy from AGPR to VGPR
1094     Reg = B.buildCopy(Ty, Reg).getReg(0);
1095     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1096   }
1097 
1098   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1099   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1100     .addDef(SGPR)
1101     .addReg(Reg);
1102 
1103   MRI.setType(SGPR, Ty);
1104 
1105   const TargetRegisterClass *Constrained =
1106       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1107   (void)Constrained;
1108   assert(Constrained && "Failed to constrain readfirstlane src reg");
1109 
1110   MI.getOperand(OpIdx).setReg(SGPR);
1111 }
1112 
1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1114 /// rest will be in the remainder.
1115 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1116   unsigned TotalSize = Ty.getSizeInBits();
1117   if (!Ty.isVector())
1118     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1119 
1120   LLT EltTy = Ty.getElementType();
1121   unsigned EltSize = EltTy.getSizeInBits();
1122   assert(FirstSize % EltSize == 0);
1123 
1124   unsigned FirstPartNumElts = FirstSize / EltSize;
1125   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1126 
1127   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1128           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1129 }
1130 
1131 static LLT widen96To128(LLT Ty) {
1132   if (!Ty.isVector())
1133     return LLT::scalar(128);
1134 
1135   LLT EltTy = Ty.getElementType();
1136   assert(128 % EltTy.getSizeInBits() == 0);
1137   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1138 }
1139 
1140 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1141                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1142                                               MachineRegisterInfo &MRI) const {
1143   Register DstReg = MI.getOperand(0).getReg();
1144   const LLT LoadTy = MRI.getType(DstReg);
1145   unsigned LoadSize = LoadTy.getSizeInBits();
1146   const unsigned MaxNonSmrdLoadSize = 128;
1147 
1148   const RegisterBank *DstBank =
1149       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1150   if (DstBank == &AMDGPU::SGPRRegBank) {
1151     // There are some special cases that we need to look at for 32 bit and 96
1152     // bit SGPR loads otherwise we have nothing to do.
1153     if (LoadSize != 32 && LoadSize != 96)
1154       return false;
1155 
1156     MachineMemOperand *MMO = *MI.memoperands_begin();
1157     const unsigned MemSize = 8 * MMO->getSize();
1158     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1159     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1160     // scalar loads should have a load size of 32 but memory access size of less
1161     // than 32.
1162     if (LoadSize == 32 &&
1163         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1164       return false;
1165 
1166     Register PtrReg = MI.getOperand(1).getReg();
1167 
1168     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1169     MachineIRBuilder B(MI, O);
1170 
1171     if (LoadSize == 32) {
1172       // This is an extending load from a sub-dword size. Widen the memory
1173       // access size to 4 bytes and clear the extra high bits appropriately
1174       const LLT S32 = LLT::scalar(32);
1175       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1176         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1177         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1178         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1179       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1180         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1181         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1182         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1183       } else
1184         // We do not need to touch the higher bits for regular loads.
1185         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1186     } else {
1187       // 96-bit loads are only available for vector loads. We need to split this
1188       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1189       if (MMO->getAlign() < Align(16)) {
1190         LLT Part64, Part32;
1191         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1192         auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1193         auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1194 
1195         auto Undef = B.buildUndef(LoadTy);
1196         auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1197         B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1198       } else {
1199         LLT WiderTy = widen96To128(LoadTy);
1200         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1201         B.buildExtract(MI.getOperand(0), WideLoad, 0);
1202       }
1203     }
1204 
1205     MI.eraseFromParent();
1206     return true;
1207   }
1208 
1209   // 128-bit loads are supported for all instruction types.
1210   if (LoadSize <= MaxNonSmrdLoadSize)
1211     return false;
1212 
1213   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1214   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1215 
1216   if (SrcRegs.empty())
1217     SrcRegs.push_back(MI.getOperand(1).getReg());
1218 
1219   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1220 
1221   // RegBankSelect only emits scalar types, so we need to reset the pointer
1222   // operand to a pointer type.
1223   Register BasePtrReg = SrcRegs[0];
1224   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1225   MRI.setType(BasePtrReg, PtrTy);
1226 
1227   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1228   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1229   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1230   MachineIRBuilder B(MI, Observer);
1231   LegalizerHelper Helper(B.getMF(), Observer, B);
1232 
1233   if (LoadTy.isVector()) {
1234     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1235       return false;
1236   } else {
1237     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1238       return false;
1239   }
1240 
1241   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1242   return true;
1243 }
1244 
1245 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1246   MachineInstr &MI,
1247   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1248   MachineRegisterInfo &MRI) const {
1249   const MachineFunction &MF = *MI.getMF();
1250   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1251   const auto &TFI = *ST.getFrameLowering();
1252 
1253   // Guard in case the stack growth direction ever changes with scratch
1254   // instructions.
1255   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1256     return false;
1257 
1258   Register Dst = MI.getOperand(0).getReg();
1259   Register AllocSize = MI.getOperand(1).getReg();
1260   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1261 
1262   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1263 
1264   // TODO: Need to emit a wave reduction to get the maximum size.
1265   if (SizeBank != &AMDGPU::SGPRRegBank)
1266     return false;
1267 
1268   LLT PtrTy = MRI.getType(Dst);
1269   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1270 
1271   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1272   Register SPReg = Info->getStackPtrOffsetReg();
1273   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1274   MachineIRBuilder B(MI, ApplyBank);
1275 
1276   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1277   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1278 
1279   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1280   if (Alignment > TFI.getStackAlign()) {
1281     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1282     B.buildMaskLowPtrBits(Dst, PtrAdd,
1283                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1284   } else {
1285     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1286   }
1287 
1288   MI.eraseFromParent();
1289   return true;
1290 }
1291 
1292 bool AMDGPURegisterBankInfo::applyMappingImage(
1293     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1294     MachineRegisterInfo &MRI, int RsrcIdx) const {
1295   const int NumDefs = MI.getNumExplicitDefs();
1296 
1297   // The reported argument index is relative to the IR intrinsic call arguments,
1298   // so we need to shift by the number of defs and the intrinsic ID.
1299   RsrcIdx += NumDefs + 1;
1300 
1301   // Insert copies to VGPR arguments.
1302   applyDefaultMapping(OpdMapper);
1303 
1304   // Fixup any SGPR arguments.
1305   SmallVector<unsigned, 4> SGPRIndexes;
1306   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1307     if (!MI.getOperand(I).isReg())
1308       continue;
1309 
1310     // If this intrinsic has a sampler, it immediately follows rsrc.
1311     if (I == RsrcIdx || I == RsrcIdx + 1)
1312       SGPRIndexes.push_back(I);
1313   }
1314 
1315   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1316   return true;
1317 }
1318 
1319 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1320                                         Register Reg) {
1321   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1322   if (!Def)
1323     return Reg;
1324 
1325   // TODO: Guard against this being an implicit def
1326   return Def->getOperand(0).getReg();
1327 }
1328 
1329 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1330 // the three offsets (voffset, soffset and instoffset)
1331 static unsigned setBufferOffsets(MachineIRBuilder &B,
1332                                  const AMDGPURegisterBankInfo &RBI,
1333                                  Register CombinedOffset, Register &VOffsetReg,
1334                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1335                                  Align Alignment) {
1336   const LLT S32 = LLT::scalar(32);
1337   MachineRegisterInfo *MRI = B.getMRI();
1338 
1339   if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
1340     uint32_t SOffset, ImmOffset;
1341     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1342                                  Alignment)) {
1343       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1344       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1345       InstOffsetVal = ImmOffset;
1346 
1347       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1348       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1349       return SOffset + ImmOffset;
1350     }
1351   }
1352 
1353   Register Base;
1354   unsigned Offset;
1355 
1356   std::tie(Base, Offset) =
1357       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1358 
1359   uint32_t SOffset, ImmOffset;
1360   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1361                                                   &RBI.Subtarget, Alignment)) {
1362     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1363       VOffsetReg = Base;
1364       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1365       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1366       InstOffsetVal = ImmOffset;
1367       return 0; // XXX - Why is this 0?
1368     }
1369 
1370     // If we have SGPR base, we can use it for soffset.
1371     if (SOffset == 0) {
1372       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1373       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1374       SOffsetReg = Base;
1375       InstOffsetVal = ImmOffset;
1376       return 0; // XXX - Why is this 0?
1377     }
1378   }
1379 
1380   // Handle the variable sgpr + vgpr case.
1381   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1382   if (Add && (int)Offset >= 0) {
1383     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1384     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1385 
1386     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1387     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1388 
1389     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1390       VOffsetReg = Src0;
1391       SOffsetReg = Src1;
1392       return 0;
1393     }
1394 
1395     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1396       VOffsetReg = Src1;
1397       SOffsetReg = Src0;
1398       return 0;
1399     }
1400   }
1401 
1402   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1403   // have an SGPR offset and a VGPR resource.
1404   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1405     VOffsetReg = CombinedOffset;
1406   } else {
1407     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1408     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1409   }
1410 
1411   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1412   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1413   return 0;
1414 }
1415 
1416 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1417   const OperandsMapper &OpdMapper) const {
1418   MachineInstr &MI = OpdMapper.getMI();
1419   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1420 
1421   const LLT S32 = LLT::scalar(32);
1422   Register Dst = MI.getOperand(0).getReg();
1423   LLT Ty = MRI.getType(Dst);
1424 
1425   const RegisterBank *RSrcBank =
1426     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1427   const RegisterBank *OffsetBank =
1428     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1429   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1430       OffsetBank == &AMDGPU::SGPRRegBank)
1431     return true; // Legal mapping
1432 
1433   // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1434   // here but don't have an MMO.
1435 
1436   unsigned LoadSize = Ty.getSizeInBits();
1437   int NumLoads = 1;
1438   if (LoadSize == 256 || LoadSize == 512) {
1439     NumLoads = LoadSize / 128;
1440     Ty = Ty.divide(NumLoads);
1441   }
1442 
1443   // Use the alignment to ensure that the required offsets will fit into the
1444   // immediate offsets.
1445   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1446 
1447   MachineIRBuilder B(MI);
1448   MachineFunction &MF = B.getMF();
1449 
1450   Register SOffset;
1451   Register VOffset;
1452   int64_t ImmOffset = 0;
1453 
1454   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1455                                         VOffset, SOffset, ImmOffset, Alignment);
1456 
1457   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1458   // can, but we neeed to track an MMO for that.
1459   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1460   const Align MemAlign(4); // FIXME: ABI type alignment?
1461   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1462     MachinePointerInfo(),
1463     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1464     MachineMemOperand::MOInvariant,
1465     MemSize, MemAlign);
1466   if (MMOOffset != 0)
1467     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1468 
1469   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1470   // assume that the buffer is unswizzled.
1471 
1472   Register RSrc = MI.getOperand(1).getReg();
1473   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1474   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1475 
1476   SmallVector<Register, 4> LoadParts(NumLoads);
1477 
1478   MachineBasicBlock::iterator MII = MI.getIterator();
1479   MachineInstrSpan Span(MII, &B.getMBB());
1480 
1481   for (int i = 0; i < NumLoads; ++i) {
1482     if (NumLoads == 1) {
1483       LoadParts[i] = Dst;
1484     } else {
1485       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1486       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1487     }
1488 
1489     MachineMemOperand *MMO = BaseMMO;
1490     if (i != 0)
1491       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1492 
1493     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1494       .addDef(LoadParts[i])       // vdata
1495       .addUse(RSrc)               // rsrc
1496       .addUse(VIndex)             // vindex
1497       .addUse(VOffset)            // voffset
1498       .addUse(SOffset)            // soffset
1499       .addImm(ImmOffset + 16 * i) // offset(imm)
1500       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1501       .addImm(0)                  // idxen(imm)
1502       .addMemOperand(MMO);
1503   }
1504 
1505   // TODO: If only the resource is a VGPR, it may be better to execute the
1506   // scalar load in the waterfall loop if the resource is expected to frequently
1507   // be dynamically uniform.
1508   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1509     // Remove the original instruction to avoid potentially confusing the
1510     // waterfall loop logic.
1511     B.setInstr(*Span.begin());
1512     MI.eraseFromParent();
1513 
1514     SmallSet<Register, 4> OpsToWaterfall;
1515 
1516     OpsToWaterfall.insert(RSrc);
1517     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1518                            OpsToWaterfall, MRI);
1519   }
1520 
1521   if (NumLoads != 1) {
1522     if (Ty.isVector())
1523       B.buildConcatVectors(Dst, LoadParts);
1524     else
1525       B.buildMerge(Dst, LoadParts);
1526   }
1527 
1528   // We removed the instruction earlier with a waterfall loop.
1529   if (RSrcBank == &AMDGPU::SGPRRegBank)
1530     MI.eraseFromParent();
1531 
1532   return true;
1533 }
1534 
1535 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1536                                              bool Signed) const {
1537   MachineInstr &MI = OpdMapper.getMI();
1538   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1539 
1540   // Insert basic copies
1541   applyDefaultMapping(OpdMapper);
1542 
1543   Register DstReg = MI.getOperand(0).getReg();
1544   LLT Ty = MRI.getType(DstReg);
1545 
1546   const LLT S32 = LLT::scalar(32);
1547 
1548   unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1549   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1550   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1551   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1552 
1553   const RegisterBank *DstBank =
1554     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1555   if (DstBank == &AMDGPU::VGPRRegBank) {
1556     if (Ty == S32)
1557       return true;
1558 
1559     // There is no 64-bit vgpr bitfield extract instructions so the operation
1560     // is expanded to a sequence of instructions that implement the operation.
1561     ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1562     MachineIRBuilder B(MI, ApplyBank);
1563 
1564     const LLT S64 = LLT::scalar(64);
1565     // Shift the source operand so that extracted bits start at bit 0.
1566     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1567                               : B.buildLShr(S64, SrcReg, OffsetReg);
1568     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1569 
1570     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1571     // if the width is a constant.
1572     if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) {
1573       // Use the 32-bit bitfield extract instruction if the width is a constant.
1574       // Depending on the width size, use either the low or high 32-bits.
1575       auto Zero = B.buildConstant(S32, 0);
1576       auto WidthImm = ConstWidth->Value.getZExtValue();
1577       if (WidthImm <= 32) {
1578         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1579         // or clear the upper 32-bits.
1580         auto Extract =
1581             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1582                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1583         auto Extend =
1584             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1585         B.buildMerge(DstReg, {Extract, Extend});
1586       } else {
1587         // Use bitfield extract on upper 32-bit source, and combine with lower
1588         // 32-bit source.
1589         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1590         auto Extract =
1591             Signed
1592                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1593                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1594         B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1595       }
1596       MI.eraseFromParent();
1597       return true;
1598     }
1599 
1600     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1601     // operations.
1602     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1603     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1604     if (Signed)
1605       B.buildAShr(S64, SignBit, ExtShift);
1606     else
1607       B.buildLShr(S64, SignBit, ExtShift);
1608     MI.eraseFromParent();
1609     return true;
1610   }
1611 
1612   // The scalar form packs the offset and width in a single operand.
1613 
1614   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1615   MachineIRBuilder B(MI, ApplyBank);
1616 
1617   // Ensure the high bits are clear to insert the offset.
1618   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1619   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1620 
1621   // Zeros out the low bits, so don't bother clamping the input value.
1622   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1623 
1624   // Transformation function, pack the offset and width of a BFE into
1625   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1626   // source, bits [5:0] contain the offset and bits [22:16] the width.
1627   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1628 
1629   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1630   // register class constraints.
1631   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1632                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1633 
1634   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1635   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1636     llvm_unreachable("failed to constrain BFE");
1637 
1638   MI.eraseFromParent();
1639   return true;
1640 }
1641 
1642 // Return a suitable opcode for extending the operands of Opc when widening.
1643 static unsigned getExtendOp(unsigned Opc) {
1644   switch (Opc) {
1645   case TargetOpcode::G_ASHR:
1646   case TargetOpcode::G_SMIN:
1647   case TargetOpcode::G_SMAX:
1648     return TargetOpcode::G_SEXT;
1649   case TargetOpcode::G_LSHR:
1650   case TargetOpcode::G_UMIN:
1651   case TargetOpcode::G_UMAX:
1652     return TargetOpcode::G_ZEXT;
1653   default:
1654     return TargetOpcode::G_ANYEXT;
1655   }
1656 }
1657 
1658 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1659 // any illegal vector extend or unmerge operations.
1660 static std::pair<Register, Register>
1661 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1662   const LLT S32 = LLT::scalar(32);
1663   auto Bitcast = B.buildBitcast(S32, Src);
1664 
1665   if (ExtOpcode == TargetOpcode::G_SEXT) {
1666     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1667     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1668     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1669   }
1670 
1671   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1672   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1673     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1674     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1675   }
1676 
1677   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1678   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1679 }
1680 
1681 // For cases where only a single copy is inserted for matching register banks.
1682 // Replace the register in the instruction operand
1683 static bool substituteSimpleCopyRegs(
1684   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1685   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1686   if (!SrcReg.empty()) {
1687     assert(SrcReg.size() == 1);
1688     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1689     return true;
1690   }
1691 
1692   return false;
1693 }
1694 
1695 /// Handle register layout difference for f16 images for some subtargets.
1696 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1697                                                 MachineRegisterInfo &MRI,
1698                                                 Register Reg) const {
1699   if (!Subtarget.hasUnpackedD16VMem())
1700     return Reg;
1701 
1702   const LLT S16 = LLT::scalar(16);
1703   LLT StoreVT = MRI.getType(Reg);
1704   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1705     return Reg;
1706 
1707   auto Unmerge = B.buildUnmerge(S16, Reg);
1708 
1709 
1710   SmallVector<Register, 4> WideRegs;
1711   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1712     WideRegs.push_back(Unmerge.getReg(I));
1713 
1714   const LLT S32 = LLT::scalar(32);
1715   int NumElts = StoreVT.getNumElements();
1716 
1717   return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1718 }
1719 
1720 static std::pair<Register, unsigned>
1721 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1722   int64_t Const;
1723   if (mi_match(Reg, MRI, m_ICst(Const)))
1724     return std::make_pair(Register(), Const);
1725 
1726   Register Base;
1727   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1728     return std::make_pair(Base, Const);
1729 
1730   // TODO: Handle G_OR used for add case
1731   return std::make_pair(Reg, 0);
1732 }
1733 
1734 std::pair<Register, unsigned>
1735 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1736                                            Register OrigOffset) const {
1737   const unsigned MaxImm = 4095;
1738   Register BaseReg;
1739   unsigned ImmOffset;
1740   const LLT S32 = LLT::scalar(32);
1741 
1742   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1743                                                            OrigOffset);
1744 
1745   unsigned C1 = 0;
1746   if (ImmOffset != 0) {
1747     // If the immediate value is too big for the immoffset field, put the value
1748     // and -4096 into the immoffset field so that the value that is copied/added
1749     // for the voffset field is a multiple of 4096, and it stands more chance
1750     // of being CSEd with the copy/add for another similar load/store.
1751     // However, do not do that rounding down to a multiple of 4096 if that is a
1752     // negative number, as it appears to be illegal to have a negative offset
1753     // in the vgpr, even if adding the immediate offset makes it positive.
1754     unsigned Overflow = ImmOffset & ~MaxImm;
1755     ImmOffset -= Overflow;
1756     if ((int32_t)Overflow < 0) {
1757       Overflow += ImmOffset;
1758       ImmOffset = 0;
1759     }
1760 
1761     C1 = ImmOffset;
1762     if (Overflow != 0) {
1763       if (!BaseReg)
1764         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1765       else {
1766         auto OverflowVal = B.buildConstant(S32, Overflow);
1767         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1768       }
1769     }
1770   }
1771 
1772   if (!BaseReg)
1773     BaseReg = B.buildConstant(S32, 0).getReg(0);
1774 
1775   return {BaseReg, C1};
1776 }
1777 
1778 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1779   int64_t C;
1780   return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1781 }
1782 
1783 static unsigned extractCPol(unsigned CachePolicy) {
1784   return CachePolicy & AMDGPU::CPol::ALL;
1785 }
1786 
1787 static unsigned extractSWZ(unsigned CachePolicy) {
1788   return (CachePolicy >> 3) & 1;
1789 }
1790 
1791 
1792 MachineInstr *
1793 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1794                                              MachineInstr &MI) const {
1795    MachineRegisterInfo &MRI = *B.getMRI();
1796   executeInWaterfallLoop(B, MI, MRI, {2, 4});
1797 
1798   // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1799 
1800   Register VData = MI.getOperand(1).getReg();
1801   LLT Ty = MRI.getType(VData);
1802 
1803   int EltSize = Ty.getScalarSizeInBits();
1804   int Size = Ty.getSizeInBits();
1805 
1806   // FIXME: Broken integer truncstore.
1807   if (EltSize != 32)
1808     report_fatal_error("unhandled intrinsic store");
1809 
1810   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1811   const int MemSize = (*MI.memoperands_begin())->getSize();
1812 
1813 
1814   Register RSrc = MI.getOperand(2).getReg();
1815   Register VOffset = MI.getOperand(3).getReg();
1816   Register SOffset = MI.getOperand(4).getReg();
1817   unsigned CachePolicy = MI.getOperand(5).getImm();
1818 
1819   unsigned ImmOffset;
1820   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1821 
1822   const bool Offen = !isZero(VOffset, MRI);
1823 
1824   unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1825   switch (8 * MemSize) {
1826   case 8:
1827     Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1828                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1829     break;
1830   case 16:
1831     Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1832                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1833     break;
1834   default:
1835     Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1836                   AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1837     if (Size > 32)
1838       Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1839     break;
1840   }
1841 
1842 
1843   // Set the insertion point back to the instruction in case it was moved into a
1844   // loop.
1845   B.setInstr(MI);
1846 
1847   MachineInstrBuilder MIB = B.buildInstr(Opc)
1848     .addUse(VData);
1849 
1850   if (Offen)
1851     MIB.addUse(VOffset);
1852 
1853   MIB.addUse(RSrc)
1854      .addUse(SOffset)
1855      .addImm(ImmOffset)
1856      .addImm(extractCPol(CachePolicy))
1857      .addImm(0) // tfe: FIXME: Remove from inst
1858      .addImm(extractSWZ(CachePolicy))
1859      .cloneMemRefs(MI);
1860 
1861   // FIXME: We need a way to report failure from applyMappingImpl.
1862   // Insert constrain copies before inserting the loop.
1863   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1864     report_fatal_error("failed to constrain selected store intrinsic");
1865 
1866   return MIB;
1867 }
1868 
1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1870                                         Register SrcReg) const {
1871   MachineRegisterInfo &MRI = *B.getMRI();
1872   LLT SrcTy = MRI.getType(SrcReg);
1873   if (SrcTy.getSizeInBits() == 32) {
1874     // Use a v_mov_b32 here to make the exec dependency explicit.
1875     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876       .addDef(DstReg)
1877       .addUse(SrcReg);
1878     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1879            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1880   }
1881 
1882   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1883   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1884 
1885   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1886     .addDef(TmpReg0)
1887     .addUse(SrcReg, 0, AMDGPU::sub0);
1888   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1889     .addDef(TmpReg1)
1890     .addUse(SrcReg, 0, AMDGPU::sub1);
1891   B.buildInstr(AMDGPU::REG_SEQUENCE)
1892     .addDef(DstReg)
1893     .addUse(TmpReg0)
1894     .addImm(AMDGPU::sub0)
1895     .addUse(TmpReg1)
1896     .addImm(AMDGPU::sub1);
1897 
1898   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1899          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1900 }
1901 
1902 /// Utility function for pushing dynamic vector indexes with a constant offset
1903 /// into waterwall loops.
1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1905                                    MachineInstr &IdxUseInstr,
1906                                    unsigned OpIdx,
1907                                    unsigned ConstOffset) {
1908   MachineRegisterInfo &MRI = *B.getMRI();
1909   const LLT S32 = LLT::scalar(32);
1910   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1911   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1912 
1913   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1914 
1915   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1916   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1917   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1918   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1919 }
1920 
1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1922 /// original 32-bit source value (to be inserted in the low part of the combined
1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1924 /// value.
1925 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1926                                   Register Hi32Reg, Register Lo32Reg,
1927                                   unsigned ExtOpc,
1928                                   const RegisterBank &RegBank,
1929                                   bool IsBooleanSrc = false) {
1930   if (ExtOpc == AMDGPU::G_ZEXT) {
1931     B.buildConstant(Hi32Reg, 0);
1932   } else if (ExtOpc == AMDGPU::G_SEXT) {
1933     if (IsBooleanSrc) {
1934       // If we know the original source was an s1, the high half is the same as
1935       // the low.
1936       B.buildCopy(Hi32Reg, Lo32Reg);
1937     } else {
1938       // Replicate sign bit from 32-bit extended part.
1939       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1940       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1941       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1942     }
1943   } else {
1944     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1945     B.buildUndef(Hi32Reg);
1946   }
1947 }
1948 
1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1950   MachineInstr &MI, MachineRegisterInfo &MRI,
1951   const OperandsMapper &OpdMapper) const {
1952 
1953   Register VecReg = MI.getOperand(1).getReg();
1954   Register Idx = MI.getOperand(2).getReg();
1955 
1956   const RegisterBank &IdxBank =
1957     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1958 
1959   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1960 
1961   LLT VecTy = MRI.getType(VecReg);
1962   unsigned EltSize = VecTy.getScalarSizeInBits();
1963   unsigned NumElem = VecTy.getNumElements();
1964 
1965   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1966                                                   IsDivergentIdx))
1967     return false;
1968 
1969   MachineIRBuilder B(MI);
1970   LLT S32 = LLT::scalar(32);
1971 
1972   const RegisterBank &DstBank =
1973     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1974   const RegisterBank &SrcBank =
1975     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1976 
1977   const RegisterBank &CCBank =
1978     (DstBank == AMDGPU::SGPRRegBank &&
1979      SrcBank == AMDGPU::SGPRRegBank &&
1980      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1981                                      : AMDGPU::VCCRegBank;
1982   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1983 
1984   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1985     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1986     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1987   }
1988 
1989   LLT EltTy = VecTy.getScalarType();
1990   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1991   unsigned NumLanes = DstRegs.size();
1992   if (!NumLanes)
1993     NumLanes = 1;
1994   else
1995     EltTy = MRI.getType(DstRegs[0]);
1996 
1997   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1998   SmallVector<Register, 2> Res(NumLanes);
1999   for (unsigned L = 0; L < NumLanes; ++L)
2000     Res[L] = UnmergeToEltTy.getReg(L);
2001 
2002   for (unsigned I = 1; I < NumElem; ++I) {
2003     auto IC = B.buildConstant(S32, I);
2004     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2005     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2006     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2007 
2008     for (unsigned L = 0; L < NumLanes; ++L) {
2009       auto S = B.buildSelect(EltTy, Cmp,
2010                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
2011 
2012       for (unsigned N : { 0, 2, 3 })
2013         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2014 
2015       Res[L] = S->getOperand(0).getReg();
2016     }
2017   }
2018 
2019   for (unsigned L = 0; L < NumLanes; ++L) {
2020     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2021     B.buildCopy(DstReg, Res[L]);
2022     MRI.setRegBank(DstReg, DstBank);
2023   }
2024 
2025   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2026   MI.eraseFromParent();
2027 
2028   return true;
2029 }
2030 
2031 // Insert a cross regbank copy for a register if it already has a bank that
2032 // differs from the one we want to set.
2033 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2034                                    MachineIRBuilder &B, Register &Reg,
2035                                    const RegisterBank &Bank) {
2036   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2037   if (CurrBank && *CurrBank != Bank) {
2038     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2039     MRI.setRegBank(Copy, Bank);
2040     return Copy;
2041   }
2042 
2043   MRI.setRegBank(Reg, Bank);
2044   return Reg;
2045 }
2046 
2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2048   MachineInstr &MI, MachineRegisterInfo &MRI,
2049   const OperandsMapper &OpdMapper) const {
2050 
2051   Register VecReg = MI.getOperand(1).getReg();
2052   Register Idx = MI.getOperand(3).getReg();
2053 
2054   const RegisterBank &IdxBank =
2055     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2056 
2057   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2058 
2059   LLT VecTy = MRI.getType(VecReg);
2060   unsigned EltSize = VecTy.getScalarSizeInBits();
2061   unsigned NumElem = VecTy.getNumElements();
2062 
2063   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2064                                                   IsDivergentIdx))
2065     return false;
2066 
2067   MachineIRBuilder B(MI);
2068   LLT S32 = LLT::scalar(32);
2069 
2070   const RegisterBank &DstBank =
2071     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2072   const RegisterBank &SrcBank =
2073     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2074   const RegisterBank &InsBank =
2075     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2076 
2077   const RegisterBank &CCBank =
2078     (DstBank == AMDGPU::SGPRRegBank &&
2079      SrcBank == AMDGPU::SGPRRegBank &&
2080      InsBank == AMDGPU::SGPRRegBank &&
2081      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2082                                      : AMDGPU::VCCRegBank;
2083   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2084 
2085   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2086     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2087     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2088   }
2089 
2090   LLT EltTy = VecTy.getScalarType();
2091   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2092   unsigned NumLanes = InsRegs.size();
2093   if (!NumLanes) {
2094     NumLanes = 1;
2095     InsRegs.push_back(MI.getOperand(2).getReg());
2096   } else {
2097     EltTy = MRI.getType(InsRegs[0]);
2098   }
2099 
2100   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2101   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2102 
2103   for (unsigned I = 0; I < NumElem; ++I) {
2104     auto IC = B.buildConstant(S32, I);
2105     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2106     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2107     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2108 
2109     for (unsigned L = 0; L < NumLanes; ++L) {
2110       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2111       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2112       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2113 
2114       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2115       MRI.setRegBank(Select, DstBank);
2116 
2117       Ops[I * NumLanes + L] = Select;
2118     }
2119   }
2120 
2121   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2122   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2123     B.buildBuildVector(MI.getOperand(0), Ops);
2124   } else {
2125     auto Vec = B.buildBuildVector(MergeTy, Ops);
2126     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2127     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2128   }
2129 
2130   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2131   MI.eraseFromParent();
2132 
2133   return true;
2134 }
2135 
2136 void AMDGPURegisterBankInfo::applyMappingImpl(
2137     const OperandsMapper &OpdMapper) const {
2138   MachineInstr &MI = OpdMapper.getMI();
2139   unsigned Opc = MI.getOpcode();
2140   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2141   switch (Opc) {
2142   case AMDGPU::G_PHI: {
2143     Register DstReg = MI.getOperand(0).getReg();
2144     LLT DstTy = MRI.getType(DstReg);
2145     if (DstTy != LLT::scalar(1))
2146       break;
2147 
2148     const LLT S32 = LLT::scalar(32);
2149     const RegisterBank *DstBank =
2150       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2151     if (DstBank == &AMDGPU::VCCRegBank) {
2152       applyDefaultMapping(OpdMapper);
2153       // The standard handling only considers the result register bank for
2154       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2155       // produce an invalid copy. We can only copy with some kind of compare to
2156       // get a vector boolean result. Insert a regitser bank copy that will be
2157       // correctly lowered to a compare.
2158       MachineIRBuilder B(*MI.getParent()->getParent());
2159 
2160       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2161         Register SrcReg = MI.getOperand(I).getReg();
2162         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2163 
2164         if (SrcBank != &AMDGPU::VCCRegBank) {
2165           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2166           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2167 
2168           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2169           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2170           MI.getOperand(I).setReg(Copy.getReg(0));
2171         }
2172       }
2173 
2174       return;
2175     }
2176 
2177     // Phi handling is strange and only considers the bank of the destination.
2178     substituteSimpleCopyRegs(OpdMapper, 0);
2179 
2180     // Promote SGPR/VGPR booleans to s32
2181     MachineFunction *MF = MI.getParent()->getParent();
2182     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2183     MachineIRBuilder B(MI, ApplyBank);
2184     LegalizerHelper Helper(*MF, ApplyBank, B);
2185 
2186     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2187       llvm_unreachable("widen scalar should have succeeded");
2188 
2189     return;
2190   }
2191   case AMDGPU::G_ICMP:
2192   case AMDGPU::G_UADDO:
2193   case AMDGPU::G_USUBO:
2194   case AMDGPU::G_UADDE:
2195   case AMDGPU::G_SADDE:
2196   case AMDGPU::G_USUBE:
2197   case AMDGPU::G_SSUBE: {
2198     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2199     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2200 
2201     const RegisterBank *DstBank =
2202       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2203     if (DstBank != &AMDGPU::SGPRRegBank)
2204       break;
2205 
2206     const bool HasCarryIn = MI.getNumOperands() == 5;
2207 
2208     // If this is a scalar compare, promote the result to s32, as the selection
2209     // will end up using a copy to a 32-bit vreg.
2210     const LLT S32 = LLT::scalar(32);
2211     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2212     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2213     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2214     MachineIRBuilder B(MI);
2215 
2216     if (HasCarryIn) {
2217       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2218       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2219       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2220       MI.getOperand(4).setReg(NewSrcReg);
2221     }
2222 
2223     MachineBasicBlock *MBB = MI.getParent();
2224     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2225 
2226     // If we had a constrained VCC result register, a copy was inserted to VCC
2227     // from SGPR.
2228     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2229     if (DefRegs.empty())
2230       DefRegs.push_back(DstReg);
2231     B.buildTrunc(DefRegs[0], NewDstReg);
2232     return;
2233   }
2234   case AMDGPU::G_SELECT: {
2235     Register DstReg = MI.getOperand(0).getReg();
2236     LLT DstTy = MRI.getType(DstReg);
2237 
2238     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2239     if (CondRegs.empty())
2240       CondRegs.push_back(MI.getOperand(1).getReg());
2241     else {
2242       assert(CondRegs.size() == 1);
2243     }
2244 
2245     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2246     if (CondBank == &AMDGPU::SGPRRegBank) {
2247       MachineIRBuilder B(MI);
2248       const LLT S32 = LLT::scalar(32);
2249       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2250       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2251 
2252       MI.getOperand(1).setReg(NewCondReg);
2253       B.buildZExt(NewCondReg, CondRegs[0]);
2254     }
2255 
2256     if (DstTy.getSizeInBits() != 64)
2257       break;
2258 
2259     MachineIRBuilder B(MI);
2260     LLT HalfTy = getHalfSizedType(DstTy);
2261 
2262     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2263     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2264     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2265 
2266     // All inputs are SGPRs, nothing special to do.
2267     if (DefRegs.empty()) {
2268       assert(Src1Regs.empty() && Src2Regs.empty());
2269       break;
2270     }
2271 
2272     if (Src1Regs.empty())
2273       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2274     else {
2275       setRegsToType(MRI, Src1Regs, HalfTy);
2276     }
2277 
2278     if (Src2Regs.empty())
2279       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2280     else
2281       setRegsToType(MRI, Src2Regs, HalfTy);
2282 
2283     setRegsToType(MRI, DefRegs, HalfTy);
2284 
2285     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2286     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2287 
2288     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2289     MI.eraseFromParent();
2290     return;
2291   }
2292   case AMDGPU::G_BRCOND: {
2293     Register CondReg = MI.getOperand(0).getReg();
2294     // FIXME: Should use legalizer helper, but should change bool ext type.
2295     const RegisterBank *CondBank =
2296       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2297 
2298     if (CondBank == &AMDGPU::SGPRRegBank) {
2299       MachineIRBuilder B(MI);
2300       const LLT S32 = LLT::scalar(32);
2301       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2302       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2303 
2304       MI.getOperand(0).setReg(NewCondReg);
2305       B.buildZExt(NewCondReg, CondReg);
2306       return;
2307     }
2308 
2309     break;
2310   }
2311   case AMDGPU::G_AND:
2312   case AMDGPU::G_OR:
2313   case AMDGPU::G_XOR: {
2314     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2315     // there is a VGPR input.
2316     Register DstReg = MI.getOperand(0).getReg();
2317     LLT DstTy = MRI.getType(DstReg);
2318 
2319     if (DstTy.getSizeInBits() == 1) {
2320       const RegisterBank *DstBank =
2321         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2322       if (DstBank == &AMDGPU::VCCRegBank)
2323         break;
2324 
2325       MachineFunction *MF = MI.getParent()->getParent();
2326       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2327       MachineIRBuilder B(MI, ApplyBank);
2328       LegalizerHelper Helper(*MF, ApplyBank, B);
2329 
2330       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2331           LegalizerHelper::Legalized)
2332         llvm_unreachable("widen scalar should have succeeded");
2333       return;
2334     }
2335 
2336     if (DstTy.getSizeInBits() != 64)
2337       break;
2338 
2339     LLT HalfTy = getHalfSizedType(DstTy);
2340     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2341     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2342     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2343 
2344     // All inputs are SGPRs, nothing special to do.
2345     if (DefRegs.empty()) {
2346       assert(Src0Regs.empty() && Src1Regs.empty());
2347       break;
2348     }
2349 
2350     assert(DefRegs.size() == 2);
2351     assert(Src0Regs.size() == Src1Regs.size() &&
2352            (Src0Regs.empty() || Src0Regs.size() == 2));
2353 
2354     // Depending on where the source registers came from, the generic code may
2355     // have decided to split the inputs already or not. If not, we still need to
2356     // extract the values.
2357     MachineIRBuilder B(MI);
2358 
2359     if (Src0Regs.empty())
2360       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2361     else
2362       setRegsToType(MRI, Src0Regs, HalfTy);
2363 
2364     if (Src1Regs.empty())
2365       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2366     else
2367       setRegsToType(MRI, Src1Regs, HalfTy);
2368 
2369     setRegsToType(MRI, DefRegs, HalfTy);
2370 
2371     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2372     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2373 
2374     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2375     MI.eraseFromParent();
2376     return;
2377   }
2378   case AMDGPU::G_ABS: {
2379     Register SrcReg = MI.getOperand(1).getReg();
2380     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2381 
2382     // There is no VALU abs instruction so we need to replace it with a sub and
2383     // max combination.
2384     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2385       MachineFunction *MF = MI.getParent()->getParent();
2386       ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2387       MachineIRBuilder B(MI, Apply);
2388       LegalizerHelper Helper(*MF, Apply, B);
2389 
2390       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2391         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2392       return;
2393     }
2394     LLVM_FALLTHROUGH;
2395   }
2396   case AMDGPU::G_ADD:
2397   case AMDGPU::G_SUB:
2398   case AMDGPU::G_MUL:
2399   case AMDGPU::G_SHL:
2400   case AMDGPU::G_LSHR:
2401   case AMDGPU::G_ASHR:
2402   case AMDGPU::G_SMIN:
2403   case AMDGPU::G_SMAX:
2404   case AMDGPU::G_UMIN:
2405   case AMDGPU::G_UMAX: {
2406     Register DstReg = MI.getOperand(0).getReg();
2407     LLT DstTy = MRI.getType(DstReg);
2408 
2409     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2410     // Packed 16-bit operations need to be scalarized and promoted.
2411     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2412       break;
2413 
2414     const RegisterBank *DstBank =
2415       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2416     if (DstBank == &AMDGPU::VGPRRegBank)
2417       break;
2418 
2419     const LLT S32 = LLT::scalar(32);
2420     MachineBasicBlock *MBB = MI.getParent();
2421     MachineFunction *MF = MBB->getParent();
2422     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2423     MachineIRBuilder B(MI, ApplySALU);
2424 
2425     if (DstTy.isVector()) {
2426       Register WideSrc0Lo, WideSrc0Hi;
2427       Register WideSrc1Lo, WideSrc1Hi;
2428 
2429       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2430       std::tie(WideSrc0Lo, WideSrc0Hi)
2431         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2432       std::tie(WideSrc1Lo, WideSrc1Hi)
2433         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2434       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2435       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2436       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2437       MI.eraseFromParent();
2438     } else {
2439       LegalizerHelper Helper(*MF, ApplySALU, B);
2440 
2441       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2442         llvm_unreachable("widen scalar should have succeeded");
2443 
2444       // FIXME: s16 shift amounts should be legal.
2445       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2446           Opc == AMDGPU::G_ASHR) {
2447         B.setInsertPt(*MBB, MI.getIterator());
2448         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2449           llvm_unreachable("widen scalar should have succeeded");
2450       }
2451     }
2452 
2453     return;
2454   }
2455   case AMDGPU::G_SEXT_INREG: {
2456     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2457     if (SrcRegs.empty())
2458       break; // Nothing to repair
2459 
2460     const LLT S32 = LLT::scalar(32);
2461     MachineIRBuilder B(MI);
2462     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2463     GISelObserverWrapper Observer(&O);
2464     B.setChangeObserver(Observer);
2465 
2466     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2467     // we would need to further expand, and doesn't let us directly set the
2468     // result registers.
2469     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2470 
2471     int Amt = MI.getOperand(2).getImm();
2472     if (Amt <= 32) {
2473       if (Amt == 32) {
2474         // The low bits are unchanged.
2475         B.buildCopy(DstRegs[0], SrcRegs[0]);
2476       } else {
2477         // Extend in the low bits and propagate the sign bit to the high half.
2478         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2479       }
2480 
2481       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2482     } else {
2483       // The low bits are unchanged, and extend in the high bits.
2484       B.buildCopy(DstRegs[0], SrcRegs[0]);
2485       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2486     }
2487 
2488     Register DstReg = MI.getOperand(0).getReg();
2489     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2490     MI.eraseFromParent();
2491     return;
2492   }
2493   case AMDGPU::G_CTPOP:
2494   case AMDGPU::G_BITREVERSE:
2495   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2496   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2497     const RegisterBank *DstBank =
2498       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2499     if (DstBank == &AMDGPU::SGPRRegBank)
2500       break;
2501 
2502     Register SrcReg = MI.getOperand(1).getReg();
2503     const LLT S32 = LLT::scalar(32);
2504     LLT Ty = MRI.getType(SrcReg);
2505     if (Ty == S32)
2506       break;
2507 
2508     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2509     MachineIRBuilder B(MI, ApplyVALU);
2510 
2511     MachineFunction &MF = B.getMF();
2512     LegalizerHelper Helper(MF, ApplyVALU, B);
2513 
2514     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2515       llvm_unreachable("narrowScalar should have succeeded");
2516     return;
2517   }
2518   case AMDGPU::G_SEXT:
2519   case AMDGPU::G_ZEXT:
2520   case AMDGPU::G_ANYEXT: {
2521     Register SrcReg = MI.getOperand(1).getReg();
2522     LLT SrcTy = MRI.getType(SrcReg);
2523     const bool Signed = Opc == AMDGPU::G_SEXT;
2524 
2525     assert(empty(OpdMapper.getVRegs(1)));
2526 
2527     MachineIRBuilder B(MI);
2528     const RegisterBank *SrcBank =
2529       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2530 
2531     Register DstReg = MI.getOperand(0).getReg();
2532     LLT DstTy = MRI.getType(DstReg);
2533     if (DstTy.isScalar() &&
2534         SrcBank != &AMDGPU::SGPRRegBank &&
2535         SrcBank != &AMDGPU::VCCRegBank &&
2536         // FIXME: Should handle any type that round to s64 when irregular
2537         // breakdowns supported.
2538         DstTy.getSizeInBits() == 64 &&
2539         SrcTy.getSizeInBits() <= 32) {
2540       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2541 
2542       // Extend to 32-bit, and then extend the low half.
2543       if (Signed) {
2544         // TODO: Should really be buildSExtOrCopy
2545         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2546       } else if (Opc == AMDGPU::G_ZEXT) {
2547         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2548       } else {
2549         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2550       }
2551 
2552       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2553       MRI.setRegBank(DstReg, *SrcBank);
2554       MI.eraseFromParent();
2555       return;
2556     }
2557 
2558     if (SrcTy != LLT::scalar(1))
2559       return;
2560 
2561     // It is not legal to have a legalization artifact with a VCC source. Rather
2562     // than introducing a copy, insert the select we would have to select the
2563     // copy to.
2564     if (SrcBank == &AMDGPU::VCCRegBank) {
2565       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2566 
2567       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2568 
2569       unsigned DstSize = DstTy.getSizeInBits();
2570       // 64-bit select is SGPR only
2571       const bool UseSel64 = DstSize > 32 &&
2572         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2573 
2574       // TODO: Should s16 select be legal?
2575       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2576       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2577       auto False = B.buildConstant(SelType, 0);
2578 
2579       MRI.setRegBank(True.getReg(0), *DstBank);
2580       MRI.setRegBank(False.getReg(0), *DstBank);
2581       MRI.setRegBank(DstReg, *DstBank);
2582 
2583       if (DstSize > 32) {
2584         B.buildSelect(DefRegs[0], SrcReg, True, False);
2585         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2586       } else if (DstSize < 32) {
2587         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2588         MRI.setRegBank(Sel.getReg(0), *DstBank);
2589         B.buildTrunc(DstReg, Sel);
2590       } else {
2591         B.buildSelect(DstReg, SrcReg, True, False);
2592       }
2593 
2594       MI.eraseFromParent();
2595       return;
2596     }
2597 
2598     break;
2599   }
2600   case AMDGPU::G_BUILD_VECTOR:
2601   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2602     Register DstReg = MI.getOperand(0).getReg();
2603     LLT DstTy = MRI.getType(DstReg);
2604     if (DstTy != LLT::fixed_vector(2, 16))
2605       break;
2606 
2607     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2608     substituteSimpleCopyRegs(OpdMapper, 1);
2609     substituteSimpleCopyRegs(OpdMapper, 2);
2610 
2611     const RegisterBank *DstBank =
2612       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2613     if (DstBank == &AMDGPU::SGPRRegBank)
2614       break; // Can use S_PACK_* instructions.
2615 
2616     MachineIRBuilder B(MI);
2617 
2618     Register Lo = MI.getOperand(1).getReg();
2619     Register Hi = MI.getOperand(2).getReg();
2620     const LLT S32 = LLT::scalar(32);
2621 
2622     const RegisterBank *BankLo =
2623       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2624     const RegisterBank *BankHi =
2625       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2626 
2627     Register ZextLo;
2628     Register ShiftHi;
2629 
2630     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2631       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2632       MRI.setRegBank(ZextLo, *BankLo);
2633 
2634       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2635       MRI.setRegBank(ZextHi, *BankHi);
2636 
2637       auto ShiftAmt = B.buildConstant(S32, 16);
2638       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2639 
2640       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2641       MRI.setRegBank(ShiftHi, *BankHi);
2642     } else {
2643       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2644       MRI.setRegBank(MaskLo, *BankLo);
2645 
2646       auto ShiftAmt = B.buildConstant(S32, 16);
2647       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2648 
2649       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2650       MRI.setRegBank(ShiftHi, *BankHi);
2651 
2652       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2653       MRI.setRegBank(ZextLo, *BankLo);
2654     }
2655 
2656     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2657     MRI.setRegBank(Or.getReg(0), *DstBank);
2658 
2659     B.buildBitcast(DstReg, Or);
2660     MI.eraseFromParent();
2661     return;
2662   }
2663   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2664     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2665 
2666     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2667 
2668     Register DstReg = MI.getOperand(0).getReg();
2669     Register SrcReg = MI.getOperand(1).getReg();
2670 
2671     const LLT S32 = LLT::scalar(32);
2672     LLT DstTy = MRI.getType(DstReg);
2673     LLT SrcTy = MRI.getType(SrcReg);
2674 
2675     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2676       return;
2677 
2678     MachineIRBuilder B(MI);
2679 
2680     const ValueMapping &DstMapping
2681       = OpdMapper.getInstrMapping().getOperandMapping(0);
2682     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2683     const RegisterBank *SrcBank =
2684       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2685     const RegisterBank *IdxBank =
2686         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2687 
2688     Register BaseIdxReg;
2689     unsigned ConstOffset;
2690     std::tie(BaseIdxReg, ConstOffset) =
2691         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2692 
2693     // See if the index is an add of a constant which will be foldable by moving
2694     // the base register of the index later if this is going to be executed in a
2695     // waterfall loop. This is essentially to reassociate the add of a constant
2696     // with the readfirstlane.
2697     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2698                                    ConstOffset > 0 &&
2699                                    ConstOffset < SrcTy.getNumElements();
2700 
2701     // Move the base register. We'll re-insert the add later.
2702     if (ShouldMoveIndexIntoLoop)
2703       MI.getOperand(2).setReg(BaseIdxReg);
2704 
2705     // If this is a VGPR result only because the index was a VGPR result, the
2706     // actual indexing will be done on the SGPR source vector, which will
2707     // produce a scalar result. We need to copy to the VGPR result inside the
2708     // waterfall loop.
2709     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2710                                 SrcBank == &AMDGPU::SGPRRegBank;
2711     if (DstRegs.empty()) {
2712       applyDefaultMapping(OpdMapper);
2713 
2714       executeInWaterfallLoop(MI, MRI, { 2 });
2715 
2716       if (NeedCopyToVGPR) {
2717         // We don't want a phi for this temporary reg.
2718         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2719         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2720         MI.getOperand(0).setReg(TmpReg);
2721         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2722 
2723         // Use a v_mov_b32 here to make the exec dependency explicit.
2724         buildVCopy(B, DstReg, TmpReg);
2725       }
2726 
2727       // Re-insert the constant offset add inside the waterfall loop.
2728       if (ShouldMoveIndexIntoLoop)
2729         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2730 
2731       return;
2732     }
2733 
2734     assert(DstTy.getSizeInBits() == 64);
2735 
2736     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2737 
2738     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2739     auto One = B.buildConstant(S32, 1);
2740 
2741     MachineBasicBlock::iterator MII = MI.getIterator();
2742 
2743     // Split the vector index into 32-bit pieces. Prepare to move all of the
2744     // new instructions into a waterfall loop if necessary.
2745     //
2746     // Don't put the bitcast or constant in the loop.
2747     MachineInstrSpan Span(MII, &B.getMBB());
2748 
2749     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2750     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2751     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2752 
2753     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2754     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2755 
2756     MRI.setRegBank(DstReg, *DstBank);
2757     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2758     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2759     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2760     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2761 
2762     SmallSet<Register, 4> OpsToWaterfall;
2763     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2764       MI.eraseFromParent();
2765       return;
2766     }
2767 
2768     // Remove the original instruction to avoid potentially confusing the
2769     // waterfall loop logic.
2770     B.setInstr(*Span.begin());
2771     MI.eraseFromParent();
2772     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2773                            OpsToWaterfall, MRI);
2774 
2775     if (NeedCopyToVGPR) {
2776       MachineBasicBlock *LoopBB = Extract1->getParent();
2777       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2778       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2779       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2780       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2781 
2782       Extract0->getOperand(0).setReg(TmpReg0);
2783       Extract1->getOperand(0).setReg(TmpReg1);
2784 
2785       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2786 
2787       buildVCopy(B, DstRegs[0], TmpReg0);
2788       buildVCopy(B, DstRegs[1], TmpReg1);
2789     }
2790 
2791     if (ShouldMoveIndexIntoLoop)
2792       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2793 
2794     return;
2795   }
2796   case AMDGPU::G_INSERT_VECTOR_ELT: {
2797     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2798 
2799     Register DstReg = MI.getOperand(0).getReg();
2800     LLT VecTy = MRI.getType(DstReg);
2801 
2802     assert(OpdMapper.getVRegs(0).empty());
2803     assert(OpdMapper.getVRegs(3).empty());
2804 
2805     if (substituteSimpleCopyRegs(OpdMapper, 1))
2806       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2807 
2808     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2809       return;
2810 
2811     const RegisterBank *IdxBank =
2812       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2813 
2814     Register SrcReg = MI.getOperand(1).getReg();
2815     Register InsReg = MI.getOperand(2).getReg();
2816     LLT InsTy = MRI.getType(InsReg);
2817     (void)InsTy;
2818 
2819     Register BaseIdxReg;
2820     unsigned ConstOffset;
2821     std::tie(BaseIdxReg, ConstOffset) =
2822         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2823 
2824     // See if the index is an add of a constant which will be foldable by moving
2825     // the base register of the index later if this is going to be executed in a
2826     // waterfall loop. This is essentially to reassociate the add of a constant
2827     // with the readfirstlane.
2828     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2829       ConstOffset > 0 &&
2830       ConstOffset < VecTy.getNumElements();
2831 
2832     // Move the base register. We'll re-insert the add later.
2833     if (ShouldMoveIndexIntoLoop)
2834       MI.getOperand(3).setReg(BaseIdxReg);
2835 
2836 
2837     if (InsRegs.empty()) {
2838       executeInWaterfallLoop(MI, MRI, { 3 });
2839 
2840       // Re-insert the constant offset add inside the waterfall loop.
2841       if (ShouldMoveIndexIntoLoop) {
2842         MachineIRBuilder B(MI);
2843         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2844       }
2845 
2846       return;
2847     }
2848 
2849 
2850     assert(InsTy.getSizeInBits() == 64);
2851 
2852     const LLT S32 = LLT::scalar(32);
2853     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2854 
2855     MachineIRBuilder B(MI);
2856     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2857     auto One = B.buildConstant(S32, 1);
2858 
2859     // Split the vector index into 32-bit pieces. Prepare to move all of the
2860     // new instructions into a waterfall loop if necessary.
2861     //
2862     // Don't put the bitcast or constant in the loop.
2863     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2864 
2865     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2866     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2867     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2868 
2869     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2870     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2871 
2872     const RegisterBank *DstBank =
2873       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2874     const RegisterBank *SrcBank =
2875       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2876     const RegisterBank *InsSrcBank =
2877       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2878 
2879     MRI.setRegBank(InsReg, *InsSrcBank);
2880     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2881     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2882     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2883     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2884     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2885     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2886 
2887 
2888     SmallSet<Register, 4> OpsToWaterfall;
2889     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2890       B.setInsertPt(B.getMBB(), MI);
2891       B.buildBitcast(DstReg, InsHi);
2892       MI.eraseFromParent();
2893       return;
2894     }
2895 
2896     B.setInstr(*Span.begin());
2897     MI.eraseFromParent();
2898 
2899     // Figure out the point after the waterfall loop before mangling the control
2900     // flow.
2901     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2902                            OpsToWaterfall, MRI);
2903 
2904     // The insertion point is now right after the original instruction.
2905     //
2906     // Keep the bitcast to the original vector type out of the loop. Doing this
2907     // saved an extra phi we don't need inside the loop.
2908     B.buildBitcast(DstReg, InsHi);
2909 
2910     // Re-insert the constant offset add inside the waterfall loop.
2911     if (ShouldMoveIndexIntoLoop)
2912       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2913 
2914     return;
2915   }
2916   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2917   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2918   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2919   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2920   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2921   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2922   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2923   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2924   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2925   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2926   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2927   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2928   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2929   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2930   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2931   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2932     applyDefaultMapping(OpdMapper);
2933     executeInWaterfallLoop(MI, MRI, {1, 4});
2934     return;
2935   }
2936   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2937   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2938   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2939   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2940   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2941   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2942   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2943   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2944   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2945   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2946   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2947   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2948     applyDefaultMapping(OpdMapper);
2949     executeInWaterfallLoop(MI, MRI, {2, 5});
2950     return;
2951   }
2952   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2953   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2954   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2955     applyDefaultMapping(OpdMapper);
2956     executeInWaterfallLoop(MI, MRI, {2, 5});
2957     return;
2958   }
2959   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2960     applyDefaultMapping(OpdMapper);
2961     executeInWaterfallLoop(MI, MRI, {3, 6});
2962     return;
2963   }
2964   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2965     applyMappingSBufferLoad(OpdMapper);
2966     return;
2967   }
2968   case AMDGPU::G_INTRINSIC: {
2969     switch (MI.getIntrinsicID()) {
2970     case Intrinsic::amdgcn_readlane: {
2971       substituteSimpleCopyRegs(OpdMapper, 2);
2972 
2973       assert(OpdMapper.getVRegs(0).empty());
2974       assert(OpdMapper.getVRegs(3).empty());
2975 
2976       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2977       // waterfall loop, so assume it's a uniform value.
2978       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2979       return;
2980     }
2981     case Intrinsic::amdgcn_writelane: {
2982       assert(OpdMapper.getVRegs(0).empty());
2983       assert(OpdMapper.getVRegs(2).empty());
2984       assert(OpdMapper.getVRegs(3).empty());
2985 
2986       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2987       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2988       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2989       return;
2990     }
2991     case Intrinsic::amdgcn_interp_p1:
2992     case Intrinsic::amdgcn_interp_p2:
2993     case Intrinsic::amdgcn_interp_mov:
2994     case Intrinsic::amdgcn_interp_p1_f16:
2995     case Intrinsic::amdgcn_interp_p2_f16: {
2996       applyDefaultMapping(OpdMapper);
2997 
2998       // Readlane for m0 value, which is always the last operand.
2999       // FIXME: Should this be a waterfall loop instead?
3000       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3001       return;
3002     }
3003     case Intrinsic::amdgcn_permlane16:
3004     case Intrinsic::amdgcn_permlanex16: {
3005       // Doing a waterfall loop over these wouldn't make any sense.
3006       substituteSimpleCopyRegs(OpdMapper, 2);
3007       substituteSimpleCopyRegs(OpdMapper, 3);
3008       constrainOpWithReadfirstlane(MI, MRI, 4);
3009       constrainOpWithReadfirstlane(MI, MRI, 5);
3010       return;
3011     }
3012     case Intrinsic::amdgcn_sbfe:
3013       applyMappingBFE(OpdMapper, true);
3014       return;
3015     case Intrinsic::amdgcn_ubfe:
3016       applyMappingBFE(OpdMapper, false);
3017       return;
3018     case Intrinsic::amdgcn_ballot:
3019       // Use default handling and insert copy to vcc source.
3020       break;
3021     }
3022     break;
3023   }
3024   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3025   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3026     const AMDGPU::RsrcIntrinsic *RSrcIntrin
3027       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3028     assert(RSrcIntrin && RSrcIntrin->IsImage);
3029     // Non-images can have complications from operands that allow both SGPR
3030     // and VGPR. For now it's too complicated to figure out the final opcode
3031     // to derive the register bank from the MCInstrDesc.
3032     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3033     return;
3034   }
3035   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3036     unsigned N = MI.getNumExplicitOperands() - 2;
3037     executeInWaterfallLoop(MI, MRI, { N });
3038     return;
3039   }
3040   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3041     auto IntrID = MI.getIntrinsicID();
3042     switch (IntrID) {
3043     case Intrinsic::amdgcn_ds_ordered_add:
3044     case Intrinsic::amdgcn_ds_ordered_swap: {
3045       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3046       assert(OpdMapper.getVRegs(0).empty());
3047       substituteSimpleCopyRegs(OpdMapper, 3);
3048       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3049       return;
3050     }
3051     case Intrinsic::amdgcn_ds_gws_init:
3052     case Intrinsic::amdgcn_ds_gws_barrier:
3053     case Intrinsic::amdgcn_ds_gws_sema_br: {
3054       // Only the first lane is executes, so readfirstlane is safe.
3055       substituteSimpleCopyRegs(OpdMapper, 1);
3056       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3057       return;
3058     }
3059     case Intrinsic::amdgcn_ds_gws_sema_v:
3060     case Intrinsic::amdgcn_ds_gws_sema_p:
3061     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3062       // Only the first lane is executes, so readfirstlane is safe.
3063       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3064       return;
3065     }
3066     case Intrinsic::amdgcn_ds_append:
3067     case Intrinsic::amdgcn_ds_consume: {
3068       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3069       return;
3070     }
3071     case Intrinsic::amdgcn_s_sendmsg:
3072     case Intrinsic::amdgcn_s_sendmsghalt: {
3073       // FIXME: Should this use a waterfall loop?
3074       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3075       return;
3076     }
3077     case Intrinsic::amdgcn_s_setreg: {
3078       constrainOpWithReadfirstlane(MI, MRI, 2);
3079       return;
3080     }
3081     default: {
3082       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3083               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3084         // Non-images can have complications from operands that allow both SGPR
3085         // and VGPR. For now it's too complicated to figure out the final opcode
3086         // to derive the register bank from the MCInstrDesc.
3087         if (RSrcIntrin->IsImage) {
3088           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3089           return;
3090         }
3091       }
3092 
3093       break;
3094     }
3095     }
3096     break;
3097   }
3098   case AMDGPU::G_LOAD:
3099   case AMDGPU::G_ZEXTLOAD:
3100   case AMDGPU::G_SEXTLOAD: {
3101     if (applyMappingLoad(MI, OpdMapper, MRI))
3102       return;
3103     break;
3104   }
3105   case AMDGPU::G_DYN_STACKALLOC:
3106     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3107     return;
3108   case AMDGPU::G_SBFX:
3109     applyMappingBFE(OpdMapper, /*Signed*/ true);
3110     return;
3111   case AMDGPU::G_UBFX:
3112     applyMappingBFE(OpdMapper, /*Signed*/ false);
3113     return;
3114   default:
3115     break;
3116   }
3117 
3118   return applyDefaultMapping(OpdMapper);
3119 }
3120 
3121 // vgpr, sgpr -> vgpr
3122 // vgpr, agpr -> vgpr
3123 // agpr, agpr -> agpr
3124 // agpr, sgpr -> vgpr
3125 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3126   if (RB0 == AMDGPU::InvalidRegBankID)
3127     return RB1;
3128   if (RB1 == AMDGPU::InvalidRegBankID)
3129     return RB0;
3130 
3131   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3132     return AMDGPU::SGPRRegBankID;
3133 
3134   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3135     return AMDGPU::AGPRRegBankID;
3136 
3137   return AMDGPU::VGPRRegBankID;
3138 }
3139 
3140 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3141   if (RB0 == AMDGPU::InvalidRegBankID)
3142     return RB1;
3143   if (RB1 == AMDGPU::InvalidRegBankID)
3144     return RB0;
3145 
3146   // vcc, vcc -> vcc
3147   // vcc, sgpr -> vcc
3148   // vcc, vgpr -> vcc
3149   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3150     return AMDGPU::VCCRegBankID;
3151 
3152   // vcc, vgpr -> vgpr
3153   return regBankUnion(RB0, RB1);
3154 }
3155 
3156 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3157                                                 const MachineInstr &MI) const {
3158   unsigned RegBank = AMDGPU::InvalidRegBankID;
3159 
3160   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3161     if (!MI.getOperand(i).isReg())
3162       continue;
3163     Register Reg = MI.getOperand(i).getReg();
3164     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3165       RegBank = regBankUnion(RegBank, Bank->getID());
3166       if (RegBank == AMDGPU::VGPRRegBankID)
3167         break;
3168     }
3169   }
3170 
3171   return RegBank;
3172 }
3173 
3174 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3175   const MachineFunction &MF = *MI.getParent()->getParent();
3176   const MachineRegisterInfo &MRI = MF.getRegInfo();
3177   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3178     if (!MI.getOperand(i).isReg())
3179       continue;
3180     Register Reg = MI.getOperand(i).getReg();
3181     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3182       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3183         return false;
3184     }
3185   }
3186   return true;
3187 }
3188 
3189 const RegisterBankInfo::InstructionMapping &
3190 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3191   const MachineFunction &MF = *MI.getParent()->getParent();
3192   const MachineRegisterInfo &MRI = MF.getRegInfo();
3193   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3194 
3195   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3196     const MachineOperand &SrcOp = MI.getOperand(i);
3197     if (!SrcOp.isReg())
3198       continue;
3199 
3200     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3201     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3202   }
3203   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3204                                MI.getNumOperands());
3205 }
3206 
3207 const RegisterBankInfo::InstructionMapping &
3208 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3209   const MachineFunction &MF = *MI.getParent()->getParent();
3210   const MachineRegisterInfo &MRI = MF.getRegInfo();
3211   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3212 
3213   // Even though we technically could use SGPRs, this would require knowledge of
3214   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3215   //
3216   // TODO: Unary ops are trivially OK, so accept SGPRs?
3217   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3218     const MachineOperand &Src = MI.getOperand(i);
3219     if (!Src.isReg())
3220       continue;
3221 
3222     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3223     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3224     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3225   }
3226 
3227   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3228                                MI.getNumOperands());
3229 }
3230 
3231 const RegisterBankInfo::InstructionMapping &
3232 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3233   const MachineFunction &MF = *MI.getParent()->getParent();
3234   const MachineRegisterInfo &MRI = MF.getRegInfo();
3235   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3236 
3237   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3238     const MachineOperand &Op = MI.getOperand(I);
3239     if (!Op.isReg())
3240       continue;
3241 
3242     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3243     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3244   }
3245 
3246   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3247                                MI.getNumOperands());
3248 }
3249 
3250 const RegisterBankInfo::InstructionMapping &
3251 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3252                                         const MachineInstr &MI,
3253                                         int RsrcIdx) const {
3254   // The reported argument index is relative to the IR intrinsic call arguments,
3255   // so we need to shift by the number of defs and the intrinsic ID.
3256   RsrcIdx += MI.getNumExplicitDefs() + 1;
3257 
3258   const int NumOps = MI.getNumOperands();
3259   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3260 
3261   // TODO: Should packed/unpacked D16 difference be reported here as part of
3262   // the value mapping?
3263   for (int I = 0; I != NumOps; ++I) {
3264     if (!MI.getOperand(I).isReg())
3265       continue;
3266 
3267     Register OpReg = MI.getOperand(I).getReg();
3268     // We replace some dead address operands with $noreg
3269     if (!OpReg)
3270       continue;
3271 
3272     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3273 
3274     // FIXME: Probably need a new intrinsic register bank searchable table to
3275     // handle arbitrary intrinsics easily.
3276     //
3277     // If this has a sampler, it immediately follows rsrc.
3278     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3279 
3280     if (MustBeSGPR) {
3281       // If this must be an SGPR, so we must report whatever it is as legal.
3282       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3283       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3284     } else {
3285       // Some operands must be VGPR, and these are easy to copy to.
3286       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3287     }
3288   }
3289 
3290   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3291 }
3292 
3293 /// Return the mapping for a pointer arugment.
3294 const RegisterBankInfo::ValueMapping *
3295 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3296                                               Register PtrReg) const {
3297   LLT PtrTy = MRI.getType(PtrReg);
3298   unsigned Size = PtrTy.getSizeInBits();
3299   if (Subtarget.useFlatForGlobal() ||
3300       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3301     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3302 
3303   // If we're using MUBUF instructions for global memory, an SGPR base register
3304   // is possible. Otherwise this needs to be a VGPR.
3305   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3306   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3307 }
3308 
3309 const RegisterBankInfo::InstructionMapping &
3310 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3311 
3312   const MachineFunction &MF = *MI.getParent()->getParent();
3313   const MachineRegisterInfo &MRI = MF.getRegInfo();
3314   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3315   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3316   Register PtrReg = MI.getOperand(1).getReg();
3317   LLT PtrTy = MRI.getType(PtrReg);
3318   unsigned AS = PtrTy.getAddressSpace();
3319   unsigned PtrSize = PtrTy.getSizeInBits();
3320 
3321   const ValueMapping *ValMapping;
3322   const ValueMapping *PtrMapping;
3323 
3324   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3325 
3326   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3327     if (isScalarLoadLegal(MI)) {
3328       // We have a uniform instruction so we want to use an SMRD load
3329       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3330       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3331     } else {
3332       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3333 
3334       // If we're using MUBUF instructions for global memory, an SGPR base
3335       // register is possible. Otherwise this needs to be a VGPR.
3336       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3337         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3338 
3339       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3340     }
3341   } else {
3342     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3343     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3344   }
3345 
3346   OpdsMapping[0] = ValMapping;
3347   OpdsMapping[1] = PtrMapping;
3348   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3349       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3350   return Mapping;
3351 
3352   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3353   // handle that during instruction selection?
3354 }
3355 
3356 unsigned
3357 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3358                                      const MachineRegisterInfo &MRI,
3359                                      unsigned Default) const {
3360   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3361   return Bank ? Bank->getID() : Default;
3362 }
3363 
3364 const RegisterBankInfo::ValueMapping *
3365 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3366                                          const MachineRegisterInfo &MRI,
3367                                          const TargetRegisterInfo &TRI) const {
3368   // Lie and claim anything is legal, even though this needs to be an SGPR
3369   // applyMapping will have to deal with it as a waterfall loop.
3370   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3371   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3372   return AMDGPU::getValueMapping(Bank, Size);
3373 }
3374 
3375 const RegisterBankInfo::ValueMapping *
3376 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3377                                          const MachineRegisterInfo &MRI,
3378                                          const TargetRegisterInfo &TRI) const {
3379   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3380   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3381 }
3382 
3383 const RegisterBankInfo::ValueMapping *
3384 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3385                                          const MachineRegisterInfo &MRI,
3386                                          const TargetRegisterInfo &TRI) const {
3387   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3388   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3389 }
3390 
3391 ///
3392 /// This function must return a legal mapping, because
3393 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3394 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3395 /// VGPR to SGPR generated is illegal.
3396 ///
3397 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3398 // legal. These will be dealt with in applyMappingImpl.
3399 //
3400 const RegisterBankInfo::InstructionMapping &
3401 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3402   const MachineFunction &MF = *MI.getParent()->getParent();
3403   const MachineRegisterInfo &MRI = MF.getRegInfo();
3404 
3405   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3406     // The default logic bothers to analyze impossible alternative mappings. We
3407     // want the most straightforward mapping, so just directly handle this.
3408     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3409                                              *TRI);
3410     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3411                                              *TRI);
3412     assert(SrcBank && "src bank should have been assigned already");
3413     if (!DstBank)
3414       DstBank = SrcBank;
3415 
3416     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3417     if (cannotCopy(*DstBank, *SrcBank, Size))
3418       return getInvalidInstructionMapping();
3419 
3420     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3421     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3422     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3423     OpdsMapping[0] = &ValMap;
3424     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3425       OpdsMapping[1] = &ValMap;
3426 
3427     return getInstructionMapping(
3428         1, /*Cost*/ 1,
3429         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3430   }
3431 
3432   if (MI.isRegSequence()) {
3433     // If any input is a VGPR, the result must be a VGPR. The default handling
3434     // assumes any copy between banks is legal.
3435     unsigned BankID = AMDGPU::SGPRRegBankID;
3436 
3437     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3438       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3439       // It doesn't make sense to use vcc or scc banks here, so just ignore
3440       // them.
3441       if (OpBank != AMDGPU::SGPRRegBankID) {
3442         BankID = AMDGPU::VGPRRegBankID;
3443         break;
3444       }
3445     }
3446     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3447 
3448     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3449     return getInstructionMapping(
3450         1, /*Cost*/ 1,
3451         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3452   }
3453 
3454   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3455   // properly.
3456   //
3457   // TODO: There are additional exec masking dependencies to analyze.
3458   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3459     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3460     Register DstReg = MI.getOperand(0).getReg();
3461 
3462     // Sometimes the result may have already been assigned a bank.
3463     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3464       ResultBank = DstBank->getID();
3465 
3466     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3467       Register Reg = MI.getOperand(I).getReg();
3468       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3469 
3470       // FIXME: Assuming VGPR for any undetermined inputs.
3471       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3472         ResultBank = AMDGPU::VGPRRegBankID;
3473         break;
3474       }
3475 
3476       // FIXME: Need to promote SGPR case to s32
3477       unsigned OpBank = Bank->getID();
3478       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3479     }
3480 
3481     assert(ResultBank != AMDGPU::InvalidRegBankID);
3482 
3483     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3484 
3485     const ValueMapping &ValMap =
3486         getValueMapping(0, Size, getRegBank(ResultBank));
3487     return getInstructionMapping(
3488         1, /*Cost*/ 1,
3489         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3490   }
3491 
3492   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3493   if (Mapping.isValid())
3494     return Mapping;
3495 
3496   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3497 
3498   switch (MI.getOpcode()) {
3499   default:
3500     return getInvalidInstructionMapping();
3501 
3502   case AMDGPU::G_AND:
3503   case AMDGPU::G_OR:
3504   case AMDGPU::G_XOR: {
3505     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3506     if (Size == 1) {
3507       const RegisterBank *DstBank
3508         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3509 
3510       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3511       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3512       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3513       if (DstBank) {
3514         TargetBankID = DstBank->getID();
3515         if (DstBank == &AMDGPU::VCCRegBank) {
3516           TargetBankID = AMDGPU::VCCRegBankID;
3517           BankLHS = AMDGPU::VCCRegBankID;
3518           BankRHS = AMDGPU::VCCRegBankID;
3519         } else {
3520           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3521                                  AMDGPU::SGPRRegBankID);
3522           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3523                                  AMDGPU::SGPRRegBankID);
3524         }
3525       } else {
3526         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3527                                AMDGPU::VCCRegBankID);
3528         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3529                                AMDGPU::VCCRegBankID);
3530 
3531         // Both inputs should be true booleans to produce a boolean result.
3532         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3533           TargetBankID = AMDGPU::VGPRRegBankID;
3534         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3535           TargetBankID = AMDGPU::VCCRegBankID;
3536           BankLHS = AMDGPU::VCCRegBankID;
3537           BankRHS = AMDGPU::VCCRegBankID;
3538         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3539           TargetBankID = AMDGPU::SGPRRegBankID;
3540         }
3541       }
3542 
3543       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3544       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3545       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3546       break;
3547     }
3548 
3549     if (Size == 64) {
3550 
3551       if (isSALUMapping(MI)) {
3552         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3553         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3554       } else {
3555         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3556         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3557         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3558 
3559         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3560         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3561       }
3562 
3563       break;
3564     }
3565 
3566     LLVM_FALLTHROUGH;
3567   }
3568   case AMDGPU::G_PTR_ADD:
3569   case AMDGPU::G_PTRMASK:
3570   case AMDGPU::G_ADD:
3571   case AMDGPU::G_SUB:
3572   case AMDGPU::G_MUL:
3573   case AMDGPU::G_SHL:
3574   case AMDGPU::G_LSHR:
3575   case AMDGPU::G_ASHR:
3576   case AMDGPU::G_UADDO:
3577   case AMDGPU::G_USUBO:
3578   case AMDGPU::G_UADDE:
3579   case AMDGPU::G_SADDE:
3580   case AMDGPU::G_USUBE:
3581   case AMDGPU::G_SSUBE:
3582   case AMDGPU::G_SMIN:
3583   case AMDGPU::G_SMAX:
3584   case AMDGPU::G_UMIN:
3585   case AMDGPU::G_UMAX:
3586   case AMDGPU::G_ABS:
3587   case AMDGPU::G_SHUFFLE_VECTOR:
3588   case AMDGPU::G_SBFX:
3589   case AMDGPU::G_UBFX:
3590     if (isSALUMapping(MI))
3591       return getDefaultMappingSOP(MI);
3592     LLVM_FALLTHROUGH;
3593 
3594   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3595   case AMDGPU::G_SSUBSAT:
3596   case AMDGPU::G_UADDSAT:
3597   case AMDGPU::G_USUBSAT:
3598   case AMDGPU::G_FADD:
3599   case AMDGPU::G_FSUB:
3600   case AMDGPU::G_FPTOSI:
3601   case AMDGPU::G_FPTOUI:
3602   case AMDGPU::G_FMUL:
3603   case AMDGPU::G_FMA:
3604   case AMDGPU::G_FMAD:
3605   case AMDGPU::G_FSQRT:
3606   case AMDGPU::G_FFLOOR:
3607   case AMDGPU::G_FCEIL:
3608   case AMDGPU::G_FRINT:
3609   case AMDGPU::G_SITOFP:
3610   case AMDGPU::G_UITOFP:
3611   case AMDGPU::G_FPTRUNC:
3612   case AMDGPU::G_FPEXT:
3613   case AMDGPU::G_FEXP2:
3614   case AMDGPU::G_FLOG2:
3615   case AMDGPU::G_FMINNUM:
3616   case AMDGPU::G_FMAXNUM:
3617   case AMDGPU::G_FMINNUM_IEEE:
3618   case AMDGPU::G_FMAXNUM_IEEE:
3619   case AMDGPU::G_FCANONICALIZE:
3620   case AMDGPU::G_INTRINSIC_TRUNC:
3621   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3622   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3623   case AMDGPU::G_AMDGPU_FFBH_U32:
3624   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3625   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3626   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3627   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3628   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3629   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3630   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3631   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3632   case AMDGPU::G_AMDGPU_SMED3:
3633     return getDefaultMappingVOP(MI);
3634   case AMDGPU::G_UMULH:
3635   case AMDGPU::G_SMULH: {
3636     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3637       return getDefaultMappingSOP(MI);
3638     return getDefaultMappingVOP(MI);
3639   }
3640   case AMDGPU::G_IMPLICIT_DEF: {
3641     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3642     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3643     break;
3644   }
3645   case AMDGPU::G_FCONSTANT:
3646   case AMDGPU::G_CONSTANT:
3647   case AMDGPU::G_GLOBAL_VALUE:
3648   case AMDGPU::G_BLOCK_ADDR:
3649   case AMDGPU::G_READCYCLECOUNTER: {
3650     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3651     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3652     break;
3653   }
3654   case AMDGPU::G_FRAME_INDEX: {
3655     // TODO: This should be the same as other constants, but eliminateFrameIndex
3656     // currently assumes VALU uses.
3657     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3658     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3659     break;
3660   }
3661   case AMDGPU::G_DYN_STACKALLOC: {
3662     // Result is always uniform, and a wave reduction is needed for the source.
3663     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3664     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3665     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3666     break;
3667   }
3668   case AMDGPU::G_INSERT: {
3669     unsigned BankID = getMappingType(MRI, MI);
3670     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3671     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3672     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3673     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3674     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3675     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3676     OpdsMapping[3] = nullptr;
3677     break;
3678   }
3679   case AMDGPU::G_EXTRACT: {
3680     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3681     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3682     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3683     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3684     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3685     OpdsMapping[2] = nullptr;
3686     break;
3687   }
3688   case AMDGPU::G_BUILD_VECTOR:
3689   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3690     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3691     if (DstTy == LLT::fixed_vector(2, 16)) {
3692       unsigned DstSize = DstTy.getSizeInBits();
3693       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3694       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3695       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3696       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3697 
3698       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3699       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3700       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3701       break;
3702     }
3703 
3704     LLVM_FALLTHROUGH;
3705   }
3706   case AMDGPU::G_MERGE_VALUES:
3707   case AMDGPU::G_CONCAT_VECTORS: {
3708     unsigned Bank = getMappingType(MRI, MI);
3709     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3710     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3711 
3712     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3713     // Op1 and Dst should use the same register bank.
3714     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3715       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3716     break;
3717   }
3718   case AMDGPU::G_BITREVERSE:
3719   case AMDGPU::G_BITCAST:
3720   case AMDGPU::G_INTTOPTR:
3721   case AMDGPU::G_PTRTOINT:
3722   case AMDGPU::G_FABS:
3723   case AMDGPU::G_FNEG: {
3724     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3725     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3726     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3727     break;
3728   }
3729   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3730   case AMDGPU::G_CTTZ_ZERO_UNDEF:
3731   case AMDGPU::G_CTPOP: {
3732     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3733     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3734     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3735 
3736     // This should really be getValueMappingSGPR64Only, but allowing the generic
3737     // code to handle the register split just makes using LegalizerHelper more
3738     // difficult.
3739     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3740     break;
3741   }
3742   case AMDGPU::G_TRUNC: {
3743     Register Dst = MI.getOperand(0).getReg();
3744     Register Src = MI.getOperand(1).getReg();
3745     unsigned Bank = getRegBankID(Src, MRI);
3746     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3747     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3748     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3749     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3750     break;
3751   }
3752   case AMDGPU::G_ZEXT:
3753   case AMDGPU::G_SEXT:
3754   case AMDGPU::G_ANYEXT:
3755   case AMDGPU::G_SEXT_INREG: {
3756     Register Dst = MI.getOperand(0).getReg();
3757     Register Src = MI.getOperand(1).getReg();
3758     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3759     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3760 
3761     unsigned DstBank;
3762     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3763     assert(SrcBank);
3764     switch (SrcBank->getID()) {
3765     case AMDGPU::SGPRRegBankID:
3766       DstBank = AMDGPU::SGPRRegBankID;
3767       break;
3768     default:
3769       DstBank = AMDGPU::VGPRRegBankID;
3770       break;
3771     }
3772 
3773     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3774     // 32-bits, and then to 64.
3775     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3776     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3777                                                        SrcSize);
3778     break;
3779   }
3780   case AMDGPU::G_FCMP: {
3781     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3782     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3783     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3784     OpdsMapping[1] = nullptr; // Predicate Operand.
3785     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3786     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3787     break;
3788   }
3789   case AMDGPU::G_STORE: {
3790     assert(MI.getOperand(0).isReg());
3791     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3792 
3793     // FIXME: We need to specify a different reg bank once scalar stores are
3794     // supported.
3795     const ValueMapping *ValMapping =
3796         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3797     OpdsMapping[0] = ValMapping;
3798     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3799     break;
3800   }
3801   case AMDGPU::G_ICMP: {
3802     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3803     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3804 
3805     // See if the result register has already been constrained to vcc, which may
3806     // happen due to control flow intrinsic lowering.
3807     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3808                                     AMDGPU::SGPRRegBankID);
3809     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3810     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3811 
3812     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3813                      Op2Bank == AMDGPU::SGPRRegBankID &&
3814                      Op3Bank == AMDGPU::SGPRRegBankID &&
3815       (Size == 32 || (Size == 64 &&
3816                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3817                       Subtarget.hasScalarCompareEq64()));
3818 
3819     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3820     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3821 
3822     // TODO: Use 32-bit for scalar output size.
3823     // SCC results will need to be copied to a 32-bit SGPR virtual register.
3824     const unsigned ResultSize = 1;
3825 
3826     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3827     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3828     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3829     break;
3830   }
3831   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3832     // VGPR index can be used for waterfall when indexing a SGPR vector.
3833     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3834     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3835     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3836     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3837     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3838     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3839 
3840     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3841     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3842 
3843     // The index can be either if the source vector is VGPR.
3844     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3845     break;
3846   }
3847   case AMDGPU::G_INSERT_VECTOR_ELT: {
3848     unsigned OutputBankID = isSALUMapping(MI) ?
3849       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3850 
3851     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3852     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3853     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3854     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3855     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3856 
3857     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3858     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3859 
3860     // This is a weird case, because we need to break down the mapping based on
3861     // the register bank of a different operand.
3862     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3863       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3864                                                       InsertSize);
3865     } else {
3866       assert(InsertSize == 32 || InsertSize == 64);
3867       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3868     }
3869 
3870     // The index can be either if the source vector is VGPR.
3871     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3872     break;
3873   }
3874   case AMDGPU::G_UNMERGE_VALUES: {
3875     unsigned Bank = getMappingType(MRI, MI);
3876 
3877     // Op1 and Dst should use the same register bank.
3878     // FIXME: Shouldn't this be the default? Why do we need to handle this?
3879     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3880       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3881       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3882     }
3883     break;
3884   }
3885   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3886   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3887   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3888   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3889   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3890   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3891   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3892   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3893   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3894   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3895   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3896   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3897   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3898   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3899   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3900   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3901     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3902 
3903     // rsrc
3904     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3905 
3906     // vindex
3907     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3908 
3909     // voffset
3910     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3911 
3912     // soffset
3913     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3914 
3915     // Any remaining operands are immediates and were correctly null
3916     // initialized.
3917     break;
3918   }
3919   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3920   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3921   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3922   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3923   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3924   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3925   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3926   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3927   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3928   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3929   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3930   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3931   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3932   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3933   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3934     // vdata_out
3935     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3936 
3937     // vdata_in
3938     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3939 
3940     // rsrc
3941     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3942 
3943     // vindex
3944     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3945 
3946     // voffset
3947     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3948 
3949     // soffset
3950     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3951 
3952     // Any remaining operands are immediates and were correctly null
3953     // initialized.
3954     break;
3955   }
3956   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3957     // vdata_out
3958     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3959 
3960     // vdata_in
3961     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3962 
3963     // cmp
3964     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3965 
3966     // rsrc
3967     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3968 
3969     // vindex
3970     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3971 
3972     // voffset
3973     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3974 
3975     // soffset
3976     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3977 
3978     // Any remaining operands are immediates and were correctly null
3979     // initialized.
3980     break;
3981   }
3982   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3983     // Lie and claim everything is legal, even though some need to be
3984     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3985     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3986     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3987 
3988     // We need to convert this to a MUBUF if either the resource of offset is
3989     // VGPR.
3990     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3991     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3992     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3993 
3994     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3995     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3996     break;
3997   }
3998   case AMDGPU::G_INTRINSIC: {
3999     switch (MI.getIntrinsicID()) {
4000     default:
4001       return getInvalidInstructionMapping();
4002     case Intrinsic::amdgcn_div_fmas:
4003     case Intrinsic::amdgcn_div_fixup:
4004     case Intrinsic::amdgcn_trig_preop:
4005     case Intrinsic::amdgcn_sin:
4006     case Intrinsic::amdgcn_cos:
4007     case Intrinsic::amdgcn_log_clamp:
4008     case Intrinsic::amdgcn_rcp:
4009     case Intrinsic::amdgcn_rcp_legacy:
4010     case Intrinsic::amdgcn_sqrt:
4011     case Intrinsic::amdgcn_rsq:
4012     case Intrinsic::amdgcn_rsq_legacy:
4013     case Intrinsic::amdgcn_rsq_clamp:
4014     case Intrinsic::amdgcn_fmul_legacy:
4015     case Intrinsic::amdgcn_fma_legacy:
4016     case Intrinsic::amdgcn_ldexp:
4017     case Intrinsic::amdgcn_frexp_mant:
4018     case Intrinsic::amdgcn_frexp_exp:
4019     case Intrinsic::amdgcn_fract:
4020     case Intrinsic::amdgcn_cvt_pkrtz:
4021     case Intrinsic::amdgcn_cvt_pknorm_i16:
4022     case Intrinsic::amdgcn_cvt_pknorm_u16:
4023     case Intrinsic::amdgcn_cvt_pk_i16:
4024     case Intrinsic::amdgcn_cvt_pk_u16:
4025     case Intrinsic::amdgcn_fmed3:
4026     case Intrinsic::amdgcn_cubeid:
4027     case Intrinsic::amdgcn_cubema:
4028     case Intrinsic::amdgcn_cubesc:
4029     case Intrinsic::amdgcn_cubetc:
4030     case Intrinsic::amdgcn_sffbh:
4031     case Intrinsic::amdgcn_fmad_ftz:
4032     case Intrinsic::amdgcn_mbcnt_lo:
4033     case Intrinsic::amdgcn_mbcnt_hi:
4034     case Intrinsic::amdgcn_mul_u24:
4035     case Intrinsic::amdgcn_mul_i24:
4036     case Intrinsic::amdgcn_lerp:
4037     case Intrinsic::amdgcn_sad_u8:
4038     case Intrinsic::amdgcn_msad_u8:
4039     case Intrinsic::amdgcn_sad_hi_u8:
4040     case Intrinsic::amdgcn_sad_u16:
4041     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4042     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4043     case Intrinsic::amdgcn_mqsad_u32_u8:
4044     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4045     case Intrinsic::amdgcn_alignbit:
4046     case Intrinsic::amdgcn_alignbyte:
4047     case Intrinsic::amdgcn_perm:
4048     case Intrinsic::amdgcn_fdot2:
4049     case Intrinsic::amdgcn_sdot2:
4050     case Intrinsic::amdgcn_udot2:
4051     case Intrinsic::amdgcn_sdot4:
4052     case Intrinsic::amdgcn_udot4:
4053     case Intrinsic::amdgcn_sdot8:
4054     case Intrinsic::amdgcn_udot8:
4055       return getDefaultMappingVOP(MI);
4056     case Intrinsic::amdgcn_sbfe:
4057     case Intrinsic::amdgcn_ubfe:
4058       if (isSALUMapping(MI))
4059         return getDefaultMappingSOP(MI);
4060       return getDefaultMappingVOP(MI);
4061     case Intrinsic::amdgcn_ds_swizzle:
4062     case Intrinsic::amdgcn_ds_permute:
4063     case Intrinsic::amdgcn_ds_bpermute:
4064     case Intrinsic::amdgcn_update_dpp:
4065     case Intrinsic::amdgcn_mov_dpp8:
4066     case Intrinsic::amdgcn_mov_dpp:
4067     case Intrinsic::amdgcn_strict_wwm:
4068     case Intrinsic::amdgcn_wwm:
4069     case Intrinsic::amdgcn_strict_wqm:
4070     case Intrinsic::amdgcn_wqm:
4071     case Intrinsic::amdgcn_softwqm:
4072     case Intrinsic::amdgcn_set_inactive:
4073       return getDefaultMappingAllVGPR(MI);
4074     case Intrinsic::amdgcn_kernarg_segment_ptr:
4075     case Intrinsic::amdgcn_s_getpc:
4076     case Intrinsic::amdgcn_groupstaticsize:
4077     case Intrinsic::amdgcn_reloc_constant:
4078     case Intrinsic::returnaddress: {
4079       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4080       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4081       break;
4082     }
4083     case Intrinsic::amdgcn_wqm_vote: {
4084       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4085       OpdsMapping[0] = OpdsMapping[2]
4086         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4087       break;
4088     }
4089     case Intrinsic::amdgcn_ps_live: {
4090       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4091       break;
4092     }
4093     case Intrinsic::amdgcn_div_scale: {
4094       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4095       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4096       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4097       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4098 
4099       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4100       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4101       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4102       break;
4103     }
4104     case Intrinsic::amdgcn_class: {
4105       Register Src0Reg = MI.getOperand(2).getReg();
4106       Register Src1Reg = MI.getOperand(3).getReg();
4107       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4108       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4109       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4110       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4111       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4112       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4113       break;
4114     }
4115     case Intrinsic::amdgcn_icmp:
4116     case Intrinsic::amdgcn_fcmp: {
4117       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4118       // This is not VCCRegBank because this is not used in boolean contexts.
4119       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4120       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4121       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4122       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4123       break;
4124     }
4125     case Intrinsic::amdgcn_readlane: {
4126       // This must be an SGPR, but accept a VGPR.
4127       Register IdxReg = MI.getOperand(3).getReg();
4128       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4129       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4130       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4131       LLVM_FALLTHROUGH;
4132     }
4133     case Intrinsic::amdgcn_readfirstlane: {
4134       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4135       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4136       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4137       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4138       break;
4139     }
4140     case Intrinsic::amdgcn_writelane: {
4141       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4142       Register SrcReg = MI.getOperand(2).getReg();
4143       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4144       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4145       Register IdxReg = MI.getOperand(3).getReg();
4146       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4147       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4148       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4149 
4150       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4151       // to legalize.
4152       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4153       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4154       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4155       break;
4156     }
4157     case Intrinsic::amdgcn_if_break: {
4158       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4159       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4160       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4161       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4162       break;
4163     }
4164     case Intrinsic::amdgcn_permlane16:
4165     case Intrinsic::amdgcn_permlanex16: {
4166       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4167       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4168       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4169       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4170       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4171       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4172       break;
4173     }
4174     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4175     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4176     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4177     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4178     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4179     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4180     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4181     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4182     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4183     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4184     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4185     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4186     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4187     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4188     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4189     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4190     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4191     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4192     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4193     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4194     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4195     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4196     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4197     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4198     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4199     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4200     case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4201       // Default for MAI intrinsics.
4202       // srcC can also be an immediate which can be folded later.
4203       // FIXME: Should we eventually add an alternative mapping with AGPR src
4204       // for srcA/srcB?
4205       //
4206       // vdst, srcA, srcB, srcC
4207       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4208       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4209       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4210       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4211       break;
4212     }
4213     case Intrinsic::amdgcn_interp_p1:
4214     case Intrinsic::amdgcn_interp_p2:
4215     case Intrinsic::amdgcn_interp_mov:
4216     case Intrinsic::amdgcn_interp_p1_f16:
4217     case Intrinsic::amdgcn_interp_p2_f16: {
4218       const int M0Idx = MI.getNumOperands() - 1;
4219       Register M0Reg = MI.getOperand(M0Idx).getReg();
4220       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4221       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4222 
4223       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4224       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4225         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4226 
4227       // Must be SGPR, but we must take whatever the original bank is and fix it
4228       // later.
4229       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4230       break;
4231     }
4232     case Intrinsic::amdgcn_ballot: {
4233       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4234       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4235       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4236       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4237       break;
4238     }
4239     }
4240     break;
4241   }
4242   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4243   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4244     auto IntrID = MI.getIntrinsicID();
4245     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4246     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4247     // Non-images can have complications from operands that allow both SGPR
4248     // and VGPR. For now it's too complicated to figure out the final opcode
4249     // to derive the register bank from the MCInstrDesc.
4250     assert(RSrcIntrin->IsImage);
4251     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4252   }
4253   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4254     unsigned N = MI.getNumExplicitOperands() - 2;
4255     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4256     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4257     for (unsigned I = 2; I < N; ++I)
4258       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4259     break;
4260   }
4261   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4262     auto IntrID = MI.getIntrinsicID();
4263     switch (IntrID) {
4264     case Intrinsic::amdgcn_s_getreg:
4265     case Intrinsic::amdgcn_s_memtime:
4266     case Intrinsic::amdgcn_s_memrealtime:
4267     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4268       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4269       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4270       break;
4271     }
4272     case Intrinsic::amdgcn_global_atomic_fadd:
4273     case Intrinsic::amdgcn_global_atomic_csub:
4274     case Intrinsic::amdgcn_global_atomic_fmin:
4275     case Intrinsic::amdgcn_global_atomic_fmax:
4276     case Intrinsic::amdgcn_flat_atomic_fadd:
4277     case Intrinsic::amdgcn_flat_atomic_fmin:
4278     case Intrinsic::amdgcn_flat_atomic_fmax:
4279       return getDefaultMappingAllVGPR(MI);
4280     case Intrinsic::amdgcn_ds_ordered_add:
4281     case Intrinsic::amdgcn_ds_ordered_swap: {
4282       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4283       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4284       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4285                                  AMDGPU::SGPRRegBankID);
4286       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4287       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4288       break;
4289     }
4290     case Intrinsic::amdgcn_ds_append:
4291     case Intrinsic::amdgcn_ds_consume: {
4292       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4293       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4294       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4295       break;
4296     }
4297     case Intrinsic::amdgcn_exp_compr:
4298       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4299       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4300       break;
4301     case Intrinsic::amdgcn_exp:
4302       // FIXME: Could we support packed types here?
4303       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4304       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4305       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4306       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4307       break;
4308     case Intrinsic::amdgcn_s_sendmsg:
4309     case Intrinsic::amdgcn_s_sendmsghalt: {
4310       // This must be an SGPR, but accept a VGPR.
4311       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4312                                    AMDGPU::SGPRRegBankID);
4313       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4314       break;
4315     }
4316     case Intrinsic::amdgcn_s_setreg: {
4317       // This must be an SGPR, but accept a VGPR.
4318       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4319                                    AMDGPU::SGPRRegBankID);
4320       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4321       break;
4322     }
4323     case Intrinsic::amdgcn_end_cf: {
4324       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4325       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4326       break;
4327     }
4328     case Intrinsic::amdgcn_else: {
4329       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4330       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4331       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4332       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4333       break;
4334     }
4335     case Intrinsic::amdgcn_live_mask: {
4336       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4337       break;
4338     }
4339     case Intrinsic::amdgcn_wqm_demote:
4340     case Intrinsic::amdgcn_kill: {
4341       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4342       break;
4343     }
4344     case Intrinsic::amdgcn_raw_buffer_load:
4345     case Intrinsic::amdgcn_raw_tbuffer_load: {
4346       // FIXME: Should make intrinsic ID the last operand of the instruction,
4347       // then this would be the same as store
4348       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4349       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4350       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4351       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4352       break;
4353     }
4354     case Intrinsic::amdgcn_raw_buffer_store:
4355     case Intrinsic::amdgcn_raw_buffer_store_format:
4356     case Intrinsic::amdgcn_raw_tbuffer_store: {
4357       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4358       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4359       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4360       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4361       break;
4362     }
4363     case Intrinsic::amdgcn_struct_buffer_load:
4364     case Intrinsic::amdgcn_struct_tbuffer_load: {
4365       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4366       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4367       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4368       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4369       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4370       break;
4371     }
4372     case Intrinsic::amdgcn_struct_buffer_store:
4373     case Intrinsic::amdgcn_struct_tbuffer_store: {
4374       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4375       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4376       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4377       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4378       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4379       break;
4380     }
4381     case Intrinsic::amdgcn_init_exec_from_input: {
4382       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4383       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4384       break;
4385     }
4386     case Intrinsic::amdgcn_ds_gws_init:
4387     case Intrinsic::amdgcn_ds_gws_barrier:
4388     case Intrinsic::amdgcn_ds_gws_sema_br: {
4389       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4390 
4391       // This must be an SGPR, but accept a VGPR.
4392       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4393                                    AMDGPU::SGPRRegBankID);
4394       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4395       break;
4396     }
4397     case Intrinsic::amdgcn_ds_gws_sema_v:
4398     case Intrinsic::amdgcn_ds_gws_sema_p:
4399     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4400       // This must be an SGPR, but accept a VGPR.
4401       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4402                                    AMDGPU::SGPRRegBankID);
4403       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4404       break;
4405     }
4406     default:
4407       return getInvalidInstructionMapping();
4408     }
4409     break;
4410   }
4411   case AMDGPU::G_SELECT: {
4412     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4413     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4414                                     AMDGPU::SGPRRegBankID);
4415     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4416                                     AMDGPU::SGPRRegBankID);
4417     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4418                     Op3Bank == AMDGPU::SGPRRegBankID;
4419 
4420     unsigned CondBankDefault = SGPRSrcs ?
4421       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4422     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4423                                      CondBankDefault);
4424     if (CondBank == AMDGPU::SGPRRegBankID)
4425       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4426     else if (CondBank == AMDGPU::VGPRRegBankID)
4427       CondBank = AMDGPU::VCCRegBankID;
4428 
4429     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4430       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4431 
4432     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4433 
4434     // TODO: Should report 32-bit for scalar condition type.
4435     if (Size == 64) {
4436       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4437       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4438       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4439       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4440     } else {
4441       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4442       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4443       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4444       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4445     }
4446 
4447     break;
4448   }
4449 
4450   case AMDGPU::G_LOAD:
4451   case AMDGPU::G_ZEXTLOAD:
4452   case AMDGPU::G_SEXTLOAD:
4453     return getInstrMappingForLoad(MI);
4454 
4455   case AMDGPU::G_ATOMICRMW_XCHG:
4456   case AMDGPU::G_ATOMICRMW_ADD:
4457   case AMDGPU::G_ATOMICRMW_SUB:
4458   case AMDGPU::G_ATOMICRMW_AND:
4459   case AMDGPU::G_ATOMICRMW_OR:
4460   case AMDGPU::G_ATOMICRMW_XOR:
4461   case AMDGPU::G_ATOMICRMW_MAX:
4462   case AMDGPU::G_ATOMICRMW_MIN:
4463   case AMDGPU::G_ATOMICRMW_UMAX:
4464   case AMDGPU::G_ATOMICRMW_UMIN:
4465   case AMDGPU::G_ATOMICRMW_FADD:
4466   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4467   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4468   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4469   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4470   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4471     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4472     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4473     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4474     break;
4475   }
4476   case AMDGPU::G_ATOMIC_CMPXCHG: {
4477     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4478     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4479     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4480     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4481     break;
4482   }
4483   case AMDGPU::G_BRCOND: {
4484     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4485                                  AMDGPU::SGPRRegBankID);
4486     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4487     if (Bank != AMDGPU::SGPRRegBankID)
4488       Bank = AMDGPU::VCCRegBankID;
4489 
4490     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4491     break;
4492   }
4493   }
4494 
4495   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4496                                getOperandsMapping(OpdsMapping),
4497                                MI.getNumOperands());
4498 }
4499