1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   const AMDGPURegisterBankInfo &RBI;
101   MachineRegisterInfo &MRI;
102   const RegisterBank *NewBank;
103   SmallVector<MachineInstr *, 4> NewInsts;
104 
105 public:
106   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
108     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109 
110   ~ApplyRegBankMapping() {
111     for (MachineInstr *MI : NewInsts)
112       applyBank(*MI);
113   }
114 
115   /// Set any registers that don't have a set register class or bank to SALU.
116   void applyBank(MachineInstr &MI) {
117     const unsigned Opc = MI.getOpcode();
118     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119         Opc == AMDGPU::G_SEXT) {
120       // LegalizerHelper wants to use the basic legalization artifacts when
121       // widening etc. We don't handle selection with vcc in artifact sources,
122       // so we need to use a select instead to handle these properly.
123       Register DstReg = MI.getOperand(0).getReg();
124       Register SrcReg = MI.getOperand(1).getReg();
125       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126       if (SrcBank == &AMDGPU::VCCRegBank) {
127         const LLT S32 = LLT::scalar(32);
128         assert(MRI.getType(SrcReg) == LLT::scalar(1));
129         assert(MRI.getType(DstReg) == S32);
130         assert(NewBank == &AMDGPU::VGPRRegBank);
131 
132         // Replace the extension with a select, which really uses the boolean
133         // source.
134         MachineIRBuilder B(MI);
135         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136         auto False = B.buildConstant(S32, 0);
137         B.buildSelect(DstReg, SrcReg, True, False);
138         MRI.setRegBank(True.getReg(0), *NewBank);
139         MRI.setRegBank(False.getReg(0), *NewBank);
140         MI.eraseFromParent();
141       }
142 
143       assert(!MRI.getRegClassOrRegBank(DstReg));
144       MRI.setRegBank(DstReg, *NewBank);
145       return;
146     }
147 
148 #ifndef NDEBUG
149     if (Opc == AMDGPU::G_TRUNC) {
150       Register DstReg = MI.getOperand(0).getReg();
151       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152       assert(DstBank != &AMDGPU::VCCRegBank);
153     }
154 #endif
155 
156     for (MachineOperand &Op : MI.operands()) {
157       if (!Op.isReg())
158         continue;
159 
160       // We may see physical registers if building a real MI
161       Register Reg = Op.getReg();
162       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163         continue;
164 
165       const RegisterBank *RB = NewBank;
166       if (MRI.getType(Reg) == LLT::scalar(1)) {
167         assert(NewBank == &AMDGPU::VGPRRegBank &&
168                "s1 operands should only be used for vector bools");
169         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171                "not expecting legalization artifacts here");
172         RB = &AMDGPU::VCCRegBank;
173       }
174 
175       MRI.setRegBank(Reg, *RB);
176     }
177   }
178 
179   void erasingInstr(MachineInstr &MI) override {}
180 
181   void createdInstr(MachineInstr &MI) override {
182     // At this point, the instruction was just inserted and has no operands.
183     NewInsts.push_back(&MI);
184   }
185 
186   void changingInstr(MachineInstr &MI) override {}
187   void changedInstr(MachineInstr &MI) override {
188     // FIXME: In principle we should probably add the instruction to NewInsts,
189     // but the way the LegalizerHelper uses the observer, we will always see the
190     // registers we need to set the regbank on also referenced in a new
191     // instruction.
192   }
193 };
194 
195 }
196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
197     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
198       TII(Subtarget.getInstrInfo()) {
199 
200   // HACK: Until this is fully tablegen'd.
201   static llvm::once_flag InitializeRegisterBankFlag;
202 
203   static auto InitializeRegisterBankOnce = [this]() {
204     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207     (void)this;
208   };
209 
210   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212 
213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214   unsigned BankID = Bank.getID();
215   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217 
218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219                                           const RegisterBank &Src,
220                                           unsigned Size) const {
221   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224     return std::numeric_limits<unsigned>::max();
225   }
226 
227   // Bool values are tricky, because the meaning is based on context. The SCC
228   // and VCC banks are for the natural scalar and vector conditions produced by
229   // a compare.
230   //
231   // Legalization doesn't know about the necessary context, so an s1 use may
232   // have been a truncate from an arbitrary value, in which case a copy (lowered
233   // as a compare with 0) needs to be inserted.
234   if (Size == 1 &&
235       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236       (isVectorRegisterBank(Src) ||
237        Src.getID() == AMDGPU::SGPRRegBankID ||
238        Src.getID() == AMDGPU::VCCRegBankID))
239     return std::numeric_limits<unsigned>::max();
240 
241   // There is no direct copy between AGPRs.
242   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243       Src.getID() == AMDGPU::AGPRRegBankID)
244     return 4;
245 
246   return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248 
249 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
250   const ValueMapping &ValMapping,
251   const RegisterBank *CurBank) const {
252   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253   // VGPR.
254   // FIXME: Is there a better way to do this?
255   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256     return 10; // This is expensive.
257 
258   assert(ValMapping.NumBreakDowns == 2 &&
259          ValMapping.BreakDown[0].Length == 32 &&
260          ValMapping.BreakDown[0].StartIdx == 0 &&
261          ValMapping.BreakDown[1].Length == 32 &&
262          ValMapping.BreakDown[1].StartIdx == 32 &&
263          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264 
265   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267   // want.
268 
269   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270   // alignment restrictions, but this probably isn't important.
271   return 1;
272 }
273 
274 const RegisterBank &
275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
276                                                LLT Ty) const {
277   if (&RC == &AMDGPU::SReg_1RegClass)
278     return AMDGPU::VCCRegBank;
279 
280   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281   // VCC-like use.
282   if (TRI->isSGPRClass(&RC)) {
283     // FIXME: This probably came from a copy from a physical register, which
284     // should be inferable from the copied to-type. We don't have many boolean
285     // physical register constraints so just assume a normal SGPR for now.
286     if (!Ty.isValid())
287       return AMDGPU::SGPRRegBank;
288 
289     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290   }
291 
292   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294 
295 template <unsigned NumOps>
296 RegisterBankInfo::InstructionMappings
297 AMDGPURegisterBankInfo::addMappingFromTable(
298     const MachineInstr &MI, const MachineRegisterInfo &MRI,
299     const std::array<unsigned, NumOps> RegSrcOpIdx,
300     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301 
302   InstructionMappings AltMappings;
303 
304   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
305 
306   unsigned Sizes[NumOps];
307   for (unsigned I = 0; I < NumOps; ++I) {
308     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310   }
311 
312   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315   }
316 
317   // getInstrMapping's default mapping uses ID 1, so start at 2.
318   unsigned MappingID = 2;
319   for (const auto &Entry : Table) {
320     for (unsigned I = 0; I < NumOps; ++I) {
321       int OpIdx = RegSrcOpIdx[I];
322       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323     }
324 
325     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
326                                                  getOperandsMapping(Operands),
327                                                  Operands.size()));
328   }
329 
330   return AltMappings;
331 }
332 
333 RegisterBankInfo::InstructionMappings
334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
335     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336   switch (MI.getIntrinsicID()) {
337   case Intrinsic::amdgcn_readlane: {
338     static const OpRegBankEntry<3> Table[2] = {
339       // Perfectly legal.
340       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341 
342       // Need a readfirstlane for the index.
343       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344     };
345 
346     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
348   }
349   case Intrinsic::amdgcn_writelane: {
350     static const OpRegBankEntry<4> Table[4] = {
351       // Perfectly legal.
352       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353 
354       // Need readfirstlane of first op
355       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356 
357       // Need readfirstlane of second op
358       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359 
360       // Need readfirstlane of both ops
361       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362     };
363 
364     // rsrc, voffset, offset
365     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
367   }
368   default:
369     return RegisterBankInfo::getInstrAlternativeMappings(MI);
370   }
371 }
372 
373 RegisterBankInfo::InstructionMappings
374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
375     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376 
377   switch (MI.getIntrinsicID()) {
378   case Intrinsic::amdgcn_s_buffer_load: {
379     static const OpRegBankEntry<2> Table[4] = {
380       // Perfectly legal.
381       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382 
383       // Only need 1 register in loop
384       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385 
386       // Have to waterfall the resource.
387       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388 
389       // Have to waterfall the resource, and the offset.
390       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391     };
392 
393     // rsrc, offset
394     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
396   }
397   case Intrinsic::amdgcn_ds_ordered_add:
398   case Intrinsic::amdgcn_ds_ordered_swap: {
399     // VGPR = M0, VGPR
400     static const OpRegBankEntry<3> Table[2] = {
401       // Perfectly legal.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
403 
404       // Need a readfirstlane for m0
405       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406     };
407 
408     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
410   }
411   case Intrinsic::amdgcn_s_sendmsg:
412   case Intrinsic::amdgcn_s_sendmsghalt: {
413     // FIXME: Should have no register for immediate
414     static const OpRegBankEntry<1> Table[2] = {
415       // Perfectly legal.
416       { { AMDGPU::SGPRRegBankID }, 1 },
417 
418       // Need readlane
419       { { AMDGPU::VGPRRegBankID }, 3 }
420     };
421 
422     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
424   }
425   default:
426     return RegisterBankInfo::getInstrAlternativeMappings(MI);
427   }
428 }
429 
430 // FIXME: Returns uniform if there's no source value information. This is
431 // probably wrong.
432 static bool isScalarLoadLegal(const MachineInstr &MI) {
433   if (!MI.hasOneMemOperand())
434     return false;
435 
436   const MachineMemOperand *MMO = *MI.memoperands_begin();
437   const unsigned AS = MMO->getAddrSpace();
438   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
439                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
440   // Require 4-byte alignment.
441   return MMO->getAlign() >= Align(4) &&
442          // Can't do a scalar atomic load.
443          !MMO->isAtomic() &&
444          // Don't use scalar loads for volatile accesses to non-constant address
445          // spaces.
446          (IsConst || !MMO->isVolatile()) &&
447          // Memory must be known constant, or not written before this load.
448          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
449          AMDGPUInstrInfo::isUniformMMO(MMO);
450 }
451 
452 RegisterBankInfo::InstructionMappings
453 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
454     const MachineInstr &MI) const {
455 
456   const MachineFunction &MF = *MI.getParent()->getParent();
457   const MachineRegisterInfo &MRI = MF.getRegInfo();
458 
459 
460   InstructionMappings AltMappings;
461   switch (MI.getOpcode()) {
462   case TargetOpcode::G_CONSTANT: {
463     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
464     if (Size == 1) {
465       static const OpRegBankEntry<1> Table[3] = {
466         { { AMDGPU::VGPRRegBankID }, 1 },
467         { { AMDGPU::SGPRRegBankID }, 1 },
468         { { AMDGPU::VCCRegBankID }, 1 }
469       };
470 
471       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
472     }
473 
474     LLVM_FALLTHROUGH;
475   }
476   case TargetOpcode::G_FCONSTANT:
477   case TargetOpcode::G_FRAME_INDEX:
478   case TargetOpcode::G_GLOBAL_VALUE: {
479     static const OpRegBankEntry<1> Table[2] = {
480       { { AMDGPU::VGPRRegBankID }, 1 },
481       { { AMDGPU::SGPRRegBankID }, 1 }
482     };
483 
484     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
485   }
486   case TargetOpcode::G_AND:
487   case TargetOpcode::G_OR:
488   case TargetOpcode::G_XOR: {
489     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
490 
491     if (Size == 1) {
492       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
493       const InstructionMapping &SCCMapping = getInstructionMapping(
494         1, 1, getOperandsMapping(
495           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
496            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
497            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
498         3); // Num Operands
499       AltMappings.push_back(&SCCMapping);
500 
501       const InstructionMapping &VCCMapping0 = getInstructionMapping(
502         2, 1, getOperandsMapping(
503           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
504            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
505            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
506         3); // Num Operands
507       AltMappings.push_back(&VCCMapping0);
508       return AltMappings;
509     }
510 
511     if (Size != 64)
512       break;
513 
514     const InstructionMapping &SSMapping = getInstructionMapping(
515       1, 1, getOperandsMapping(
516         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
517          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
519       3); // Num Operands
520     AltMappings.push_back(&SSMapping);
521 
522     const InstructionMapping &VVMapping = getInstructionMapping(
523       2, 2, getOperandsMapping(
524         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
525          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
527       3); // Num Operands
528     AltMappings.push_back(&VVMapping);
529     break;
530   }
531   case TargetOpcode::G_LOAD:
532   case TargetOpcode::G_ZEXTLOAD:
533   case TargetOpcode::G_SEXTLOAD: {
534     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
535     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
536     unsigned PtrSize = PtrTy.getSizeInBits();
537     unsigned AS = PtrTy.getAddressSpace();
538 
539     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
540          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
541         isScalarLoadLegal(MI)) {
542       const InstructionMapping &SSMapping = getInstructionMapping(
543           1, 1, getOperandsMapping(
544                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
545                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
546           2); // Num Operands
547       AltMappings.push_back(&SSMapping);
548     }
549 
550     const InstructionMapping &VVMapping = getInstructionMapping(
551         2, 1,
552         getOperandsMapping(
553             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
554              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
555         2); // Num Operands
556     AltMappings.push_back(&VVMapping);
557 
558     // It may be possible to have a vgpr = load sgpr mapping here, because
559     // the mubuf instructions support this kind of load, but probably for only
560     // gfx7 and older.  However, the addressing mode matching in the instruction
561     // selector should be able to do a better job of detecting and selecting
562     // these kinds of loads from the vgpr = load vgpr mapping.
563 
564     return AltMappings;
565 
566   }
567   case TargetOpcode::G_SELECT: {
568     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
569     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
570       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
571                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
572                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
573                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
574       4); // Num Operands
575     AltMappings.push_back(&SSMapping);
576 
577     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
578       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
579                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
580                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
581                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
582       4); // Num Operands
583     AltMappings.push_back(&VVMapping);
584 
585     return AltMappings;
586   }
587   case TargetOpcode::G_UADDE:
588   case TargetOpcode::G_USUBE:
589   case TargetOpcode::G_SADDE:
590   case TargetOpcode::G_SSUBE: {
591     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
592     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
593       getOperandsMapping(
594         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
595          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
596          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
597          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
598          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
599       5); // Num Operands
600     AltMappings.push_back(&SSMapping);
601 
602     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
603       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
604                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
605                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
606                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
607                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
608       5); // Num Operands
609     AltMappings.push_back(&VVMapping);
610     return AltMappings;
611   }
612   case AMDGPU::G_BRCOND: {
613     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
614 
615     // TODO: Change type to 32 for scalar
616     const InstructionMapping &SMapping = getInstructionMapping(
617       1, 1, getOperandsMapping(
618         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
619       2); // Num Operands
620     AltMappings.push_back(&SMapping);
621 
622     const InstructionMapping &VMapping = getInstructionMapping(
623       1, 1, getOperandsMapping(
624         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
625       2); // Num Operands
626     AltMappings.push_back(&VMapping);
627     return AltMappings;
628   }
629   case AMDGPU::G_INTRINSIC:
630     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
631   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
632     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
633   default:
634     break;
635   }
636   return RegisterBankInfo::getInstrAlternativeMappings(MI);
637 }
638 
639 void AMDGPURegisterBankInfo::split64BitValueForMapping(
640   MachineIRBuilder &B,
641   SmallVector<Register, 2> &Regs,
642   LLT HalfTy,
643   Register Reg) const {
644   assert(HalfTy.getSizeInBits() == 32);
645   MachineRegisterInfo *MRI = B.getMRI();
646   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
647   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
648   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
649   MRI->setRegBank(LoLHS, *Bank);
650   MRI->setRegBank(HiLHS, *Bank);
651 
652   Regs.push_back(LoLHS);
653   Regs.push_back(HiLHS);
654 
655   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
656     .addDef(LoLHS)
657     .addDef(HiLHS)
658     .addUse(Reg);
659 }
660 
661 /// Replace the current type each register in \p Regs has with \p NewTy
662 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
663                           LLT NewTy) {
664   for (Register Reg : Regs) {
665     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
666     MRI.setType(Reg, NewTy);
667   }
668 }
669 
670 static LLT getHalfSizedType(LLT Ty) {
671   if (Ty.isVector()) {
672     assert(Ty.getElementCount().isKnownMultipleOf(2));
673     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
674                                Ty.getElementType());
675   }
676 
677   assert(Ty.getScalarSizeInBits() % 2 == 0);
678   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
679 }
680 
681 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
682 // source value into a scalar register.
683 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
684                                                     MachineRegisterInfo &MRI,
685                                                     Register Src) const {
686   LLT Ty = MRI.getType(Src);
687   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
688 
689   if (Bank == &AMDGPU::SGPRRegBank)
690     return Src;
691 
692   unsigned Bits = Ty.getSizeInBits();
693   assert(Bits % 32 == 0);
694 
695   if (Bank != &AMDGPU::VGPRRegBank) {
696     // We need to copy from AGPR to VGPR
697     Src = B.buildCopy(Ty, Src).getReg(0);
698     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
699   }
700 
701   LLT S32 = LLT::scalar(32);
702   unsigned NumParts = Bits / 32;
703   SmallVector<Register, 8> SrcParts;
704   SmallVector<Register, 8> DstParts;
705 
706   if (Bits == 32) {
707     SrcParts.push_back(Src);
708   } else {
709     auto Unmerge = B.buildUnmerge(S32, Src);
710     for (unsigned i = 0; i < NumParts; ++i)
711       SrcParts.push_back(Unmerge.getReg(i));
712   }
713 
714   for (unsigned i = 0; i < NumParts; ++i) {
715     Register SrcPart = SrcParts[i];
716     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
717     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
718 
719     const TargetRegisterClass *Constrained =
720         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
721     (void)Constrained;
722     assert(Constrained && "Failed to constrain readfirstlane src reg");
723 
724     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
725 
726     DstParts.push_back(DstPart);
727   }
728 
729   if (Bits == 32)
730     return DstParts[0];
731 
732   Register Dst = B.buildMerge(Ty, DstParts).getReg(0);
733   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
734   return Dst;
735 }
736 
737 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
738 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
739 /// execute the instruction for each unique combination of values in all lanes
740 /// in the wave. The block will be split such that rest of the instructions are
741 /// moved to a new block.
742 ///
743 /// Essentially performs this loop:
744 //
745 /// Save Execution Mask
746 /// For (Lane : Wavefront) {
747 ///   Enable Lane, Disable all other lanes
748 ///   SGPR = read SGPR value for current lane from VGPR
749 ///   VGPRResult[Lane] = use_op SGPR
750 /// }
751 /// Restore Execution Mask
752 ///
753 /// There is additional complexity to try for compare values to identify the
754 /// unique values used.
755 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
756   MachineIRBuilder &B,
757   iterator_range<MachineBasicBlock::iterator> Range,
758   SmallSet<Register, 4> &SGPROperandRegs,
759   MachineRegisterInfo &MRI) const {
760 
761   // Track use registers which have already been expanded with a readfirstlane
762   // sequence. This may have multiple uses if moving a sequence.
763   DenseMap<Register, Register> WaterfalledRegMap;
764 
765   MachineBasicBlock &MBB = B.getMBB();
766   MachineFunction *MF = &B.getMF();
767 
768   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
769   const unsigned MovExecOpc =
770       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
771   const unsigned MovExecTermOpc =
772       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
773 
774   const unsigned XorTermOpc = Subtarget.isWave32() ?
775     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
776   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
777     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
778   const unsigned ExecReg =  Subtarget.isWave32() ?
779     AMDGPU::EXEC_LO : AMDGPU::EXEC;
780 
781 #ifndef NDEBUG
782   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
783 #endif
784 
785   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
786   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
787 
788   // Don't bother using generic instructions/registers for the exec mask.
789   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
790     .addDef(InitSaveExecReg);
791 
792   Register PhiExec = MRI.createVirtualRegister(WaveRC);
793   Register NewExec = MRI.createVirtualRegister(WaveRC);
794 
795   // To insert the loop we need to split the block. Move everything before this
796   // point to a new block, and insert a new empty block before this instruction.
797   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
798   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
799   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
800   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
801   MachineFunction::iterator MBBI(MBB);
802   ++MBBI;
803   MF->insert(MBBI, LoopBB);
804   MF->insert(MBBI, BodyBB);
805   MF->insert(MBBI, RestoreExecBB);
806   MF->insert(MBBI, RemainderBB);
807 
808   LoopBB->addSuccessor(BodyBB);
809   BodyBB->addSuccessor(RestoreExecBB);
810   BodyBB->addSuccessor(LoopBB);
811 
812   // Move the rest of the block into a new block.
813   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
814   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
815 
816   MBB.addSuccessor(LoopBB);
817   RestoreExecBB->addSuccessor(RemainderBB);
818 
819   B.setInsertPt(*LoopBB, LoopBB->end());
820 
821   B.buildInstr(TargetOpcode::PHI)
822       .addDef(PhiExec)
823       .addReg(InitSaveExecReg)
824       .addMBB(&MBB)
825       .addReg(NewExec)
826       .addMBB(BodyBB);
827 
828   const DebugLoc &DL = B.getDL();
829 
830   MachineInstr &FirstInst = *Range.begin();
831 
832   // Move the instruction into the loop body. Note we moved everything after
833   // Range.end() already into a new block, so Range.end() is no longer valid.
834   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
835 
836   // Figure out the iterator range after splicing the instructions.
837   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
838   auto NewEnd = BodyBB->end();
839 
840   B.setMBB(*LoopBB);
841 
842   LLT S1 = LLT::scalar(1);
843   Register CondReg;
844 
845   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
846 
847   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
848     for (MachineOperand &Op : MI.uses()) {
849       if (!Op.isReg() || Op.isDef())
850         continue;
851 
852       Register OldReg = Op.getReg();
853       if (!SGPROperandRegs.count(OldReg))
854         continue;
855 
856       // See if we already processed this register in another instruction in the
857       // sequence.
858       auto OldVal = WaterfalledRegMap.find(OldReg);
859       if (OldVal != WaterfalledRegMap.end()) {
860         Op.setReg(OldVal->second);
861         continue;
862       }
863 
864       Register OpReg = Op.getReg();
865       LLT OpTy = MRI.getType(OpReg);
866 
867       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
868       if (OpBank != &AMDGPU::VGPRRegBank) {
869         // Insert copy from AGPR to VGPR before the loop.
870         B.setMBB(MBB);
871         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
872         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
873         B.setMBB(*LoopBB);
874       }
875 
876       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
877 
878       // Build the comparison(s).
879       unsigned OpSize = OpTy.getSizeInBits();
880       bool Is64 = OpSize % 64 == 0;
881       unsigned PartSize = Is64 ? 64 : 32;
882       LLT PartTy = LLT::scalar(PartSize);
883       unsigned NumParts = OpSize / PartSize;
884       SmallVector<Register, 8> OpParts;
885       SmallVector<Register, 8> CurrentLaneParts;
886 
887       if (NumParts == 1) {
888         OpParts.push_back(OpReg);
889         CurrentLaneParts.push_back(CurrentLaneReg);
890       } else {
891         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
892         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
893         for (unsigned i = 0; i < NumParts; ++i) {
894           OpParts.push_back(UnmergeOp.getReg(i));
895           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
896           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
897           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
898         }
899       }
900 
901       for (unsigned i = 0; i < NumParts; ++i) {
902         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
903                                   OpParts[i]).getReg(0);
904         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
905 
906         if (!CondReg) {
907           CondReg = CmpReg;
908         } else {
909           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
910           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
911         }
912       }
913 
914       Op.setReg(CurrentLaneReg);
915 
916       // Make sure we don't re-process this register again.
917       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
918     }
919   }
920 
921   // The ballot becomes a no-op during instruction selection.
922   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
923                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
924                              false)
925                 .addReg(CondReg)
926                 .getReg(0);
927   MRI.setRegClass(CondReg, WaveRC);
928 
929   // Update EXEC, save the original EXEC value to VCC.
930   B.buildInstr(AndSaveExecOpc)
931     .addDef(NewExec)
932     .addReg(CondReg, RegState::Kill);
933 
934   MRI.setSimpleHint(NewExec, CondReg);
935 
936   B.setInsertPt(*BodyBB, BodyBB->end());
937 
938   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
939   B.buildInstr(XorTermOpc)
940     .addDef(ExecReg)
941     .addReg(ExecReg)
942     .addReg(NewExec);
943 
944   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
945   // s_cbranch_scc0?
946 
947   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
948   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
949 
950   // Save the EXEC mask before the loop.
951   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
952     .addReg(ExecReg);
953 
954   // Restore the EXEC mask after the loop.
955   B.setMBB(*RestoreExecBB);
956   B.buildInstr(MovExecTermOpc)
957     .addDef(ExecReg)
958     .addReg(SaveExecReg);
959 
960   // Set the insert point after the original instruction, so any new
961   // instructions will be in the remainder.
962   B.setInsertPt(*RemainderBB, RemainderBB->begin());
963 
964   return true;
965 }
966 
967 // Return any unique registers used by \p MI at \p OpIndices that need to be
968 // handled in a waterfall loop. Returns these registers in \p
969 // SGPROperandRegs. Returns true if there are any operands to handle and a
970 // waterfall loop is necessary.
971 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
972   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
973   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
974   for (unsigned Op : OpIndices) {
975     assert(MI.getOperand(Op).isUse());
976     Register Reg = MI.getOperand(Op).getReg();
977     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
978     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
979       SGPROperandRegs.insert(Reg);
980   }
981 
982   // No operands need to be replaced, so no need to loop.
983   return !SGPROperandRegs.empty();
984 }
985 
986 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
987   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
988   ArrayRef<unsigned> OpIndices) const {
989   // Use a set to avoid extra readfirstlanes in the case where multiple operands
990   // are the same register.
991   SmallSet<Register, 4> SGPROperandRegs;
992 
993   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
994     return false;
995 
996   MachineBasicBlock::iterator I = MI.getIterator();
997   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
998                                 SGPROperandRegs, MRI);
999 }
1000 
1001 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1002   MachineInstr &MI, MachineRegisterInfo &MRI,
1003   ArrayRef<unsigned> OpIndices) const {
1004   MachineIRBuilder B(MI);
1005   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1006 }
1007 
1008 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1009 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1010     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1011   Register Reg = MI.getOperand(OpIdx).getReg();
1012   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1013   if (Bank == &AMDGPU::SGPRRegBank)
1014     return;
1015 
1016   MachineIRBuilder B(MI);
1017 
1018   Reg = buildReadFirstLane(B, MRI, Reg);
1019   MI.getOperand(OpIdx).setReg(Reg);
1020 }
1021 
1022 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1023 /// rest will be in the remainder.
1024 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1025   unsigned TotalSize = Ty.getSizeInBits();
1026   if (!Ty.isVector())
1027     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1028 
1029   LLT EltTy = Ty.getElementType();
1030   unsigned EltSize = EltTy.getSizeInBits();
1031   assert(FirstSize % EltSize == 0);
1032 
1033   unsigned FirstPartNumElts = FirstSize / EltSize;
1034   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1035 
1036   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1037           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1038 }
1039 
1040 static LLT widen96To128(LLT Ty) {
1041   if (!Ty.isVector())
1042     return LLT::scalar(128);
1043 
1044   LLT EltTy = Ty.getElementType();
1045   assert(128 % EltTy.getSizeInBits() == 0);
1046   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1047 }
1048 
1049 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1050                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1051                                               MachineRegisterInfo &MRI) const {
1052   Register DstReg = MI.getOperand(0).getReg();
1053   const LLT LoadTy = MRI.getType(DstReg);
1054   unsigned LoadSize = LoadTy.getSizeInBits();
1055   const unsigned MaxNonSmrdLoadSize = 128;
1056 
1057   const RegisterBank *DstBank =
1058       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1059   if (DstBank == &AMDGPU::SGPRRegBank) {
1060     // There are some special cases that we need to look at for 32 bit and 96
1061     // bit SGPR loads otherwise we have nothing to do.
1062     if (LoadSize != 32 && LoadSize != 96)
1063       return false;
1064 
1065     MachineMemOperand *MMO = *MI.memoperands_begin();
1066     const unsigned MemSize = 8 * MMO->getSize();
1067     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1068     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1069     // scalar loads should have a load size of 32 but memory access size of less
1070     // than 32.
1071     if (LoadSize == 32 &&
1072         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1073       return false;
1074 
1075     Register PtrReg = MI.getOperand(1).getReg();
1076 
1077     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1078     MachineIRBuilder B(MI, O);
1079 
1080     if (LoadSize == 32) {
1081       // This is an extending load from a sub-dword size. Widen the memory
1082       // access size to 4 bytes and clear the extra high bits appropriately
1083       const LLT S32 = LLT::scalar(32);
1084       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1085         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1086         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1087         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1088       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1089         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1090         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1091         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1092       } else
1093         // We do not need to touch the higher bits for regular loads.
1094         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1095     } else {
1096       // 96-bit loads are only available for vector loads. We need to split this
1097       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1098       if (MMO->getAlign() < Align(16)) {
1099         MachineFunction *MF = MI.getParent()->getParent();
1100         ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1101         MachineIRBuilder B(MI, ApplyBank);
1102         LegalizerHelper Helper(*MF, ApplyBank, B);
1103         LLT Part64, Part32;
1104         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1105         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1106             LegalizerHelper::Legalized)
1107           return false;
1108         return true;
1109       } else {
1110         LLT WiderTy = widen96To128(LoadTy);
1111         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1112         if (WiderTy.isScalar())
1113           B.buildTrunc(MI.getOperand(0), WideLoad);
1114         else {
1115           B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1116                                               WideLoad);
1117         }
1118       }
1119     }
1120 
1121     MI.eraseFromParent();
1122     return true;
1123   }
1124 
1125   // 128-bit loads are supported for all instruction types.
1126   if (LoadSize <= MaxNonSmrdLoadSize)
1127     return false;
1128 
1129   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1130   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1131 
1132   if (SrcRegs.empty())
1133     SrcRegs.push_back(MI.getOperand(1).getReg());
1134 
1135   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1136 
1137   // RegBankSelect only emits scalar types, so we need to reset the pointer
1138   // operand to a pointer type.
1139   Register BasePtrReg = SrcRegs[0];
1140   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1141   MRI.setType(BasePtrReg, PtrTy);
1142 
1143   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1144   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1145   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1146   MachineIRBuilder B(MI, Observer);
1147   LegalizerHelper Helper(B.getMF(), Observer, B);
1148 
1149   if (LoadTy.isVector()) {
1150     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1151       return false;
1152   } else {
1153     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1154       return false;
1155   }
1156 
1157   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1158   return true;
1159 }
1160 
1161 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1162   MachineInstr &MI,
1163   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1164   MachineRegisterInfo &MRI) const {
1165   const MachineFunction &MF = *MI.getMF();
1166   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1167   const auto &TFI = *ST.getFrameLowering();
1168 
1169   // Guard in case the stack growth direction ever changes with scratch
1170   // instructions.
1171   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1172     return false;
1173 
1174   Register Dst = MI.getOperand(0).getReg();
1175   Register AllocSize = MI.getOperand(1).getReg();
1176   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1177 
1178   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1179 
1180   // TODO: Need to emit a wave reduction to get the maximum size.
1181   if (SizeBank != &AMDGPU::SGPRRegBank)
1182     return false;
1183 
1184   LLT PtrTy = MRI.getType(Dst);
1185   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1186 
1187   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1188   Register SPReg = Info->getStackPtrOffsetReg();
1189   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1190   MachineIRBuilder B(MI, ApplyBank);
1191 
1192   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1193   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1194 
1195   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1196   if (Alignment > TFI.getStackAlign()) {
1197     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1198     B.buildMaskLowPtrBits(Dst, PtrAdd,
1199                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1200   } else {
1201     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1202   }
1203 
1204   MI.eraseFromParent();
1205   return true;
1206 }
1207 
1208 bool AMDGPURegisterBankInfo::applyMappingImage(
1209     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1210     MachineRegisterInfo &MRI, int RsrcIdx) const {
1211   const int NumDefs = MI.getNumExplicitDefs();
1212 
1213   // The reported argument index is relative to the IR intrinsic call arguments,
1214   // so we need to shift by the number of defs and the intrinsic ID.
1215   RsrcIdx += NumDefs + 1;
1216 
1217   // Insert copies to VGPR arguments.
1218   applyDefaultMapping(OpdMapper);
1219 
1220   // Fixup any SGPR arguments.
1221   SmallVector<unsigned, 4> SGPRIndexes;
1222   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1223     if (!MI.getOperand(I).isReg())
1224       continue;
1225 
1226     // If this intrinsic has a sampler, it immediately follows rsrc.
1227     if (I == RsrcIdx || I == RsrcIdx + 1)
1228       SGPRIndexes.push_back(I);
1229   }
1230 
1231   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1232   return true;
1233 }
1234 
1235 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1236                                         Register Reg) {
1237   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1238   if (!Def)
1239     return Reg;
1240 
1241   // TODO: Guard against this being an implicit def
1242   return Def->getOperand(0).getReg();
1243 }
1244 
1245 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1246 // the three offsets (voffset, soffset and instoffset)
1247 static unsigned setBufferOffsets(MachineIRBuilder &B,
1248                                  const AMDGPURegisterBankInfo &RBI,
1249                                  Register CombinedOffset, Register &VOffsetReg,
1250                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1251                                  Align Alignment) {
1252   const LLT S32 = LLT::scalar(32);
1253   MachineRegisterInfo *MRI = B.getMRI();
1254 
1255   if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1256     uint32_t SOffset, ImmOffset;
1257     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1258                                  Alignment)) {
1259       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1260       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1261       InstOffsetVal = ImmOffset;
1262 
1263       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1264       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1265       return SOffset + ImmOffset;
1266     }
1267   }
1268 
1269   Register Base;
1270   unsigned Offset;
1271 
1272   std::tie(Base, Offset) =
1273       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1274 
1275   uint32_t SOffset, ImmOffset;
1276   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1277                                                   &RBI.Subtarget, Alignment)) {
1278     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1279       VOffsetReg = Base;
1280       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1281       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1282       InstOffsetVal = ImmOffset;
1283       return 0; // XXX - Why is this 0?
1284     }
1285 
1286     // If we have SGPR base, we can use it for soffset.
1287     if (SOffset == 0) {
1288       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1289       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1290       SOffsetReg = Base;
1291       InstOffsetVal = ImmOffset;
1292       return 0; // XXX - Why is this 0?
1293     }
1294   }
1295 
1296   // Handle the variable sgpr + vgpr case.
1297   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1298   if (Add && (int)Offset >= 0) {
1299     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1300     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1301 
1302     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1303     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1304 
1305     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1306       VOffsetReg = Src0;
1307       SOffsetReg = Src1;
1308       return 0;
1309     }
1310 
1311     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1312       VOffsetReg = Src1;
1313       SOffsetReg = Src0;
1314       return 0;
1315     }
1316   }
1317 
1318   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1319   // have an SGPR offset and a VGPR resource.
1320   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1321     VOffsetReg = CombinedOffset;
1322   } else {
1323     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1324     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1325   }
1326 
1327   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1328   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1329   return 0;
1330 }
1331 
1332 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1333   const OperandsMapper &OpdMapper) const {
1334   MachineInstr &MI = OpdMapper.getMI();
1335   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1336 
1337   const LLT S32 = LLT::scalar(32);
1338   Register Dst = MI.getOperand(0).getReg();
1339   LLT Ty = MRI.getType(Dst);
1340 
1341   const RegisterBank *RSrcBank =
1342     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1343   const RegisterBank *OffsetBank =
1344     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1345   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1346       OffsetBank == &AMDGPU::SGPRRegBank)
1347     return true; // Legal mapping
1348 
1349   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1350   // here but don't have an MMO.
1351 
1352   unsigned LoadSize = Ty.getSizeInBits();
1353   int NumLoads = 1;
1354   if (LoadSize == 256 || LoadSize == 512) {
1355     NumLoads = LoadSize / 128;
1356     Ty = Ty.divide(NumLoads);
1357   }
1358 
1359   // Use the alignment to ensure that the required offsets will fit into the
1360   // immediate offsets.
1361   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1362 
1363   MachineIRBuilder B(MI);
1364   MachineFunction &MF = B.getMF();
1365 
1366   Register SOffset;
1367   Register VOffset;
1368   int64_t ImmOffset = 0;
1369 
1370   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1371                                         VOffset, SOffset, ImmOffset, Alignment);
1372 
1373   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1374   // can, but we need to track an MMO for that.
1375   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1376   const Align MemAlign(4); // FIXME: ABI type alignment?
1377   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1378     MachinePointerInfo(),
1379     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1380     MachineMemOperand::MOInvariant,
1381     MemSize, MemAlign);
1382   if (MMOOffset != 0)
1383     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1384 
1385   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1386   // assume that the buffer is unswizzled.
1387 
1388   Register RSrc = MI.getOperand(1).getReg();
1389   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1390   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1391 
1392   SmallVector<Register, 4> LoadParts(NumLoads);
1393 
1394   MachineBasicBlock::iterator MII = MI.getIterator();
1395   MachineInstrSpan Span(MII, &B.getMBB());
1396 
1397   for (int i = 0; i < NumLoads; ++i) {
1398     if (NumLoads == 1) {
1399       LoadParts[i] = Dst;
1400     } else {
1401       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1402       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1403     }
1404 
1405     MachineMemOperand *MMO = BaseMMO;
1406     if (i != 0)
1407       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1408 
1409     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1410       .addDef(LoadParts[i])       // vdata
1411       .addUse(RSrc)               // rsrc
1412       .addUse(VIndex)             // vindex
1413       .addUse(VOffset)            // voffset
1414       .addUse(SOffset)            // soffset
1415       .addImm(ImmOffset + 16 * i) // offset(imm)
1416       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1417       .addImm(0)                  // idxen(imm)
1418       .addMemOperand(MMO);
1419   }
1420 
1421   // TODO: If only the resource is a VGPR, it may be better to execute the
1422   // scalar load in the waterfall loop if the resource is expected to frequently
1423   // be dynamically uniform.
1424   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1425     // Remove the original instruction to avoid potentially confusing the
1426     // waterfall loop logic.
1427     B.setInstr(*Span.begin());
1428     MI.eraseFromParent();
1429 
1430     SmallSet<Register, 4> OpsToWaterfall;
1431 
1432     OpsToWaterfall.insert(RSrc);
1433     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1434                            OpsToWaterfall, MRI);
1435   }
1436 
1437   if (NumLoads != 1) {
1438     if (Ty.isVector())
1439       B.buildConcatVectors(Dst, LoadParts);
1440     else
1441       B.buildMerge(Dst, LoadParts);
1442   }
1443 
1444   // We removed the instruction earlier with a waterfall loop.
1445   if (RSrcBank == &AMDGPU::SGPRRegBank)
1446     MI.eraseFromParent();
1447 
1448   return true;
1449 }
1450 
1451 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1452                                              bool Signed) const {
1453   MachineInstr &MI = OpdMapper.getMI();
1454   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1455 
1456   // Insert basic copies
1457   applyDefaultMapping(OpdMapper);
1458 
1459   Register DstReg = MI.getOperand(0).getReg();
1460   LLT Ty = MRI.getType(DstReg);
1461 
1462   const LLT S32 = LLT::scalar(32);
1463 
1464   unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1465   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1466   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1467   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1468 
1469   const RegisterBank *DstBank =
1470     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1471   if (DstBank == &AMDGPU::VGPRRegBank) {
1472     if (Ty == S32)
1473       return true;
1474 
1475     // There is no 64-bit vgpr bitfield extract instructions so the operation
1476     // is expanded to a sequence of instructions that implement the operation.
1477     ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1478     MachineIRBuilder B(MI, ApplyBank);
1479 
1480     const LLT S64 = LLT::scalar(64);
1481     // Shift the source operand so that extracted bits start at bit 0.
1482     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1483                               : B.buildLShr(S64, SrcReg, OffsetReg);
1484     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1485 
1486     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1487     // if the width is a constant.
1488     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1489       // Use the 32-bit bitfield extract instruction if the width is a constant.
1490       // Depending on the width size, use either the low or high 32-bits.
1491       auto Zero = B.buildConstant(S32, 0);
1492       auto WidthImm = ConstWidth->Value.getZExtValue();
1493       if (WidthImm <= 32) {
1494         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1495         // or clear the upper 32-bits.
1496         auto Extract =
1497             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1498                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1499         auto Extend =
1500             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1501         B.buildMerge(DstReg, {Extract, Extend});
1502       } else {
1503         // Use bitfield extract on upper 32-bit source, and combine with lower
1504         // 32-bit source.
1505         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1506         auto Extract =
1507             Signed
1508                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1509                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1510         B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1511       }
1512       MI.eraseFromParent();
1513       return true;
1514     }
1515 
1516     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1517     // operations.
1518     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1519     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1520     if (Signed)
1521       B.buildAShr(S64, SignBit, ExtShift);
1522     else
1523       B.buildLShr(S64, SignBit, ExtShift);
1524     MI.eraseFromParent();
1525     return true;
1526   }
1527 
1528   // The scalar form packs the offset and width in a single operand.
1529 
1530   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1531   MachineIRBuilder B(MI, ApplyBank);
1532 
1533   // Ensure the high bits are clear to insert the offset.
1534   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1535   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1536 
1537   // Zeros out the low bits, so don't bother clamping the input value.
1538   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1539 
1540   // Transformation function, pack the offset and width of a BFE into
1541   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1542   // source, bits [5:0] contain the offset and bits [22:16] the width.
1543   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1544 
1545   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1546   // register class constraints.
1547   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1548                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1549 
1550   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1551   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1552     llvm_unreachable("failed to constrain BFE");
1553 
1554   MI.eraseFromParent();
1555   return true;
1556 }
1557 
1558 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1559     const OperandsMapper &OpdMapper) const {
1560   MachineInstr &MI = OpdMapper.getMI();
1561   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1562 
1563   // Insert basic copies.
1564   applyDefaultMapping(OpdMapper);
1565 
1566   Register Dst0 = MI.getOperand(0).getReg();
1567   Register Dst1 = MI.getOperand(1).getReg();
1568   Register Src0 = MI.getOperand(2).getReg();
1569   Register Src1 = MI.getOperand(3).getReg();
1570   Register Src2 = MI.getOperand(4).getReg();
1571 
1572   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1573     return true;
1574 
1575   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1576   LLT S1 = LLT::scalar(1);
1577   LLT S32 = LLT::scalar(32);
1578 
1579   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1580   bool Accumulate = true;
1581 
1582   if (!DstOnValu) {
1583     if (mi_match(Src2, MRI, m_ZeroInt()))
1584       Accumulate = false;
1585   }
1586 
1587   // Keep the multiplication on the SALU.
1588   MachineIRBuilder B(MI);
1589 
1590   Register DstHi;
1591   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1592   bool MulHiInVgpr = false;
1593 
1594   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1595 
1596   if (Subtarget.hasSMulHi()) {
1597     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1598                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1599     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1600   } else {
1601     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1602     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1603 
1604     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1605     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1606 
1607     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1608                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1609     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1610 
1611     if (!DstOnValu) {
1612       DstHi = buildReadFirstLane(B, MRI, DstHi);
1613     } else {
1614       MulHiInVgpr = true;
1615     }
1616   }
1617 
1618   // Accumulate and produce the "carry-out" bit.
1619   //
1620   // The "carry-out" is defined as bit 64 of the result when computed as a
1621   // big integer. For unsigned multiply-add, this matches the usual definition
1622   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1623   // result, which is determined as:
1624   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1625   LLT CarryType = DstOnValu ? S1 : S32;
1626   const RegisterBank &CarryBank =
1627       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1628   const RegisterBank &DstBank =
1629       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1630   Register Carry;
1631   Register Zero;
1632 
1633   if (!IsUnsigned) {
1634     Zero = B.buildConstant(S32, 0).getReg(0);
1635     MRI.setRegBank(Zero,
1636                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1637 
1638     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1639                 .getReg(0);
1640     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1641                                       : AMDGPU::SGPRRegBank);
1642 
1643     if (DstOnValu && !MulHiInVgpr) {
1644       Carry = B.buildTrunc(S1, Carry).getReg(0);
1645       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1646     }
1647   }
1648 
1649   if (Accumulate) {
1650     if (DstOnValu) {
1651       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1652       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1653       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1654       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1655     }
1656 
1657     auto Unmerge = B.buildUnmerge(S32, Src2);
1658     Register Src2Lo = Unmerge.getReg(0);
1659     Register Src2Hi = Unmerge.getReg(1);
1660     MRI.setRegBank(Src2Lo, DstBank);
1661     MRI.setRegBank(Src2Hi, DstBank);
1662 
1663     if (!IsUnsigned) {
1664       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1665       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1666 
1667       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1668       MRI.setRegBank(Carry, CarryBank);
1669     }
1670 
1671     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1672     DstLo = AddLo.getReg(0);
1673     Register CarryLo = AddLo.getReg(1);
1674     MRI.setRegBank(DstLo, DstBank);
1675     MRI.setRegBank(CarryLo, CarryBank);
1676 
1677     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1678     DstHi = AddHi.getReg(0);
1679     MRI.setRegBank(DstHi, DstBank);
1680 
1681     Register CarryHi = AddHi.getReg(1);
1682     MRI.setRegBank(CarryHi, CarryBank);
1683 
1684     if (IsUnsigned) {
1685       Carry = CarryHi;
1686     } else {
1687       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1688       MRI.setRegBank(Carry, CarryBank);
1689     }
1690   } else {
1691     if (IsUnsigned) {
1692       Carry = B.buildConstant(CarryType, 0).getReg(0);
1693       MRI.setRegBank(Carry, CarryBank);
1694     }
1695   }
1696 
1697   B.buildMerge(Dst0, {DstLo, DstHi});
1698 
1699   if (DstOnValu) {
1700     B.buildCopy(Dst1, Carry);
1701   } else {
1702     B.buildTrunc(Dst1, Carry);
1703   }
1704 
1705   MI.eraseFromParent();
1706   return true;
1707 }
1708 
1709 // Return a suitable opcode for extending the operands of Opc when widening.
1710 static unsigned getExtendOp(unsigned Opc) {
1711   switch (Opc) {
1712   case TargetOpcode::G_ASHR:
1713   case TargetOpcode::G_SMIN:
1714   case TargetOpcode::G_SMAX:
1715     return TargetOpcode::G_SEXT;
1716   case TargetOpcode::G_LSHR:
1717   case TargetOpcode::G_UMIN:
1718   case TargetOpcode::G_UMAX:
1719     return TargetOpcode::G_ZEXT;
1720   default:
1721     return TargetOpcode::G_ANYEXT;
1722   }
1723 }
1724 
1725 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1726 // any illegal vector extend or unmerge operations.
1727 static std::pair<Register, Register>
1728 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1729   const LLT S32 = LLT::scalar(32);
1730   auto Bitcast = B.buildBitcast(S32, Src);
1731 
1732   if (ExtOpcode == TargetOpcode::G_SEXT) {
1733     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1734     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1735     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1736   }
1737 
1738   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1739   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1740     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1741     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1742   }
1743 
1744   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1745   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1746 }
1747 
1748 // For cases where only a single copy is inserted for matching register banks.
1749 // Replace the register in the instruction operand
1750 static bool substituteSimpleCopyRegs(
1751   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1752   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1753   if (!SrcReg.empty()) {
1754     assert(SrcReg.size() == 1);
1755     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1756     return true;
1757   }
1758 
1759   return false;
1760 }
1761 
1762 /// Handle register layout difference for f16 images for some subtargets.
1763 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1764                                                 MachineRegisterInfo &MRI,
1765                                                 Register Reg) const {
1766   if (!Subtarget.hasUnpackedD16VMem())
1767     return Reg;
1768 
1769   const LLT S16 = LLT::scalar(16);
1770   LLT StoreVT = MRI.getType(Reg);
1771   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1772     return Reg;
1773 
1774   auto Unmerge = B.buildUnmerge(S16, Reg);
1775 
1776 
1777   SmallVector<Register, 4> WideRegs;
1778   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1779     WideRegs.push_back(Unmerge.getReg(I));
1780 
1781   const LLT S32 = LLT::scalar(32);
1782   int NumElts = StoreVT.getNumElements();
1783 
1784   return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1785 }
1786 
1787 static std::pair<Register, unsigned>
1788 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1789   int64_t Const;
1790   if (mi_match(Reg, MRI, m_ICst(Const)))
1791     return std::make_pair(Register(), Const);
1792 
1793   Register Base;
1794   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1795     return std::make_pair(Base, Const);
1796 
1797   // TODO: Handle G_OR used for add case
1798   return std::make_pair(Reg, 0);
1799 }
1800 
1801 std::pair<Register, unsigned>
1802 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1803                                            Register OrigOffset) const {
1804   const unsigned MaxImm = 4095;
1805   Register BaseReg;
1806   unsigned ImmOffset;
1807   const LLT S32 = LLT::scalar(32);
1808 
1809   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1810                                                            OrigOffset);
1811 
1812   unsigned C1 = 0;
1813   if (ImmOffset != 0) {
1814     // If the immediate value is too big for the immoffset field, put the value
1815     // and -4096 into the immoffset field so that the value that is copied/added
1816     // for the voffset field is a multiple of 4096, and it stands more chance
1817     // of being CSEd with the copy/add for another similar load/store.
1818     // However, do not do that rounding down to a multiple of 4096 if that is a
1819     // negative number, as it appears to be illegal to have a negative offset
1820     // in the vgpr, even if adding the immediate offset makes it positive.
1821     unsigned Overflow = ImmOffset & ~MaxImm;
1822     ImmOffset -= Overflow;
1823     if ((int32_t)Overflow < 0) {
1824       Overflow += ImmOffset;
1825       ImmOffset = 0;
1826     }
1827 
1828     C1 = ImmOffset;
1829     if (Overflow != 0) {
1830       if (!BaseReg)
1831         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1832       else {
1833         auto OverflowVal = B.buildConstant(S32, Overflow);
1834         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1835       }
1836     }
1837   }
1838 
1839   if (!BaseReg)
1840     BaseReg = B.buildConstant(S32, 0).getReg(0);
1841 
1842   return {BaseReg, C1};
1843 }
1844 
1845 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1846                                         Register SrcReg) const {
1847   MachineRegisterInfo &MRI = *B.getMRI();
1848   LLT SrcTy = MRI.getType(SrcReg);
1849   if (SrcTy.getSizeInBits() == 32) {
1850     // Use a v_mov_b32 here to make the exec dependency explicit.
1851     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1852       .addDef(DstReg)
1853       .addUse(SrcReg);
1854     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1855            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1856   }
1857 
1858   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1859   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1860 
1861   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1862     .addDef(TmpReg0)
1863     .addUse(SrcReg, 0, AMDGPU::sub0);
1864   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1865     .addDef(TmpReg1)
1866     .addUse(SrcReg, 0, AMDGPU::sub1);
1867   B.buildInstr(AMDGPU::REG_SEQUENCE)
1868     .addDef(DstReg)
1869     .addUse(TmpReg0)
1870     .addImm(AMDGPU::sub0)
1871     .addUse(TmpReg1)
1872     .addImm(AMDGPU::sub1);
1873 
1874   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1875          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1876 }
1877 
1878 /// Utility function for pushing dynamic vector indexes with a constant offset
1879 /// into waterfall loops.
1880 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1881                                    MachineInstr &IdxUseInstr,
1882                                    unsigned OpIdx,
1883                                    unsigned ConstOffset) {
1884   MachineRegisterInfo &MRI = *B.getMRI();
1885   const LLT S32 = LLT::scalar(32);
1886   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1887   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1888 
1889   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1890 
1891   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1892   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1893   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1894   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1895 }
1896 
1897 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1898 /// original 32-bit source value (to be inserted in the low part of the combined
1899 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1900 /// value.
1901 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1902                                   Register Hi32Reg, Register Lo32Reg,
1903                                   unsigned ExtOpc,
1904                                   const RegisterBank &RegBank,
1905                                   bool IsBooleanSrc = false) {
1906   if (ExtOpc == AMDGPU::G_ZEXT) {
1907     B.buildConstant(Hi32Reg, 0);
1908   } else if (ExtOpc == AMDGPU::G_SEXT) {
1909     if (IsBooleanSrc) {
1910       // If we know the original source was an s1, the high half is the same as
1911       // the low.
1912       B.buildCopy(Hi32Reg, Lo32Reg);
1913     } else {
1914       // Replicate sign bit from 32-bit extended part.
1915       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1916       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1917       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1918     }
1919   } else {
1920     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1921     B.buildUndef(Hi32Reg);
1922   }
1923 }
1924 
1925 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   const OperandsMapper &OpdMapper) const {
1928 
1929   Register VecReg = MI.getOperand(1).getReg();
1930   Register Idx = MI.getOperand(2).getReg();
1931 
1932   const RegisterBank &IdxBank =
1933     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1934 
1935   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1936 
1937   LLT VecTy = MRI.getType(VecReg);
1938   unsigned EltSize = VecTy.getScalarSizeInBits();
1939   unsigned NumElem = VecTy.getNumElements();
1940 
1941   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1942                                                   IsDivergentIdx, &Subtarget))
1943     return false;
1944 
1945   MachineIRBuilder B(MI);
1946   LLT S32 = LLT::scalar(32);
1947 
1948   const RegisterBank &DstBank =
1949     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1950   const RegisterBank &SrcBank =
1951     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1952 
1953   const RegisterBank &CCBank =
1954     (DstBank == AMDGPU::SGPRRegBank &&
1955      SrcBank == AMDGPU::SGPRRegBank &&
1956      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1957                                      : AMDGPU::VCCRegBank;
1958   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1959 
1960   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1961     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1962     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1963   }
1964 
1965   LLT EltTy = VecTy.getScalarType();
1966   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1967   unsigned NumLanes = DstRegs.size();
1968   if (!NumLanes)
1969     NumLanes = 1;
1970   else
1971     EltTy = MRI.getType(DstRegs[0]);
1972 
1973   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1974   SmallVector<Register, 2> Res(NumLanes);
1975   for (unsigned L = 0; L < NumLanes; ++L)
1976     Res[L] = UnmergeToEltTy.getReg(L);
1977 
1978   for (unsigned I = 1; I < NumElem; ++I) {
1979     auto IC = B.buildConstant(S32, I);
1980     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1981     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1982     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1983 
1984     for (unsigned L = 0; L < NumLanes; ++L) {
1985       auto S = B.buildSelect(EltTy, Cmp,
1986                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1987 
1988       for (unsigned N : { 0, 2, 3 })
1989         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1990 
1991       Res[L] = S->getOperand(0).getReg();
1992     }
1993   }
1994 
1995   for (unsigned L = 0; L < NumLanes; ++L) {
1996     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1997     B.buildCopy(DstReg, Res[L]);
1998     MRI.setRegBank(DstReg, DstBank);
1999   }
2000 
2001   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2002   MI.eraseFromParent();
2003 
2004   return true;
2005 }
2006 
2007 // Insert a cross regbank copy for a register if it already has a bank that
2008 // differs from the one we want to set.
2009 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2010                                    MachineIRBuilder &B, Register &Reg,
2011                                    const RegisterBank &Bank) {
2012   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2013   if (CurrBank && *CurrBank != Bank) {
2014     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2015     MRI.setRegBank(Copy, Bank);
2016     return Copy;
2017   }
2018 
2019   MRI.setRegBank(Reg, Bank);
2020   return Reg;
2021 }
2022 
2023 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2024   MachineInstr &MI, MachineRegisterInfo &MRI,
2025   const OperandsMapper &OpdMapper) const {
2026 
2027   Register VecReg = MI.getOperand(1).getReg();
2028   Register Idx = MI.getOperand(3).getReg();
2029 
2030   const RegisterBank &IdxBank =
2031     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2032 
2033   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2034 
2035   LLT VecTy = MRI.getType(VecReg);
2036   unsigned EltSize = VecTy.getScalarSizeInBits();
2037   unsigned NumElem = VecTy.getNumElements();
2038 
2039   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2040                                                   IsDivergentIdx, &Subtarget))
2041     return false;
2042 
2043   MachineIRBuilder B(MI);
2044   LLT S32 = LLT::scalar(32);
2045 
2046   const RegisterBank &DstBank =
2047     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2048   const RegisterBank &SrcBank =
2049     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2050   const RegisterBank &InsBank =
2051     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2052 
2053   const RegisterBank &CCBank =
2054     (DstBank == AMDGPU::SGPRRegBank &&
2055      SrcBank == AMDGPU::SGPRRegBank &&
2056      InsBank == AMDGPU::SGPRRegBank &&
2057      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2058                                      : AMDGPU::VCCRegBank;
2059   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2060 
2061   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2062     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2063     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2064   }
2065 
2066   LLT EltTy = VecTy.getScalarType();
2067   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2068   unsigned NumLanes = InsRegs.size();
2069   if (!NumLanes) {
2070     NumLanes = 1;
2071     InsRegs.push_back(MI.getOperand(2).getReg());
2072   } else {
2073     EltTy = MRI.getType(InsRegs[0]);
2074   }
2075 
2076   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2077   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2078 
2079   for (unsigned I = 0; I < NumElem; ++I) {
2080     auto IC = B.buildConstant(S32, I);
2081     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2082     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2083     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2084 
2085     for (unsigned L = 0; L < NumLanes; ++L) {
2086       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2087       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2088       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2089 
2090       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2091       MRI.setRegBank(Select, DstBank);
2092 
2093       Ops[I * NumLanes + L] = Select;
2094     }
2095   }
2096 
2097   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2098   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2099     B.buildBuildVector(MI.getOperand(0), Ops);
2100   } else {
2101     auto Vec = B.buildBuildVector(MergeTy, Ops);
2102     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2103     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2104   }
2105 
2106   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2107   MI.eraseFromParent();
2108 
2109   return true;
2110 }
2111 
2112 void AMDGPURegisterBankInfo::applyMappingImpl(
2113     const OperandsMapper &OpdMapper) const {
2114   MachineInstr &MI = OpdMapper.getMI();
2115   unsigned Opc = MI.getOpcode();
2116   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2117   switch (Opc) {
2118   case AMDGPU::G_PHI: {
2119     Register DstReg = MI.getOperand(0).getReg();
2120     LLT DstTy = MRI.getType(DstReg);
2121     if (DstTy != LLT::scalar(1))
2122       break;
2123 
2124     const LLT S32 = LLT::scalar(32);
2125     const RegisterBank *DstBank =
2126       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2127     if (DstBank == &AMDGPU::VCCRegBank) {
2128       applyDefaultMapping(OpdMapper);
2129       // The standard handling only considers the result register bank for
2130       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2131       // produce an invalid copy. We can only copy with some kind of compare to
2132       // get a vector boolean result. Insert a register bank copy that will be
2133       // correctly lowered to a compare.
2134       MachineIRBuilder B(*MI.getParent()->getParent());
2135 
2136       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2137         Register SrcReg = MI.getOperand(I).getReg();
2138         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2139 
2140         if (SrcBank != &AMDGPU::VCCRegBank) {
2141           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2142           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2143 
2144           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2145           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2146           MI.getOperand(I).setReg(Copy.getReg(0));
2147         }
2148       }
2149 
2150       return;
2151     }
2152 
2153     // Phi handling is strange and only considers the bank of the destination.
2154     substituteSimpleCopyRegs(OpdMapper, 0);
2155 
2156     // Promote SGPR/VGPR booleans to s32
2157     MachineFunction *MF = MI.getParent()->getParent();
2158     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2159     MachineIRBuilder B(MI, ApplyBank);
2160     LegalizerHelper Helper(*MF, ApplyBank, B);
2161 
2162     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2163       llvm_unreachable("widen scalar should have succeeded");
2164 
2165     return;
2166   }
2167   case AMDGPU::G_ICMP:
2168   case AMDGPU::G_UADDO:
2169   case AMDGPU::G_USUBO:
2170   case AMDGPU::G_UADDE:
2171   case AMDGPU::G_SADDE:
2172   case AMDGPU::G_USUBE:
2173   case AMDGPU::G_SSUBE: {
2174     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2175     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2176 
2177     const RegisterBank *DstBank =
2178       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2179     if (DstBank != &AMDGPU::SGPRRegBank)
2180       break;
2181 
2182     const bool HasCarryIn = MI.getNumOperands() == 5;
2183 
2184     // If this is a scalar compare, promote the result to s32, as the selection
2185     // will end up using a copy to a 32-bit vreg.
2186     const LLT S32 = LLT::scalar(32);
2187     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2188     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2189     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2190     MachineIRBuilder B(MI);
2191 
2192     if (HasCarryIn) {
2193       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2194       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2195       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2196       MI.getOperand(4).setReg(NewSrcReg);
2197     }
2198 
2199     MachineBasicBlock *MBB = MI.getParent();
2200     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2201 
2202     // If we had a constrained VCC result register, a copy was inserted to VCC
2203     // from SGPR.
2204     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2205     if (DefRegs.empty())
2206       DefRegs.push_back(DstReg);
2207     B.buildTrunc(DefRegs[0], NewDstReg);
2208     return;
2209   }
2210   case AMDGPU::G_SELECT: {
2211     Register DstReg = MI.getOperand(0).getReg();
2212     LLT DstTy = MRI.getType(DstReg);
2213 
2214     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2215     if (CondRegs.empty())
2216       CondRegs.push_back(MI.getOperand(1).getReg());
2217     else {
2218       assert(CondRegs.size() == 1);
2219     }
2220 
2221     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2222     if (CondBank == &AMDGPU::SGPRRegBank) {
2223       MachineIRBuilder B(MI);
2224       const LLT S32 = LLT::scalar(32);
2225       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2226       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2227 
2228       MI.getOperand(1).setReg(NewCondReg);
2229       B.buildZExt(NewCondReg, CondRegs[0]);
2230     }
2231 
2232     if (DstTy.getSizeInBits() != 64)
2233       break;
2234 
2235     MachineIRBuilder B(MI);
2236     LLT HalfTy = getHalfSizedType(DstTy);
2237 
2238     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2239     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2240     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2241 
2242     // All inputs are SGPRs, nothing special to do.
2243     if (DefRegs.empty()) {
2244       assert(Src1Regs.empty() && Src2Regs.empty());
2245       break;
2246     }
2247 
2248     if (Src1Regs.empty())
2249       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2250     else {
2251       setRegsToType(MRI, Src1Regs, HalfTy);
2252     }
2253 
2254     if (Src2Regs.empty())
2255       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2256     else
2257       setRegsToType(MRI, Src2Regs, HalfTy);
2258 
2259     setRegsToType(MRI, DefRegs, HalfTy);
2260 
2261     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2262     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2263 
2264     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2265     MI.eraseFromParent();
2266     return;
2267   }
2268   case AMDGPU::G_BRCOND: {
2269     Register CondReg = MI.getOperand(0).getReg();
2270     // FIXME: Should use legalizer helper, but should change bool ext type.
2271     const RegisterBank *CondBank =
2272       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2273 
2274     if (CondBank == &AMDGPU::SGPRRegBank) {
2275       MachineIRBuilder B(MI);
2276       const LLT S32 = LLT::scalar(32);
2277       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2278       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2279 
2280       MI.getOperand(0).setReg(NewCondReg);
2281       B.buildZExt(NewCondReg, CondReg);
2282       return;
2283     }
2284 
2285     break;
2286   }
2287   case AMDGPU::G_AND:
2288   case AMDGPU::G_OR:
2289   case AMDGPU::G_XOR: {
2290     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2291     // there is a VGPR input.
2292     Register DstReg = MI.getOperand(0).getReg();
2293     LLT DstTy = MRI.getType(DstReg);
2294 
2295     if (DstTy.getSizeInBits() == 1) {
2296       const RegisterBank *DstBank =
2297         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2298       if (DstBank == &AMDGPU::VCCRegBank)
2299         break;
2300 
2301       MachineFunction *MF = MI.getParent()->getParent();
2302       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2303       MachineIRBuilder B(MI, ApplyBank);
2304       LegalizerHelper Helper(*MF, ApplyBank, B);
2305 
2306       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2307           LegalizerHelper::Legalized)
2308         llvm_unreachable("widen scalar should have succeeded");
2309       return;
2310     }
2311 
2312     if (DstTy.getSizeInBits() != 64)
2313       break;
2314 
2315     LLT HalfTy = getHalfSizedType(DstTy);
2316     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2317     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2318     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2319 
2320     // All inputs are SGPRs, nothing special to do.
2321     if (DefRegs.empty()) {
2322       assert(Src0Regs.empty() && Src1Regs.empty());
2323       break;
2324     }
2325 
2326     assert(DefRegs.size() == 2);
2327     assert(Src0Regs.size() == Src1Regs.size() &&
2328            (Src0Regs.empty() || Src0Regs.size() == 2));
2329 
2330     // Depending on where the source registers came from, the generic code may
2331     // have decided to split the inputs already or not. If not, we still need to
2332     // extract the values.
2333     MachineIRBuilder B(MI);
2334 
2335     if (Src0Regs.empty())
2336       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2337     else
2338       setRegsToType(MRI, Src0Regs, HalfTy);
2339 
2340     if (Src1Regs.empty())
2341       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2342     else
2343       setRegsToType(MRI, Src1Regs, HalfTy);
2344 
2345     setRegsToType(MRI, DefRegs, HalfTy);
2346 
2347     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2348     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2349 
2350     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2351     MI.eraseFromParent();
2352     return;
2353   }
2354   case AMDGPU::G_ABS: {
2355     Register SrcReg = MI.getOperand(1).getReg();
2356     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2357 
2358     // There is no VALU abs instruction so we need to replace it with a sub and
2359     // max combination.
2360     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2361       MachineFunction *MF = MI.getParent()->getParent();
2362       ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2363       MachineIRBuilder B(MI, Apply);
2364       LegalizerHelper Helper(*MF, Apply, B);
2365 
2366       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2367         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2368       return;
2369     }
2370     LLVM_FALLTHROUGH;
2371   }
2372   case AMDGPU::G_ADD:
2373   case AMDGPU::G_SUB:
2374   case AMDGPU::G_MUL:
2375   case AMDGPU::G_SHL:
2376   case AMDGPU::G_LSHR:
2377   case AMDGPU::G_ASHR:
2378   case AMDGPU::G_SMIN:
2379   case AMDGPU::G_SMAX:
2380   case AMDGPU::G_UMIN:
2381   case AMDGPU::G_UMAX: {
2382     Register DstReg = MI.getOperand(0).getReg();
2383     LLT DstTy = MRI.getType(DstReg);
2384 
2385     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2386     // Packed 16-bit operations need to be scalarized and promoted.
2387     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2388       break;
2389 
2390     const RegisterBank *DstBank =
2391       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2392     if (DstBank == &AMDGPU::VGPRRegBank)
2393       break;
2394 
2395     const LLT S32 = LLT::scalar(32);
2396     MachineBasicBlock *MBB = MI.getParent();
2397     MachineFunction *MF = MBB->getParent();
2398     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2399     MachineIRBuilder B(MI, ApplySALU);
2400 
2401     if (DstTy.isVector()) {
2402       Register WideSrc0Lo, WideSrc0Hi;
2403       Register WideSrc1Lo, WideSrc1Hi;
2404 
2405       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2406       std::tie(WideSrc0Lo, WideSrc0Hi)
2407         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2408       std::tie(WideSrc1Lo, WideSrc1Hi)
2409         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2410       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2411       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2412       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2413       MI.eraseFromParent();
2414     } else {
2415       LegalizerHelper Helper(*MF, ApplySALU, B);
2416 
2417       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2418         llvm_unreachable("widen scalar should have succeeded");
2419 
2420       // FIXME: s16 shift amounts should be legal.
2421       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2422           Opc == AMDGPU::G_ASHR) {
2423         B.setInsertPt(*MBB, MI.getIterator());
2424         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2425           llvm_unreachable("widen scalar should have succeeded");
2426       }
2427     }
2428 
2429     return;
2430   }
2431   case AMDGPU::G_SEXT_INREG: {
2432     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2433     if (SrcRegs.empty())
2434       break; // Nothing to repair
2435 
2436     const LLT S32 = LLT::scalar(32);
2437     MachineIRBuilder B(MI);
2438     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2439     GISelObserverWrapper Observer(&O);
2440     B.setChangeObserver(Observer);
2441 
2442     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2443     // we would need to further expand, and doesn't let us directly set the
2444     // result registers.
2445     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2446 
2447     int Amt = MI.getOperand(2).getImm();
2448     if (Amt <= 32) {
2449       if (Amt == 32) {
2450         // The low bits are unchanged.
2451         B.buildCopy(DstRegs[0], SrcRegs[0]);
2452       } else {
2453         // Extend in the low bits and propagate the sign bit to the high half.
2454         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2455       }
2456 
2457       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2458     } else {
2459       // The low bits are unchanged, and extend in the high bits.
2460       B.buildCopy(DstRegs[0], SrcRegs[0]);
2461       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2462     }
2463 
2464     Register DstReg = MI.getOperand(0).getReg();
2465     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2466     MI.eraseFromParent();
2467     return;
2468   }
2469   case AMDGPU::G_CTPOP:
2470   case AMDGPU::G_BITREVERSE: {
2471     const RegisterBank *DstBank =
2472       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2473     if (DstBank == &AMDGPU::SGPRRegBank)
2474       break;
2475 
2476     Register SrcReg = MI.getOperand(1).getReg();
2477     const LLT S32 = LLT::scalar(32);
2478     LLT Ty = MRI.getType(SrcReg);
2479     if (Ty == S32)
2480       break;
2481 
2482     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2483     MachineIRBuilder B(MI, ApplyVALU);
2484 
2485     MachineFunction &MF = B.getMF();
2486     LegalizerHelper Helper(MF, ApplyVALU, B);
2487 
2488     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2489       llvm_unreachable("narrowScalar should have succeeded");
2490     return;
2491   }
2492   case AMDGPU::G_AMDGPU_FFBH_U32:
2493   case AMDGPU::G_AMDGPU_FFBL_B32:
2494   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2495   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2496     const RegisterBank *DstBank =
2497         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2498     if (DstBank == &AMDGPU::SGPRRegBank)
2499       break;
2500 
2501     Register SrcReg = MI.getOperand(1).getReg();
2502     const LLT S32 = LLT::scalar(32);
2503     LLT Ty = MRI.getType(SrcReg);
2504     if (Ty == S32)
2505       break;
2506 
2507     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2508     // which return -1 when the input is zero:
2509     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2510     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2511     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2512     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2513     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2514     MachineIRBuilder B(MI, ApplyVALU);
2515     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2516     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2517                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2518                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2519                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2520                                 : Opc;
2521     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2522     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2523     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2524     unsigned AddOpc =
2525         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2526             ? AMDGPU::G_ADD
2527             : AMDGPU::G_UADDSAT;
2528     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2529     Register DstReg = MI.getOperand(0).getReg();
2530     B.buildUMin(DstReg, X, Y);
2531     MI.eraseFromParent();
2532     return;
2533   }
2534   case AMDGPU::G_SEXT:
2535   case AMDGPU::G_ZEXT:
2536   case AMDGPU::G_ANYEXT: {
2537     Register SrcReg = MI.getOperand(1).getReg();
2538     LLT SrcTy = MRI.getType(SrcReg);
2539     const bool Signed = Opc == AMDGPU::G_SEXT;
2540 
2541     assert(empty(OpdMapper.getVRegs(1)));
2542 
2543     MachineIRBuilder B(MI);
2544     const RegisterBank *SrcBank =
2545       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2546 
2547     Register DstReg = MI.getOperand(0).getReg();
2548     LLT DstTy = MRI.getType(DstReg);
2549     if (DstTy.isScalar() &&
2550         SrcBank != &AMDGPU::SGPRRegBank &&
2551         SrcBank != &AMDGPU::VCCRegBank &&
2552         // FIXME: Should handle any type that round to s64 when irregular
2553         // breakdowns supported.
2554         DstTy.getSizeInBits() == 64 &&
2555         SrcTy.getSizeInBits() <= 32) {
2556       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2557 
2558       // Extend to 32-bit, and then extend the low half.
2559       if (Signed) {
2560         // TODO: Should really be buildSExtOrCopy
2561         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2562       } else if (Opc == AMDGPU::G_ZEXT) {
2563         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2564       } else {
2565         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2566       }
2567 
2568       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2569       MRI.setRegBank(DstReg, *SrcBank);
2570       MI.eraseFromParent();
2571       return;
2572     }
2573 
2574     if (SrcTy != LLT::scalar(1))
2575       return;
2576 
2577     // It is not legal to have a legalization artifact with a VCC source. Rather
2578     // than introducing a copy, insert the select we would have to select the
2579     // copy to.
2580     if (SrcBank == &AMDGPU::VCCRegBank) {
2581       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2582 
2583       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2584 
2585       unsigned DstSize = DstTy.getSizeInBits();
2586       // 64-bit select is SGPR only
2587       const bool UseSel64 = DstSize > 32 &&
2588         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2589 
2590       // TODO: Should s16 select be legal?
2591       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2592       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2593       auto False = B.buildConstant(SelType, 0);
2594 
2595       MRI.setRegBank(True.getReg(0), *DstBank);
2596       MRI.setRegBank(False.getReg(0), *DstBank);
2597       MRI.setRegBank(DstReg, *DstBank);
2598 
2599       if (DstSize > 32) {
2600         B.buildSelect(DefRegs[0], SrcReg, True, False);
2601         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2602       } else if (DstSize < 32) {
2603         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2604         MRI.setRegBank(Sel.getReg(0), *DstBank);
2605         B.buildTrunc(DstReg, Sel);
2606       } else {
2607         B.buildSelect(DstReg, SrcReg, True, False);
2608       }
2609 
2610       MI.eraseFromParent();
2611       return;
2612     }
2613 
2614     break;
2615   }
2616   case AMDGPU::G_BUILD_VECTOR:
2617   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2618     Register DstReg = MI.getOperand(0).getReg();
2619     LLT DstTy = MRI.getType(DstReg);
2620     if (DstTy != LLT::fixed_vector(2, 16))
2621       break;
2622 
2623     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2624     substituteSimpleCopyRegs(OpdMapper, 1);
2625     substituteSimpleCopyRegs(OpdMapper, 2);
2626 
2627     const RegisterBank *DstBank =
2628       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2629     if (DstBank == &AMDGPU::SGPRRegBank)
2630       break; // Can use S_PACK_* instructions.
2631 
2632     MachineIRBuilder B(MI);
2633 
2634     Register Lo = MI.getOperand(1).getReg();
2635     Register Hi = MI.getOperand(2).getReg();
2636     const LLT S32 = LLT::scalar(32);
2637 
2638     const RegisterBank *BankLo =
2639       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2640     const RegisterBank *BankHi =
2641       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2642 
2643     Register ZextLo;
2644     Register ShiftHi;
2645 
2646     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2647       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2648       MRI.setRegBank(ZextLo, *BankLo);
2649 
2650       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2651       MRI.setRegBank(ZextHi, *BankHi);
2652 
2653       auto ShiftAmt = B.buildConstant(S32, 16);
2654       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2655 
2656       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2657       MRI.setRegBank(ShiftHi, *BankHi);
2658     } else {
2659       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2660       MRI.setRegBank(MaskLo, *BankLo);
2661 
2662       auto ShiftAmt = B.buildConstant(S32, 16);
2663       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2664 
2665       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2666       MRI.setRegBank(ShiftHi, *BankHi);
2667 
2668       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2669       MRI.setRegBank(ZextLo, *BankLo);
2670     }
2671 
2672     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2673     MRI.setRegBank(Or.getReg(0), *DstBank);
2674 
2675     B.buildBitcast(DstReg, Or);
2676     MI.eraseFromParent();
2677     return;
2678   }
2679   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2680     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2681 
2682     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2683 
2684     Register DstReg = MI.getOperand(0).getReg();
2685     Register SrcReg = MI.getOperand(1).getReg();
2686 
2687     const LLT S32 = LLT::scalar(32);
2688     LLT DstTy = MRI.getType(DstReg);
2689     LLT SrcTy = MRI.getType(SrcReg);
2690 
2691     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2692       return;
2693 
2694     MachineIRBuilder B(MI);
2695 
2696     const ValueMapping &DstMapping
2697       = OpdMapper.getInstrMapping().getOperandMapping(0);
2698     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2699     const RegisterBank *SrcBank =
2700       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2701     const RegisterBank *IdxBank =
2702         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2703 
2704     Register BaseIdxReg;
2705     unsigned ConstOffset;
2706     std::tie(BaseIdxReg, ConstOffset) =
2707         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2708 
2709     // See if the index is an add of a constant which will be foldable by moving
2710     // the base register of the index later if this is going to be executed in a
2711     // waterfall loop. This is essentially to reassociate the add of a constant
2712     // with the readfirstlane.
2713     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2714                                    ConstOffset > 0 &&
2715                                    ConstOffset < SrcTy.getNumElements();
2716 
2717     // Move the base register. We'll re-insert the add later.
2718     if (ShouldMoveIndexIntoLoop)
2719       MI.getOperand(2).setReg(BaseIdxReg);
2720 
2721     // If this is a VGPR result only because the index was a VGPR result, the
2722     // actual indexing will be done on the SGPR source vector, which will
2723     // produce a scalar result. We need to copy to the VGPR result inside the
2724     // waterfall loop.
2725     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2726                                 SrcBank == &AMDGPU::SGPRRegBank;
2727     if (DstRegs.empty()) {
2728       applyDefaultMapping(OpdMapper);
2729 
2730       executeInWaterfallLoop(MI, MRI, { 2 });
2731 
2732       if (NeedCopyToVGPR) {
2733         // We don't want a phi for this temporary reg.
2734         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2735         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2736         MI.getOperand(0).setReg(TmpReg);
2737         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2738 
2739         // Use a v_mov_b32 here to make the exec dependency explicit.
2740         buildVCopy(B, DstReg, TmpReg);
2741       }
2742 
2743       // Re-insert the constant offset add inside the waterfall loop.
2744       if (ShouldMoveIndexIntoLoop)
2745         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2746 
2747       return;
2748     }
2749 
2750     assert(DstTy.getSizeInBits() == 64);
2751 
2752     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2753 
2754     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2755     auto One = B.buildConstant(S32, 1);
2756 
2757     MachineBasicBlock::iterator MII = MI.getIterator();
2758 
2759     // Split the vector index into 32-bit pieces. Prepare to move all of the
2760     // new instructions into a waterfall loop if necessary.
2761     //
2762     // Don't put the bitcast or constant in the loop.
2763     MachineInstrSpan Span(MII, &B.getMBB());
2764 
2765     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2766     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2767     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2768 
2769     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2770     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2771 
2772     MRI.setRegBank(DstReg, *DstBank);
2773     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2774     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2775     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2776     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2777 
2778     SmallSet<Register, 4> OpsToWaterfall;
2779     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2780       MI.eraseFromParent();
2781       return;
2782     }
2783 
2784     // Remove the original instruction to avoid potentially confusing the
2785     // waterfall loop logic.
2786     B.setInstr(*Span.begin());
2787     MI.eraseFromParent();
2788     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2789                            OpsToWaterfall, MRI);
2790 
2791     if (NeedCopyToVGPR) {
2792       MachineBasicBlock *LoopBB = Extract1->getParent();
2793       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2794       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2795       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2796       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2797 
2798       Extract0->getOperand(0).setReg(TmpReg0);
2799       Extract1->getOperand(0).setReg(TmpReg1);
2800 
2801       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2802 
2803       buildVCopy(B, DstRegs[0], TmpReg0);
2804       buildVCopy(B, DstRegs[1], TmpReg1);
2805     }
2806 
2807     if (ShouldMoveIndexIntoLoop)
2808       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2809 
2810     return;
2811   }
2812   case AMDGPU::G_INSERT_VECTOR_ELT: {
2813     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2814 
2815     Register DstReg = MI.getOperand(0).getReg();
2816     LLT VecTy = MRI.getType(DstReg);
2817 
2818     assert(OpdMapper.getVRegs(0).empty());
2819     assert(OpdMapper.getVRegs(3).empty());
2820 
2821     if (substituteSimpleCopyRegs(OpdMapper, 1))
2822       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2823 
2824     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2825       return;
2826 
2827     const RegisterBank *IdxBank =
2828       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2829 
2830     Register SrcReg = MI.getOperand(1).getReg();
2831     Register InsReg = MI.getOperand(2).getReg();
2832     LLT InsTy = MRI.getType(InsReg);
2833     (void)InsTy;
2834 
2835     Register BaseIdxReg;
2836     unsigned ConstOffset;
2837     std::tie(BaseIdxReg, ConstOffset) =
2838         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2839 
2840     // See if the index is an add of a constant which will be foldable by moving
2841     // the base register of the index later if this is going to be executed in a
2842     // waterfall loop. This is essentially to reassociate the add of a constant
2843     // with the readfirstlane.
2844     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2845       ConstOffset > 0 &&
2846       ConstOffset < VecTy.getNumElements();
2847 
2848     // Move the base register. We'll re-insert the add later.
2849     if (ShouldMoveIndexIntoLoop)
2850       MI.getOperand(3).setReg(BaseIdxReg);
2851 
2852 
2853     if (InsRegs.empty()) {
2854       executeInWaterfallLoop(MI, MRI, { 3 });
2855 
2856       // Re-insert the constant offset add inside the waterfall loop.
2857       if (ShouldMoveIndexIntoLoop) {
2858         MachineIRBuilder B(MI);
2859         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2860       }
2861 
2862       return;
2863     }
2864 
2865 
2866     assert(InsTy.getSizeInBits() == 64);
2867 
2868     const LLT S32 = LLT::scalar(32);
2869     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2870 
2871     MachineIRBuilder B(MI);
2872     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2873     auto One = B.buildConstant(S32, 1);
2874 
2875     // Split the vector index into 32-bit pieces. Prepare to move all of the
2876     // new instructions into a waterfall loop if necessary.
2877     //
2878     // Don't put the bitcast or constant in the loop.
2879     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2880 
2881     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2882     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2883     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2884 
2885     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2886     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2887 
2888     const RegisterBank *DstBank =
2889       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2890     const RegisterBank *SrcBank =
2891       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2892     const RegisterBank *InsSrcBank =
2893       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2894 
2895     MRI.setRegBank(InsReg, *InsSrcBank);
2896     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2897     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2898     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2899     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2900     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2901     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2902 
2903 
2904     SmallSet<Register, 4> OpsToWaterfall;
2905     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2906       B.setInsertPt(B.getMBB(), MI);
2907       B.buildBitcast(DstReg, InsHi);
2908       MI.eraseFromParent();
2909       return;
2910     }
2911 
2912     B.setInstr(*Span.begin());
2913     MI.eraseFromParent();
2914 
2915     // Figure out the point after the waterfall loop before mangling the control
2916     // flow.
2917     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2918                            OpsToWaterfall, MRI);
2919 
2920     // The insertion point is now right after the original instruction.
2921     //
2922     // Keep the bitcast to the original vector type out of the loop. Doing this
2923     // saved an extra phi we don't need inside the loop.
2924     B.buildBitcast(DstReg, InsHi);
2925 
2926     // Re-insert the constant offset add inside the waterfall loop.
2927     if (ShouldMoveIndexIntoLoop)
2928       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2929 
2930     return;
2931   }
2932   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2933   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2934   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2935   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2936   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2937   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2938   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2939   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2940   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2941   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2942   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2943   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2944   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2945   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2946   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2947   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2948     applyDefaultMapping(OpdMapper);
2949     executeInWaterfallLoop(MI, MRI, {1, 4});
2950     return;
2951   }
2952   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2953   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2954   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2955   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2956   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2957   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2958   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2959   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2960   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2961   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2962   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2963   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2964     applyDefaultMapping(OpdMapper);
2965     executeInWaterfallLoop(MI, MRI, {2, 5});
2966     return;
2967   }
2968   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2969   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2970   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2971     applyDefaultMapping(OpdMapper);
2972     executeInWaterfallLoop(MI, MRI, {2, 5});
2973     return;
2974   }
2975   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2976     applyDefaultMapping(OpdMapper);
2977     executeInWaterfallLoop(MI, MRI, {3, 6});
2978     return;
2979   }
2980   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2981     applyMappingSBufferLoad(OpdMapper);
2982     return;
2983   }
2984   case AMDGPU::G_INTRINSIC: {
2985     switch (MI.getIntrinsicID()) {
2986     case Intrinsic::amdgcn_readlane: {
2987       substituteSimpleCopyRegs(OpdMapper, 2);
2988 
2989       assert(OpdMapper.getVRegs(0).empty());
2990       assert(OpdMapper.getVRegs(3).empty());
2991 
2992       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2993       // waterfall loop, so assume it's a uniform value.
2994       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2995       return;
2996     }
2997     case Intrinsic::amdgcn_writelane: {
2998       assert(OpdMapper.getVRegs(0).empty());
2999       assert(OpdMapper.getVRegs(2).empty());
3000       assert(OpdMapper.getVRegs(3).empty());
3001 
3002       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3003       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
3004       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3005       return;
3006     }
3007     case Intrinsic::amdgcn_interp_p1:
3008     case Intrinsic::amdgcn_interp_p2:
3009     case Intrinsic::amdgcn_interp_mov:
3010     case Intrinsic::amdgcn_interp_p1_f16:
3011     case Intrinsic::amdgcn_interp_p2_f16:
3012     case Intrinsic::amdgcn_lds_param_load: {
3013       applyDefaultMapping(OpdMapper);
3014 
3015       // Readlane for m0 value, which is always the last operand.
3016       // FIXME: Should this be a waterfall loop instead?
3017       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3018       return;
3019     }
3020     case Intrinsic::amdgcn_interp_inreg_p10:
3021     case Intrinsic::amdgcn_interp_inreg_p2:
3022     case Intrinsic::amdgcn_interp_inreg_p10_f16:
3023     case Intrinsic::amdgcn_interp_inreg_p2_f16:
3024       applyDefaultMapping(OpdMapper);
3025       return;
3026     case Intrinsic::amdgcn_permlane16:
3027     case Intrinsic::amdgcn_permlanex16: {
3028       // Doing a waterfall loop over these wouldn't make any sense.
3029       substituteSimpleCopyRegs(OpdMapper, 2);
3030       substituteSimpleCopyRegs(OpdMapper, 3);
3031       constrainOpWithReadfirstlane(MI, MRI, 4);
3032       constrainOpWithReadfirstlane(MI, MRI, 5);
3033       return;
3034     }
3035     case Intrinsic::amdgcn_sbfe:
3036       applyMappingBFE(OpdMapper, true);
3037       return;
3038     case Intrinsic::amdgcn_ubfe:
3039       applyMappingBFE(OpdMapper, false);
3040       return;
3041     case Intrinsic::amdgcn_ballot:
3042       // Use default handling and insert copy to vcc source.
3043       break;
3044     }
3045     break;
3046   }
3047   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3048   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3049   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3050   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3051     const AMDGPU::RsrcIntrinsic *RSrcIntrin
3052       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3053     assert(RSrcIntrin && RSrcIntrin->IsImage);
3054     // Non-images can have complications from operands that allow both SGPR
3055     // and VGPR. For now it's too complicated to figure out the final opcode
3056     // to derive the register bank from the MCInstrDesc.
3057     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3058     return;
3059   }
3060   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3061     unsigned N = MI.getNumExplicitOperands() - 2;
3062     applyDefaultMapping(OpdMapper);
3063     executeInWaterfallLoop(MI, MRI, { N });
3064     return;
3065   }
3066   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3067     auto IntrID = MI.getIntrinsicID();
3068     switch (IntrID) {
3069     case Intrinsic::amdgcn_ds_ordered_add:
3070     case Intrinsic::amdgcn_ds_ordered_swap: {
3071       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3072       assert(OpdMapper.getVRegs(0).empty());
3073       substituteSimpleCopyRegs(OpdMapper, 3);
3074       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3075       return;
3076     }
3077     case Intrinsic::amdgcn_ds_gws_init:
3078     case Intrinsic::amdgcn_ds_gws_barrier:
3079     case Intrinsic::amdgcn_ds_gws_sema_br: {
3080       // Only the first lane is executes, so readfirstlane is safe.
3081       substituteSimpleCopyRegs(OpdMapper, 1);
3082       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3083       return;
3084     }
3085     case Intrinsic::amdgcn_ds_gws_sema_v:
3086     case Intrinsic::amdgcn_ds_gws_sema_p:
3087     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3088       // Only the first lane is executes, so readfirstlane is safe.
3089       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3090       return;
3091     }
3092     case Intrinsic::amdgcn_ds_append:
3093     case Intrinsic::amdgcn_ds_consume: {
3094       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3095       return;
3096     }
3097     case Intrinsic::amdgcn_s_sendmsg:
3098     case Intrinsic::amdgcn_s_sendmsghalt: {
3099       // FIXME: Should this use a waterfall loop?
3100       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3101       return;
3102     }
3103     case Intrinsic::amdgcn_s_setreg: {
3104       constrainOpWithReadfirstlane(MI, MRI, 2);
3105       return;
3106     }
3107     case Intrinsic::amdgcn_raw_buffer_load_lds: {
3108       applyDefaultMapping(OpdMapper);
3109       constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3110       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3111       constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
3112       return;
3113     }
3114     case Intrinsic::amdgcn_struct_buffer_load_lds: {
3115       applyDefaultMapping(OpdMapper);
3116       constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3117       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3118       constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
3119       return;
3120     }
3121     case Intrinsic::amdgcn_global_load_lds: {
3122       applyDefaultMapping(OpdMapper);
3123       constrainOpWithReadfirstlane(MI, MRI, 2);
3124       return;
3125     }
3126     case Intrinsic::amdgcn_lds_direct_load: {
3127       applyDefaultMapping(OpdMapper);
3128       // Readlane for m0 value, which is always the last operand.
3129       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3130       return;
3131     }
3132     case Intrinsic::amdgcn_exp_row:
3133       applyDefaultMapping(OpdMapper);
3134       constrainOpWithReadfirstlane(MI, MRI, 8); // M0
3135       return;
3136     default: {
3137       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3138               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3139         // Non-images can have complications from operands that allow both SGPR
3140         // and VGPR. For now it's too complicated to figure out the final opcode
3141         // to derive the register bank from the MCInstrDesc.
3142         if (RSrcIntrin->IsImage) {
3143           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3144           return;
3145         }
3146       }
3147 
3148       break;
3149     }
3150     }
3151     break;
3152   }
3153   case AMDGPU::G_SI_CALL: {
3154     // Use a set to avoid extra readfirstlanes in the case where multiple
3155     // operands are the same register.
3156     SmallSet<Register, 4> SGPROperandRegs;
3157 
3158     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3159       break;
3160 
3161     // Move all copies to physical SGPRs that are used by the call instruction
3162     // into the loop block. Start searching for these copies until the
3163     // ADJCALLSTACKUP.
3164     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3165     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3166 
3167     // Move all non-copies before the copies, so that a complete range can be
3168     // moved into the waterfall loop.
3169     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3170     // Count of NonCopyInstrs found until the current LastCopy.
3171     unsigned NonCopyInstrsLen = 0;
3172     MachineBasicBlock::iterator Start(&MI);
3173     MachineBasicBlock::iterator LastCopy = Start;
3174     MachineBasicBlock *MBB = MI.getParent();
3175     const SIMachineFunctionInfo *Info =
3176         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3177     while (Start->getOpcode() != FrameSetupOpcode) {
3178       --Start;
3179       bool IsCopy = false;
3180       if (Start->getOpcode() == AMDGPU::COPY) {
3181         auto &Dst = Start->getOperand(0);
3182         if (Dst.isReg()) {
3183           Register Reg = Dst.getReg();
3184           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3185             IsCopy = true;
3186           } else {
3187             // Also move the copy from the scratch rsrc descriptor into the loop
3188             // to allow it to be optimized away.
3189             auto &Src = Start->getOperand(1);
3190             if (Src.isReg()) {
3191               Reg = Src.getReg();
3192               IsCopy = Info->getScratchRSrcReg() == Reg;
3193             }
3194           }
3195         }
3196       }
3197 
3198       if (IsCopy) {
3199         LastCopy = Start;
3200         NonCopyInstrsLen = NonCopyInstrs.size();
3201       } else {
3202         NonCopyInstrs.push_back(&*Start);
3203       }
3204     }
3205     NonCopyInstrs.resize(NonCopyInstrsLen);
3206 
3207     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3208       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3209     }
3210     Start = LastCopy;
3211 
3212     // Do the same for copies after the loop
3213     NonCopyInstrs.clear();
3214     NonCopyInstrsLen = 0;
3215     MachineBasicBlock::iterator End(&MI);
3216     LastCopy = End;
3217     while (End->getOpcode() != FrameDestroyOpcode) {
3218       ++End;
3219       bool IsCopy = false;
3220       if (End->getOpcode() == AMDGPU::COPY) {
3221         auto &Src = End->getOperand(1);
3222         if (Src.isReg()) {
3223           Register Reg = Src.getReg();
3224           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3225         }
3226       }
3227 
3228       if (IsCopy) {
3229         LastCopy = End;
3230         NonCopyInstrsLen = NonCopyInstrs.size();
3231       } else {
3232         NonCopyInstrs.push_back(&*End);
3233       }
3234     }
3235     NonCopyInstrs.resize(NonCopyInstrsLen);
3236 
3237     End = LastCopy;
3238     ++LastCopy;
3239     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3240       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3241     }
3242 
3243     ++End;
3244     MachineIRBuilder B(*Start);
3245     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3246     break;
3247   }
3248   case AMDGPU::G_LOAD:
3249   case AMDGPU::G_ZEXTLOAD:
3250   case AMDGPU::G_SEXTLOAD: {
3251     if (applyMappingLoad(MI, OpdMapper, MRI))
3252       return;
3253     break;
3254   }
3255   case AMDGPU::G_DYN_STACKALLOC:
3256     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3257     return;
3258   case AMDGPU::G_SBFX:
3259     applyMappingBFE(OpdMapper, /*Signed*/ true);
3260     return;
3261   case AMDGPU::G_UBFX:
3262     applyMappingBFE(OpdMapper, /*Signed*/ false);
3263     return;
3264   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3265   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3266     applyMappingMAD_64_32(OpdMapper);
3267     return;
3268   default:
3269     break;
3270   }
3271 
3272   return applyDefaultMapping(OpdMapper);
3273 }
3274 
3275 // vgpr, sgpr -> vgpr
3276 // vgpr, agpr -> vgpr
3277 // agpr, agpr -> agpr
3278 // agpr, sgpr -> vgpr
3279 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3280   if (RB0 == AMDGPU::InvalidRegBankID)
3281     return RB1;
3282   if (RB1 == AMDGPU::InvalidRegBankID)
3283     return RB0;
3284 
3285   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3286     return AMDGPU::SGPRRegBankID;
3287 
3288   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3289     return AMDGPU::AGPRRegBankID;
3290 
3291   return AMDGPU::VGPRRegBankID;
3292 }
3293 
3294 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3295   if (RB0 == AMDGPU::InvalidRegBankID)
3296     return RB1;
3297   if (RB1 == AMDGPU::InvalidRegBankID)
3298     return RB0;
3299 
3300   // vcc, vcc -> vcc
3301   // vcc, sgpr -> vcc
3302   // vcc, vgpr -> vcc
3303   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3304     return AMDGPU::VCCRegBankID;
3305 
3306   // vcc, vgpr -> vgpr
3307   return regBankUnion(RB0, RB1);
3308 }
3309 
3310 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3311                                                 const MachineInstr &MI) const {
3312   unsigned RegBank = AMDGPU::InvalidRegBankID;
3313 
3314   for (const MachineOperand &MO : MI.operands()) {
3315     if (!MO.isReg())
3316       continue;
3317     Register Reg = MO.getReg();
3318     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3319       RegBank = regBankUnion(RegBank, Bank->getID());
3320       if (RegBank == AMDGPU::VGPRRegBankID)
3321         break;
3322     }
3323   }
3324 
3325   return RegBank;
3326 }
3327 
3328 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3329   const MachineFunction &MF = *MI.getParent()->getParent();
3330   const MachineRegisterInfo &MRI = MF.getRegInfo();
3331   for (const MachineOperand &MO : MI.operands()) {
3332     if (!MO.isReg())
3333       continue;
3334     Register Reg = MO.getReg();
3335     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3336       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3337         return false;
3338     }
3339   }
3340   return true;
3341 }
3342 
3343 const RegisterBankInfo::InstructionMapping &
3344 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3345   const MachineFunction &MF = *MI.getParent()->getParent();
3346   const MachineRegisterInfo &MRI = MF.getRegInfo();
3347   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3348 
3349   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3350     const MachineOperand &SrcOp = MI.getOperand(i);
3351     if (!SrcOp.isReg())
3352       continue;
3353 
3354     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3355     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3356   }
3357   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3358                                MI.getNumOperands());
3359 }
3360 
3361 const RegisterBankInfo::InstructionMapping &
3362 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3363   const MachineFunction &MF = *MI.getParent()->getParent();
3364   const MachineRegisterInfo &MRI = MF.getRegInfo();
3365   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3366 
3367   // Even though we technically could use SGPRs, this would require knowledge of
3368   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3369   //
3370   // TODO: Unary ops are trivially OK, so accept SGPRs?
3371   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3372     const MachineOperand &Src = MI.getOperand(i);
3373     if (!Src.isReg())
3374       continue;
3375 
3376     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3377     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3378     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3379   }
3380 
3381   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3382                                MI.getNumOperands());
3383 }
3384 
3385 const RegisterBankInfo::InstructionMapping &
3386 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3387   const MachineFunction &MF = *MI.getParent()->getParent();
3388   const MachineRegisterInfo &MRI = MF.getRegInfo();
3389   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3390 
3391   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3392     const MachineOperand &Op = MI.getOperand(I);
3393     if (!Op.isReg())
3394       continue;
3395 
3396     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3397     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3398   }
3399 
3400   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3401                                MI.getNumOperands());
3402 }
3403 
3404 const RegisterBankInfo::InstructionMapping &
3405 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3406                                         const MachineInstr &MI,
3407                                         int RsrcIdx) const {
3408   // The reported argument index is relative to the IR intrinsic call arguments,
3409   // so we need to shift by the number of defs and the intrinsic ID.
3410   RsrcIdx += MI.getNumExplicitDefs() + 1;
3411 
3412   const int NumOps = MI.getNumOperands();
3413   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3414 
3415   // TODO: Should packed/unpacked D16 difference be reported here as part of
3416   // the value mapping?
3417   for (int I = 0; I != NumOps; ++I) {
3418     if (!MI.getOperand(I).isReg())
3419       continue;
3420 
3421     Register OpReg = MI.getOperand(I).getReg();
3422     // We replace some dead address operands with $noreg
3423     if (!OpReg)
3424       continue;
3425 
3426     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3427 
3428     // FIXME: Probably need a new intrinsic register bank searchable table to
3429     // handle arbitrary intrinsics easily.
3430     //
3431     // If this has a sampler, it immediately follows rsrc.
3432     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3433 
3434     if (MustBeSGPR) {
3435       // If this must be an SGPR, so we must report whatever it is as legal.
3436       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3437       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3438     } else {
3439       // Some operands must be VGPR, and these are easy to copy to.
3440       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3441     }
3442   }
3443 
3444   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3445 }
3446 
3447 /// Return the mapping for a pointer argument.
3448 const RegisterBankInfo::ValueMapping *
3449 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3450                                               Register PtrReg) const {
3451   LLT PtrTy = MRI.getType(PtrReg);
3452   unsigned Size = PtrTy.getSizeInBits();
3453   if (Subtarget.useFlatForGlobal() ||
3454       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3455     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3456 
3457   // If we're using MUBUF instructions for global memory, an SGPR base register
3458   // is possible. Otherwise this needs to be a VGPR.
3459   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3460   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3461 }
3462 
3463 const RegisterBankInfo::InstructionMapping &
3464 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3465 
3466   const MachineFunction &MF = *MI.getParent()->getParent();
3467   const MachineRegisterInfo &MRI = MF.getRegInfo();
3468   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3469   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3470   Register PtrReg = MI.getOperand(1).getReg();
3471   LLT PtrTy = MRI.getType(PtrReg);
3472   unsigned AS = PtrTy.getAddressSpace();
3473   unsigned PtrSize = PtrTy.getSizeInBits();
3474 
3475   const ValueMapping *ValMapping;
3476   const ValueMapping *PtrMapping;
3477 
3478   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3479 
3480   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3481     if (isScalarLoadLegal(MI)) {
3482       // We have a uniform instruction so we want to use an SMRD load
3483       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3484       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3485     } else {
3486       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3487 
3488       // If we're using MUBUF instructions for global memory, an SGPR base
3489       // register is possible. Otherwise this needs to be a VGPR.
3490       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3491         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3492 
3493       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3494     }
3495   } else {
3496     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3497     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3498   }
3499 
3500   OpdsMapping[0] = ValMapping;
3501   OpdsMapping[1] = PtrMapping;
3502   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3503       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3504   return Mapping;
3505 
3506   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3507   // handle that during instruction selection?
3508 }
3509 
3510 unsigned
3511 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3512                                      const MachineRegisterInfo &MRI,
3513                                      unsigned Default) const {
3514   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3515   return Bank ? Bank->getID() : Default;
3516 }
3517 
3518 const RegisterBankInfo::ValueMapping *
3519 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3520                                          const MachineRegisterInfo &MRI,
3521                                          const TargetRegisterInfo &TRI) const {
3522   // Lie and claim anything is legal, even though this needs to be an SGPR
3523   // applyMapping will have to deal with it as a waterfall loop.
3524   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3525   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3526   return AMDGPU::getValueMapping(Bank, Size);
3527 }
3528 
3529 const RegisterBankInfo::ValueMapping *
3530 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3531                                          const MachineRegisterInfo &MRI,
3532                                          const TargetRegisterInfo &TRI) const {
3533   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3534   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3535 }
3536 
3537 const RegisterBankInfo::ValueMapping *
3538 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3539                                          const MachineRegisterInfo &MRI,
3540                                          const TargetRegisterInfo &TRI) const {
3541   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3542   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3543 }
3544 
3545 ///
3546 /// This function must return a legal mapping, because
3547 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3548 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3549 /// VGPR to SGPR generated is illegal.
3550 ///
3551 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3552 // legal. These will be dealt with in applyMappingImpl.
3553 //
3554 const RegisterBankInfo::InstructionMapping &
3555 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3556   const MachineFunction &MF = *MI.getParent()->getParent();
3557   const MachineRegisterInfo &MRI = MF.getRegInfo();
3558 
3559   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3560     // The default logic bothers to analyze impossible alternative mappings. We
3561     // want the most straightforward mapping, so just directly handle this.
3562     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3563                                              *TRI);
3564     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3565                                              *TRI);
3566     assert(SrcBank && "src bank should have been assigned already");
3567     if (!DstBank)
3568       DstBank = SrcBank;
3569 
3570     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3571     if (cannotCopy(*DstBank, *SrcBank, Size))
3572       return getInvalidInstructionMapping();
3573 
3574     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3575     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3576     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3577     OpdsMapping[0] = &ValMap;
3578     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3579       OpdsMapping[1] = &ValMap;
3580 
3581     return getInstructionMapping(
3582         1, /*Cost*/ 1,
3583         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3584   }
3585 
3586   if (MI.isRegSequence()) {
3587     // If any input is a VGPR, the result must be a VGPR. The default handling
3588     // assumes any copy between banks is legal.
3589     unsigned BankID = AMDGPU::SGPRRegBankID;
3590 
3591     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3592       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3593       // It doesn't make sense to use vcc or scc banks here, so just ignore
3594       // them.
3595       if (OpBank != AMDGPU::SGPRRegBankID) {
3596         BankID = AMDGPU::VGPRRegBankID;
3597         break;
3598       }
3599     }
3600     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3601 
3602     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3603     return getInstructionMapping(
3604         1, /*Cost*/ 1,
3605         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3606   }
3607 
3608   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3609   // properly.
3610   //
3611   // TODO: There are additional exec masking dependencies to analyze.
3612   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3613     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3614     Register DstReg = MI.getOperand(0).getReg();
3615 
3616     // Sometimes the result may have already been assigned a bank.
3617     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3618       ResultBank = DstBank->getID();
3619 
3620     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3621       Register Reg = MI.getOperand(I).getReg();
3622       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3623 
3624       // FIXME: Assuming VGPR for any undetermined inputs.
3625       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3626         ResultBank = AMDGPU::VGPRRegBankID;
3627         break;
3628       }
3629 
3630       // FIXME: Need to promote SGPR case to s32
3631       unsigned OpBank = Bank->getID();
3632       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3633     }
3634 
3635     assert(ResultBank != AMDGPU::InvalidRegBankID);
3636 
3637     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3638 
3639     const ValueMapping &ValMap =
3640         getValueMapping(0, Size, getRegBank(ResultBank));
3641     return getInstructionMapping(
3642         1, /*Cost*/ 1,
3643         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3644   }
3645 
3646   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3647   if (Mapping.isValid())
3648     return Mapping;
3649 
3650   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3651 
3652   switch (MI.getOpcode()) {
3653   default:
3654     return getInvalidInstructionMapping();
3655 
3656   case AMDGPU::G_AND:
3657   case AMDGPU::G_OR:
3658   case AMDGPU::G_XOR: {
3659     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3660     if (Size == 1) {
3661       const RegisterBank *DstBank
3662         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3663 
3664       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3665       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3666       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3667       if (DstBank) {
3668         TargetBankID = DstBank->getID();
3669         if (DstBank == &AMDGPU::VCCRegBank) {
3670           TargetBankID = AMDGPU::VCCRegBankID;
3671           BankLHS = AMDGPU::VCCRegBankID;
3672           BankRHS = AMDGPU::VCCRegBankID;
3673         } else {
3674           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3675                                  AMDGPU::SGPRRegBankID);
3676           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3677                                  AMDGPU::SGPRRegBankID);
3678         }
3679       } else {
3680         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3681                                AMDGPU::VCCRegBankID);
3682         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3683                                AMDGPU::VCCRegBankID);
3684 
3685         // Both inputs should be true booleans to produce a boolean result.
3686         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3687           TargetBankID = AMDGPU::VGPRRegBankID;
3688         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3689           TargetBankID = AMDGPU::VCCRegBankID;
3690           BankLHS = AMDGPU::VCCRegBankID;
3691           BankRHS = AMDGPU::VCCRegBankID;
3692         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3693           TargetBankID = AMDGPU::SGPRRegBankID;
3694         }
3695       }
3696 
3697       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3698       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3699       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3700       break;
3701     }
3702 
3703     if (Size == 64) {
3704 
3705       if (isSALUMapping(MI)) {
3706         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3707         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3708       } else {
3709         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3710         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3711         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3712 
3713         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3714         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3715       }
3716 
3717       break;
3718     }
3719 
3720     LLVM_FALLTHROUGH;
3721   }
3722   case AMDGPU::G_PTR_ADD:
3723   case AMDGPU::G_PTRMASK:
3724   case AMDGPU::G_ADD:
3725   case AMDGPU::G_SUB:
3726   case AMDGPU::G_MUL:
3727   case AMDGPU::G_SHL:
3728   case AMDGPU::G_LSHR:
3729   case AMDGPU::G_ASHR:
3730   case AMDGPU::G_UADDO:
3731   case AMDGPU::G_USUBO:
3732   case AMDGPU::G_UADDE:
3733   case AMDGPU::G_SADDE:
3734   case AMDGPU::G_USUBE:
3735   case AMDGPU::G_SSUBE:
3736   case AMDGPU::G_SMIN:
3737   case AMDGPU::G_SMAX:
3738   case AMDGPU::G_UMIN:
3739   case AMDGPU::G_UMAX:
3740   case AMDGPU::G_ABS:
3741   case AMDGPU::G_SHUFFLE_VECTOR:
3742   case AMDGPU::G_SBFX:
3743   case AMDGPU::G_UBFX:
3744     if (isSALUMapping(MI))
3745       return getDefaultMappingSOP(MI);
3746     LLVM_FALLTHROUGH;
3747 
3748   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3749   case AMDGPU::G_SSUBSAT:
3750   case AMDGPU::G_UADDSAT:
3751   case AMDGPU::G_USUBSAT:
3752   case AMDGPU::G_FADD:
3753   case AMDGPU::G_FSUB:
3754   case AMDGPU::G_FPTOSI:
3755   case AMDGPU::G_FPTOUI:
3756   case AMDGPU::G_FMUL:
3757   case AMDGPU::G_FMA:
3758   case AMDGPU::G_FMAD:
3759   case AMDGPU::G_FSQRT:
3760   case AMDGPU::G_FFLOOR:
3761   case AMDGPU::G_FCEIL:
3762   case AMDGPU::G_FRINT:
3763   case AMDGPU::G_SITOFP:
3764   case AMDGPU::G_UITOFP:
3765   case AMDGPU::G_FPTRUNC:
3766   case AMDGPU::G_FPEXT:
3767   case AMDGPU::G_FEXP2:
3768   case AMDGPU::G_FLOG2:
3769   case AMDGPU::G_FMINNUM:
3770   case AMDGPU::G_FMAXNUM:
3771   case AMDGPU::G_FMINNUM_IEEE:
3772   case AMDGPU::G_FMAXNUM_IEEE:
3773   case AMDGPU::G_FCANONICALIZE:
3774   case AMDGPU::G_INTRINSIC_TRUNC:
3775   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3776   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3777   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3778   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3779   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3780   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3781   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3782   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3783   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3784   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3785   case AMDGPU::G_AMDGPU_SMED3:
3786     return getDefaultMappingVOP(MI);
3787   case AMDGPU::G_UMULH:
3788   case AMDGPU::G_SMULH: {
3789     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3790       return getDefaultMappingSOP(MI);
3791     return getDefaultMappingVOP(MI);
3792   }
3793   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3794   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3795     // Three possible mappings:
3796     //
3797     //  - Default SOP
3798     //  - Default VOP
3799     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3800     //
3801     // This allows instruction selection to keep the multiplication part of the
3802     // instruction on the SALU.
3803     bool AllSalu = true;
3804     bool MulSalu = true;
3805     for (unsigned i = 0; i < 5; ++i) {
3806       Register Reg = MI.getOperand(i).getReg();
3807       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3808         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3809           AllSalu = false;
3810           if (i == 2 || i == 3) {
3811             MulSalu = false;
3812             break;
3813           }
3814         }
3815       }
3816     }
3817 
3818     if (AllSalu)
3819       return getDefaultMappingSOP(MI);
3820 
3821     // If the multiply-add is full-rate in VALU, use that even if the
3822     // multiplication part is scalar. Accumulating separately on the VALU would
3823     // take two instructions.
3824     if (!MulSalu || Subtarget.hasFullRate64Ops())
3825       return getDefaultMappingVOP(MI);
3826 
3827     // Keep the multiplication on the SALU, then accumulate on the VALU.
3828     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3829     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3830     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3831     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3832     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3833     break;
3834   }
3835   case AMDGPU::G_IMPLICIT_DEF: {
3836     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3837     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3838     break;
3839   }
3840   case AMDGPU::G_FCONSTANT:
3841   case AMDGPU::G_CONSTANT:
3842   case AMDGPU::G_GLOBAL_VALUE:
3843   case AMDGPU::G_BLOCK_ADDR:
3844   case AMDGPU::G_READCYCLECOUNTER: {
3845     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3846     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3847     break;
3848   }
3849   case AMDGPU::G_FRAME_INDEX: {
3850     // TODO: This should be the same as other constants, but eliminateFrameIndex
3851     // currently assumes VALU uses.
3852     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3853     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3854     break;
3855   }
3856   case AMDGPU::G_DYN_STACKALLOC: {
3857     // Result is always uniform, and a wave reduction is needed for the source.
3858     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3859     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3860     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3861     break;
3862   }
3863   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3864     // This case is weird because we expect a physical register in the source,
3865     // but need to set a bank anyway.
3866     //
3867     // We could select the result to SGPR or VGPR, but for the one current use
3868     // it's more practical to always use VGPR.
3869     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3870     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3871     break;
3872   }
3873   case AMDGPU::G_INSERT: {
3874     unsigned BankID = getMappingType(MRI, MI);
3875     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3876     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3877     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3878     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3879     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3880     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3881     OpdsMapping[3] = nullptr;
3882     break;
3883   }
3884   case AMDGPU::G_EXTRACT: {
3885     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3886     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3887     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3888     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3889     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3890     OpdsMapping[2] = nullptr;
3891     break;
3892   }
3893   case AMDGPU::G_BUILD_VECTOR:
3894   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3895     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3896     if (DstTy == LLT::fixed_vector(2, 16)) {
3897       unsigned DstSize = DstTy.getSizeInBits();
3898       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3899       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3900       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3901       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3902 
3903       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3904       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3905       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3906       break;
3907     }
3908 
3909     LLVM_FALLTHROUGH;
3910   }
3911   case AMDGPU::G_MERGE_VALUES:
3912   case AMDGPU::G_CONCAT_VECTORS: {
3913     unsigned Bank = getMappingType(MRI, MI);
3914     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3915     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3916 
3917     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3918     // Op1 and Dst should use the same register bank.
3919     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3920       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3921     break;
3922   }
3923   case AMDGPU::G_BITREVERSE:
3924   case AMDGPU::G_BITCAST:
3925   case AMDGPU::G_INTTOPTR:
3926   case AMDGPU::G_PTRTOINT:
3927   case AMDGPU::G_FABS:
3928   case AMDGPU::G_FNEG: {
3929     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3930     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3931     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3932     break;
3933   }
3934   case AMDGPU::G_AMDGPU_FFBH_U32:
3935   case AMDGPU::G_AMDGPU_FFBL_B32:
3936   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3937   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3938     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3939     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3940     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3941     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3942     break;
3943   }
3944   case AMDGPU::G_CTPOP: {
3945     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3946     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3947     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3948 
3949     // This should really be getValueMappingSGPR64Only, but allowing the generic
3950     // code to handle the register split just makes using LegalizerHelper more
3951     // difficult.
3952     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3953     break;
3954   }
3955   case AMDGPU::G_TRUNC: {
3956     Register Dst = MI.getOperand(0).getReg();
3957     Register Src = MI.getOperand(1).getReg();
3958     unsigned Bank = getRegBankID(Src, MRI);
3959     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3960     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3961     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3962     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3963     break;
3964   }
3965   case AMDGPU::G_ZEXT:
3966   case AMDGPU::G_SEXT:
3967   case AMDGPU::G_ANYEXT:
3968   case AMDGPU::G_SEXT_INREG: {
3969     Register Dst = MI.getOperand(0).getReg();
3970     Register Src = MI.getOperand(1).getReg();
3971     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3972     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3973 
3974     unsigned DstBank;
3975     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3976     assert(SrcBank);
3977     switch (SrcBank->getID()) {
3978     case AMDGPU::SGPRRegBankID:
3979       DstBank = AMDGPU::SGPRRegBankID;
3980       break;
3981     default:
3982       DstBank = AMDGPU::VGPRRegBankID;
3983       break;
3984     }
3985 
3986     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3987     // 32-bits, and then to 64.
3988     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3989     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3990                                                        SrcSize);
3991     break;
3992   }
3993   case AMDGPU::G_FCMP: {
3994     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3995     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3996     OpdsMapping[1] = nullptr; // Predicate Operand.
3997     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3998     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3999     break;
4000   }
4001   case AMDGPU::G_STORE: {
4002     assert(MI.getOperand(0).isReg());
4003     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4004 
4005     // FIXME: We need to specify a different reg bank once scalar stores are
4006     // supported.
4007     const ValueMapping *ValMapping =
4008         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4009     OpdsMapping[0] = ValMapping;
4010     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4011     break;
4012   }
4013   case AMDGPU::G_ICMP: {
4014     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4015     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4016 
4017     // See if the result register has already been constrained to vcc, which may
4018     // happen due to control flow intrinsic lowering.
4019     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4020                                     AMDGPU::SGPRRegBankID);
4021     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4022     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4023 
4024     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4025                      Op2Bank == AMDGPU::SGPRRegBankID &&
4026                      Op3Bank == AMDGPU::SGPRRegBankID &&
4027       (Size == 32 || (Size == 64 &&
4028                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4029                       Subtarget.hasScalarCompareEq64()));
4030 
4031     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4032     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4033 
4034     // TODO: Use 32-bit for scalar output size.
4035     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4036     const unsigned ResultSize = 1;
4037 
4038     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4039     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4040     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4041     break;
4042   }
4043   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4044     // VGPR index can be used for waterfall when indexing a SGPR vector.
4045     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4046     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4047     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4048     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4049     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4050     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4051 
4052     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4053     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4054 
4055     // The index can be either if the source vector is VGPR.
4056     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4057     break;
4058   }
4059   case AMDGPU::G_INSERT_VECTOR_ELT: {
4060     unsigned OutputBankID = isSALUMapping(MI) ?
4061       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4062 
4063     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4064     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4065     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4066     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4067     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4068 
4069     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4070     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4071 
4072     // This is a weird case, because we need to break down the mapping based on
4073     // the register bank of a different operand.
4074     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4075       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4076                                                       InsertSize);
4077     } else {
4078       assert(InsertSize == 32 || InsertSize == 64);
4079       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4080     }
4081 
4082     // The index can be either if the source vector is VGPR.
4083     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4084     break;
4085   }
4086   case AMDGPU::G_UNMERGE_VALUES: {
4087     unsigned Bank = getMappingType(MRI, MI);
4088 
4089     // Op1 and Dst should use the same register bank.
4090     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4091     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4092       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4093       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4094     }
4095     break;
4096   }
4097   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4098   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4099   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4100   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4101   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4102   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4103   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4104   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4105   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4106   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4107   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4108   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4109   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4110   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4111   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4112   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4113     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4114 
4115     // rsrc
4116     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4117 
4118     // vindex
4119     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4120 
4121     // voffset
4122     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4123 
4124     // soffset
4125     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4126 
4127     // Any remaining operands are immediates and were correctly null
4128     // initialized.
4129     break;
4130   }
4131   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4132   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4133   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4134   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4135   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4136   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4137   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4138   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4139   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4140   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4141   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4142   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4143   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4144   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4145   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4146     // vdata_out
4147     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4148 
4149     // vdata_in
4150     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4151 
4152     // rsrc
4153     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4154 
4155     // vindex
4156     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4157 
4158     // voffset
4159     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4160 
4161     // soffset
4162     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4163 
4164     // Any remaining operands are immediates and were correctly null
4165     // initialized.
4166     break;
4167   }
4168   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4169     // vdata_out
4170     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4171 
4172     // vdata_in
4173     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4174 
4175     // cmp
4176     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4177 
4178     // rsrc
4179     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4180 
4181     // vindex
4182     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4183 
4184     // voffset
4185     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4186 
4187     // soffset
4188     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4189 
4190     // Any remaining operands are immediates and were correctly null
4191     // initialized.
4192     break;
4193   }
4194   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4195     // Lie and claim everything is legal, even though some need to be
4196     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4197     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4198     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4199 
4200     // We need to convert this to a MUBUF if either the resource of offset is
4201     // VGPR.
4202     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4203     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4204     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4205 
4206     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4207     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4208     break;
4209   }
4210   case AMDGPU::G_INTRINSIC: {
4211     switch (MI.getIntrinsicID()) {
4212     default:
4213       return getInvalidInstructionMapping();
4214     case Intrinsic::amdgcn_div_fmas:
4215     case Intrinsic::amdgcn_div_fixup:
4216     case Intrinsic::amdgcn_trig_preop:
4217     case Intrinsic::amdgcn_sin:
4218     case Intrinsic::amdgcn_cos:
4219     case Intrinsic::amdgcn_log_clamp:
4220     case Intrinsic::amdgcn_rcp:
4221     case Intrinsic::amdgcn_rcp_legacy:
4222     case Intrinsic::amdgcn_sqrt:
4223     case Intrinsic::amdgcn_rsq:
4224     case Intrinsic::amdgcn_rsq_legacy:
4225     case Intrinsic::amdgcn_rsq_clamp:
4226     case Intrinsic::amdgcn_fmul_legacy:
4227     case Intrinsic::amdgcn_fma_legacy:
4228     case Intrinsic::amdgcn_ldexp:
4229     case Intrinsic::amdgcn_frexp_mant:
4230     case Intrinsic::amdgcn_frexp_exp:
4231     case Intrinsic::amdgcn_fract:
4232     case Intrinsic::amdgcn_cvt_pkrtz:
4233     case Intrinsic::amdgcn_cvt_pknorm_i16:
4234     case Intrinsic::amdgcn_cvt_pknorm_u16:
4235     case Intrinsic::amdgcn_cvt_pk_i16:
4236     case Intrinsic::amdgcn_cvt_pk_u16:
4237     case Intrinsic::amdgcn_fmed3:
4238     case Intrinsic::amdgcn_cubeid:
4239     case Intrinsic::amdgcn_cubema:
4240     case Intrinsic::amdgcn_cubesc:
4241     case Intrinsic::amdgcn_cubetc:
4242     case Intrinsic::amdgcn_sffbh:
4243     case Intrinsic::amdgcn_fmad_ftz:
4244     case Intrinsic::amdgcn_mbcnt_lo:
4245     case Intrinsic::amdgcn_mbcnt_hi:
4246     case Intrinsic::amdgcn_mul_u24:
4247     case Intrinsic::amdgcn_mul_i24:
4248     case Intrinsic::amdgcn_mulhi_u24:
4249     case Intrinsic::amdgcn_mulhi_i24:
4250     case Intrinsic::amdgcn_lerp:
4251     case Intrinsic::amdgcn_sad_u8:
4252     case Intrinsic::amdgcn_msad_u8:
4253     case Intrinsic::amdgcn_sad_hi_u8:
4254     case Intrinsic::amdgcn_sad_u16:
4255     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4256     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4257     case Intrinsic::amdgcn_mqsad_u32_u8:
4258     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4259     case Intrinsic::amdgcn_alignbyte:
4260     case Intrinsic::amdgcn_perm:
4261     case Intrinsic::amdgcn_fdot2:
4262     case Intrinsic::amdgcn_sdot2:
4263     case Intrinsic::amdgcn_udot2:
4264     case Intrinsic::amdgcn_sdot4:
4265     case Intrinsic::amdgcn_udot4:
4266     case Intrinsic::amdgcn_sdot8:
4267     case Intrinsic::amdgcn_udot8:
4268     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4269     case Intrinsic::amdgcn_fdot2_f16_f16:
4270     case Intrinsic::amdgcn_fdot2_f32_bf16:
4271     case Intrinsic::amdgcn_sudot4:
4272     case Intrinsic::amdgcn_sudot8:
4273     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4274     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4275     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4276     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4277     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4278     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4279       return getDefaultMappingVOP(MI);
4280     case Intrinsic::amdgcn_sbfe:
4281     case Intrinsic::amdgcn_ubfe:
4282       if (isSALUMapping(MI))
4283         return getDefaultMappingSOP(MI);
4284       return getDefaultMappingVOP(MI);
4285     case Intrinsic::amdgcn_ds_swizzle:
4286     case Intrinsic::amdgcn_ds_permute:
4287     case Intrinsic::amdgcn_ds_bpermute:
4288     case Intrinsic::amdgcn_update_dpp:
4289     case Intrinsic::amdgcn_mov_dpp8:
4290     case Intrinsic::amdgcn_mov_dpp:
4291     case Intrinsic::amdgcn_strict_wwm:
4292     case Intrinsic::amdgcn_wwm:
4293     case Intrinsic::amdgcn_strict_wqm:
4294     case Intrinsic::amdgcn_wqm:
4295     case Intrinsic::amdgcn_softwqm:
4296     case Intrinsic::amdgcn_set_inactive:
4297     case Intrinsic::amdgcn_permlane64:
4298       return getDefaultMappingAllVGPR(MI);
4299     case Intrinsic::amdgcn_kernarg_segment_ptr:
4300     case Intrinsic::amdgcn_s_getpc:
4301     case Intrinsic::amdgcn_groupstaticsize:
4302     case Intrinsic::amdgcn_reloc_constant:
4303     case Intrinsic::returnaddress: {
4304       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4305       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4306       break;
4307     }
4308     case Intrinsic::amdgcn_wqm_vote: {
4309       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4310       OpdsMapping[0] = OpdsMapping[2]
4311         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4312       break;
4313     }
4314     case Intrinsic::amdgcn_ps_live: {
4315       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4316       break;
4317     }
4318     case Intrinsic::amdgcn_div_scale: {
4319       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4320       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4321       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4322       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4323 
4324       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4325       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4326       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4327       break;
4328     }
4329     case Intrinsic::amdgcn_class: {
4330       Register Src0Reg = MI.getOperand(2).getReg();
4331       Register Src1Reg = MI.getOperand(3).getReg();
4332       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4333       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4334       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4335       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4336       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4337       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4338       break;
4339     }
4340     case Intrinsic::amdgcn_icmp:
4341     case Intrinsic::amdgcn_fcmp: {
4342       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4343       // This is not VCCRegBank because this is not used in boolean contexts.
4344       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4345       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4346       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4347       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4348       break;
4349     }
4350     case Intrinsic::amdgcn_readlane: {
4351       // This must be an SGPR, but accept a VGPR.
4352       Register IdxReg = MI.getOperand(3).getReg();
4353       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4354       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4355       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4356       LLVM_FALLTHROUGH;
4357     }
4358     case Intrinsic::amdgcn_readfirstlane: {
4359       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4360       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4361       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4362       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4363       break;
4364     }
4365     case Intrinsic::amdgcn_writelane: {
4366       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4367       Register SrcReg = MI.getOperand(2).getReg();
4368       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4369       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4370       Register IdxReg = MI.getOperand(3).getReg();
4371       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4372       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4373       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4374 
4375       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4376       // to legalize.
4377       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4378       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4379       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4380       break;
4381     }
4382     case Intrinsic::amdgcn_if_break: {
4383       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4384       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4385       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4386       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4387       break;
4388     }
4389     case Intrinsic::amdgcn_permlane16:
4390     case Intrinsic::amdgcn_permlanex16: {
4391       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4392       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4393       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4394       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4395       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4396       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4397       break;
4398     }
4399     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4400     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4401     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4402     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4403     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4404     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4405     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4406     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4407     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4408     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4409     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4410     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4411     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4412     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4413     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4414     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4415     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4416     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4417     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4418     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4419     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4420     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4421     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4422     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4423     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4424     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4425     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4426     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4427     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4428     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4429     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4430     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4431     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4432     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4433     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4434     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4435     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4436     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4437     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4438       // Default for MAI intrinsics.
4439       // srcC can also be an immediate which can be folded later.
4440       // FIXME: Should we eventually add an alternative mapping with AGPR src
4441       // for srcA/srcB?
4442       //
4443       // vdst, srcA, srcB, srcC
4444       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4445       OpdsMapping[0] =
4446           Info->mayNeedAGPRs()
4447               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4448               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4449       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4450       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4451       OpdsMapping[4] =
4452           Info->mayNeedAGPRs()
4453               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4454               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4455       break;
4456     }
4457     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4458     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4459     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4460     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4461     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4462     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4463     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4464     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4465     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4466     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4467     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4468     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4469     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4470     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4471       // vdst, srcA, srcB, srcC, idx
4472       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4473       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4474       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4475       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4476       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4477       break;
4478     }
4479     case Intrinsic::amdgcn_interp_p1:
4480     case Intrinsic::amdgcn_interp_p2:
4481     case Intrinsic::amdgcn_interp_mov:
4482     case Intrinsic::amdgcn_interp_p1_f16:
4483     case Intrinsic::amdgcn_interp_p2_f16:
4484     case Intrinsic::amdgcn_lds_param_load: {
4485       const int M0Idx = MI.getNumOperands() - 1;
4486       Register M0Reg = MI.getOperand(M0Idx).getReg();
4487       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4488       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4489 
4490       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4491       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4492         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4493 
4494       // Must be SGPR, but we must take whatever the original bank is and fix it
4495       // later.
4496       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4497       break;
4498     }
4499     case Intrinsic::amdgcn_interp_inreg_p10:
4500     case Intrinsic::amdgcn_interp_inreg_p2:
4501     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4502     case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4503       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4504       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4505       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4506       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4507       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4508       break;
4509     }
4510     case Intrinsic::amdgcn_ballot: {
4511       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4512       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4513       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4514       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4515       break;
4516     }
4517     }
4518     break;
4519   }
4520   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4521   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4522   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4523   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4524     auto IntrID = MI.getIntrinsicID();
4525     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4526     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4527     // Non-images can have complications from operands that allow both SGPR
4528     // and VGPR. For now it's too complicated to figure out the final opcode
4529     // to derive the register bank from the MCInstrDesc.
4530     assert(RSrcIntrin->IsImage);
4531     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4532   }
4533   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4534     unsigned N = MI.getNumExplicitOperands() - 2;
4535     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4536     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4537     if (N == 3) {
4538       // Sequential form: all operands combined into VGPR256/VGPR512
4539       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4540       if (Size > 256)
4541         Size = 512;
4542       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4543     } else {
4544       // NSA form
4545       for (unsigned I = 2; I < N; ++I) {
4546         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4547         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4548       }
4549     }
4550     break;
4551   }
4552   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4553     auto IntrID = MI.getIntrinsicID();
4554     switch (IntrID) {
4555     case Intrinsic::amdgcn_s_getreg:
4556     case Intrinsic::amdgcn_s_memtime:
4557     case Intrinsic::amdgcn_s_memrealtime:
4558     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4559     case Intrinsic::amdgcn_s_sendmsg_rtn: {
4560       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4561       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4562       break;
4563     }
4564     case Intrinsic::amdgcn_global_atomic_fadd:
4565     case Intrinsic::amdgcn_global_atomic_csub:
4566     case Intrinsic::amdgcn_global_atomic_fmin:
4567     case Intrinsic::amdgcn_global_atomic_fmax:
4568     case Intrinsic::amdgcn_flat_atomic_fadd:
4569     case Intrinsic::amdgcn_flat_atomic_fmin:
4570     case Intrinsic::amdgcn_flat_atomic_fmax:
4571     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4572     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4573       return getDefaultMappingAllVGPR(MI);
4574     case Intrinsic::amdgcn_ds_ordered_add:
4575     case Intrinsic::amdgcn_ds_ordered_swap: {
4576       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4577       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4578       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4579                                  AMDGPU::SGPRRegBankID);
4580       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4581       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4582       break;
4583     }
4584     case Intrinsic::amdgcn_ds_append:
4585     case Intrinsic::amdgcn_ds_consume: {
4586       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4587       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4588       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4589       break;
4590     }
4591     case Intrinsic::amdgcn_exp_compr:
4592       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4593       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4594       break;
4595     case Intrinsic::amdgcn_exp:
4596       // FIXME: Could we support packed types here?
4597       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4598       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4599       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4600       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4601       break;
4602     case Intrinsic::amdgcn_exp_row:
4603       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4604       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4605       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4606       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4607       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4608       break;
4609     case Intrinsic::amdgcn_s_sendmsg:
4610     case Intrinsic::amdgcn_s_sendmsghalt: {
4611       // This must be an SGPR, but accept a VGPR.
4612       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4613                                    AMDGPU::SGPRRegBankID);
4614       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4615       break;
4616     }
4617     case Intrinsic::amdgcn_s_setreg: {
4618       // This must be an SGPR, but accept a VGPR.
4619       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4620                                    AMDGPU::SGPRRegBankID);
4621       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4622       break;
4623     }
4624     case Intrinsic::amdgcn_end_cf: {
4625       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4626       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4627       break;
4628     }
4629     case Intrinsic::amdgcn_else: {
4630       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4631       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4632       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4633       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4634       break;
4635     }
4636     case Intrinsic::amdgcn_live_mask: {
4637       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4638       break;
4639     }
4640     case Intrinsic::amdgcn_wqm_demote:
4641     case Intrinsic::amdgcn_kill: {
4642       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4643       break;
4644     }
4645     case Intrinsic::amdgcn_raw_buffer_load:
4646     case Intrinsic::amdgcn_raw_tbuffer_load: {
4647       // FIXME: Should make intrinsic ID the last operand of the instruction,
4648       // then this would be the same as store
4649       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4650       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4651       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4652       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4653       break;
4654     }
4655     case Intrinsic::amdgcn_raw_buffer_load_lds: {
4656       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4657       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4658       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4659       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4660       break;
4661     }
4662     case Intrinsic::amdgcn_raw_buffer_store:
4663     case Intrinsic::amdgcn_raw_buffer_store_format:
4664     case Intrinsic::amdgcn_raw_tbuffer_store: {
4665       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4666       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4667       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4668       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4669       break;
4670     }
4671     case Intrinsic::amdgcn_struct_buffer_load:
4672     case Intrinsic::amdgcn_struct_tbuffer_load: {
4673       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4674       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4675       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4676       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4677       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4678       break;
4679     }
4680     case Intrinsic::amdgcn_struct_buffer_load_lds: {
4681       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4682       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4683       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4684       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4685       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4686       break;
4687     }
4688     case Intrinsic::amdgcn_struct_buffer_store:
4689     case Intrinsic::amdgcn_struct_tbuffer_store: {
4690       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4691       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4692       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4693       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4694       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4695       break;
4696     }
4697     case Intrinsic::amdgcn_init_exec_from_input: {
4698       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4699       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4700       break;
4701     }
4702     case Intrinsic::amdgcn_ds_gws_init:
4703     case Intrinsic::amdgcn_ds_gws_barrier:
4704     case Intrinsic::amdgcn_ds_gws_sema_br: {
4705       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4706 
4707       // This must be an SGPR, but accept a VGPR.
4708       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4709                                    AMDGPU::SGPRRegBankID);
4710       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4711       break;
4712     }
4713     case Intrinsic::amdgcn_ds_gws_sema_v:
4714     case Intrinsic::amdgcn_ds_gws_sema_p:
4715     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4716       // This must be an SGPR, but accept a VGPR.
4717       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4718                                    AMDGPU::SGPRRegBankID);
4719       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4720       break;
4721     }
4722     case Intrinsic::amdgcn_global_load_lds: {
4723       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4724       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4725       break;
4726     }
4727     case Intrinsic::amdgcn_lds_direct_load: {
4728       const int M0Idx = MI.getNumOperands() - 1;
4729       Register M0Reg = MI.getOperand(M0Idx).getReg();
4730       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4731       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4732 
4733       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4734       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4735         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4736 
4737       // Must be SGPR, but we must take whatever the original bank is and fix it
4738       // later.
4739       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4740       break;
4741     }
4742     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
4743     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
4744       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4745       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4746       break;
4747     default:
4748       return getInvalidInstructionMapping();
4749     }
4750     break;
4751   }
4752   case AMDGPU::G_SELECT: {
4753     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4754     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4755                                     AMDGPU::SGPRRegBankID);
4756     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4757                                     AMDGPU::SGPRRegBankID);
4758     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4759                     Op3Bank == AMDGPU::SGPRRegBankID;
4760 
4761     unsigned CondBankDefault = SGPRSrcs ?
4762       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4763     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4764                                      CondBankDefault);
4765     if (CondBank == AMDGPU::SGPRRegBankID)
4766       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4767     else if (CondBank == AMDGPU::VGPRRegBankID)
4768       CondBank = AMDGPU::VCCRegBankID;
4769 
4770     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4771       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4772 
4773     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4774 
4775     // TODO: Should report 32-bit for scalar condition type.
4776     if (Size == 64) {
4777       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4778       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4779       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4780       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4781     } else {
4782       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4783       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4784       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4785       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4786     }
4787 
4788     break;
4789   }
4790 
4791   case AMDGPU::G_SI_CALL: {
4792     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4793     // Lie and claim everything is legal, even though some need to be
4794     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4795     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4796 
4797     // Allow anything for implicit arguments
4798     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4799       if (MI.getOperand(I).isReg()) {
4800         Register Reg = MI.getOperand(I).getReg();
4801         auto OpBank = getRegBankID(Reg, MRI);
4802         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4803         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4804       }
4805     }
4806     break;
4807   }
4808   case AMDGPU::G_LOAD:
4809   case AMDGPU::G_ZEXTLOAD:
4810   case AMDGPU::G_SEXTLOAD:
4811     return getInstrMappingForLoad(MI);
4812 
4813   case AMDGPU::G_ATOMICRMW_XCHG:
4814   case AMDGPU::G_ATOMICRMW_ADD:
4815   case AMDGPU::G_ATOMICRMW_SUB:
4816   case AMDGPU::G_ATOMICRMW_AND:
4817   case AMDGPU::G_ATOMICRMW_OR:
4818   case AMDGPU::G_ATOMICRMW_XOR:
4819   case AMDGPU::G_ATOMICRMW_MAX:
4820   case AMDGPU::G_ATOMICRMW_MIN:
4821   case AMDGPU::G_ATOMICRMW_UMAX:
4822   case AMDGPU::G_ATOMICRMW_UMIN:
4823   case AMDGPU::G_ATOMICRMW_FADD:
4824   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4825   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4826   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4827   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4828   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4829     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4830     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4831     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4832     break;
4833   }
4834   case AMDGPU::G_ATOMIC_CMPXCHG: {
4835     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4836     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4837     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4838     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4839     break;
4840   }
4841   case AMDGPU::G_BRCOND: {
4842     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4843                                  AMDGPU::SGPRRegBankID);
4844     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4845     if (Bank != AMDGPU::SGPRRegBankID)
4846       Bank = AMDGPU::VCCRegBankID;
4847 
4848     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4849     break;
4850   }
4851   case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
4852   case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
4853     return getDefaultMappingVOP(MI);
4854   }
4855 
4856   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4857                                getOperandsMapping(OpdsMapping),
4858                                MI.getNumOperands());
4859 }
4860