1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   const AMDGPURegisterBankInfo &RBI;
101   MachineRegisterInfo &MRI;
102   const RegisterBank *NewBank;
103   SmallVector<MachineInstr *, 4> NewInsts;
104 
105 public:
106   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
108     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109 
110   ~ApplyRegBankMapping() {
111     for (MachineInstr *MI : NewInsts)
112       applyBank(*MI);
113   }
114 
115   /// Set any registers that don't have a set register class or bank to SALU.
116   void applyBank(MachineInstr &MI) {
117     const unsigned Opc = MI.getOpcode();
118     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119         Opc == AMDGPU::G_SEXT) {
120       // LegalizerHelper wants to use the basic legalization artifacts when
121       // widening etc. We don't handle selection with vcc in artifact sources,
122       // so we need to use a select instead to handle these properly.
123       Register DstReg = MI.getOperand(0).getReg();
124       Register SrcReg = MI.getOperand(1).getReg();
125       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126       if (SrcBank == &AMDGPU::VCCRegBank) {
127         const LLT S32 = LLT::scalar(32);
128         assert(MRI.getType(SrcReg) == LLT::scalar(1));
129         assert(MRI.getType(DstReg) == S32);
130         assert(NewBank == &AMDGPU::VGPRRegBank);
131 
132         // Replace the extension with a select, which really uses the boolean
133         // source.
134         MachineIRBuilder B(MI);
135         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136         auto False = B.buildConstant(S32, 0);
137         B.buildSelect(DstReg, SrcReg, True, False);
138         MRI.setRegBank(True.getReg(0), *NewBank);
139         MRI.setRegBank(False.getReg(0), *NewBank);
140         MI.eraseFromParent();
141       }
142 
143       assert(!MRI.getRegClassOrRegBank(DstReg));
144       MRI.setRegBank(DstReg, *NewBank);
145       return;
146     }
147 
148 #ifndef NDEBUG
149     if (Opc == AMDGPU::G_TRUNC) {
150       Register DstReg = MI.getOperand(0).getReg();
151       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152       assert(DstBank != &AMDGPU::VCCRegBank);
153     }
154 #endif
155 
156     for (MachineOperand &Op : MI.operands()) {
157       if (!Op.isReg())
158         continue;
159 
160       // We may see physical registers if building a real MI
161       Register Reg = Op.getReg();
162       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163         continue;
164 
165       const RegisterBank *RB = NewBank;
166       if (MRI.getType(Reg) == LLT::scalar(1)) {
167         assert(NewBank == &AMDGPU::VGPRRegBank &&
168                "s1 operands should only be used for vector bools");
169         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171                "not expecting legalization artifacts here");
172         RB = &AMDGPU::VCCRegBank;
173       }
174 
175       MRI.setRegBank(Reg, *RB);
176     }
177   }
178 
179   void erasingInstr(MachineInstr &MI) override {}
180 
181   void createdInstr(MachineInstr &MI) override {
182     // At this point, the instruction was just inserted and has no operands.
183     NewInsts.push_back(&MI);
184   }
185 
186   void changingInstr(MachineInstr &MI) override {}
187   void changedInstr(MachineInstr &MI) override {
188     // FIXME: In principle we should probably add the instruction to NewInsts,
189     // but the way the LegalizerHelper uses the observer, we will always see the
190     // registers we need to set the regbank on also referenced in a new
191     // instruction.
192   }
193 };
194 
195 }
196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
197     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
198       TII(Subtarget.getInstrInfo()) {
199 
200   // HACK: Until this is fully tablegen'd.
201   static llvm::once_flag InitializeRegisterBankFlag;
202 
203   static auto InitializeRegisterBankOnce = [this]() {
204     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207     (void)this;
208   };
209 
210   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212 
213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214   unsigned BankID = Bank.getID();
215   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217 
218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219                                           const RegisterBank &Src,
220                                           unsigned Size) const {
221   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224     return std::numeric_limits<unsigned>::max();
225   }
226 
227   // Bool values are tricky, because the meaning is based on context. The SCC
228   // and VCC banks are for the natural scalar and vector conditions produced by
229   // a compare.
230   //
231   // Legalization doesn't know about the necessary context, so an s1 use may
232   // have been a truncate from an arbitrary value, in which case a copy (lowered
233   // as a compare with 0) needs to be inserted.
234   if (Size == 1 &&
235       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236       (isVectorRegisterBank(Src) ||
237        Src.getID() == AMDGPU::SGPRRegBankID ||
238        Src.getID() == AMDGPU::VCCRegBankID))
239     return std::numeric_limits<unsigned>::max();
240 
241   // There is no direct copy between AGPRs.
242   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243       Src.getID() == AMDGPU::AGPRRegBankID)
244     return 4;
245 
246   return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248 
249 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
250   const ValueMapping &ValMapping,
251   const RegisterBank *CurBank) const {
252   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253   // VGPR.
254   // FIXME: Is there a better way to do this?
255   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256     return 10; // This is expensive.
257 
258   assert(ValMapping.NumBreakDowns == 2 &&
259          ValMapping.BreakDown[0].Length == 32 &&
260          ValMapping.BreakDown[0].StartIdx == 0 &&
261          ValMapping.BreakDown[1].Length == 32 &&
262          ValMapping.BreakDown[1].StartIdx == 32 &&
263          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264 
265   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267   // want.
268 
269   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270   // alignment restrictions, but this probably isn't important.
271   return 1;
272 }
273 
274 const RegisterBank &
275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
276                                                LLT Ty) const {
277   if (&RC == &AMDGPU::SReg_1RegClass)
278     return AMDGPU::VCCRegBank;
279 
280   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281   // VCC-like use.
282   if (TRI->isSGPRClass(&RC)) {
283     // FIXME: This probably came from a copy from a physical register, which
284     // should be inferable from the copied to-type. We don't have many boolean
285     // physical register constraints so just assume a normal SGPR for now.
286     if (!Ty.isValid())
287       return AMDGPU::SGPRRegBank;
288 
289     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290   }
291 
292   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294 
295 template <unsigned NumOps>
296 RegisterBankInfo::InstructionMappings
297 AMDGPURegisterBankInfo::addMappingFromTable(
298     const MachineInstr &MI, const MachineRegisterInfo &MRI,
299     const std::array<unsigned, NumOps> RegSrcOpIdx,
300     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301 
302   InstructionMappings AltMappings;
303 
304   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
305 
306   unsigned Sizes[NumOps];
307   for (unsigned I = 0; I < NumOps; ++I) {
308     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310   }
311 
312   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315   }
316 
317   // getInstrMapping's default mapping uses ID 1, so start at 2.
318   unsigned MappingID = 2;
319   for (const auto &Entry : Table) {
320     for (unsigned I = 0; I < NumOps; ++I) {
321       int OpIdx = RegSrcOpIdx[I];
322       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323     }
324 
325     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
326                                                  getOperandsMapping(Operands),
327                                                  Operands.size()));
328   }
329 
330   return AltMappings;
331 }
332 
333 RegisterBankInfo::InstructionMappings
334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
335     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336   switch (MI.getIntrinsicID()) {
337   case Intrinsic::amdgcn_readlane: {
338     static const OpRegBankEntry<3> Table[2] = {
339       // Perfectly legal.
340       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341 
342       // Need a readfirstlane for the index.
343       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344     };
345 
346     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
348   }
349   case Intrinsic::amdgcn_writelane: {
350     static const OpRegBankEntry<4> Table[4] = {
351       // Perfectly legal.
352       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353 
354       // Need readfirstlane of first op
355       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356 
357       // Need readfirstlane of second op
358       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359 
360       // Need readfirstlane of both ops
361       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362     };
363 
364     // rsrc, voffset, offset
365     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
367   }
368   default:
369     return RegisterBankInfo::getInstrAlternativeMappings(MI);
370   }
371 }
372 
373 RegisterBankInfo::InstructionMappings
374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
375     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376 
377   switch (MI.getIntrinsicID()) {
378   case Intrinsic::amdgcn_s_buffer_load: {
379     static const OpRegBankEntry<2> Table[4] = {
380       // Perfectly legal.
381       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382 
383       // Only need 1 register in loop
384       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385 
386       // Have to waterfall the resource.
387       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388 
389       // Have to waterfall the resource, and the offset.
390       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391     };
392 
393     // rsrc, offset
394     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
396   }
397   case Intrinsic::amdgcn_ds_ordered_add:
398   case Intrinsic::amdgcn_ds_ordered_swap: {
399     // VGPR = M0, VGPR
400     static const OpRegBankEntry<3> Table[2] = {
401       // Perfectly legal.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
403 
404       // Need a readfirstlane for m0
405       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406     };
407 
408     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
410   }
411   case Intrinsic::amdgcn_s_sendmsg:
412   case Intrinsic::amdgcn_s_sendmsghalt: {
413     // FIXME: Should have no register for immediate
414     static const OpRegBankEntry<1> Table[2] = {
415       // Perfectly legal.
416       { { AMDGPU::SGPRRegBankID }, 1 },
417 
418       // Need readlane
419       { { AMDGPU::VGPRRegBankID }, 3 }
420     };
421 
422     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
424   }
425   default:
426     return RegisterBankInfo::getInstrAlternativeMappings(MI);
427   }
428 }
429 
430 // FIXME: Returns uniform if there's no source value information. This is
431 // probably wrong.
432 static bool isScalarLoadLegal(const MachineInstr &MI) {
433   if (!MI.hasOneMemOperand())
434     return false;
435 
436   const MachineMemOperand *MMO = *MI.memoperands_begin();
437   const unsigned AS = MMO->getAddrSpace();
438   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
439                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
440   // Require 4-byte alignment.
441   return MMO->getAlign() >= Align(4) &&
442          // Can't do a scalar atomic load.
443          !MMO->isAtomic() &&
444          // Don't use scalar loads for volatile accesses to non-constant address
445          // spaces.
446          (IsConst || !MMO->isVolatile()) &&
447          // Memory must be known constant, or not written before this load.
448          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
449          AMDGPUInstrInfo::isUniformMMO(MMO);
450 }
451 
452 RegisterBankInfo::InstructionMappings
453 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
454     const MachineInstr &MI) const {
455 
456   const MachineFunction &MF = *MI.getParent()->getParent();
457   const MachineRegisterInfo &MRI = MF.getRegInfo();
458 
459 
460   InstructionMappings AltMappings;
461   switch (MI.getOpcode()) {
462   case TargetOpcode::G_CONSTANT:
463   case TargetOpcode::G_IMPLICIT_DEF: {
464     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
465     if (Size == 1) {
466       static const OpRegBankEntry<1> Table[3] = {
467         { { AMDGPU::VGPRRegBankID }, 1 },
468         { { AMDGPU::SGPRRegBankID }, 1 },
469         { { AMDGPU::VCCRegBankID }, 1 }
470       };
471 
472       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
473     }
474 
475     [[fallthrough]];
476   }
477   case TargetOpcode::G_FCONSTANT:
478   case TargetOpcode::G_FRAME_INDEX:
479   case TargetOpcode::G_GLOBAL_VALUE: {
480     static const OpRegBankEntry<1> Table[2] = {
481       { { AMDGPU::VGPRRegBankID }, 1 },
482       { { AMDGPU::SGPRRegBankID }, 1 }
483     };
484 
485     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
486   }
487   case TargetOpcode::G_AND:
488   case TargetOpcode::G_OR:
489   case TargetOpcode::G_XOR: {
490     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
491 
492     if (Size == 1) {
493       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
494       const InstructionMapping &SCCMapping = getInstructionMapping(
495         1, 1, getOperandsMapping(
496           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
497            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
498            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
499         3); // Num Operands
500       AltMappings.push_back(&SCCMapping);
501 
502       const InstructionMapping &VCCMapping0 = getInstructionMapping(
503         2, 1, getOperandsMapping(
504           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
505            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
506            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
507         3); // Num Operands
508       AltMappings.push_back(&VCCMapping0);
509       return AltMappings;
510     }
511 
512     if (Size != 64)
513       break;
514 
515     const InstructionMapping &SSMapping = getInstructionMapping(
516       1, 1, getOperandsMapping(
517         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
519          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
520       3); // Num Operands
521     AltMappings.push_back(&SSMapping);
522 
523     const InstructionMapping &VVMapping = getInstructionMapping(
524       2, 2, getOperandsMapping(
525         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
527          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
528       3); // Num Operands
529     AltMappings.push_back(&VVMapping);
530     break;
531   }
532   case TargetOpcode::G_LOAD:
533   case TargetOpcode::G_ZEXTLOAD:
534   case TargetOpcode::G_SEXTLOAD: {
535     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
536     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
537     unsigned PtrSize = PtrTy.getSizeInBits();
538     unsigned AS = PtrTy.getAddressSpace();
539 
540     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
541          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
542         isScalarLoadLegal(MI)) {
543       const InstructionMapping &SSMapping = getInstructionMapping(
544           1, 1, getOperandsMapping(
545                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
546                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
547           2); // Num Operands
548       AltMappings.push_back(&SSMapping);
549     }
550 
551     const InstructionMapping &VVMapping = getInstructionMapping(
552         2, 1,
553         getOperandsMapping(
554             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
555              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
556         2); // Num Operands
557     AltMappings.push_back(&VVMapping);
558 
559     // It may be possible to have a vgpr = load sgpr mapping here, because
560     // the mubuf instructions support this kind of load, but probably for only
561     // gfx7 and older.  However, the addressing mode matching in the instruction
562     // selector should be able to do a better job of detecting and selecting
563     // these kinds of loads from the vgpr = load vgpr mapping.
564 
565     return AltMappings;
566 
567   }
568   case TargetOpcode::G_SELECT: {
569     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
570     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
571       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
572                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
573                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
574                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
575       4); // Num Operands
576     AltMappings.push_back(&SSMapping);
577 
578     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
579       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
580                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
581                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
582                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
583       4); // Num Operands
584     AltMappings.push_back(&VVMapping);
585 
586     return AltMappings;
587   }
588   case TargetOpcode::G_UADDE:
589   case TargetOpcode::G_USUBE:
590   case TargetOpcode::G_SADDE:
591   case TargetOpcode::G_SSUBE: {
592     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
593     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
594       getOperandsMapping(
595         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
596          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
597          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
598          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
599          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
600       5); // Num Operands
601     AltMappings.push_back(&SSMapping);
602 
603     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
604       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
605                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
606                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
607                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
608                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
609       5); // Num Operands
610     AltMappings.push_back(&VVMapping);
611     return AltMappings;
612   }
613   case AMDGPU::G_BRCOND: {
614     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
615 
616     // TODO: Change type to 32 for scalar
617     const InstructionMapping &SMapping = getInstructionMapping(
618       1, 1, getOperandsMapping(
619         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
620       2); // Num Operands
621     AltMappings.push_back(&SMapping);
622 
623     const InstructionMapping &VMapping = getInstructionMapping(
624       1, 1, getOperandsMapping(
625         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
626       2); // Num Operands
627     AltMappings.push_back(&VMapping);
628     return AltMappings;
629   }
630   case AMDGPU::G_INTRINSIC:
631     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
632   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
633     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
634   default:
635     break;
636   }
637   return RegisterBankInfo::getInstrAlternativeMappings(MI);
638 }
639 
640 void AMDGPURegisterBankInfo::split64BitValueForMapping(
641   MachineIRBuilder &B,
642   SmallVector<Register, 2> &Regs,
643   LLT HalfTy,
644   Register Reg) const {
645   assert(HalfTy.getSizeInBits() == 32);
646   MachineRegisterInfo *MRI = B.getMRI();
647   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
648   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
649   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
650   MRI->setRegBank(LoLHS, *Bank);
651   MRI->setRegBank(HiLHS, *Bank);
652 
653   Regs.push_back(LoLHS);
654   Regs.push_back(HiLHS);
655 
656   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
657     .addDef(LoLHS)
658     .addDef(HiLHS)
659     .addUse(Reg);
660 }
661 
662 /// Replace the current type each register in \p Regs has with \p NewTy
663 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
664                           LLT NewTy) {
665   for (Register Reg : Regs) {
666     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
667     MRI.setType(Reg, NewTy);
668   }
669 }
670 
671 static LLT getHalfSizedType(LLT Ty) {
672   if (Ty.isVector()) {
673     assert(Ty.getElementCount().isKnownMultipleOf(2));
674     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
675                                Ty.getElementType());
676   }
677 
678   assert(Ty.getScalarSizeInBits() % 2 == 0);
679   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
680 }
681 
682 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
683 // source value into a scalar register.
684 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
685                                                     MachineRegisterInfo &MRI,
686                                                     Register Src) const {
687   LLT Ty = MRI.getType(Src);
688   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
689 
690   if (Bank == &AMDGPU::SGPRRegBank)
691     return Src;
692 
693   unsigned Bits = Ty.getSizeInBits();
694   assert(Bits % 32 == 0);
695 
696   if (Bank != &AMDGPU::VGPRRegBank) {
697     // We need to copy from AGPR to VGPR
698     Src = B.buildCopy(Ty, Src).getReg(0);
699     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
700   }
701 
702   LLT S32 = LLT::scalar(32);
703   unsigned NumParts = Bits / 32;
704   SmallVector<Register, 8> SrcParts;
705   SmallVector<Register, 8> DstParts;
706 
707   if (Bits == 32) {
708     SrcParts.push_back(Src);
709   } else {
710     auto Unmerge = B.buildUnmerge(S32, Src);
711     for (unsigned i = 0; i < NumParts; ++i)
712       SrcParts.push_back(Unmerge.getReg(i));
713   }
714 
715   for (unsigned i = 0; i < NumParts; ++i) {
716     Register SrcPart = SrcParts[i];
717     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
718     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
719 
720     const TargetRegisterClass *Constrained =
721         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
722     (void)Constrained;
723     assert(Constrained && "Failed to constrain readfirstlane src reg");
724 
725     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
726 
727     DstParts.push_back(DstPart);
728   }
729 
730   if (Bits == 32)
731     return DstParts[0];
732 
733   Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
734   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
735   return Dst;
736 }
737 
738 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
739 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
740 /// execute the instruction for each unique combination of values in all lanes
741 /// in the wave. The block will be split such that rest of the instructions are
742 /// moved to a new block.
743 ///
744 /// Essentially performs this loop:
745 //
746 /// Save Execution Mask
747 /// For (Lane : Wavefront) {
748 ///   Enable Lane, Disable all other lanes
749 ///   SGPR = read SGPR value for current lane from VGPR
750 ///   VGPRResult[Lane] = use_op SGPR
751 /// }
752 /// Restore Execution Mask
753 ///
754 /// There is additional complexity to try for compare values to identify the
755 /// unique values used.
756 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
757   MachineIRBuilder &B,
758   iterator_range<MachineBasicBlock::iterator> Range,
759   SmallSet<Register, 4> &SGPROperandRegs,
760   MachineRegisterInfo &MRI) const {
761 
762   // Track use registers which have already been expanded with a readfirstlane
763   // sequence. This may have multiple uses if moving a sequence.
764   DenseMap<Register, Register> WaterfalledRegMap;
765 
766   MachineBasicBlock &MBB = B.getMBB();
767   MachineFunction *MF = &B.getMF();
768 
769   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
770   const unsigned MovExecOpc =
771       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
772   const unsigned MovExecTermOpc =
773       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
774 
775   const unsigned XorTermOpc = Subtarget.isWave32() ?
776     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
777   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
778     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
779   const unsigned ExecReg =  Subtarget.isWave32() ?
780     AMDGPU::EXEC_LO : AMDGPU::EXEC;
781 
782 #ifndef NDEBUG
783   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
784 #endif
785 
786   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
787   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
788 
789   // Don't bother using generic instructions/registers for the exec mask.
790   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
791     .addDef(InitSaveExecReg);
792 
793   Register PhiExec = MRI.createVirtualRegister(WaveRC);
794   Register NewExec = MRI.createVirtualRegister(WaveRC);
795 
796   // To insert the loop we need to split the block. Move everything before this
797   // point to a new block, and insert a new empty block before this instruction.
798   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
799   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
800   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
801   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
802   MachineFunction::iterator MBBI(MBB);
803   ++MBBI;
804   MF->insert(MBBI, LoopBB);
805   MF->insert(MBBI, BodyBB);
806   MF->insert(MBBI, RestoreExecBB);
807   MF->insert(MBBI, RemainderBB);
808 
809   LoopBB->addSuccessor(BodyBB);
810   BodyBB->addSuccessor(RestoreExecBB);
811   BodyBB->addSuccessor(LoopBB);
812 
813   // Move the rest of the block into a new block.
814   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
815   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
816 
817   MBB.addSuccessor(LoopBB);
818   RestoreExecBB->addSuccessor(RemainderBB);
819 
820   B.setInsertPt(*LoopBB, LoopBB->end());
821 
822   B.buildInstr(TargetOpcode::PHI)
823       .addDef(PhiExec)
824       .addReg(InitSaveExecReg)
825       .addMBB(&MBB)
826       .addReg(NewExec)
827       .addMBB(BodyBB);
828 
829   const DebugLoc &DL = B.getDL();
830 
831   MachineInstr &FirstInst = *Range.begin();
832 
833   // Move the instruction into the loop body. Note we moved everything after
834   // Range.end() already into a new block, so Range.end() is no longer valid.
835   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
836 
837   // Figure out the iterator range after splicing the instructions.
838   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
839   auto NewEnd = BodyBB->end();
840 
841   B.setMBB(*LoopBB);
842 
843   LLT S1 = LLT::scalar(1);
844   Register CondReg;
845 
846   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
847 
848   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
849     for (MachineOperand &Op : MI.uses()) {
850       if (!Op.isReg() || Op.isDef())
851         continue;
852 
853       Register OldReg = Op.getReg();
854       if (!SGPROperandRegs.count(OldReg))
855         continue;
856 
857       // See if we already processed this register in another instruction in the
858       // sequence.
859       auto OldVal = WaterfalledRegMap.find(OldReg);
860       if (OldVal != WaterfalledRegMap.end()) {
861         Op.setReg(OldVal->second);
862         continue;
863       }
864 
865       Register OpReg = Op.getReg();
866       LLT OpTy = MRI.getType(OpReg);
867 
868       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
869       if (OpBank != &AMDGPU::VGPRRegBank) {
870         // Insert copy from AGPR to VGPR before the loop.
871         B.setMBB(MBB);
872         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
873         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
874         B.setMBB(*LoopBB);
875       }
876 
877       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
878 
879       // Build the comparison(s).
880       unsigned OpSize = OpTy.getSizeInBits();
881       bool Is64 = OpSize % 64 == 0;
882       unsigned PartSize = Is64 ? 64 : 32;
883       LLT PartTy = LLT::scalar(PartSize);
884       unsigned NumParts = OpSize / PartSize;
885       SmallVector<Register, 8> OpParts;
886       SmallVector<Register, 8> CurrentLaneParts;
887 
888       if (NumParts == 1) {
889         OpParts.push_back(OpReg);
890         CurrentLaneParts.push_back(CurrentLaneReg);
891       } else {
892         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
893         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
894         for (unsigned i = 0; i < NumParts; ++i) {
895           OpParts.push_back(UnmergeOp.getReg(i));
896           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
897           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
898           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
899         }
900       }
901 
902       for (unsigned i = 0; i < NumParts; ++i) {
903         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
904                                   OpParts[i]).getReg(0);
905         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
906 
907         if (!CondReg) {
908           CondReg = CmpReg;
909         } else {
910           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
911           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
912         }
913       }
914 
915       Op.setReg(CurrentLaneReg);
916 
917       // Make sure we don't re-process this register again.
918       WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
919     }
920   }
921 
922   // The ballot becomes a no-op during instruction selection.
923   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
924                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
925                              false)
926                 .addReg(CondReg)
927                 .getReg(0);
928   MRI.setRegClass(CondReg, WaveRC);
929 
930   // Update EXEC, save the original EXEC value to VCC.
931   B.buildInstr(AndSaveExecOpc)
932     .addDef(NewExec)
933     .addReg(CondReg, RegState::Kill);
934 
935   MRI.setSimpleHint(NewExec, CondReg);
936 
937   B.setInsertPt(*BodyBB, BodyBB->end());
938 
939   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
940   B.buildInstr(XorTermOpc)
941     .addDef(ExecReg)
942     .addReg(ExecReg)
943     .addReg(NewExec);
944 
945   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
946   // s_cbranch_scc0?
947 
948   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
949   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
950 
951   // Save the EXEC mask before the loop.
952   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
953     .addReg(ExecReg);
954 
955   // Restore the EXEC mask after the loop.
956   B.setMBB(*RestoreExecBB);
957   B.buildInstr(MovExecTermOpc)
958     .addDef(ExecReg)
959     .addReg(SaveExecReg);
960 
961   // Set the insert point after the original instruction, so any new
962   // instructions will be in the remainder.
963   B.setInsertPt(*RemainderBB, RemainderBB->begin());
964 
965   return true;
966 }
967 
968 // Return any unique registers used by \p MI at \p OpIndices that need to be
969 // handled in a waterfall loop. Returns these registers in \p
970 // SGPROperandRegs. Returns true if there are any operands to handle and a
971 // waterfall loop is necessary.
972 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
973   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
974   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
975   for (unsigned Op : OpIndices) {
976     assert(MI.getOperand(Op).isUse());
977     Register Reg = MI.getOperand(Op).getReg();
978     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
979     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
980       SGPROperandRegs.insert(Reg);
981   }
982 
983   // No operands need to be replaced, so no need to loop.
984   return !SGPROperandRegs.empty();
985 }
986 
987 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
988   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
989   ArrayRef<unsigned> OpIndices) const {
990   // Use a set to avoid extra readfirstlanes in the case where multiple operands
991   // are the same register.
992   SmallSet<Register, 4> SGPROperandRegs;
993 
994   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
995     return false;
996 
997   MachineBasicBlock::iterator I = MI.getIterator();
998   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
999                                 SGPROperandRegs, MRI);
1000 }
1001 
1002 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1003   MachineInstr &MI, MachineRegisterInfo &MRI,
1004   ArrayRef<unsigned> OpIndices) const {
1005   MachineIRBuilder B(MI);
1006   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1007 }
1008 
1009 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1010 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1011     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1012   Register Reg = MI.getOperand(OpIdx).getReg();
1013   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1014   if (Bank == &AMDGPU::SGPRRegBank)
1015     return;
1016 
1017   MachineIRBuilder B(MI);
1018 
1019   Reg = buildReadFirstLane(B, MRI, Reg);
1020   MI.getOperand(OpIdx).setReg(Reg);
1021 }
1022 
1023 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1024 /// rest will be in the remainder.
1025 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1026   unsigned TotalSize = Ty.getSizeInBits();
1027   if (!Ty.isVector())
1028     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1029 
1030   LLT EltTy = Ty.getElementType();
1031   unsigned EltSize = EltTy.getSizeInBits();
1032   assert(FirstSize % EltSize == 0);
1033 
1034   unsigned FirstPartNumElts = FirstSize / EltSize;
1035   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1036 
1037   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1038           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1039 }
1040 
1041 static LLT widen96To128(LLT Ty) {
1042   if (!Ty.isVector())
1043     return LLT::scalar(128);
1044 
1045   LLT EltTy = Ty.getElementType();
1046   assert(128 % EltTy.getSizeInBits() == 0);
1047   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1048 }
1049 
1050 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1051                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1052                                               MachineRegisterInfo &MRI) const {
1053   Register DstReg = MI.getOperand(0).getReg();
1054   const LLT LoadTy = MRI.getType(DstReg);
1055   unsigned LoadSize = LoadTy.getSizeInBits();
1056   const unsigned MaxNonSmrdLoadSize = 128;
1057 
1058   const RegisterBank *DstBank =
1059       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1060   if (DstBank == &AMDGPU::SGPRRegBank) {
1061     // There are some special cases that we need to look at for 32 bit and 96
1062     // bit SGPR loads otherwise we have nothing to do.
1063     if (LoadSize != 32 && LoadSize != 96)
1064       return false;
1065 
1066     MachineMemOperand *MMO = *MI.memoperands_begin();
1067     const unsigned MemSize = 8 * MMO->getSize();
1068     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1069     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1070     // scalar loads should have a load size of 32 but memory access size of less
1071     // than 32.
1072     if (LoadSize == 32 &&
1073         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1074       return false;
1075 
1076     Register PtrReg = MI.getOperand(1).getReg();
1077 
1078     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1079     MachineIRBuilder B(MI, O);
1080 
1081     if (LoadSize == 32) {
1082       // This is an extending load from a sub-dword size. Widen the memory
1083       // access size to 4 bytes and clear the extra high bits appropriately
1084       const LLT S32 = LLT::scalar(32);
1085       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1086         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1087         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1088         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1089       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1090         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1091         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1092         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1093       } else
1094         // We do not need to touch the higher bits for regular loads.
1095         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1096     } else {
1097       // 96-bit loads are only available for vector loads. We need to split this
1098       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1099       if (MMO->getAlign() < Align(16)) {
1100         MachineFunction *MF = MI.getParent()->getParent();
1101         ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1102         MachineIRBuilder B(MI, ApplyBank);
1103         LegalizerHelper Helper(*MF, ApplyBank, B);
1104         LLT Part64, Part32;
1105         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1106         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1107             LegalizerHelper::Legalized)
1108           return false;
1109         return true;
1110       } else {
1111         LLT WiderTy = widen96To128(LoadTy);
1112         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1113         if (WiderTy.isScalar())
1114           B.buildTrunc(MI.getOperand(0), WideLoad);
1115         else {
1116           B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1117                                               WideLoad);
1118         }
1119       }
1120     }
1121 
1122     MI.eraseFromParent();
1123     return true;
1124   }
1125 
1126   // 128-bit loads are supported for all instruction types.
1127   if (LoadSize <= MaxNonSmrdLoadSize)
1128     return false;
1129 
1130   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1131   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1132 
1133   if (SrcRegs.empty())
1134     SrcRegs.push_back(MI.getOperand(1).getReg());
1135 
1136   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1137 
1138   // RegBankSelect only emits scalar types, so we need to reset the pointer
1139   // operand to a pointer type.
1140   Register BasePtrReg = SrcRegs[0];
1141   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1142   MRI.setType(BasePtrReg, PtrTy);
1143 
1144   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1145   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1146   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1147   MachineIRBuilder B(MI, Observer);
1148   LegalizerHelper Helper(B.getMF(), Observer, B);
1149 
1150   if (LoadTy.isVector()) {
1151     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1152       return false;
1153   } else {
1154     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1155       return false;
1156   }
1157 
1158   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1159   return true;
1160 }
1161 
1162 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1163   MachineInstr &MI,
1164   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1165   MachineRegisterInfo &MRI) const {
1166   const MachineFunction &MF = *MI.getMF();
1167   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1168   const auto &TFI = *ST.getFrameLowering();
1169 
1170   // Guard in case the stack growth direction ever changes with scratch
1171   // instructions.
1172   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1173     return false;
1174 
1175   Register Dst = MI.getOperand(0).getReg();
1176   Register AllocSize = MI.getOperand(1).getReg();
1177   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1178 
1179   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1180 
1181   // TODO: Need to emit a wave reduction to get the maximum size.
1182   if (SizeBank != &AMDGPU::SGPRRegBank)
1183     return false;
1184 
1185   LLT PtrTy = MRI.getType(Dst);
1186   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1187 
1188   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1189   Register SPReg = Info->getStackPtrOffsetReg();
1190   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1191   MachineIRBuilder B(MI, ApplyBank);
1192 
1193   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1194   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1195 
1196   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1197   if (Alignment > TFI.getStackAlign()) {
1198     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1199     B.buildMaskLowPtrBits(Dst, PtrAdd,
1200                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1201   } else {
1202     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1203   }
1204 
1205   MI.eraseFromParent();
1206   return true;
1207 }
1208 
1209 bool AMDGPURegisterBankInfo::applyMappingImage(
1210     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1211     MachineRegisterInfo &MRI, int RsrcIdx) const {
1212   const int NumDefs = MI.getNumExplicitDefs();
1213 
1214   // The reported argument index is relative to the IR intrinsic call arguments,
1215   // so we need to shift by the number of defs and the intrinsic ID.
1216   RsrcIdx += NumDefs + 1;
1217 
1218   // Insert copies to VGPR arguments.
1219   applyDefaultMapping(OpdMapper);
1220 
1221   // Fixup any SGPR arguments.
1222   SmallVector<unsigned, 4> SGPRIndexes;
1223   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1224     if (!MI.getOperand(I).isReg())
1225       continue;
1226 
1227     // If this intrinsic has a sampler, it immediately follows rsrc.
1228     if (I == RsrcIdx || I == RsrcIdx + 1)
1229       SGPRIndexes.push_back(I);
1230   }
1231 
1232   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1233   return true;
1234 }
1235 
1236 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1237                                         Register Reg) {
1238   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1239   if (!Def)
1240     return Reg;
1241 
1242   // TODO: Guard against this being an implicit def
1243   return Def->getOperand(0).getReg();
1244 }
1245 
1246 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1247 // the three offsets (voffset, soffset and instoffset)
1248 static unsigned setBufferOffsets(MachineIRBuilder &B,
1249                                  const AMDGPURegisterBankInfo &RBI,
1250                                  Register CombinedOffset, Register &VOffsetReg,
1251                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1252                                  Align Alignment) {
1253   const LLT S32 = LLT::scalar(32);
1254   MachineRegisterInfo *MRI = B.getMRI();
1255 
1256   if (std::optional<int64_t> Imm =
1257           getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1258     uint32_t SOffset, ImmOffset;
1259     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1260                                  Alignment)) {
1261       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1262       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1263       InstOffsetVal = ImmOffset;
1264 
1265       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1266       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1267       return SOffset + ImmOffset;
1268     }
1269   }
1270 
1271   Register Base;
1272   unsigned Offset;
1273 
1274   std::tie(Base, Offset) =
1275       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1276 
1277   uint32_t SOffset, ImmOffset;
1278   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1279                                                   &RBI.Subtarget, Alignment)) {
1280     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1281       VOffsetReg = Base;
1282       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1283       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1284       InstOffsetVal = ImmOffset;
1285       return 0; // XXX - Why is this 0?
1286     }
1287 
1288     // If we have SGPR base, we can use it for soffset.
1289     if (SOffset == 0) {
1290       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1291       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1292       SOffsetReg = Base;
1293       InstOffsetVal = ImmOffset;
1294       return 0; // XXX - Why is this 0?
1295     }
1296   }
1297 
1298   // Handle the variable sgpr + vgpr case.
1299   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1300   if (Add && (int)Offset >= 0) {
1301     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1302     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1303 
1304     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1305     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1306 
1307     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1308       VOffsetReg = Src0;
1309       SOffsetReg = Src1;
1310       return 0;
1311     }
1312 
1313     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1314       VOffsetReg = Src1;
1315       SOffsetReg = Src0;
1316       return 0;
1317     }
1318   }
1319 
1320   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1321   // have an SGPR offset and a VGPR resource.
1322   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1323     VOffsetReg = CombinedOffset;
1324   } else {
1325     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1326     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1327   }
1328 
1329   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1330   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1331   return 0;
1332 }
1333 
1334 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1335   const OperandsMapper &OpdMapper) const {
1336   MachineInstr &MI = OpdMapper.getMI();
1337   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1338 
1339   const LLT S32 = LLT::scalar(32);
1340   Register Dst = MI.getOperand(0).getReg();
1341   LLT Ty = MRI.getType(Dst);
1342 
1343   const RegisterBank *RSrcBank =
1344     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1345   const RegisterBank *OffsetBank =
1346     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1347   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1348       OffsetBank == &AMDGPU::SGPRRegBank)
1349     return true; // Legal mapping
1350 
1351   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1352   // here but don't have an MMO.
1353 
1354   unsigned LoadSize = Ty.getSizeInBits();
1355   int NumLoads = 1;
1356   if (LoadSize == 256 || LoadSize == 512) {
1357     NumLoads = LoadSize / 128;
1358     Ty = Ty.divide(NumLoads);
1359   }
1360 
1361   // Use the alignment to ensure that the required offsets will fit into the
1362   // immediate offsets.
1363   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1364 
1365   MachineIRBuilder B(MI);
1366   MachineFunction &MF = B.getMF();
1367 
1368   Register SOffset;
1369   Register VOffset;
1370   int64_t ImmOffset = 0;
1371 
1372   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1373                                         VOffset, SOffset, ImmOffset, Alignment);
1374 
1375   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1376   // can, but we need to track an MMO for that.
1377   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1378   const Align MemAlign(4); // FIXME: ABI type alignment?
1379   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1380     MachinePointerInfo(),
1381     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1382     MachineMemOperand::MOInvariant,
1383     MemSize, MemAlign);
1384   if (MMOOffset != 0)
1385     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1386 
1387   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1388   // assume that the buffer is unswizzled.
1389 
1390   Register RSrc = MI.getOperand(1).getReg();
1391   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1392   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1393 
1394   SmallVector<Register, 4> LoadParts(NumLoads);
1395 
1396   MachineBasicBlock::iterator MII = MI.getIterator();
1397   MachineInstrSpan Span(MII, &B.getMBB());
1398 
1399   for (int i = 0; i < NumLoads; ++i) {
1400     if (NumLoads == 1) {
1401       LoadParts[i] = Dst;
1402     } else {
1403       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1404       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1405     }
1406 
1407     MachineMemOperand *MMO = BaseMMO;
1408     if (i != 0)
1409       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1410 
1411     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1412       .addDef(LoadParts[i])       // vdata
1413       .addUse(RSrc)               // rsrc
1414       .addUse(VIndex)             // vindex
1415       .addUse(VOffset)            // voffset
1416       .addUse(SOffset)            // soffset
1417       .addImm(ImmOffset + 16 * i) // offset(imm)
1418       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1419       .addImm(0)                  // idxen(imm)
1420       .addMemOperand(MMO);
1421   }
1422 
1423   // TODO: If only the resource is a VGPR, it may be better to execute the
1424   // scalar load in the waterfall loop if the resource is expected to frequently
1425   // be dynamically uniform.
1426   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1427     // Remove the original instruction to avoid potentially confusing the
1428     // waterfall loop logic.
1429     B.setInstr(*Span.begin());
1430     MI.eraseFromParent();
1431 
1432     SmallSet<Register, 4> OpsToWaterfall;
1433 
1434     OpsToWaterfall.insert(RSrc);
1435     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1436                            OpsToWaterfall, MRI);
1437   }
1438 
1439   if (NumLoads != 1) {
1440     if (Ty.isVector())
1441       B.buildConcatVectors(Dst, LoadParts);
1442     else
1443       B.buildMergeLikeInstr(Dst, LoadParts);
1444   }
1445 
1446   // We removed the instruction earlier with a waterfall loop.
1447   if (RSrcBank == &AMDGPU::SGPRRegBank)
1448     MI.eraseFromParent();
1449 
1450   return true;
1451 }
1452 
1453 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1454                                              bool Signed) const {
1455   MachineInstr &MI = OpdMapper.getMI();
1456   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1457 
1458   // Insert basic copies
1459   applyDefaultMapping(OpdMapper);
1460 
1461   Register DstReg = MI.getOperand(0).getReg();
1462   LLT Ty = MRI.getType(DstReg);
1463 
1464   const LLT S32 = LLT::scalar(32);
1465 
1466   unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1467   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1468   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1469   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1470 
1471   const RegisterBank *DstBank =
1472     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1473   if (DstBank == &AMDGPU::VGPRRegBank) {
1474     if (Ty == S32)
1475       return true;
1476 
1477     // There is no 64-bit vgpr bitfield extract instructions so the operation
1478     // is expanded to a sequence of instructions that implement the operation.
1479     ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1480     MachineIRBuilder B(MI, ApplyBank);
1481 
1482     const LLT S64 = LLT::scalar(64);
1483     // Shift the source operand so that extracted bits start at bit 0.
1484     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1485                               : B.buildLShr(S64, SrcReg, OffsetReg);
1486     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1487 
1488     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1489     // if the width is a constant.
1490     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1491       // Use the 32-bit bitfield extract instruction if the width is a constant.
1492       // Depending on the width size, use either the low or high 32-bits.
1493       auto Zero = B.buildConstant(S32, 0);
1494       auto WidthImm = ConstWidth->Value.getZExtValue();
1495       if (WidthImm <= 32) {
1496         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1497         // or clear the upper 32-bits.
1498         auto Extract =
1499             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1500                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1501         auto Extend =
1502             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1503         B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1504       } else {
1505         // Use bitfield extract on upper 32-bit source, and combine with lower
1506         // 32-bit source.
1507         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1508         auto Extract =
1509             Signed
1510                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1511                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1512         B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1513       }
1514       MI.eraseFromParent();
1515       return true;
1516     }
1517 
1518     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1519     // operations.
1520     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1521     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1522     if (Signed)
1523       B.buildAShr(S64, SignBit, ExtShift);
1524     else
1525       B.buildLShr(S64, SignBit, ExtShift);
1526     MI.eraseFromParent();
1527     return true;
1528   }
1529 
1530   // The scalar form packs the offset and width in a single operand.
1531 
1532   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1533   MachineIRBuilder B(MI, ApplyBank);
1534 
1535   // Ensure the high bits are clear to insert the offset.
1536   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1537   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1538 
1539   // Zeros out the low bits, so don't bother clamping the input value.
1540   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1541 
1542   // Transformation function, pack the offset and width of a BFE into
1543   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1544   // source, bits [5:0] contain the offset and bits [22:16] the width.
1545   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1546 
1547   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1548   // register class constraints.
1549   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1550                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1551 
1552   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1553   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1554     llvm_unreachable("failed to constrain BFE");
1555 
1556   MI.eraseFromParent();
1557   return true;
1558 }
1559 
1560 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1561     const OperandsMapper &OpdMapper) const {
1562   MachineInstr &MI = OpdMapper.getMI();
1563   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1564 
1565   // Insert basic copies.
1566   applyDefaultMapping(OpdMapper);
1567 
1568   Register Dst0 = MI.getOperand(0).getReg();
1569   Register Dst1 = MI.getOperand(1).getReg();
1570   Register Src0 = MI.getOperand(2).getReg();
1571   Register Src1 = MI.getOperand(3).getReg();
1572   Register Src2 = MI.getOperand(4).getReg();
1573 
1574   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1575     return true;
1576 
1577   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1578   LLT S1 = LLT::scalar(1);
1579   LLT S32 = LLT::scalar(32);
1580 
1581   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1582   bool Accumulate = true;
1583 
1584   if (!DstOnValu) {
1585     if (mi_match(Src2, MRI, m_ZeroInt()))
1586       Accumulate = false;
1587   }
1588 
1589   // Keep the multiplication on the SALU.
1590   MachineIRBuilder B(MI);
1591 
1592   Register DstHi;
1593   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1594   bool MulHiInVgpr = false;
1595 
1596   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1597 
1598   if (Subtarget.hasSMulHi()) {
1599     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1600                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1601     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1602   } else {
1603     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1604     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1605 
1606     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1607     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1608 
1609     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1610                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1611     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1612 
1613     if (!DstOnValu) {
1614       DstHi = buildReadFirstLane(B, MRI, DstHi);
1615     } else {
1616       MulHiInVgpr = true;
1617     }
1618   }
1619 
1620   // Accumulate and produce the "carry-out" bit.
1621   //
1622   // The "carry-out" is defined as bit 64 of the result when computed as a
1623   // big integer. For unsigned multiply-add, this matches the usual definition
1624   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1625   // result, which is determined as:
1626   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1627   LLT CarryType = DstOnValu ? S1 : S32;
1628   const RegisterBank &CarryBank =
1629       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1630   const RegisterBank &DstBank =
1631       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1632   Register Carry;
1633   Register Zero;
1634 
1635   if (!IsUnsigned) {
1636     Zero = B.buildConstant(S32, 0).getReg(0);
1637     MRI.setRegBank(Zero,
1638                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1639 
1640     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1641                 .getReg(0);
1642     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1643                                       : AMDGPU::SGPRRegBank);
1644 
1645     if (DstOnValu && !MulHiInVgpr) {
1646       Carry = B.buildTrunc(S1, Carry).getReg(0);
1647       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1648     }
1649   }
1650 
1651   if (Accumulate) {
1652     if (DstOnValu) {
1653       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1654       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1655       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1656       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1657     }
1658 
1659     auto Unmerge = B.buildUnmerge(S32, Src2);
1660     Register Src2Lo = Unmerge.getReg(0);
1661     Register Src2Hi = Unmerge.getReg(1);
1662     MRI.setRegBank(Src2Lo, DstBank);
1663     MRI.setRegBank(Src2Hi, DstBank);
1664 
1665     if (!IsUnsigned) {
1666       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1667       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1668 
1669       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1670       MRI.setRegBank(Carry, CarryBank);
1671     }
1672 
1673     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1674     DstLo = AddLo.getReg(0);
1675     Register CarryLo = AddLo.getReg(1);
1676     MRI.setRegBank(DstLo, DstBank);
1677     MRI.setRegBank(CarryLo, CarryBank);
1678 
1679     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1680     DstHi = AddHi.getReg(0);
1681     MRI.setRegBank(DstHi, DstBank);
1682 
1683     Register CarryHi = AddHi.getReg(1);
1684     MRI.setRegBank(CarryHi, CarryBank);
1685 
1686     if (IsUnsigned) {
1687       Carry = CarryHi;
1688     } else {
1689       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1690       MRI.setRegBank(Carry, CarryBank);
1691     }
1692   } else {
1693     if (IsUnsigned) {
1694       Carry = B.buildConstant(CarryType, 0).getReg(0);
1695       MRI.setRegBank(Carry, CarryBank);
1696     }
1697   }
1698 
1699   B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1700 
1701   if (DstOnValu) {
1702     B.buildCopy(Dst1, Carry);
1703   } else {
1704     B.buildTrunc(Dst1, Carry);
1705   }
1706 
1707   MI.eraseFromParent();
1708   return true;
1709 }
1710 
1711 // Return a suitable opcode for extending the operands of Opc when widening.
1712 static unsigned getExtendOp(unsigned Opc) {
1713   switch (Opc) {
1714   case TargetOpcode::G_ASHR:
1715   case TargetOpcode::G_SMIN:
1716   case TargetOpcode::G_SMAX:
1717     return TargetOpcode::G_SEXT;
1718   case TargetOpcode::G_LSHR:
1719   case TargetOpcode::G_UMIN:
1720   case TargetOpcode::G_UMAX:
1721     return TargetOpcode::G_ZEXT;
1722   default:
1723     return TargetOpcode::G_ANYEXT;
1724   }
1725 }
1726 
1727 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1728 // any illegal vector extend or unmerge operations.
1729 static std::pair<Register, Register>
1730 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1731   const LLT S32 = LLT::scalar(32);
1732   auto Bitcast = B.buildBitcast(S32, Src);
1733 
1734   if (ExtOpcode == TargetOpcode::G_SEXT) {
1735     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1736     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1737     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1738   }
1739 
1740   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1741   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1742     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1743     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1744   }
1745 
1746   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1747   return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1748 }
1749 
1750 // For cases where only a single copy is inserted for matching register banks.
1751 // Replace the register in the instruction operand
1752 static bool substituteSimpleCopyRegs(
1753   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1754   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1755   if (!SrcReg.empty()) {
1756     assert(SrcReg.size() == 1);
1757     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1758     return true;
1759   }
1760 
1761   return false;
1762 }
1763 
1764 /// Handle register layout difference for f16 images for some subtargets.
1765 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1766                                                 MachineRegisterInfo &MRI,
1767                                                 Register Reg) const {
1768   if (!Subtarget.hasUnpackedD16VMem())
1769     return Reg;
1770 
1771   const LLT S16 = LLT::scalar(16);
1772   LLT StoreVT = MRI.getType(Reg);
1773   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1774     return Reg;
1775 
1776   auto Unmerge = B.buildUnmerge(S16, Reg);
1777 
1778 
1779   SmallVector<Register, 4> WideRegs;
1780   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1781     WideRegs.push_back(Unmerge.getReg(I));
1782 
1783   const LLT S32 = LLT::scalar(32);
1784   int NumElts = StoreVT.getNumElements();
1785 
1786   return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1787       .getReg(0);
1788 }
1789 
1790 static std::pair<Register, unsigned>
1791 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1792   int64_t Const;
1793   if (mi_match(Reg, MRI, m_ICst(Const)))
1794     return std::pair(Register(), Const);
1795 
1796   Register Base;
1797   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1798     return std::pair(Base, Const);
1799 
1800   // TODO: Handle G_OR used for add case
1801   return std::pair(Reg, 0);
1802 }
1803 
1804 std::pair<Register, unsigned>
1805 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1806                                            Register OrigOffset) const {
1807   const unsigned MaxImm = 4095;
1808   Register BaseReg;
1809   unsigned ImmOffset;
1810   const LLT S32 = LLT::scalar(32);
1811 
1812   // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1813   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1814                                                            OrigOffset);
1815 
1816   unsigned C1 = 0;
1817   if (ImmOffset != 0) {
1818     // If the immediate value is too big for the immoffset field, put the value
1819     // and -4096 into the immoffset field so that the value that is copied/added
1820     // for the voffset field is a multiple of 4096, and it stands more chance
1821     // of being CSEd with the copy/add for another similar load/store.
1822     // However, do not do that rounding down to a multiple of 4096 if that is a
1823     // negative number, as it appears to be illegal to have a negative offset
1824     // in the vgpr, even if adding the immediate offset makes it positive.
1825     unsigned Overflow = ImmOffset & ~MaxImm;
1826     ImmOffset -= Overflow;
1827     if ((int32_t)Overflow < 0) {
1828       Overflow += ImmOffset;
1829       ImmOffset = 0;
1830     }
1831 
1832     C1 = ImmOffset;
1833     if (Overflow != 0) {
1834       if (!BaseReg)
1835         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1836       else {
1837         auto OverflowVal = B.buildConstant(S32, Overflow);
1838         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1839       }
1840     }
1841   }
1842 
1843   if (!BaseReg)
1844     BaseReg = B.buildConstant(S32, 0).getReg(0);
1845 
1846   return {BaseReg, C1};
1847 }
1848 
1849 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1850                                         Register SrcReg) const {
1851   MachineRegisterInfo &MRI = *B.getMRI();
1852   LLT SrcTy = MRI.getType(SrcReg);
1853   if (SrcTy.getSizeInBits() == 32) {
1854     // Use a v_mov_b32 here to make the exec dependency explicit.
1855     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1856       .addDef(DstReg)
1857       .addUse(SrcReg);
1858     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1859            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1860   }
1861 
1862   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1863   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1864 
1865   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1866     .addDef(TmpReg0)
1867     .addUse(SrcReg, 0, AMDGPU::sub0);
1868   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1869     .addDef(TmpReg1)
1870     .addUse(SrcReg, 0, AMDGPU::sub1);
1871   B.buildInstr(AMDGPU::REG_SEQUENCE)
1872     .addDef(DstReg)
1873     .addUse(TmpReg0)
1874     .addImm(AMDGPU::sub0)
1875     .addUse(TmpReg1)
1876     .addImm(AMDGPU::sub1);
1877 
1878   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1879          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1880 }
1881 
1882 /// Utility function for pushing dynamic vector indexes with a constant offset
1883 /// into waterfall loops.
1884 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1885                                    MachineInstr &IdxUseInstr,
1886                                    unsigned OpIdx,
1887                                    unsigned ConstOffset) {
1888   MachineRegisterInfo &MRI = *B.getMRI();
1889   const LLT S32 = LLT::scalar(32);
1890   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1891   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1892 
1893   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1894 
1895   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1896   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1897   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1898   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1899 }
1900 
1901 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1902 /// original 32-bit source value (to be inserted in the low part of the combined
1903 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1904 /// value.
1905 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1906                                   Register Hi32Reg, Register Lo32Reg,
1907                                   unsigned ExtOpc,
1908                                   const RegisterBank &RegBank,
1909                                   bool IsBooleanSrc = false) {
1910   if (ExtOpc == AMDGPU::G_ZEXT) {
1911     B.buildConstant(Hi32Reg, 0);
1912   } else if (ExtOpc == AMDGPU::G_SEXT) {
1913     if (IsBooleanSrc) {
1914       // If we know the original source was an s1, the high half is the same as
1915       // the low.
1916       B.buildCopy(Hi32Reg, Lo32Reg);
1917     } else {
1918       // Replicate sign bit from 32-bit extended part.
1919       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1920       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1921       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1922     }
1923   } else {
1924     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1925     B.buildUndef(Hi32Reg);
1926   }
1927 }
1928 
1929 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1930   MachineInstr &MI, MachineRegisterInfo &MRI,
1931   const OperandsMapper &OpdMapper) const {
1932 
1933   Register VecReg = MI.getOperand(1).getReg();
1934   Register Idx = MI.getOperand(2).getReg();
1935 
1936   const RegisterBank &IdxBank =
1937     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1938 
1939   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1940 
1941   LLT VecTy = MRI.getType(VecReg);
1942   unsigned EltSize = VecTy.getScalarSizeInBits();
1943   unsigned NumElem = VecTy.getNumElements();
1944 
1945   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1946                                                   IsDivergentIdx, &Subtarget))
1947     return false;
1948 
1949   MachineIRBuilder B(MI);
1950   LLT S32 = LLT::scalar(32);
1951 
1952   const RegisterBank &DstBank =
1953     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1954   const RegisterBank &SrcBank =
1955     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1956 
1957   const RegisterBank &CCBank =
1958     (DstBank == AMDGPU::SGPRRegBank &&
1959      SrcBank == AMDGPU::SGPRRegBank &&
1960      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1961                                      : AMDGPU::VCCRegBank;
1962   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1963 
1964   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1965     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1966     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1967   }
1968 
1969   LLT EltTy = VecTy.getScalarType();
1970   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1971   unsigned NumLanes = DstRegs.size();
1972   if (!NumLanes)
1973     NumLanes = 1;
1974   else
1975     EltTy = MRI.getType(DstRegs[0]);
1976 
1977   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1978   SmallVector<Register, 2> Res(NumLanes);
1979   for (unsigned L = 0; L < NumLanes; ++L)
1980     Res[L] = UnmergeToEltTy.getReg(L);
1981 
1982   for (unsigned I = 1; I < NumElem; ++I) {
1983     auto IC = B.buildConstant(S32, I);
1984     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1985     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1986     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1987 
1988     for (unsigned L = 0; L < NumLanes; ++L) {
1989       auto S = B.buildSelect(EltTy, Cmp,
1990                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1991 
1992       for (unsigned N : { 0, 2, 3 })
1993         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1994 
1995       Res[L] = S->getOperand(0).getReg();
1996     }
1997   }
1998 
1999   for (unsigned L = 0; L < NumLanes; ++L) {
2000     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2001     B.buildCopy(DstReg, Res[L]);
2002     MRI.setRegBank(DstReg, DstBank);
2003   }
2004 
2005   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2006   MI.eraseFromParent();
2007 
2008   return true;
2009 }
2010 
2011 // Insert a cross regbank copy for a register if it already has a bank that
2012 // differs from the one we want to set.
2013 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2014                                    MachineIRBuilder &B, Register &Reg,
2015                                    const RegisterBank &Bank) {
2016   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2017   if (CurrBank && *CurrBank != Bank) {
2018     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2019     MRI.setRegBank(Copy, Bank);
2020     return Copy;
2021   }
2022 
2023   MRI.setRegBank(Reg, Bank);
2024   return Reg;
2025 }
2026 
2027 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2028   MachineInstr &MI, MachineRegisterInfo &MRI,
2029   const OperandsMapper &OpdMapper) const {
2030 
2031   Register VecReg = MI.getOperand(1).getReg();
2032   Register Idx = MI.getOperand(3).getReg();
2033 
2034   const RegisterBank &IdxBank =
2035     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2036 
2037   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2038 
2039   LLT VecTy = MRI.getType(VecReg);
2040   unsigned EltSize = VecTy.getScalarSizeInBits();
2041   unsigned NumElem = VecTy.getNumElements();
2042 
2043   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2044                                                   IsDivergentIdx, &Subtarget))
2045     return false;
2046 
2047   MachineIRBuilder B(MI);
2048   LLT S32 = LLT::scalar(32);
2049 
2050   const RegisterBank &DstBank =
2051     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2052   const RegisterBank &SrcBank =
2053     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2054   const RegisterBank &InsBank =
2055     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2056 
2057   const RegisterBank &CCBank =
2058     (DstBank == AMDGPU::SGPRRegBank &&
2059      SrcBank == AMDGPU::SGPRRegBank &&
2060      InsBank == AMDGPU::SGPRRegBank &&
2061      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2062                                      : AMDGPU::VCCRegBank;
2063   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2064 
2065   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2066     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2067     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2068   }
2069 
2070   LLT EltTy = VecTy.getScalarType();
2071   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2072   unsigned NumLanes = InsRegs.size();
2073   if (!NumLanes) {
2074     NumLanes = 1;
2075     InsRegs.push_back(MI.getOperand(2).getReg());
2076   } else {
2077     EltTy = MRI.getType(InsRegs[0]);
2078   }
2079 
2080   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2081   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2082 
2083   for (unsigned I = 0; I < NumElem; ++I) {
2084     auto IC = B.buildConstant(S32, I);
2085     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2086     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2087     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2088 
2089     for (unsigned L = 0; L < NumLanes; ++L) {
2090       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2091       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2092       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2093 
2094       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2095       MRI.setRegBank(Select, DstBank);
2096 
2097       Ops[I * NumLanes + L] = Select;
2098     }
2099   }
2100 
2101   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2102   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2103     B.buildBuildVector(MI.getOperand(0), Ops);
2104   } else {
2105     auto Vec = B.buildBuildVector(MergeTy, Ops);
2106     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2107     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2108   }
2109 
2110   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2111   MI.eraseFromParent();
2112 
2113   return true;
2114 }
2115 
2116 void AMDGPURegisterBankInfo::applyMappingImpl(
2117     const OperandsMapper &OpdMapper) const {
2118   MachineInstr &MI = OpdMapper.getMI();
2119   unsigned Opc = MI.getOpcode();
2120   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2121   switch (Opc) {
2122   case AMDGPU::G_CONSTANT:
2123   case AMDGPU::G_IMPLICIT_DEF: {
2124     Register DstReg = MI.getOperand(0).getReg();
2125     LLT DstTy = MRI.getType(DstReg);
2126     if (DstTy != LLT::scalar(1))
2127       break;
2128 
2129     const RegisterBank *DstBank =
2130         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2131     if (DstBank == &AMDGPU::VCCRegBank)
2132       break;
2133     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2134     if (DefRegs.empty())
2135       DefRegs.push_back(DstReg);
2136 
2137     MachineIRBuilder B(MI);
2138     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2139 
2140     Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2141     LLVMContext &Ctx = B.getMF().getFunction().getContext();
2142 
2143     MI.getOperand(0).setReg(NewDstReg);
2144     if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2145       uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2146       MI.getOperand(1).setCImm(
2147           ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2148     }
2149 
2150     MRI.setRegBank(NewDstReg, *DstBank);
2151     B.buildTrunc(DefRegs[0], NewDstReg);
2152     return;
2153   }
2154   case AMDGPU::G_PHI: {
2155     Register DstReg = MI.getOperand(0).getReg();
2156     LLT DstTy = MRI.getType(DstReg);
2157     if (DstTy != LLT::scalar(1))
2158       break;
2159 
2160     const LLT S32 = LLT::scalar(32);
2161     const RegisterBank *DstBank =
2162       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2163     if (DstBank == &AMDGPU::VCCRegBank) {
2164       applyDefaultMapping(OpdMapper);
2165       // The standard handling only considers the result register bank for
2166       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2167       // produce an invalid copy. We can only copy with some kind of compare to
2168       // get a vector boolean result. Insert a register bank copy that will be
2169       // correctly lowered to a compare.
2170       MachineIRBuilder B(*MI.getParent()->getParent());
2171 
2172       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2173         Register SrcReg = MI.getOperand(I).getReg();
2174         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2175 
2176         if (SrcBank != &AMDGPU::VCCRegBank) {
2177           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2178           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2179 
2180           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2181           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2182           MI.getOperand(I).setReg(Copy.getReg(0));
2183         }
2184       }
2185 
2186       return;
2187     }
2188 
2189     // Phi handling is strange and only considers the bank of the destination.
2190     substituteSimpleCopyRegs(OpdMapper, 0);
2191 
2192     // Promote SGPR/VGPR booleans to s32
2193     MachineFunction *MF = MI.getParent()->getParent();
2194     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2195     MachineIRBuilder B(MI, ApplyBank);
2196     LegalizerHelper Helper(*MF, ApplyBank, B);
2197 
2198     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2199       llvm_unreachable("widen scalar should have succeeded");
2200 
2201     return;
2202   }
2203   case AMDGPU::G_ICMP:
2204   case AMDGPU::G_UADDO:
2205   case AMDGPU::G_USUBO:
2206   case AMDGPU::G_UADDE:
2207   case AMDGPU::G_SADDE:
2208   case AMDGPU::G_USUBE:
2209   case AMDGPU::G_SSUBE: {
2210     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2211     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2212 
2213     const RegisterBank *DstBank =
2214       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2215     if (DstBank != &AMDGPU::SGPRRegBank)
2216       break;
2217 
2218     const bool HasCarryIn = MI.getNumOperands() == 5;
2219 
2220     // If this is a scalar compare, promote the result to s32, as the selection
2221     // will end up using a copy to a 32-bit vreg.
2222     const LLT S32 = LLT::scalar(32);
2223     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2224     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2225     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2226     MachineIRBuilder B(MI);
2227 
2228     if (HasCarryIn) {
2229       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2230       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2231       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2232       MI.getOperand(4).setReg(NewSrcReg);
2233     }
2234 
2235     MachineBasicBlock *MBB = MI.getParent();
2236     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2237 
2238     // If we had a constrained VCC result register, a copy was inserted to VCC
2239     // from SGPR.
2240     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2241     if (DefRegs.empty())
2242       DefRegs.push_back(DstReg);
2243     B.buildTrunc(DefRegs[0], NewDstReg);
2244     return;
2245   }
2246   case AMDGPU::G_SELECT: {
2247     Register DstReg = MI.getOperand(0).getReg();
2248     LLT DstTy = MRI.getType(DstReg);
2249 
2250     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2251     if (CondRegs.empty())
2252       CondRegs.push_back(MI.getOperand(1).getReg());
2253     else {
2254       assert(CondRegs.size() == 1);
2255     }
2256 
2257     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2258     if (CondBank == &AMDGPU::SGPRRegBank) {
2259       MachineIRBuilder B(MI);
2260       const LLT S32 = LLT::scalar(32);
2261       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2262       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2263 
2264       MI.getOperand(1).setReg(NewCondReg);
2265       B.buildZExt(NewCondReg, CondRegs[0]);
2266     }
2267 
2268     if (DstTy.getSizeInBits() != 64)
2269       break;
2270 
2271     MachineIRBuilder B(MI);
2272     LLT HalfTy = getHalfSizedType(DstTy);
2273 
2274     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2275     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2276     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2277 
2278     // All inputs are SGPRs, nothing special to do.
2279     if (DefRegs.empty()) {
2280       assert(Src1Regs.empty() && Src2Regs.empty());
2281       break;
2282     }
2283 
2284     if (Src1Regs.empty())
2285       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2286     else {
2287       setRegsToType(MRI, Src1Regs, HalfTy);
2288     }
2289 
2290     if (Src2Regs.empty())
2291       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2292     else
2293       setRegsToType(MRI, Src2Regs, HalfTy);
2294 
2295     setRegsToType(MRI, DefRegs, HalfTy);
2296 
2297     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2298     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2299 
2300     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2301     MI.eraseFromParent();
2302     return;
2303   }
2304   case AMDGPU::G_BRCOND: {
2305     Register CondReg = MI.getOperand(0).getReg();
2306     // FIXME: Should use legalizer helper, but should change bool ext type.
2307     const RegisterBank *CondBank =
2308       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2309 
2310     if (CondBank == &AMDGPU::SGPRRegBank) {
2311       MachineIRBuilder B(MI);
2312       const LLT S32 = LLT::scalar(32);
2313       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2314       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2315 
2316       MI.getOperand(0).setReg(NewCondReg);
2317       B.buildZExt(NewCondReg, CondReg);
2318       return;
2319     }
2320 
2321     break;
2322   }
2323   case AMDGPU::G_AND:
2324   case AMDGPU::G_OR:
2325   case AMDGPU::G_XOR: {
2326     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2327     // there is a VGPR input.
2328     Register DstReg = MI.getOperand(0).getReg();
2329     LLT DstTy = MRI.getType(DstReg);
2330 
2331     if (DstTy.getSizeInBits() == 1) {
2332       const RegisterBank *DstBank =
2333         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2334       if (DstBank == &AMDGPU::VCCRegBank)
2335         break;
2336 
2337       MachineFunction *MF = MI.getParent()->getParent();
2338       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2339       MachineIRBuilder B(MI, ApplyBank);
2340       LegalizerHelper Helper(*MF, ApplyBank, B);
2341 
2342       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2343           LegalizerHelper::Legalized)
2344         llvm_unreachable("widen scalar should have succeeded");
2345       return;
2346     }
2347 
2348     if (DstTy.getSizeInBits() != 64)
2349       break;
2350 
2351     LLT HalfTy = getHalfSizedType(DstTy);
2352     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2353     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2354     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2355 
2356     // All inputs are SGPRs, nothing special to do.
2357     if (DefRegs.empty()) {
2358       assert(Src0Regs.empty() && Src1Regs.empty());
2359       break;
2360     }
2361 
2362     assert(DefRegs.size() == 2);
2363     assert(Src0Regs.size() == Src1Regs.size() &&
2364            (Src0Regs.empty() || Src0Regs.size() == 2));
2365 
2366     // Depending on where the source registers came from, the generic code may
2367     // have decided to split the inputs already or not. If not, we still need to
2368     // extract the values.
2369     MachineIRBuilder B(MI);
2370 
2371     if (Src0Regs.empty())
2372       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2373     else
2374       setRegsToType(MRI, Src0Regs, HalfTy);
2375 
2376     if (Src1Regs.empty())
2377       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2378     else
2379       setRegsToType(MRI, Src1Regs, HalfTy);
2380 
2381     setRegsToType(MRI, DefRegs, HalfTy);
2382 
2383     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2384     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2385 
2386     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2387     MI.eraseFromParent();
2388     return;
2389   }
2390   case AMDGPU::G_ABS: {
2391     Register SrcReg = MI.getOperand(1).getReg();
2392     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2393 
2394     // There is no VALU abs instruction so we need to replace it with a sub and
2395     // max combination.
2396     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2397       MachineFunction *MF = MI.getParent()->getParent();
2398       ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2399       MachineIRBuilder B(MI, Apply);
2400       LegalizerHelper Helper(*MF, Apply, B);
2401 
2402       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2403         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2404       return;
2405     }
2406     [[fallthrough]];
2407   }
2408   case AMDGPU::G_ADD:
2409   case AMDGPU::G_SUB:
2410   case AMDGPU::G_MUL:
2411   case AMDGPU::G_SHL:
2412   case AMDGPU::G_LSHR:
2413   case AMDGPU::G_ASHR:
2414   case AMDGPU::G_SMIN:
2415   case AMDGPU::G_SMAX:
2416   case AMDGPU::G_UMIN:
2417   case AMDGPU::G_UMAX: {
2418     Register DstReg = MI.getOperand(0).getReg();
2419     LLT DstTy = MRI.getType(DstReg);
2420 
2421     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2422     // Packed 16-bit operations need to be scalarized and promoted.
2423     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2424       break;
2425 
2426     const RegisterBank *DstBank =
2427       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2428     if (DstBank == &AMDGPU::VGPRRegBank)
2429       break;
2430 
2431     const LLT S32 = LLT::scalar(32);
2432     MachineBasicBlock *MBB = MI.getParent();
2433     MachineFunction *MF = MBB->getParent();
2434     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2435     MachineIRBuilder B(MI, ApplySALU);
2436 
2437     if (DstTy.isVector()) {
2438       Register WideSrc0Lo, WideSrc0Hi;
2439       Register WideSrc1Lo, WideSrc1Hi;
2440 
2441       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2442       std::tie(WideSrc0Lo, WideSrc0Hi)
2443         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2444       std::tie(WideSrc1Lo, WideSrc1Hi)
2445         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2446       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2447       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2448       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2449       MI.eraseFromParent();
2450     } else {
2451       LegalizerHelper Helper(*MF, ApplySALU, B);
2452 
2453       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2454         llvm_unreachable("widen scalar should have succeeded");
2455 
2456       // FIXME: s16 shift amounts should be legal.
2457       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2458           Opc == AMDGPU::G_ASHR) {
2459         B.setInsertPt(*MBB, MI.getIterator());
2460         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2461           llvm_unreachable("widen scalar should have succeeded");
2462       }
2463     }
2464 
2465     return;
2466   }
2467   case AMDGPU::G_SEXT_INREG: {
2468     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2469     if (SrcRegs.empty())
2470       break; // Nothing to repair
2471 
2472     const LLT S32 = LLT::scalar(32);
2473     MachineIRBuilder B(MI);
2474     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2475     GISelObserverWrapper Observer(&O);
2476     B.setChangeObserver(Observer);
2477 
2478     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2479     // we would need to further expand, and doesn't let us directly set the
2480     // result registers.
2481     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2482 
2483     int Amt = MI.getOperand(2).getImm();
2484     if (Amt <= 32) {
2485       // Downstream users have expectations for the high bit behavior, so freeze
2486       // incoming undefined bits.
2487       if (Amt == 32) {
2488         // The low bits are unchanged.
2489         B.buildFreeze(DstRegs[0], SrcRegs[0]);
2490       } else {
2491         auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2492         // Extend in the low bits and propagate the sign bit to the high half.
2493         B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2494       }
2495 
2496       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2497     } else {
2498       // The low bits are unchanged, and extend in the high bits.
2499       // No freeze required
2500       B.buildCopy(DstRegs[0], SrcRegs[0]);
2501       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2502     }
2503 
2504     Register DstReg = MI.getOperand(0).getReg();
2505     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2506     MI.eraseFromParent();
2507     return;
2508   }
2509   case AMDGPU::G_CTPOP:
2510   case AMDGPU::G_BITREVERSE: {
2511     const RegisterBank *DstBank =
2512       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2513     if (DstBank == &AMDGPU::SGPRRegBank)
2514       break;
2515 
2516     Register SrcReg = MI.getOperand(1).getReg();
2517     const LLT S32 = LLT::scalar(32);
2518     LLT Ty = MRI.getType(SrcReg);
2519     if (Ty == S32)
2520       break;
2521 
2522     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2523     MachineIRBuilder B(MI, ApplyVALU);
2524 
2525     MachineFunction &MF = B.getMF();
2526     LegalizerHelper Helper(MF, ApplyVALU, B);
2527 
2528     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2529       llvm_unreachable("narrowScalar should have succeeded");
2530     return;
2531   }
2532   case AMDGPU::G_AMDGPU_FFBH_U32:
2533   case AMDGPU::G_AMDGPU_FFBL_B32:
2534   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2535   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2536     const RegisterBank *DstBank =
2537         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2538     if (DstBank == &AMDGPU::SGPRRegBank)
2539       break;
2540 
2541     Register SrcReg = MI.getOperand(1).getReg();
2542     const LLT S32 = LLT::scalar(32);
2543     LLT Ty = MRI.getType(SrcReg);
2544     if (Ty == S32)
2545       break;
2546 
2547     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2548     // which return -1 when the input is zero:
2549     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2550     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2551     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2552     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2553     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2554     MachineIRBuilder B(MI, ApplyVALU);
2555     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2556     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2557                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2558                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2559                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2560                                 : Opc;
2561     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2562     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2563     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2564     unsigned AddOpc =
2565         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2566             ? AMDGPU::G_ADD
2567             : AMDGPU::G_UADDSAT;
2568     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2569     Register DstReg = MI.getOperand(0).getReg();
2570     B.buildUMin(DstReg, X, Y);
2571     MI.eraseFromParent();
2572     return;
2573   }
2574   case AMDGPU::G_SEXT:
2575   case AMDGPU::G_ZEXT:
2576   case AMDGPU::G_ANYEXT: {
2577     Register SrcReg = MI.getOperand(1).getReg();
2578     LLT SrcTy = MRI.getType(SrcReg);
2579     const bool Signed = Opc == AMDGPU::G_SEXT;
2580 
2581     assert(OpdMapper.getVRegs(1).empty());
2582 
2583     MachineIRBuilder B(MI);
2584     const RegisterBank *SrcBank =
2585       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2586 
2587     Register DstReg = MI.getOperand(0).getReg();
2588     LLT DstTy = MRI.getType(DstReg);
2589     if (DstTy.isScalar() &&
2590         SrcBank != &AMDGPU::SGPRRegBank &&
2591         SrcBank != &AMDGPU::VCCRegBank &&
2592         // FIXME: Should handle any type that round to s64 when irregular
2593         // breakdowns supported.
2594         DstTy.getSizeInBits() == 64 &&
2595         SrcTy.getSizeInBits() <= 32) {
2596       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2597 
2598       // Extend to 32-bit, and then extend the low half.
2599       if (Signed) {
2600         // TODO: Should really be buildSExtOrCopy
2601         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2602       } else if (Opc == AMDGPU::G_ZEXT) {
2603         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2604       } else {
2605         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2606       }
2607 
2608       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2609       MRI.setRegBank(DstReg, *SrcBank);
2610       MI.eraseFromParent();
2611       return;
2612     }
2613 
2614     if (SrcTy != LLT::scalar(1))
2615       return;
2616 
2617     // It is not legal to have a legalization artifact with a VCC source. Rather
2618     // than introducing a copy, insert the select we would have to select the
2619     // copy to.
2620     if (SrcBank == &AMDGPU::VCCRegBank) {
2621       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2622 
2623       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2624 
2625       unsigned DstSize = DstTy.getSizeInBits();
2626       // 64-bit select is SGPR only
2627       const bool UseSel64 = DstSize > 32 &&
2628         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2629 
2630       // TODO: Should s16 select be legal?
2631       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2632       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2633       auto False = B.buildConstant(SelType, 0);
2634 
2635       MRI.setRegBank(True.getReg(0), *DstBank);
2636       MRI.setRegBank(False.getReg(0), *DstBank);
2637       MRI.setRegBank(DstReg, *DstBank);
2638 
2639       if (DstSize > 32) {
2640         B.buildSelect(DefRegs[0], SrcReg, True, False);
2641         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2642       } else if (DstSize < 32) {
2643         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2644         MRI.setRegBank(Sel.getReg(0), *DstBank);
2645         B.buildTrunc(DstReg, Sel);
2646       } else {
2647         B.buildSelect(DstReg, SrcReg, True, False);
2648       }
2649 
2650       MI.eraseFromParent();
2651       return;
2652     }
2653 
2654     break;
2655   }
2656   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2657     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2658 
2659     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2660 
2661     Register DstReg = MI.getOperand(0).getReg();
2662     Register SrcReg = MI.getOperand(1).getReg();
2663 
2664     const LLT S32 = LLT::scalar(32);
2665     LLT DstTy = MRI.getType(DstReg);
2666     LLT SrcTy = MRI.getType(SrcReg);
2667 
2668     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2669       return;
2670 
2671     MachineIRBuilder B(MI);
2672 
2673     const ValueMapping &DstMapping
2674       = OpdMapper.getInstrMapping().getOperandMapping(0);
2675     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2676     const RegisterBank *SrcBank =
2677       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2678     const RegisterBank *IdxBank =
2679         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2680 
2681     Register BaseIdxReg;
2682     unsigned ConstOffset;
2683     std::tie(BaseIdxReg, ConstOffset) =
2684         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2685 
2686     // See if the index is an add of a constant which will be foldable by moving
2687     // the base register of the index later if this is going to be executed in a
2688     // waterfall loop. This is essentially to reassociate the add of a constant
2689     // with the readfirstlane.
2690     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2691                                    ConstOffset > 0 &&
2692                                    ConstOffset < SrcTy.getNumElements();
2693 
2694     // Move the base register. We'll re-insert the add later.
2695     if (ShouldMoveIndexIntoLoop)
2696       MI.getOperand(2).setReg(BaseIdxReg);
2697 
2698     // If this is a VGPR result only because the index was a VGPR result, the
2699     // actual indexing will be done on the SGPR source vector, which will
2700     // produce a scalar result. We need to copy to the VGPR result inside the
2701     // waterfall loop.
2702     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2703                                 SrcBank == &AMDGPU::SGPRRegBank;
2704     if (DstRegs.empty()) {
2705       applyDefaultMapping(OpdMapper);
2706 
2707       executeInWaterfallLoop(MI, MRI, { 2 });
2708 
2709       if (NeedCopyToVGPR) {
2710         // We don't want a phi for this temporary reg.
2711         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2712         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2713         MI.getOperand(0).setReg(TmpReg);
2714         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2715 
2716         // Use a v_mov_b32 here to make the exec dependency explicit.
2717         buildVCopy(B, DstReg, TmpReg);
2718       }
2719 
2720       // Re-insert the constant offset add inside the waterfall loop.
2721       if (ShouldMoveIndexIntoLoop)
2722         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2723 
2724       return;
2725     }
2726 
2727     assert(DstTy.getSizeInBits() == 64);
2728 
2729     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2730 
2731     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2732     auto One = B.buildConstant(S32, 1);
2733 
2734     MachineBasicBlock::iterator MII = MI.getIterator();
2735 
2736     // Split the vector index into 32-bit pieces. Prepare to move all of the
2737     // new instructions into a waterfall loop if necessary.
2738     //
2739     // Don't put the bitcast or constant in the loop.
2740     MachineInstrSpan Span(MII, &B.getMBB());
2741 
2742     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2743     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2744     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2745 
2746     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2747     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2748 
2749     MRI.setRegBank(DstReg, *DstBank);
2750     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2751     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2752     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2753     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2754 
2755     SmallSet<Register, 4> OpsToWaterfall;
2756     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2757       MI.eraseFromParent();
2758       return;
2759     }
2760 
2761     // Remove the original instruction to avoid potentially confusing the
2762     // waterfall loop logic.
2763     B.setInstr(*Span.begin());
2764     MI.eraseFromParent();
2765     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2766                            OpsToWaterfall, MRI);
2767 
2768     if (NeedCopyToVGPR) {
2769       MachineBasicBlock *LoopBB = Extract1->getParent();
2770       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2771       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2772       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2773       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2774 
2775       Extract0->getOperand(0).setReg(TmpReg0);
2776       Extract1->getOperand(0).setReg(TmpReg1);
2777 
2778       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2779 
2780       buildVCopy(B, DstRegs[0], TmpReg0);
2781       buildVCopy(B, DstRegs[1], TmpReg1);
2782     }
2783 
2784     if (ShouldMoveIndexIntoLoop)
2785       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2786 
2787     return;
2788   }
2789   case AMDGPU::G_INSERT_VECTOR_ELT: {
2790     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2791 
2792     Register DstReg = MI.getOperand(0).getReg();
2793     LLT VecTy = MRI.getType(DstReg);
2794 
2795     assert(OpdMapper.getVRegs(0).empty());
2796     assert(OpdMapper.getVRegs(3).empty());
2797 
2798     if (substituteSimpleCopyRegs(OpdMapper, 1))
2799       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2800 
2801     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2802       return;
2803 
2804     const RegisterBank *IdxBank =
2805       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2806 
2807     Register SrcReg = MI.getOperand(1).getReg();
2808     Register InsReg = MI.getOperand(2).getReg();
2809     LLT InsTy = MRI.getType(InsReg);
2810     (void)InsTy;
2811 
2812     Register BaseIdxReg;
2813     unsigned ConstOffset;
2814     std::tie(BaseIdxReg, ConstOffset) =
2815         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2816 
2817     // See if the index is an add of a constant which will be foldable by moving
2818     // the base register of the index later if this is going to be executed in a
2819     // waterfall loop. This is essentially to reassociate the add of a constant
2820     // with the readfirstlane.
2821     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2822       ConstOffset > 0 &&
2823       ConstOffset < VecTy.getNumElements();
2824 
2825     // Move the base register. We'll re-insert the add later.
2826     if (ShouldMoveIndexIntoLoop)
2827       MI.getOperand(3).setReg(BaseIdxReg);
2828 
2829 
2830     if (InsRegs.empty()) {
2831       executeInWaterfallLoop(MI, MRI, { 3 });
2832 
2833       // Re-insert the constant offset add inside the waterfall loop.
2834       if (ShouldMoveIndexIntoLoop) {
2835         MachineIRBuilder B(MI);
2836         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2837       }
2838 
2839       return;
2840     }
2841 
2842 
2843     assert(InsTy.getSizeInBits() == 64);
2844 
2845     const LLT S32 = LLT::scalar(32);
2846     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2847 
2848     MachineIRBuilder B(MI);
2849     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2850     auto One = B.buildConstant(S32, 1);
2851 
2852     // Split the vector index into 32-bit pieces. Prepare to move all of the
2853     // new instructions into a waterfall loop if necessary.
2854     //
2855     // Don't put the bitcast or constant in the loop.
2856     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2857 
2858     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2859     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2860     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2861 
2862     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2863     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2864 
2865     const RegisterBank *DstBank =
2866       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2867     const RegisterBank *SrcBank =
2868       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2869     const RegisterBank *InsSrcBank =
2870       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2871 
2872     MRI.setRegBank(InsReg, *InsSrcBank);
2873     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2874     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2875     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2876     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2877     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2878     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2879 
2880 
2881     SmallSet<Register, 4> OpsToWaterfall;
2882     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2883       B.setInsertPt(B.getMBB(), MI);
2884       B.buildBitcast(DstReg, InsHi);
2885       MI.eraseFromParent();
2886       return;
2887     }
2888 
2889     B.setInstr(*Span.begin());
2890     MI.eraseFromParent();
2891 
2892     // Figure out the point after the waterfall loop before mangling the control
2893     // flow.
2894     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2895                            OpsToWaterfall, MRI);
2896 
2897     // The insertion point is now right after the original instruction.
2898     //
2899     // Keep the bitcast to the original vector type out of the loop. Doing this
2900     // saved an extra phi we don't need inside the loop.
2901     B.buildBitcast(DstReg, InsHi);
2902 
2903     // Re-insert the constant offset add inside the waterfall loop.
2904     if (ShouldMoveIndexIntoLoop)
2905       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2906 
2907     return;
2908   }
2909   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2910   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2911   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2912   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2913   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2914   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2915   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
2916   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2917   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2918   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2919   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2920   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2921   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2922   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2923   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2924   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2925   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2926     applyDefaultMapping(OpdMapper);
2927     executeInWaterfallLoop(MI, MRI, {1, 4});
2928     return;
2929   }
2930   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2931   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2932   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2933   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2934   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2935   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2936   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2937   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2938   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2939   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2940   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2941   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2942     applyDefaultMapping(OpdMapper);
2943     executeInWaterfallLoop(MI, MRI, {2, 5});
2944     return;
2945   }
2946   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2947   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2948   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2949     applyDefaultMapping(OpdMapper);
2950     executeInWaterfallLoop(MI, MRI, {2, 5});
2951     return;
2952   }
2953   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2954     applyDefaultMapping(OpdMapper);
2955     executeInWaterfallLoop(MI, MRI, {3, 6});
2956     return;
2957   }
2958   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2959     applyMappingSBufferLoad(OpdMapper);
2960     return;
2961   }
2962   case AMDGPU::G_INTRINSIC: {
2963     switch (MI.getIntrinsicID()) {
2964     case Intrinsic::amdgcn_readlane: {
2965       substituteSimpleCopyRegs(OpdMapper, 2);
2966 
2967       assert(OpdMapper.getVRegs(0).empty());
2968       assert(OpdMapper.getVRegs(3).empty());
2969 
2970       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2971       // waterfall loop, so assume it's a uniform value.
2972       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2973       return;
2974     }
2975     case Intrinsic::amdgcn_writelane: {
2976       assert(OpdMapper.getVRegs(0).empty());
2977       assert(OpdMapper.getVRegs(2).empty());
2978       assert(OpdMapper.getVRegs(3).empty());
2979 
2980       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2981       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2982       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2983       return;
2984     }
2985     case Intrinsic::amdgcn_interp_p1:
2986     case Intrinsic::amdgcn_interp_p2:
2987     case Intrinsic::amdgcn_interp_mov:
2988     case Intrinsic::amdgcn_interp_p1_f16:
2989     case Intrinsic::amdgcn_interp_p2_f16:
2990     case Intrinsic::amdgcn_lds_param_load: {
2991       applyDefaultMapping(OpdMapper);
2992 
2993       // Readlane for m0 value, which is always the last operand.
2994       // FIXME: Should this be a waterfall loop instead?
2995       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2996       return;
2997     }
2998     case Intrinsic::amdgcn_interp_inreg_p10:
2999     case Intrinsic::amdgcn_interp_inreg_p2:
3000     case Intrinsic::amdgcn_interp_inreg_p10_f16:
3001     case Intrinsic::amdgcn_interp_inreg_p2_f16:
3002       applyDefaultMapping(OpdMapper);
3003       return;
3004     case Intrinsic::amdgcn_permlane16:
3005     case Intrinsic::amdgcn_permlanex16: {
3006       // Doing a waterfall loop over these wouldn't make any sense.
3007       substituteSimpleCopyRegs(OpdMapper, 2);
3008       substituteSimpleCopyRegs(OpdMapper, 3);
3009       constrainOpWithReadfirstlane(MI, MRI, 4);
3010       constrainOpWithReadfirstlane(MI, MRI, 5);
3011       return;
3012     }
3013     case Intrinsic::amdgcn_sbfe:
3014       applyMappingBFE(OpdMapper, true);
3015       return;
3016     case Intrinsic::amdgcn_ubfe:
3017       applyMappingBFE(OpdMapper, false);
3018       return;
3019     case Intrinsic::amdgcn_ballot:
3020       // Use default handling and insert copy to vcc source.
3021       break;
3022     }
3023     break;
3024   }
3025   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3026   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3027   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3028   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3029     const AMDGPU::RsrcIntrinsic *RSrcIntrin
3030       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3031     assert(RSrcIntrin && RSrcIntrin->IsImage);
3032     // Non-images can have complications from operands that allow both SGPR
3033     // and VGPR. For now it's too complicated to figure out the final opcode
3034     // to derive the register bank from the MCInstrDesc.
3035     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3036     return;
3037   }
3038   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3039     unsigned N = MI.getNumExplicitOperands() - 2;
3040     applyDefaultMapping(OpdMapper);
3041     executeInWaterfallLoop(MI, MRI, { N });
3042     return;
3043   }
3044   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3045     auto IntrID = MI.getIntrinsicID();
3046     switch (IntrID) {
3047     case Intrinsic::amdgcn_ds_ordered_add:
3048     case Intrinsic::amdgcn_ds_ordered_swap: {
3049       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3050       assert(OpdMapper.getVRegs(0).empty());
3051       substituteSimpleCopyRegs(OpdMapper, 3);
3052       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3053       return;
3054     }
3055     case Intrinsic::amdgcn_ds_gws_init:
3056     case Intrinsic::amdgcn_ds_gws_barrier:
3057     case Intrinsic::amdgcn_ds_gws_sema_br: {
3058       // Only the first lane is executes, so readfirstlane is safe.
3059       substituteSimpleCopyRegs(OpdMapper, 1);
3060       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3061       return;
3062     }
3063     case Intrinsic::amdgcn_ds_gws_sema_v:
3064     case Intrinsic::amdgcn_ds_gws_sema_p:
3065     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3066       // Only the first lane is executes, so readfirstlane is safe.
3067       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3068       return;
3069     }
3070     case Intrinsic::amdgcn_ds_append:
3071     case Intrinsic::amdgcn_ds_consume: {
3072       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3073       return;
3074     }
3075     case Intrinsic::amdgcn_s_sendmsg:
3076     case Intrinsic::amdgcn_s_sendmsghalt: {
3077       // FIXME: Should this use a waterfall loop?
3078       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3079       return;
3080     }
3081     case Intrinsic::amdgcn_s_setreg: {
3082       constrainOpWithReadfirstlane(MI, MRI, 2);
3083       return;
3084     }
3085     case Intrinsic::amdgcn_raw_buffer_load_lds: {
3086       applyDefaultMapping(OpdMapper);
3087       constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3088       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3089       constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
3090       return;
3091     }
3092     case Intrinsic::amdgcn_struct_buffer_load_lds: {
3093       applyDefaultMapping(OpdMapper);
3094       constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3095       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3096       constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
3097       return;
3098     }
3099     case Intrinsic::amdgcn_global_load_lds: {
3100       applyDefaultMapping(OpdMapper);
3101       constrainOpWithReadfirstlane(MI, MRI, 2);
3102       return;
3103     }
3104     case Intrinsic::amdgcn_lds_direct_load: {
3105       applyDefaultMapping(OpdMapper);
3106       // Readlane for m0 value, which is always the last operand.
3107       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3108       return;
3109     }
3110     case Intrinsic::amdgcn_exp_row:
3111       applyDefaultMapping(OpdMapper);
3112       constrainOpWithReadfirstlane(MI, MRI, 8); // M0
3113       return;
3114     default: {
3115       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3116               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3117         // Non-images can have complications from operands that allow both SGPR
3118         // and VGPR. For now it's too complicated to figure out the final opcode
3119         // to derive the register bank from the MCInstrDesc.
3120         if (RSrcIntrin->IsImage) {
3121           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3122           return;
3123         }
3124       }
3125 
3126       break;
3127     }
3128     }
3129     break;
3130   }
3131   case AMDGPU::G_SI_CALL: {
3132     // Use a set to avoid extra readfirstlanes in the case where multiple
3133     // operands are the same register.
3134     SmallSet<Register, 4> SGPROperandRegs;
3135 
3136     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3137       break;
3138 
3139     // Move all copies to physical SGPRs that are used by the call instruction
3140     // into the loop block. Start searching for these copies until the
3141     // ADJCALLSTACKUP.
3142     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3143     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3144 
3145     // Move all non-copies before the copies, so that a complete range can be
3146     // moved into the waterfall loop.
3147     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3148     // Count of NonCopyInstrs found until the current LastCopy.
3149     unsigned NonCopyInstrsLen = 0;
3150     MachineBasicBlock::iterator Start(&MI);
3151     MachineBasicBlock::iterator LastCopy = Start;
3152     MachineBasicBlock *MBB = MI.getParent();
3153     const SIMachineFunctionInfo *Info =
3154         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3155     while (Start->getOpcode() != FrameSetupOpcode) {
3156       --Start;
3157       bool IsCopy = false;
3158       if (Start->getOpcode() == AMDGPU::COPY) {
3159         auto &Dst = Start->getOperand(0);
3160         if (Dst.isReg()) {
3161           Register Reg = Dst.getReg();
3162           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3163             IsCopy = true;
3164           } else {
3165             // Also move the copy from the scratch rsrc descriptor into the loop
3166             // to allow it to be optimized away.
3167             auto &Src = Start->getOperand(1);
3168             if (Src.isReg()) {
3169               Reg = Src.getReg();
3170               IsCopy = Info->getScratchRSrcReg() == Reg;
3171             }
3172           }
3173         }
3174       }
3175 
3176       if (IsCopy) {
3177         LastCopy = Start;
3178         NonCopyInstrsLen = NonCopyInstrs.size();
3179       } else {
3180         NonCopyInstrs.push_back(&*Start);
3181       }
3182     }
3183     NonCopyInstrs.resize(NonCopyInstrsLen);
3184 
3185     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3186       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3187     }
3188     Start = LastCopy;
3189 
3190     // Do the same for copies after the loop
3191     NonCopyInstrs.clear();
3192     NonCopyInstrsLen = 0;
3193     MachineBasicBlock::iterator End(&MI);
3194     LastCopy = End;
3195     while (End->getOpcode() != FrameDestroyOpcode) {
3196       ++End;
3197       bool IsCopy = false;
3198       if (End->getOpcode() == AMDGPU::COPY) {
3199         auto &Src = End->getOperand(1);
3200         if (Src.isReg()) {
3201           Register Reg = Src.getReg();
3202           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3203         }
3204       }
3205 
3206       if (IsCopy) {
3207         LastCopy = End;
3208         NonCopyInstrsLen = NonCopyInstrs.size();
3209       } else {
3210         NonCopyInstrs.push_back(&*End);
3211       }
3212     }
3213     NonCopyInstrs.resize(NonCopyInstrsLen);
3214 
3215     End = LastCopy;
3216     ++LastCopy;
3217     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3218       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3219     }
3220 
3221     ++End;
3222     MachineIRBuilder B(*Start);
3223     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3224     break;
3225   }
3226   case AMDGPU::G_LOAD:
3227   case AMDGPU::G_ZEXTLOAD:
3228   case AMDGPU::G_SEXTLOAD: {
3229     if (applyMappingLoad(MI, OpdMapper, MRI))
3230       return;
3231     break;
3232   }
3233   case AMDGPU::G_DYN_STACKALLOC:
3234     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3235     return;
3236   case AMDGPU::G_SBFX:
3237     applyMappingBFE(OpdMapper, /*Signed*/ true);
3238     return;
3239   case AMDGPU::G_UBFX:
3240     applyMappingBFE(OpdMapper, /*Signed*/ false);
3241     return;
3242   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3243   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3244     applyMappingMAD_64_32(OpdMapper);
3245     return;
3246   default:
3247     break;
3248   }
3249 
3250   return applyDefaultMapping(OpdMapper);
3251 }
3252 
3253 // vgpr, sgpr -> vgpr
3254 // vgpr, agpr -> vgpr
3255 // agpr, agpr -> agpr
3256 // agpr, sgpr -> vgpr
3257 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3258   if (RB0 == AMDGPU::InvalidRegBankID)
3259     return RB1;
3260   if (RB1 == AMDGPU::InvalidRegBankID)
3261     return RB0;
3262 
3263   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3264     return AMDGPU::SGPRRegBankID;
3265 
3266   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3267     return AMDGPU::AGPRRegBankID;
3268 
3269   return AMDGPU::VGPRRegBankID;
3270 }
3271 
3272 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3273   if (RB0 == AMDGPU::InvalidRegBankID)
3274     return RB1;
3275   if (RB1 == AMDGPU::InvalidRegBankID)
3276     return RB0;
3277 
3278   // vcc, vcc -> vcc
3279   // vcc, sgpr -> vcc
3280   // vcc, vgpr -> vcc
3281   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3282     return AMDGPU::VCCRegBankID;
3283 
3284   // vcc, vgpr -> vgpr
3285   return regBankUnion(RB0, RB1);
3286 }
3287 
3288 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3289                                                 const MachineInstr &MI) const {
3290   unsigned RegBank = AMDGPU::InvalidRegBankID;
3291 
3292   for (const MachineOperand &MO : MI.operands()) {
3293     if (!MO.isReg())
3294       continue;
3295     Register Reg = MO.getReg();
3296     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3297       RegBank = regBankUnion(RegBank, Bank->getID());
3298       if (RegBank == AMDGPU::VGPRRegBankID)
3299         break;
3300     }
3301   }
3302 
3303   return RegBank;
3304 }
3305 
3306 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3307   const MachineFunction &MF = *MI.getParent()->getParent();
3308   const MachineRegisterInfo &MRI = MF.getRegInfo();
3309   for (const MachineOperand &MO : MI.operands()) {
3310     if (!MO.isReg())
3311       continue;
3312     Register Reg = MO.getReg();
3313     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3314       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3315         return false;
3316     }
3317   }
3318   return true;
3319 }
3320 
3321 const RegisterBankInfo::InstructionMapping &
3322 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3323   const MachineFunction &MF = *MI.getParent()->getParent();
3324   const MachineRegisterInfo &MRI = MF.getRegInfo();
3325   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3326 
3327   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3328     const MachineOperand &SrcOp = MI.getOperand(i);
3329     if (!SrcOp.isReg())
3330       continue;
3331 
3332     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3333     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3334   }
3335   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3336                                MI.getNumOperands());
3337 }
3338 
3339 const RegisterBankInfo::InstructionMapping &
3340 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3341   const MachineFunction &MF = *MI.getParent()->getParent();
3342   const MachineRegisterInfo &MRI = MF.getRegInfo();
3343   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3344 
3345   // Even though we technically could use SGPRs, this would require knowledge of
3346   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3347   //
3348   // TODO: Unary ops are trivially OK, so accept SGPRs?
3349   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3350     const MachineOperand &Src = MI.getOperand(i);
3351     if (!Src.isReg())
3352       continue;
3353 
3354     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3355     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3356     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3357   }
3358 
3359   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3360                                MI.getNumOperands());
3361 }
3362 
3363 const RegisterBankInfo::InstructionMapping &
3364 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3365   const MachineFunction &MF = *MI.getParent()->getParent();
3366   const MachineRegisterInfo &MRI = MF.getRegInfo();
3367   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3368 
3369   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3370     const MachineOperand &Op = MI.getOperand(I);
3371     if (!Op.isReg())
3372       continue;
3373 
3374     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3375     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3376   }
3377 
3378   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3379                                MI.getNumOperands());
3380 }
3381 
3382 const RegisterBankInfo::InstructionMapping &
3383 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3384                                         const MachineInstr &MI,
3385                                         int RsrcIdx) const {
3386   // The reported argument index is relative to the IR intrinsic call arguments,
3387   // so we need to shift by the number of defs and the intrinsic ID.
3388   RsrcIdx += MI.getNumExplicitDefs() + 1;
3389 
3390   const int NumOps = MI.getNumOperands();
3391   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3392 
3393   // TODO: Should packed/unpacked D16 difference be reported here as part of
3394   // the value mapping?
3395   for (int I = 0; I != NumOps; ++I) {
3396     if (!MI.getOperand(I).isReg())
3397       continue;
3398 
3399     Register OpReg = MI.getOperand(I).getReg();
3400     // We replace some dead address operands with $noreg
3401     if (!OpReg)
3402       continue;
3403 
3404     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3405 
3406     // FIXME: Probably need a new intrinsic register bank searchable table to
3407     // handle arbitrary intrinsics easily.
3408     //
3409     // If this has a sampler, it immediately follows rsrc.
3410     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3411 
3412     if (MustBeSGPR) {
3413       // If this must be an SGPR, so we must report whatever it is as legal.
3414       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3415       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3416     } else {
3417       // Some operands must be VGPR, and these are easy to copy to.
3418       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3419     }
3420   }
3421 
3422   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3423 }
3424 
3425 /// Return the mapping for a pointer argument.
3426 const RegisterBankInfo::ValueMapping *
3427 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3428                                               Register PtrReg) const {
3429   LLT PtrTy = MRI.getType(PtrReg);
3430   unsigned Size = PtrTy.getSizeInBits();
3431   if (Subtarget.useFlatForGlobal() ||
3432       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3433     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3434 
3435   // If we're using MUBUF instructions for global memory, an SGPR base register
3436   // is possible. Otherwise this needs to be a VGPR.
3437   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3438   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3439 }
3440 
3441 const RegisterBankInfo::InstructionMapping &
3442 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3443 
3444   const MachineFunction &MF = *MI.getParent()->getParent();
3445   const MachineRegisterInfo &MRI = MF.getRegInfo();
3446   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3447   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3448   Register PtrReg = MI.getOperand(1).getReg();
3449   LLT PtrTy = MRI.getType(PtrReg);
3450   unsigned AS = PtrTy.getAddressSpace();
3451   unsigned PtrSize = PtrTy.getSizeInBits();
3452 
3453   const ValueMapping *ValMapping;
3454   const ValueMapping *PtrMapping;
3455 
3456   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3457 
3458   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3459     if (isScalarLoadLegal(MI)) {
3460       // We have a uniform instruction so we want to use an SMRD load
3461       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3462       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3463     } else {
3464       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3465 
3466       // If we're using MUBUF instructions for global memory, an SGPR base
3467       // register is possible. Otherwise this needs to be a VGPR.
3468       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3469         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3470 
3471       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3472     }
3473   } else {
3474     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3475     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3476   }
3477 
3478   OpdsMapping[0] = ValMapping;
3479   OpdsMapping[1] = PtrMapping;
3480   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3481       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3482   return Mapping;
3483 
3484   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3485   // handle that during instruction selection?
3486 }
3487 
3488 unsigned
3489 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3490                                      const MachineRegisterInfo &MRI,
3491                                      unsigned Default) const {
3492   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3493   return Bank ? Bank->getID() : Default;
3494 }
3495 
3496 const RegisterBankInfo::ValueMapping *
3497 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3498                                          const MachineRegisterInfo &MRI,
3499                                          const TargetRegisterInfo &TRI) const {
3500   // Lie and claim anything is legal, even though this needs to be an SGPR
3501   // applyMapping will have to deal with it as a waterfall loop.
3502   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3503   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3504   return AMDGPU::getValueMapping(Bank, Size);
3505 }
3506 
3507 const RegisterBankInfo::ValueMapping *
3508 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3509                                          const MachineRegisterInfo &MRI,
3510                                          const TargetRegisterInfo &TRI) const {
3511   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3512   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3513 }
3514 
3515 const RegisterBankInfo::ValueMapping *
3516 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3517                                          const MachineRegisterInfo &MRI,
3518                                          const TargetRegisterInfo &TRI) const {
3519   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3520   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3521 }
3522 
3523 ///
3524 /// This function must return a legal mapping, because
3525 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3526 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3527 /// VGPR to SGPR generated is illegal.
3528 ///
3529 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3530 // legal. These will be dealt with in applyMappingImpl.
3531 //
3532 const RegisterBankInfo::InstructionMapping &
3533 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3534   const MachineFunction &MF = *MI.getParent()->getParent();
3535   const MachineRegisterInfo &MRI = MF.getRegInfo();
3536 
3537   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3538     // The default logic bothers to analyze impossible alternative mappings. We
3539     // want the most straightforward mapping, so just directly handle this.
3540     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3541                                              *TRI);
3542     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3543                                              *TRI);
3544     assert(SrcBank && "src bank should have been assigned already");
3545     if (!DstBank)
3546       DstBank = SrcBank;
3547 
3548     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3549     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3550         cannotCopy(*DstBank, *SrcBank, Size))
3551       return getInvalidInstructionMapping();
3552 
3553     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3554     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3555     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3556     OpdsMapping[0] = &ValMap;
3557     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3558       OpdsMapping[1] = &ValMap;
3559 
3560     return getInstructionMapping(
3561         1, /*Cost*/ 1,
3562         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3563   }
3564 
3565   if (MI.isRegSequence()) {
3566     // If any input is a VGPR, the result must be a VGPR. The default handling
3567     // assumes any copy between banks is legal.
3568     unsigned BankID = AMDGPU::SGPRRegBankID;
3569 
3570     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3571       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3572       // It doesn't make sense to use vcc or scc banks here, so just ignore
3573       // them.
3574       if (OpBank != AMDGPU::SGPRRegBankID) {
3575         BankID = AMDGPU::VGPRRegBankID;
3576         break;
3577       }
3578     }
3579     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3580 
3581     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3582     return getInstructionMapping(
3583         1, /*Cost*/ 1,
3584         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3585   }
3586 
3587   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3588   // properly.
3589   //
3590   // TODO: There are additional exec masking dependencies to analyze.
3591   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3592     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3593     Register DstReg = MI.getOperand(0).getReg();
3594 
3595     // Sometimes the result may have already been assigned a bank.
3596     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3597       ResultBank = DstBank->getID();
3598 
3599     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3600       Register Reg = MI.getOperand(I).getReg();
3601       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3602 
3603       // FIXME: Assuming VGPR for any undetermined inputs.
3604       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3605         ResultBank = AMDGPU::VGPRRegBankID;
3606         break;
3607       }
3608 
3609       // FIXME: Need to promote SGPR case to s32
3610       unsigned OpBank = Bank->getID();
3611       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3612     }
3613 
3614     assert(ResultBank != AMDGPU::InvalidRegBankID);
3615 
3616     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3617 
3618     const ValueMapping &ValMap =
3619         getValueMapping(0, Size, getRegBank(ResultBank));
3620     return getInstructionMapping(
3621         1, /*Cost*/ 1,
3622         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3623   }
3624 
3625   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3626   if (Mapping.isValid())
3627     return Mapping;
3628 
3629   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3630 
3631   switch (MI.getOpcode()) {
3632   default:
3633     return getInvalidInstructionMapping();
3634 
3635   case AMDGPU::G_AND:
3636   case AMDGPU::G_OR:
3637   case AMDGPU::G_XOR: {
3638     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3639     if (Size == 1) {
3640       const RegisterBank *DstBank
3641         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3642 
3643       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3644       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3645       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3646       if (DstBank) {
3647         TargetBankID = DstBank->getID();
3648         if (DstBank == &AMDGPU::VCCRegBank) {
3649           TargetBankID = AMDGPU::VCCRegBankID;
3650           BankLHS = AMDGPU::VCCRegBankID;
3651           BankRHS = AMDGPU::VCCRegBankID;
3652         } else {
3653           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3654                                  AMDGPU::SGPRRegBankID);
3655           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3656                                  AMDGPU::SGPRRegBankID);
3657         }
3658       } else {
3659         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3660                                AMDGPU::VCCRegBankID);
3661         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3662                                AMDGPU::VCCRegBankID);
3663 
3664         // Both inputs should be true booleans to produce a boolean result.
3665         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3666           TargetBankID = AMDGPU::VGPRRegBankID;
3667         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3668           TargetBankID = AMDGPU::VCCRegBankID;
3669           BankLHS = AMDGPU::VCCRegBankID;
3670           BankRHS = AMDGPU::VCCRegBankID;
3671         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3672           TargetBankID = AMDGPU::SGPRRegBankID;
3673         }
3674       }
3675 
3676       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3677       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3678       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3679       break;
3680     }
3681 
3682     if (Size == 64) {
3683 
3684       if (isSALUMapping(MI)) {
3685         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3686         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3687       } else {
3688         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3689         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3690         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3691 
3692         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3693         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3694       }
3695 
3696       break;
3697     }
3698 
3699     [[fallthrough]];
3700   }
3701   case AMDGPU::G_PTR_ADD:
3702   case AMDGPU::G_PTRMASK:
3703   case AMDGPU::G_ADD:
3704   case AMDGPU::G_SUB:
3705   case AMDGPU::G_MUL:
3706   case AMDGPU::G_SHL:
3707   case AMDGPU::G_LSHR:
3708   case AMDGPU::G_ASHR:
3709   case AMDGPU::G_UADDO:
3710   case AMDGPU::G_USUBO:
3711   case AMDGPU::G_UADDE:
3712   case AMDGPU::G_SADDE:
3713   case AMDGPU::G_USUBE:
3714   case AMDGPU::G_SSUBE:
3715   case AMDGPU::G_SMIN:
3716   case AMDGPU::G_SMAX:
3717   case AMDGPU::G_UMIN:
3718   case AMDGPU::G_UMAX:
3719   case AMDGPU::G_ABS:
3720   case AMDGPU::G_SHUFFLE_VECTOR:
3721   case AMDGPU::G_SBFX:
3722   case AMDGPU::G_UBFX:
3723     if (isSALUMapping(MI))
3724       return getDefaultMappingSOP(MI);
3725     [[fallthrough]];
3726 
3727   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3728   case AMDGPU::G_SSUBSAT:
3729   case AMDGPU::G_UADDSAT:
3730   case AMDGPU::G_USUBSAT:
3731   case AMDGPU::G_FADD:
3732   case AMDGPU::G_FSUB:
3733   case AMDGPU::G_FPTOSI:
3734   case AMDGPU::G_FPTOUI:
3735   case AMDGPU::G_FMUL:
3736   case AMDGPU::G_FMA:
3737   case AMDGPU::G_FMAD:
3738   case AMDGPU::G_FSQRT:
3739   case AMDGPU::G_FFLOOR:
3740   case AMDGPU::G_FCEIL:
3741   case AMDGPU::G_FRINT:
3742   case AMDGPU::G_SITOFP:
3743   case AMDGPU::G_UITOFP:
3744   case AMDGPU::G_FPTRUNC:
3745   case AMDGPU::G_FPEXT:
3746   case AMDGPU::G_FEXP2:
3747   case AMDGPU::G_FLOG2:
3748   case AMDGPU::G_FMINNUM:
3749   case AMDGPU::G_FMAXNUM:
3750   case AMDGPU::G_FMINNUM_IEEE:
3751   case AMDGPU::G_FMAXNUM_IEEE:
3752   case AMDGPU::G_FCANONICALIZE:
3753   case AMDGPU::G_INTRINSIC_TRUNC:
3754   case AMDGPU::G_STRICT_FADD:
3755   case AMDGPU::G_STRICT_FSUB:
3756   case AMDGPU::G_STRICT_FMUL:
3757   case AMDGPU::G_STRICT_FMA:
3758   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3759   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3760   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3761   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3762   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3763   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3764   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3765   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3766   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3767   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3768   case AMDGPU::G_AMDGPU_SMED3:
3769     return getDefaultMappingVOP(MI);
3770   case AMDGPU::G_UMULH:
3771   case AMDGPU::G_SMULH: {
3772     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3773       return getDefaultMappingSOP(MI);
3774     return getDefaultMappingVOP(MI);
3775   }
3776   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3777   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3778     // Three possible mappings:
3779     //
3780     //  - Default SOP
3781     //  - Default VOP
3782     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3783     //
3784     // This allows instruction selection to keep the multiplication part of the
3785     // instruction on the SALU.
3786     bool AllSalu = true;
3787     bool MulSalu = true;
3788     for (unsigned i = 0; i < 5; ++i) {
3789       Register Reg = MI.getOperand(i).getReg();
3790       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3791         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3792           AllSalu = false;
3793           if (i == 2 || i == 3) {
3794             MulSalu = false;
3795             break;
3796           }
3797         }
3798       }
3799     }
3800 
3801     if (AllSalu)
3802       return getDefaultMappingSOP(MI);
3803 
3804     // If the multiply-add is full-rate in VALU, use that even if the
3805     // multiplication part is scalar. Accumulating separately on the VALU would
3806     // take two instructions.
3807     if (!MulSalu || Subtarget.hasFullRate64Ops())
3808       return getDefaultMappingVOP(MI);
3809 
3810     // Keep the multiplication on the SALU, then accumulate on the VALU.
3811     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3812     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3813     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3814     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3815     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3816     break;
3817   }
3818   case AMDGPU::G_IMPLICIT_DEF: {
3819     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3820     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3821     break;
3822   }
3823   case AMDGPU::G_FCONSTANT:
3824   case AMDGPU::G_CONSTANT:
3825   case AMDGPU::G_GLOBAL_VALUE:
3826   case AMDGPU::G_BLOCK_ADDR:
3827   case AMDGPU::G_READCYCLECOUNTER: {
3828     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3829     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3830     break;
3831   }
3832   case AMDGPU::G_FRAME_INDEX: {
3833     // TODO: This should be the same as other constants, but eliminateFrameIndex
3834     // currently assumes VALU uses.
3835     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3836     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3837     break;
3838   }
3839   case AMDGPU::G_DYN_STACKALLOC: {
3840     // Result is always uniform, and a wave reduction is needed for the source.
3841     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3842     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3843     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3844     break;
3845   }
3846   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3847     // This case is weird because we expect a physical register in the source,
3848     // but need to set a bank anyway.
3849     //
3850     // We could select the result to SGPR or VGPR, but for the one current use
3851     // it's more practical to always use VGPR.
3852     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3853     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3854     break;
3855   }
3856   case AMDGPU::G_INSERT: {
3857     unsigned BankID = getMappingType(MRI, MI);
3858     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3859     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3860     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3861     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3862     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3863     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3864     OpdsMapping[3] = nullptr;
3865     break;
3866   }
3867   case AMDGPU::G_EXTRACT: {
3868     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3869     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3870     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3871     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3872     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3873     OpdsMapping[2] = nullptr;
3874     break;
3875   }
3876   case AMDGPU::G_BUILD_VECTOR:
3877   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3878     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3879     if (DstTy == LLT::fixed_vector(2, 16)) {
3880       unsigned DstSize = DstTy.getSizeInBits();
3881       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3882       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3883       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3884       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3885 
3886       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3887       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3888       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3889       break;
3890     }
3891 
3892     [[fallthrough]];
3893   }
3894   case AMDGPU::G_MERGE_VALUES:
3895   case AMDGPU::G_CONCAT_VECTORS: {
3896     unsigned Bank = getMappingType(MRI, MI);
3897     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3898     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3899 
3900     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3901     // Op1 and Dst should use the same register bank.
3902     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3903       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3904     break;
3905   }
3906   case AMDGPU::G_BITREVERSE:
3907   case AMDGPU::G_BITCAST:
3908   case AMDGPU::G_INTTOPTR:
3909   case AMDGPU::G_PTRTOINT:
3910   case AMDGPU::G_FABS:
3911   case AMDGPU::G_FNEG: {
3912     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3913     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3914     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3915     break;
3916   }
3917   case AMDGPU::G_AMDGPU_FFBH_U32:
3918   case AMDGPU::G_AMDGPU_FFBL_B32:
3919   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3920   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3921     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3922     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3923     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3924     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3925     break;
3926   }
3927   case AMDGPU::G_CTPOP: {
3928     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3929     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3930     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3931 
3932     // This should really be getValueMappingSGPR64Only, but allowing the generic
3933     // code to handle the register split just makes using LegalizerHelper more
3934     // difficult.
3935     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3936     break;
3937   }
3938   case AMDGPU::G_TRUNC: {
3939     Register Dst = MI.getOperand(0).getReg();
3940     Register Src = MI.getOperand(1).getReg();
3941     unsigned Bank = getRegBankID(Src, MRI);
3942     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3943     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3944     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3945     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3946     break;
3947   }
3948   case AMDGPU::G_ZEXT:
3949   case AMDGPU::G_SEXT:
3950   case AMDGPU::G_ANYEXT:
3951   case AMDGPU::G_SEXT_INREG: {
3952     Register Dst = MI.getOperand(0).getReg();
3953     Register Src = MI.getOperand(1).getReg();
3954     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3955     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3956 
3957     unsigned DstBank;
3958     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3959     assert(SrcBank);
3960     switch (SrcBank->getID()) {
3961     case AMDGPU::SGPRRegBankID:
3962       DstBank = AMDGPU::SGPRRegBankID;
3963       break;
3964     default:
3965       DstBank = AMDGPU::VGPRRegBankID;
3966       break;
3967     }
3968 
3969     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3970     // 32-bits, and then to 64.
3971     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3972     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3973                                                        SrcSize);
3974     break;
3975   }
3976   case AMDGPU::G_FCMP: {
3977     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3978     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3979     OpdsMapping[1] = nullptr; // Predicate Operand.
3980     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3981     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3982     break;
3983   }
3984   case AMDGPU::G_IS_FPCLASS: {
3985     Register SrcReg = MI.getOperand(1).getReg();
3986     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3987     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3988     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3989     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3990     break;
3991   }
3992   case AMDGPU::G_STORE: {
3993     assert(MI.getOperand(0).isReg());
3994     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3995 
3996     // FIXME: We need to specify a different reg bank once scalar stores are
3997     // supported.
3998     const ValueMapping *ValMapping =
3999         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4000     OpdsMapping[0] = ValMapping;
4001     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4002     break;
4003   }
4004   case AMDGPU::G_ICMP: {
4005     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4006     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4007 
4008     // See if the result register has already been constrained to vcc, which may
4009     // happen due to control flow intrinsic lowering.
4010     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4011                                     AMDGPU::SGPRRegBankID);
4012     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4013     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4014 
4015     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4016                      Op2Bank == AMDGPU::SGPRRegBankID &&
4017                      Op3Bank == AMDGPU::SGPRRegBankID &&
4018       (Size == 32 || (Size == 64 &&
4019                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4020                       Subtarget.hasScalarCompareEq64()));
4021 
4022     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4023     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4024 
4025     // TODO: Use 32-bit for scalar output size.
4026     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4027     const unsigned ResultSize = 1;
4028 
4029     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4030     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4031     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4032     break;
4033   }
4034   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4035     // VGPR index can be used for waterfall when indexing a SGPR vector.
4036     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4037     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4038     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4039     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4040     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4041     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4042 
4043     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4044     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4045 
4046     // The index can be either if the source vector is VGPR.
4047     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4048     break;
4049   }
4050   case AMDGPU::G_INSERT_VECTOR_ELT: {
4051     unsigned OutputBankID = isSALUMapping(MI) ?
4052       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4053 
4054     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4055     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4056     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4057     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4058     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4059 
4060     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4061     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4062 
4063     // This is a weird case, because we need to break down the mapping based on
4064     // the register bank of a different operand.
4065     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4066       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4067                                                       InsertSize);
4068     } else {
4069       assert(InsertSize == 32 || InsertSize == 64);
4070       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4071     }
4072 
4073     // The index can be either if the source vector is VGPR.
4074     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4075     break;
4076   }
4077   case AMDGPU::G_UNMERGE_VALUES: {
4078     unsigned Bank = getMappingType(MRI, MI);
4079 
4080     // Op1 and Dst should use the same register bank.
4081     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4082     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4083       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4084       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4085     }
4086     break;
4087   }
4088   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4089   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4090   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4091   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4092   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4093   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4094   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4095   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4096   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4097   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4098   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4099   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4100   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4101   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4102   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4103   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4104   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4105     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4106 
4107     // rsrc
4108     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4109 
4110     // vindex
4111     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4112 
4113     // voffset
4114     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4115 
4116     // soffset
4117     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4118 
4119     // Any remaining operands are immediates and were correctly null
4120     // initialized.
4121     break;
4122   }
4123   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4124   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4125   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4126   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4127   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4128   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4129   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4130   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4131   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4132   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4133   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4134   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4135   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4136   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4137   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4138     // vdata_out
4139     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4140 
4141     // vdata_in
4142     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4143 
4144     // rsrc
4145     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4146 
4147     // vindex
4148     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4149 
4150     // voffset
4151     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4152 
4153     // soffset
4154     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4155 
4156     // Any remaining operands are immediates and were correctly null
4157     // initialized.
4158     break;
4159   }
4160   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4161     // vdata_out
4162     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4163 
4164     // vdata_in
4165     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4166 
4167     // cmp
4168     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4169 
4170     // rsrc
4171     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4172 
4173     // vindex
4174     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4175 
4176     // voffset
4177     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4178 
4179     // soffset
4180     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4181 
4182     // Any remaining operands are immediates and were correctly null
4183     // initialized.
4184     break;
4185   }
4186   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4187     // Lie and claim everything is legal, even though some need to be
4188     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4189     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4190     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4191 
4192     // We need to convert this to a MUBUF if either the resource of offset is
4193     // VGPR.
4194     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4195     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4196     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4197 
4198     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4199     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4200     break;
4201   }
4202   case AMDGPU::G_INTRINSIC: {
4203     switch (MI.getIntrinsicID()) {
4204     default:
4205       return getInvalidInstructionMapping();
4206     case Intrinsic::amdgcn_div_fmas:
4207     case Intrinsic::amdgcn_div_fixup:
4208     case Intrinsic::amdgcn_trig_preop:
4209     case Intrinsic::amdgcn_sin:
4210     case Intrinsic::amdgcn_cos:
4211     case Intrinsic::amdgcn_log_clamp:
4212     case Intrinsic::amdgcn_rcp:
4213     case Intrinsic::amdgcn_rcp_legacy:
4214     case Intrinsic::amdgcn_sqrt:
4215     case Intrinsic::amdgcn_rsq:
4216     case Intrinsic::amdgcn_rsq_legacy:
4217     case Intrinsic::amdgcn_rsq_clamp:
4218     case Intrinsic::amdgcn_fmul_legacy:
4219     case Intrinsic::amdgcn_fma_legacy:
4220     case Intrinsic::amdgcn_ldexp:
4221     case Intrinsic::amdgcn_frexp_mant:
4222     case Intrinsic::amdgcn_frexp_exp:
4223     case Intrinsic::amdgcn_fract:
4224     case Intrinsic::amdgcn_cvt_pkrtz:
4225     case Intrinsic::amdgcn_cvt_pknorm_i16:
4226     case Intrinsic::amdgcn_cvt_pknorm_u16:
4227     case Intrinsic::amdgcn_cvt_pk_i16:
4228     case Intrinsic::amdgcn_cvt_pk_u16:
4229     case Intrinsic::amdgcn_fmed3:
4230     case Intrinsic::amdgcn_cubeid:
4231     case Intrinsic::amdgcn_cubema:
4232     case Intrinsic::amdgcn_cubesc:
4233     case Intrinsic::amdgcn_cubetc:
4234     case Intrinsic::amdgcn_sffbh:
4235     case Intrinsic::amdgcn_fmad_ftz:
4236     case Intrinsic::amdgcn_mbcnt_lo:
4237     case Intrinsic::amdgcn_mbcnt_hi:
4238     case Intrinsic::amdgcn_mul_u24:
4239     case Intrinsic::amdgcn_mul_i24:
4240     case Intrinsic::amdgcn_mulhi_u24:
4241     case Intrinsic::amdgcn_mulhi_i24:
4242     case Intrinsic::amdgcn_lerp:
4243     case Intrinsic::amdgcn_sad_u8:
4244     case Intrinsic::amdgcn_msad_u8:
4245     case Intrinsic::amdgcn_sad_hi_u8:
4246     case Intrinsic::amdgcn_sad_u16:
4247     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4248     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4249     case Intrinsic::amdgcn_mqsad_u32_u8:
4250     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4251     case Intrinsic::amdgcn_alignbyte:
4252     case Intrinsic::amdgcn_perm:
4253     case Intrinsic::amdgcn_fdot2:
4254     case Intrinsic::amdgcn_sdot2:
4255     case Intrinsic::amdgcn_udot2:
4256     case Intrinsic::amdgcn_sdot4:
4257     case Intrinsic::amdgcn_udot4:
4258     case Intrinsic::amdgcn_sdot8:
4259     case Intrinsic::amdgcn_udot8:
4260     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4261     case Intrinsic::amdgcn_fdot2_f16_f16:
4262     case Intrinsic::amdgcn_fdot2_f32_bf16:
4263     case Intrinsic::amdgcn_sudot4:
4264     case Intrinsic::amdgcn_sudot8:
4265     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4266     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4267     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4268     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4269     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4270     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4271       return getDefaultMappingVOP(MI);
4272     case Intrinsic::amdgcn_sbfe:
4273     case Intrinsic::amdgcn_ubfe:
4274       if (isSALUMapping(MI))
4275         return getDefaultMappingSOP(MI);
4276       return getDefaultMappingVOP(MI);
4277     case Intrinsic::amdgcn_ds_swizzle:
4278     case Intrinsic::amdgcn_ds_permute:
4279     case Intrinsic::amdgcn_ds_bpermute:
4280     case Intrinsic::amdgcn_update_dpp:
4281     case Intrinsic::amdgcn_mov_dpp8:
4282     case Intrinsic::amdgcn_mov_dpp:
4283     case Intrinsic::amdgcn_strict_wwm:
4284     case Intrinsic::amdgcn_wwm:
4285     case Intrinsic::amdgcn_strict_wqm:
4286     case Intrinsic::amdgcn_wqm:
4287     case Intrinsic::amdgcn_softwqm:
4288     case Intrinsic::amdgcn_set_inactive:
4289     case Intrinsic::amdgcn_permlane64:
4290       return getDefaultMappingAllVGPR(MI);
4291     case Intrinsic::amdgcn_kernarg_segment_ptr:
4292     case Intrinsic::amdgcn_s_getpc:
4293     case Intrinsic::amdgcn_groupstaticsize:
4294     case Intrinsic::amdgcn_reloc_constant:
4295     case Intrinsic::returnaddress: {
4296       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4297       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4298       break;
4299     }
4300     case Intrinsic::amdgcn_wqm_vote: {
4301       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4302       OpdsMapping[0] = OpdsMapping[2]
4303         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4304       break;
4305     }
4306     case Intrinsic::amdgcn_ps_live: {
4307       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4308       break;
4309     }
4310     case Intrinsic::amdgcn_div_scale: {
4311       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4312       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4313       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4314       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4315 
4316       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4317       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4318       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4319       break;
4320     }
4321     case Intrinsic::amdgcn_class: {
4322       Register Src0Reg = MI.getOperand(2).getReg();
4323       Register Src1Reg = MI.getOperand(3).getReg();
4324       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4325       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4326       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4327       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4328       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4329       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4330       break;
4331     }
4332     case Intrinsic::amdgcn_icmp:
4333     case Intrinsic::amdgcn_fcmp: {
4334       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4335       // This is not VCCRegBank because this is not used in boolean contexts.
4336       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4337       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4338       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4339       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4340       break;
4341     }
4342     case Intrinsic::amdgcn_readlane: {
4343       // This must be an SGPR, but accept a VGPR.
4344       Register IdxReg = MI.getOperand(3).getReg();
4345       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4346       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4347       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4348       [[fallthrough]];
4349     }
4350     case Intrinsic::amdgcn_readfirstlane: {
4351       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4352       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4353       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4354       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4355       break;
4356     }
4357     case Intrinsic::amdgcn_writelane: {
4358       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4359       Register SrcReg = MI.getOperand(2).getReg();
4360       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4361       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4362       Register IdxReg = MI.getOperand(3).getReg();
4363       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4364       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4365       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4366 
4367       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4368       // to legalize.
4369       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4370       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4371       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4372       break;
4373     }
4374     case Intrinsic::amdgcn_if_break: {
4375       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4376       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4377       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4378       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4379       break;
4380     }
4381     case Intrinsic::amdgcn_permlane16:
4382     case Intrinsic::amdgcn_permlanex16: {
4383       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4384       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4385       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4386       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4387       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4388       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4389       break;
4390     }
4391     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4392     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4393     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4394     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4395     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4396     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4397     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4398     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4399     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4400     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4401     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4402     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4403     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4404     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4405     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4406     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4407     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4408     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4409     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4410     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4411     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4412     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4413     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4414     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4415     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4416     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4417     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4418     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4419     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4420     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4421     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4422     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4423     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4424     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4425     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4426     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4427     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4428     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4429     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4430       // Default for MAI intrinsics.
4431       // srcC can also be an immediate which can be folded later.
4432       // FIXME: Should we eventually add an alternative mapping with AGPR src
4433       // for srcA/srcB?
4434       //
4435       // vdst, srcA, srcB, srcC
4436       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4437       OpdsMapping[0] =
4438           Info->mayNeedAGPRs()
4439               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4440               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4441       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4442       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4443       OpdsMapping[4] =
4444           Info->mayNeedAGPRs()
4445               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4446               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4447       break;
4448     }
4449     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4450     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4451     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4452     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4453     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4454     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4455     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4456     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4457     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4458     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4459     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4460     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4461     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4462     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4463       // vdst, srcA, srcB, srcC, idx
4464       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4465       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4466       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4467       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4468       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4469       break;
4470     }
4471     case Intrinsic::amdgcn_interp_p1:
4472     case Intrinsic::amdgcn_interp_p2:
4473     case Intrinsic::amdgcn_interp_mov:
4474     case Intrinsic::amdgcn_interp_p1_f16:
4475     case Intrinsic::amdgcn_interp_p2_f16:
4476     case Intrinsic::amdgcn_lds_param_load: {
4477       const int M0Idx = MI.getNumOperands() - 1;
4478       Register M0Reg = MI.getOperand(M0Idx).getReg();
4479       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4480       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4481 
4482       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4483       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4484         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4485 
4486       // Must be SGPR, but we must take whatever the original bank is and fix it
4487       // later.
4488       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4489       break;
4490     }
4491     case Intrinsic::amdgcn_interp_inreg_p10:
4492     case Intrinsic::amdgcn_interp_inreg_p2:
4493     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4494     case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4495       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4496       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4497       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4498       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4499       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4500       break;
4501     }
4502     case Intrinsic::amdgcn_ballot: {
4503       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4504       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4505       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4506       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4507       break;
4508     }
4509     }
4510     break;
4511   }
4512   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4513   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4514   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4515   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4516     auto IntrID = MI.getIntrinsicID();
4517     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4518     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4519     // Non-images can have complications from operands that allow both SGPR
4520     // and VGPR. For now it's too complicated to figure out the final opcode
4521     // to derive the register bank from the MCInstrDesc.
4522     assert(RSrcIntrin->IsImage);
4523     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4524   }
4525   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4526     unsigned N = MI.getNumExplicitOperands() - 2;
4527     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4528     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4529     if (N == 3) {
4530       // Sequential form: all operands combined into VGPR256/VGPR512
4531       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4532       if (Size > 256)
4533         Size = 512;
4534       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4535     } else {
4536       // NSA form
4537       for (unsigned I = 2; I < N; ++I) {
4538         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4539         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4540       }
4541     }
4542     break;
4543   }
4544   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4545     auto IntrID = MI.getIntrinsicID();
4546     switch (IntrID) {
4547     case Intrinsic::amdgcn_s_getreg:
4548     case Intrinsic::amdgcn_s_memtime:
4549     case Intrinsic::amdgcn_s_memrealtime:
4550     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4551     case Intrinsic::amdgcn_s_sendmsg_rtn: {
4552       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4553       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4554       break;
4555     }
4556     case Intrinsic::amdgcn_global_atomic_fadd:
4557     case Intrinsic::amdgcn_global_atomic_csub:
4558     case Intrinsic::amdgcn_global_atomic_fmin:
4559     case Intrinsic::amdgcn_global_atomic_fmax:
4560     case Intrinsic::amdgcn_flat_atomic_fadd:
4561     case Intrinsic::amdgcn_flat_atomic_fmin:
4562     case Intrinsic::amdgcn_flat_atomic_fmax:
4563     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4564     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4565       return getDefaultMappingAllVGPR(MI);
4566     case Intrinsic::amdgcn_ds_ordered_add:
4567     case Intrinsic::amdgcn_ds_ordered_swap:
4568     case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4569       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4570       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4571       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4572                                  AMDGPU::SGPRRegBankID);
4573       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4574       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4575       break;
4576     }
4577     case Intrinsic::amdgcn_ds_append:
4578     case Intrinsic::amdgcn_ds_consume: {
4579       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4580       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4581       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4582       break;
4583     }
4584     case Intrinsic::amdgcn_exp_compr:
4585       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4586       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4587       break;
4588     case Intrinsic::amdgcn_exp:
4589       // FIXME: Could we support packed types here?
4590       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4591       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4592       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4593       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4594       break;
4595     case Intrinsic::amdgcn_exp_row:
4596       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4597       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4598       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4599       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4600       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4601       break;
4602     case Intrinsic::amdgcn_s_sendmsg:
4603     case Intrinsic::amdgcn_s_sendmsghalt: {
4604       // This must be an SGPR, but accept a VGPR.
4605       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4606                                    AMDGPU::SGPRRegBankID);
4607       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4608       break;
4609     }
4610     case Intrinsic::amdgcn_s_setreg: {
4611       // This must be an SGPR, but accept a VGPR.
4612       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4613                                    AMDGPU::SGPRRegBankID);
4614       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4615       break;
4616     }
4617     case Intrinsic::amdgcn_end_cf: {
4618       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4619       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4620       break;
4621     }
4622     case Intrinsic::amdgcn_else: {
4623       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4624       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4625       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4626       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4627       break;
4628     }
4629     case Intrinsic::amdgcn_live_mask: {
4630       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4631       break;
4632     }
4633     case Intrinsic::amdgcn_wqm_demote:
4634     case Intrinsic::amdgcn_kill: {
4635       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4636       break;
4637     }
4638     case Intrinsic::amdgcn_raw_buffer_load:
4639     case Intrinsic::amdgcn_raw_tbuffer_load: {
4640       // FIXME: Should make intrinsic ID the last operand of the instruction,
4641       // then this would be the same as store
4642       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4643       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4644       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4645       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4646       break;
4647     }
4648     case Intrinsic::amdgcn_raw_buffer_load_lds: {
4649       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4650       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4651       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4652       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4653       break;
4654     }
4655     case Intrinsic::amdgcn_raw_buffer_store:
4656     case Intrinsic::amdgcn_raw_buffer_store_format:
4657     case Intrinsic::amdgcn_raw_tbuffer_store: {
4658       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4659       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4660       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4661       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4662       break;
4663     }
4664     case Intrinsic::amdgcn_struct_buffer_load:
4665     case Intrinsic::amdgcn_struct_tbuffer_load: {
4666       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4667       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4668       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4669       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4670       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4671       break;
4672     }
4673     case Intrinsic::amdgcn_struct_buffer_load_lds: {
4674       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4675       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4676       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4677       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4678       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4679       break;
4680     }
4681     case Intrinsic::amdgcn_struct_buffer_store:
4682     case Intrinsic::amdgcn_struct_tbuffer_store: {
4683       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4684       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4685       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4686       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4687       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4688       break;
4689     }
4690     case Intrinsic::amdgcn_init_exec_from_input: {
4691       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4692       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4693       break;
4694     }
4695     case Intrinsic::amdgcn_ds_gws_init:
4696     case Intrinsic::amdgcn_ds_gws_barrier:
4697     case Intrinsic::amdgcn_ds_gws_sema_br: {
4698       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4699 
4700       // This must be an SGPR, but accept a VGPR.
4701       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4702                                    AMDGPU::SGPRRegBankID);
4703       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4704       break;
4705     }
4706     case Intrinsic::amdgcn_ds_gws_sema_v:
4707     case Intrinsic::amdgcn_ds_gws_sema_p:
4708     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4709       // This must be an SGPR, but accept a VGPR.
4710       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4711                                    AMDGPU::SGPRRegBankID);
4712       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4713       break;
4714     }
4715     case Intrinsic::amdgcn_global_load_lds: {
4716       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4717       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4718       break;
4719     }
4720     case Intrinsic::amdgcn_lds_direct_load: {
4721       const int M0Idx = MI.getNumOperands() - 1;
4722       Register M0Reg = MI.getOperand(M0Idx).getReg();
4723       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4724       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4725 
4726       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4727       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4728         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4729 
4730       // Must be SGPR, but we must take whatever the original bank is and fix it
4731       // later.
4732       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4733       break;
4734     }
4735     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
4736     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
4737       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4738       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4739       break;
4740     case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
4741       OpdsMapping[0] =
4742           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
4743       OpdsMapping[1] =
4744           getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
4745       OpdsMapping[3] =
4746           getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
4747       OpdsMapping[4] =
4748           getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
4749       OpdsMapping[5] =
4750           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
4751       break;
4752     }
4753 
4754     default:
4755       return getInvalidInstructionMapping();
4756     }
4757     break;
4758   }
4759   case AMDGPU::G_SELECT: {
4760     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4761     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4762                                     AMDGPU::SGPRRegBankID);
4763     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4764                                     AMDGPU::SGPRRegBankID);
4765     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4766                     Op3Bank == AMDGPU::SGPRRegBankID;
4767 
4768     unsigned CondBankDefault = SGPRSrcs ?
4769       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4770     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4771                                      CondBankDefault);
4772     if (CondBank == AMDGPU::SGPRRegBankID)
4773       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4774     else if (CondBank == AMDGPU::VGPRRegBankID)
4775       CondBank = AMDGPU::VCCRegBankID;
4776 
4777     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4778       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4779 
4780     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4781 
4782     // TODO: Should report 32-bit for scalar condition type.
4783     if (Size == 64) {
4784       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4785       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4786       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4787       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4788     } else {
4789       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4790       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4791       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4792       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4793     }
4794 
4795     break;
4796   }
4797 
4798   case AMDGPU::G_SI_CALL: {
4799     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4800     // Lie and claim everything is legal, even though some need to be
4801     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4802     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4803 
4804     // Allow anything for implicit arguments
4805     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4806       if (MI.getOperand(I).isReg()) {
4807         Register Reg = MI.getOperand(I).getReg();
4808         auto OpBank = getRegBankID(Reg, MRI);
4809         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4810         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4811       }
4812     }
4813     break;
4814   }
4815   case AMDGPU::G_LOAD:
4816   case AMDGPU::G_ZEXTLOAD:
4817   case AMDGPU::G_SEXTLOAD:
4818     return getInstrMappingForLoad(MI);
4819 
4820   case AMDGPU::G_ATOMICRMW_XCHG:
4821   case AMDGPU::G_ATOMICRMW_ADD:
4822   case AMDGPU::G_ATOMICRMW_SUB:
4823   case AMDGPU::G_ATOMICRMW_AND:
4824   case AMDGPU::G_ATOMICRMW_OR:
4825   case AMDGPU::G_ATOMICRMW_XOR:
4826   case AMDGPU::G_ATOMICRMW_MAX:
4827   case AMDGPU::G_ATOMICRMW_MIN:
4828   case AMDGPU::G_ATOMICRMW_UMAX:
4829   case AMDGPU::G_ATOMICRMW_UMIN:
4830   case AMDGPU::G_ATOMICRMW_FADD:
4831   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4832   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4833   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4834   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4835   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4836     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4837     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4838     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4839     break;
4840   }
4841   case AMDGPU::G_ATOMIC_CMPXCHG: {
4842     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4843     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4844     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4845     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4846     break;
4847   }
4848   case AMDGPU::G_BRCOND: {
4849     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4850                                  AMDGPU::SGPRRegBankID);
4851     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4852     if (Bank != AMDGPU::SGPRRegBankID)
4853       Bank = AMDGPU::VCCRegBankID;
4854 
4855     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4856     break;
4857   }
4858   case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
4859   case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
4860     return getDefaultMappingVOP(MI);
4861   }
4862 
4863   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4864                                getOperandsMapping(OpdsMapping),
4865                                MI.getNumOperands());
4866 }
4867