1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   const AMDGPURegisterBankInfo &RBI;
101   MachineRegisterInfo &MRI;
102   const RegisterBank *NewBank;
103   SmallVector<MachineInstr *, 4> NewInsts;
104 
105 public:
106   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
108     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109 
110   ~ApplyRegBankMapping() {
111     for (MachineInstr *MI : NewInsts)
112       applyBank(*MI);
113   }
114 
115   /// Set any registers that don't have a set register class or bank to SALU.
116   void applyBank(MachineInstr &MI) {
117     const unsigned Opc = MI.getOpcode();
118     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119         Opc == AMDGPU::G_SEXT) {
120       // LegalizerHelper wants to use the basic legalization artifacts when
121       // widening etc. We don't handle selection with vcc in artifact sources,
122       // so we need to use a select instead to handle these properly.
123       Register DstReg = MI.getOperand(0).getReg();
124       Register SrcReg = MI.getOperand(1).getReg();
125       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126       if (SrcBank == &AMDGPU::VCCRegBank) {
127         const LLT S32 = LLT::scalar(32);
128         assert(MRI.getType(SrcReg) == LLT::scalar(1));
129         assert(MRI.getType(DstReg) == S32);
130         assert(NewBank == &AMDGPU::VGPRRegBank);
131 
132         // Replace the extension with a select, which really uses the boolean
133         // source.
134         MachineIRBuilder B(MI);
135         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136         auto False = B.buildConstant(S32, 0);
137         B.buildSelect(DstReg, SrcReg, True, False);
138         MRI.setRegBank(True.getReg(0), *NewBank);
139         MRI.setRegBank(False.getReg(0), *NewBank);
140         MI.eraseFromParent();
141       }
142 
143       assert(!MRI.getRegClassOrRegBank(DstReg));
144       MRI.setRegBank(DstReg, *NewBank);
145       return;
146     }
147 
148 #ifndef NDEBUG
149     if (Opc == AMDGPU::G_TRUNC) {
150       Register DstReg = MI.getOperand(0).getReg();
151       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152       assert(DstBank != &AMDGPU::VCCRegBank);
153     }
154 #endif
155 
156     for (MachineOperand &Op : MI.operands()) {
157       if (!Op.isReg())
158         continue;
159 
160       // We may see physical registers if building a real MI
161       Register Reg = Op.getReg();
162       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163         continue;
164 
165       const RegisterBank *RB = NewBank;
166       if (MRI.getType(Reg) == LLT::scalar(1)) {
167         assert(NewBank == &AMDGPU::VGPRRegBank &&
168                "s1 operands should only be used for vector bools");
169         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171                "not expecting legalization artifacts here");
172         RB = &AMDGPU::VCCRegBank;
173       }
174 
175       MRI.setRegBank(Reg, *RB);
176     }
177   }
178 
179   void erasingInstr(MachineInstr &MI) override {}
180 
181   void createdInstr(MachineInstr &MI) override {
182     // At this point, the instruction was just inserted and has no operands.
183     NewInsts.push_back(&MI);
184   }
185 
186   void changingInstr(MachineInstr &MI) override {}
187   void changedInstr(MachineInstr &MI) override {
188     // FIXME: In principle we should probably add the instruction to NewInsts,
189     // but the way the LegalizerHelper uses the observer, we will always see the
190     // registers we need to set the regbank on also referenced in a new
191     // instruction.
192   }
193 };
194 
195 }
196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
197     : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
198       TII(Subtarget.getInstrInfo()) {
199 
200   // HACK: Until this is fully tablegen'd.
201   static llvm::once_flag InitializeRegisterBankFlag;
202 
203   static auto InitializeRegisterBankOnce = [this]() {
204     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207     (void)this;
208   };
209 
210   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212 
213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214   unsigned BankID = Bank.getID();
215   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217 
218 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
219   return RB != &AMDGPU::SGPRRegBank;
220 }
221 
222 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
223                                           const RegisterBank &Src,
224                                           unsigned Size) const {
225   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
226   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
227       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
228     return std::numeric_limits<unsigned>::max();
229   }
230 
231   // Bool values are tricky, because the meaning is based on context. The SCC
232   // and VCC banks are for the natural scalar and vector conditions produced by
233   // a compare.
234   //
235   // Legalization doesn't know about the necessary context, so an s1 use may
236   // have been a truncate from an arbitrary value, in which case a copy (lowered
237   // as a compare with 0) needs to be inserted.
238   if (Size == 1 &&
239       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
240       (isVectorRegisterBank(Src) ||
241        Src.getID() == AMDGPU::SGPRRegBankID ||
242        Src.getID() == AMDGPU::VCCRegBankID))
243     return std::numeric_limits<unsigned>::max();
244 
245   // There is no direct copy between AGPRs.
246   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
247       Src.getID() == AMDGPU::AGPRRegBankID)
248     return 4;
249 
250   return RegisterBankInfo::copyCost(Dst, Src, Size);
251 }
252 
253 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
254   const ValueMapping &ValMapping,
255   const RegisterBank *CurBank) const {
256   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
257   // VGPR.
258   // FIXME: Is there a better way to do this?
259   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
260     return 10; // This is expensive.
261 
262   assert(ValMapping.NumBreakDowns == 2 &&
263          ValMapping.BreakDown[0].Length == 32 &&
264          ValMapping.BreakDown[0].StartIdx == 0 &&
265          ValMapping.BreakDown[1].Length == 32 &&
266          ValMapping.BreakDown[1].StartIdx == 32 &&
267          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
268 
269   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
270   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
271   // want.
272 
273   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
274   // alignment restrictions, but this probably isn't important.
275   return 1;
276 }
277 
278 const RegisterBank &
279 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
280                                                LLT Ty) const {
281   if (&RC == &AMDGPU::SReg_1RegClass)
282     return AMDGPU::VCCRegBank;
283 
284   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
285   // VCC-like use.
286   if (TRI->isSGPRClass(&RC)) {
287     // FIXME: This probably came from a copy from a physical register, which
288     // should be inferable from the copied to-type. We don't have many boolean
289     // physical register constraints so just assume a normal SGPR for now.
290     if (!Ty.isValid())
291       return AMDGPU::SGPRRegBank;
292 
293     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
294   }
295 
296   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
297 }
298 
299 template <unsigned NumOps>
300 RegisterBankInfo::InstructionMappings
301 AMDGPURegisterBankInfo::addMappingFromTable(
302     const MachineInstr &MI, const MachineRegisterInfo &MRI,
303     const std::array<unsigned, NumOps> RegSrcOpIdx,
304     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
305 
306   InstructionMappings AltMappings;
307 
308   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
309 
310   unsigned Sizes[NumOps];
311   for (unsigned I = 0; I < NumOps; ++I) {
312     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
313     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
314   }
315 
316   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
317     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
318     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
319   }
320 
321   // getInstrMapping's default mapping uses ID 1, so start at 2.
322   unsigned MappingID = 2;
323   for (const auto &Entry : Table) {
324     for (unsigned I = 0; I < NumOps; ++I) {
325       int OpIdx = RegSrcOpIdx[I];
326       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
327     }
328 
329     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
330                                                  getOperandsMapping(Operands),
331                                                  Operands.size()));
332   }
333 
334   return AltMappings;
335 }
336 
337 RegisterBankInfo::InstructionMappings
338 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
339     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
340   switch (MI.getIntrinsicID()) {
341   case Intrinsic::amdgcn_readlane: {
342     static const OpRegBankEntry<3> Table[2] = {
343       // Perfectly legal.
344       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
345 
346       // Need a readfirstlane for the index.
347       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
348     };
349 
350     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
351     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
352   }
353   case Intrinsic::amdgcn_writelane: {
354     static const OpRegBankEntry<4> Table[4] = {
355       // Perfectly legal.
356       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
357 
358       // Need readfirstlane of first op
359       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361       // Need readfirstlane of second op
362       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
363 
364       // Need readfirstlane of both ops
365       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
366     };
367 
368     // rsrc, voffset, offset
369     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
370     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
371   }
372   default:
373     return RegisterBankInfo::getInstrAlternativeMappings(MI);
374   }
375 }
376 
377 RegisterBankInfo::InstructionMappings
378 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
379     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
380 
381   switch (MI.getIntrinsicID()) {
382   case Intrinsic::amdgcn_s_buffer_load: {
383     static const OpRegBankEntry<2> Table[4] = {
384       // Perfectly legal.
385       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
386 
387       // Only need 1 register in loop
388       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
389 
390       // Have to waterfall the resource.
391       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
392 
393       // Have to waterfall the resource, and the offset.
394       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
395     };
396 
397     // rsrc, offset
398     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
399     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
400   }
401   case Intrinsic::amdgcn_ds_ordered_add:
402   case Intrinsic::amdgcn_ds_ordered_swap: {
403     // VGPR = M0, VGPR
404     static const OpRegBankEntry<3> Table[2] = {
405       // Perfectly legal.
406       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
407 
408       // Need a readfirstlane for m0
409       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
410     };
411 
412     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
413     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
414   }
415   case Intrinsic::amdgcn_s_sendmsg:
416   case Intrinsic::amdgcn_s_sendmsghalt: {
417     // FIXME: Should have no register for immediate
418     static const OpRegBankEntry<1> Table[2] = {
419       // Perfectly legal.
420       { { AMDGPU::SGPRRegBankID }, 1 },
421 
422       // Need readlane
423       { { AMDGPU::VGPRRegBankID }, 3 }
424     };
425 
426     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
427     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
428   }
429   default:
430     return RegisterBankInfo::getInstrAlternativeMappings(MI);
431   }
432 }
433 
434 // FIXME: Returns uniform if there's no source value information. This is
435 // probably wrong.
436 static bool isScalarLoadLegal(const MachineInstr &MI) {
437   if (!MI.hasOneMemOperand())
438     return false;
439 
440   const MachineMemOperand *MMO = *MI.memoperands_begin();
441   const unsigned AS = MMO->getAddrSpace();
442   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
443                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
444   // Require 4-byte alignment.
445   return MMO->getAlign() >= Align(4) &&
446          // Can't do a scalar atomic load.
447          !MMO->isAtomic() &&
448          // Don't use scalar loads for volatile accesses to non-constant address
449          // spaces.
450          (IsConst || !MMO->isVolatile()) &&
451          // Memory must be known constant, or not written before this load.
452          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
453          AMDGPUInstrInfo::isUniformMMO(MMO);
454 }
455 
456 RegisterBankInfo::InstructionMappings
457 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
458     const MachineInstr &MI) const {
459 
460   const MachineFunction &MF = *MI.getParent()->getParent();
461   const MachineRegisterInfo &MRI = MF.getRegInfo();
462 
463 
464   InstructionMappings AltMappings;
465   switch (MI.getOpcode()) {
466   case TargetOpcode::G_CONSTANT:
467   case TargetOpcode::G_IMPLICIT_DEF: {
468     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
469     if (Size == 1) {
470       static const OpRegBankEntry<1> Table[3] = {
471         { { AMDGPU::VGPRRegBankID }, 1 },
472         { { AMDGPU::SGPRRegBankID }, 1 },
473         { { AMDGPU::VCCRegBankID }, 1 }
474       };
475 
476       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
477     }
478 
479     [[fallthrough]];
480   }
481   case TargetOpcode::G_FCONSTANT:
482   case TargetOpcode::G_FRAME_INDEX:
483   case TargetOpcode::G_GLOBAL_VALUE: {
484     static const OpRegBankEntry<1> Table[2] = {
485       { { AMDGPU::VGPRRegBankID }, 1 },
486       { { AMDGPU::SGPRRegBankID }, 1 }
487     };
488 
489     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
490   }
491   case TargetOpcode::G_AND:
492   case TargetOpcode::G_OR:
493   case TargetOpcode::G_XOR: {
494     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
495 
496     if (Size == 1) {
497       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
498       const InstructionMapping &SCCMapping = getInstructionMapping(
499         1, 1, getOperandsMapping(
500           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
501            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
503         3); // Num Operands
504       AltMappings.push_back(&SCCMapping);
505 
506       const InstructionMapping &VCCMapping0 = getInstructionMapping(
507         2, 1, getOperandsMapping(
508           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
509            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
511         3); // Num Operands
512       AltMappings.push_back(&VCCMapping0);
513       return AltMappings;
514     }
515 
516     if (Size != 64)
517       break;
518 
519     const InstructionMapping &SSMapping = getInstructionMapping(
520       1, 1, getOperandsMapping(
521         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
522          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
524       3); // Num Operands
525     AltMappings.push_back(&SSMapping);
526 
527     const InstructionMapping &VVMapping = getInstructionMapping(
528       2, 2, getOperandsMapping(
529         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
530          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
532       3); // Num Operands
533     AltMappings.push_back(&VVMapping);
534     break;
535   }
536   case TargetOpcode::G_LOAD:
537   case TargetOpcode::G_ZEXTLOAD:
538   case TargetOpcode::G_SEXTLOAD: {
539     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
540     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
541     unsigned PtrSize = PtrTy.getSizeInBits();
542     unsigned AS = PtrTy.getAddressSpace();
543 
544     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
545          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
546         isScalarLoadLegal(MI)) {
547       const InstructionMapping &SSMapping = getInstructionMapping(
548           1, 1, getOperandsMapping(
549                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
550                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
551           2); // Num Operands
552       AltMappings.push_back(&SSMapping);
553     }
554 
555     const InstructionMapping &VVMapping = getInstructionMapping(
556         2, 1,
557         getOperandsMapping(
558             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
559              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
560         2); // Num Operands
561     AltMappings.push_back(&VVMapping);
562 
563     // It may be possible to have a vgpr = load sgpr mapping here, because
564     // the mubuf instructions support this kind of load, but probably for only
565     // gfx7 and older.  However, the addressing mode matching in the instruction
566     // selector should be able to do a better job of detecting and selecting
567     // these kinds of loads from the vgpr = load vgpr mapping.
568 
569     return AltMappings;
570 
571   }
572   case TargetOpcode::G_SELECT: {
573     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
574     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
575       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
576                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
577                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
579       4); // Num Operands
580     AltMappings.push_back(&SSMapping);
581 
582     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
583       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
584                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
585                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
586                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
587       4); // Num Operands
588     AltMappings.push_back(&VVMapping);
589 
590     return AltMappings;
591   }
592   case TargetOpcode::G_UADDE:
593   case TargetOpcode::G_USUBE:
594   case TargetOpcode::G_SADDE:
595   case TargetOpcode::G_SSUBE: {
596     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
597     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
598       getOperandsMapping(
599         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
600          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
601          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
602          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
604       5); // Num Operands
605     AltMappings.push_back(&SSMapping);
606 
607     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
608       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
609                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
610                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
611                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
613       5); // Num Operands
614     AltMappings.push_back(&VVMapping);
615     return AltMappings;
616   }
617   case AMDGPU::G_BRCOND: {
618     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
619 
620     // TODO: Change type to 32 for scalar
621     const InstructionMapping &SMapping = getInstructionMapping(
622       1, 1, getOperandsMapping(
623         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
624       2); // Num Operands
625     AltMappings.push_back(&SMapping);
626 
627     const InstructionMapping &VMapping = getInstructionMapping(
628       1, 1, getOperandsMapping(
629         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
630       2); // Num Operands
631     AltMappings.push_back(&VMapping);
632     return AltMappings;
633   }
634   case AMDGPU::G_INTRINSIC:
635     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
636   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
637     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
638   default:
639     break;
640   }
641   return RegisterBankInfo::getInstrAlternativeMappings(MI);
642 }
643 
644 void AMDGPURegisterBankInfo::split64BitValueForMapping(
645   MachineIRBuilder &B,
646   SmallVector<Register, 2> &Regs,
647   LLT HalfTy,
648   Register Reg) const {
649   assert(HalfTy.getSizeInBits() == 32);
650   MachineRegisterInfo *MRI = B.getMRI();
651   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
652   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
653   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
654   MRI->setRegBank(LoLHS, *Bank);
655   MRI->setRegBank(HiLHS, *Bank);
656 
657   Regs.push_back(LoLHS);
658   Regs.push_back(HiLHS);
659 
660   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
661     .addDef(LoLHS)
662     .addDef(HiLHS)
663     .addUse(Reg);
664 }
665 
666 /// Replace the current type each register in \p Regs has with \p NewTy
667 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
668                           LLT NewTy) {
669   for (Register Reg : Regs) {
670     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
671     MRI.setType(Reg, NewTy);
672   }
673 }
674 
675 static LLT getHalfSizedType(LLT Ty) {
676   if (Ty.isVector()) {
677     assert(Ty.getElementCount().isKnownMultipleOf(2));
678     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
679                                Ty.getElementType());
680   }
681 
682   assert(Ty.getScalarSizeInBits() % 2 == 0);
683   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
684 }
685 
686 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
687 // source value into a scalar register.
688 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
689                                                     MachineRegisterInfo &MRI,
690                                                     Register Src) const {
691   LLT Ty = MRI.getType(Src);
692   const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
693 
694   if (Bank == &AMDGPU::SGPRRegBank)
695     return Src;
696 
697   unsigned Bits = Ty.getSizeInBits();
698   assert(Bits % 32 == 0);
699 
700   if (Bank != &AMDGPU::VGPRRegBank) {
701     // We need to copy from AGPR to VGPR
702     Src = B.buildCopy(Ty, Src).getReg(0);
703     MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
704   }
705 
706   LLT S32 = LLT::scalar(32);
707   unsigned NumParts = Bits / 32;
708   SmallVector<Register, 8> SrcParts;
709   SmallVector<Register, 8> DstParts;
710 
711   if (Bits == 32) {
712     SrcParts.push_back(Src);
713   } else {
714     auto Unmerge = B.buildUnmerge(S32, Src);
715     for (unsigned i = 0; i < NumParts; ++i)
716       SrcParts.push_back(Unmerge.getReg(i));
717   }
718 
719   for (unsigned i = 0; i < NumParts; ++i) {
720     Register SrcPart = SrcParts[i];
721     Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
722     MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
723 
724     const TargetRegisterClass *Constrained =
725         constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
726     (void)Constrained;
727     assert(Constrained && "Failed to constrain readfirstlane src reg");
728 
729     B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
730 
731     DstParts.push_back(DstPart);
732   }
733 
734   if (Bits == 32)
735     return DstParts[0];
736 
737   Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
738   MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
739   return Dst;
740 }
741 
742 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
743 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
744 /// execute the instruction for each unique combination of values in all lanes
745 /// in the wave. The block will be split such that rest of the instructions are
746 /// moved to a new block.
747 ///
748 /// Essentially performs this loop:
749 //
750 /// Save Execution Mask
751 /// For (Lane : Wavefront) {
752 ///   Enable Lane, Disable all other lanes
753 ///   SGPR = read SGPR value for current lane from VGPR
754 ///   VGPRResult[Lane] = use_op SGPR
755 /// }
756 /// Restore Execution Mask
757 ///
758 /// There is additional complexity to try for compare values to identify the
759 /// unique values used.
760 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
761   MachineIRBuilder &B,
762   iterator_range<MachineBasicBlock::iterator> Range,
763   SmallSet<Register, 4> &SGPROperandRegs,
764   MachineRegisterInfo &MRI) const {
765 
766   // Track use registers which have already been expanded with a readfirstlane
767   // sequence. This may have multiple uses if moving a sequence.
768   DenseMap<Register, Register> WaterfalledRegMap;
769 
770   MachineBasicBlock &MBB = B.getMBB();
771   MachineFunction *MF = &B.getMF();
772 
773   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
774   const unsigned MovExecOpc =
775       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
776   const unsigned MovExecTermOpc =
777       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
778 
779   const unsigned XorTermOpc = Subtarget.isWave32() ?
780     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
781   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
782     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
783   const unsigned ExecReg =  Subtarget.isWave32() ?
784     AMDGPU::EXEC_LO : AMDGPU::EXEC;
785 
786 #ifndef NDEBUG
787   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
788 #endif
789 
790   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
791   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
792 
793   // Don't bother using generic instructions/registers for the exec mask.
794   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
795     .addDef(InitSaveExecReg);
796 
797   Register PhiExec = MRI.createVirtualRegister(WaveRC);
798   Register NewExec = MRI.createVirtualRegister(WaveRC);
799 
800   // To insert the loop we need to split the block. Move everything before this
801   // point to a new block, and insert a new empty block before this instruction.
802   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
803   MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
804   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
805   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
806   MachineFunction::iterator MBBI(MBB);
807   ++MBBI;
808   MF->insert(MBBI, LoopBB);
809   MF->insert(MBBI, BodyBB);
810   MF->insert(MBBI, RestoreExecBB);
811   MF->insert(MBBI, RemainderBB);
812 
813   LoopBB->addSuccessor(BodyBB);
814   BodyBB->addSuccessor(RestoreExecBB);
815   BodyBB->addSuccessor(LoopBB);
816 
817   // Move the rest of the block into a new block.
818   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
819   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
820 
821   MBB.addSuccessor(LoopBB);
822   RestoreExecBB->addSuccessor(RemainderBB);
823 
824   B.setInsertPt(*LoopBB, LoopBB->end());
825 
826   B.buildInstr(TargetOpcode::PHI)
827       .addDef(PhiExec)
828       .addReg(InitSaveExecReg)
829       .addMBB(&MBB)
830       .addReg(NewExec)
831       .addMBB(BodyBB);
832 
833   const DebugLoc &DL = B.getDL();
834 
835   MachineInstr &FirstInst = *Range.begin();
836 
837   // Move the instruction into the loop body. Note we moved everything after
838   // Range.end() already into a new block, so Range.end() is no longer valid.
839   BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
840 
841   // Figure out the iterator range after splicing the instructions.
842   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
843   auto NewEnd = BodyBB->end();
844 
845   B.setMBB(*LoopBB);
846 
847   LLT S1 = LLT::scalar(1);
848   Register CondReg;
849 
850   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
851 
852   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
853     for (MachineOperand &Op : MI.all_uses()) {
854       Register OldReg = Op.getReg();
855       if (!SGPROperandRegs.count(OldReg))
856         continue;
857 
858       // See if we already processed this register in another instruction in the
859       // sequence.
860       auto OldVal = WaterfalledRegMap.find(OldReg);
861       if (OldVal != WaterfalledRegMap.end()) {
862         Op.setReg(OldVal->second);
863         continue;
864       }
865 
866       Register OpReg = Op.getReg();
867       LLT OpTy = MRI.getType(OpReg);
868 
869       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
870       if (OpBank != &AMDGPU::VGPRRegBank) {
871         // Insert copy from AGPR to VGPR before the loop.
872         B.setMBB(MBB);
873         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
874         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
875         B.setMBB(*LoopBB);
876       }
877 
878       Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
879 
880       // Build the comparison(s).
881       unsigned OpSize = OpTy.getSizeInBits();
882       bool Is64 = OpSize % 64 == 0;
883       unsigned PartSize = Is64 ? 64 : 32;
884       LLT PartTy = LLT::scalar(PartSize);
885       unsigned NumParts = OpSize / PartSize;
886       SmallVector<Register, 8> OpParts;
887       SmallVector<Register, 8> CurrentLaneParts;
888 
889       if (NumParts == 1) {
890         OpParts.push_back(OpReg);
891         CurrentLaneParts.push_back(CurrentLaneReg);
892       } else {
893         auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
894         auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
895         for (unsigned i = 0; i < NumParts; ++i) {
896           OpParts.push_back(UnmergeOp.getReg(i));
897           CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
898           MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
899           MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
900         }
901       }
902 
903       for (unsigned i = 0; i < NumParts; ++i) {
904         auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
905                                   OpParts[i]).getReg(0);
906         MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
907 
908         if (!CondReg) {
909           CondReg = CmpReg;
910         } else {
911           CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
912           MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
913         }
914       }
915 
916       Op.setReg(CurrentLaneReg);
917 
918       // Make sure we don't re-process this register again.
919       WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
920     }
921   }
922 
923   // The ballot becomes a no-op during instruction selection.
924   CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
925                              {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
926                              false)
927                 .addReg(CondReg)
928                 .getReg(0);
929   MRI.setRegClass(CondReg, WaveRC);
930 
931   // Update EXEC, save the original EXEC value to VCC.
932   B.buildInstr(AndSaveExecOpc)
933     .addDef(NewExec)
934     .addReg(CondReg, RegState::Kill);
935 
936   MRI.setSimpleHint(NewExec, CondReg);
937 
938   B.setInsertPt(*BodyBB, BodyBB->end());
939 
940   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
941   B.buildInstr(XorTermOpc)
942     .addDef(ExecReg)
943     .addReg(ExecReg)
944     .addReg(NewExec);
945 
946   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
947   // s_cbranch_scc0?
948 
949   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
950   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
951 
952   // Save the EXEC mask before the loop.
953   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
954     .addReg(ExecReg);
955 
956   // Restore the EXEC mask after the loop.
957   B.setMBB(*RestoreExecBB);
958   B.buildInstr(MovExecTermOpc)
959     .addDef(ExecReg)
960     .addReg(SaveExecReg);
961 
962   // Set the insert point after the original instruction, so any new
963   // instructions will be in the remainder.
964   B.setInsertPt(*RemainderBB, RemainderBB->begin());
965 
966   return true;
967 }
968 
969 // Return any unique registers used by \p MI at \p OpIndices that need to be
970 // handled in a waterfall loop. Returns these registers in \p
971 // SGPROperandRegs. Returns true if there are any operands to handle and a
972 // waterfall loop is necessary.
973 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
974   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
975   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
976   for (unsigned Op : OpIndices) {
977     assert(MI.getOperand(Op).isUse());
978     Register Reg = MI.getOperand(Op).getReg();
979     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
980     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
981       SGPROperandRegs.insert(Reg);
982   }
983 
984   // No operands need to be replaced, so no need to loop.
985   return !SGPROperandRegs.empty();
986 }
987 
988 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
989   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
990   ArrayRef<unsigned> OpIndices) const {
991   // Use a set to avoid extra readfirstlanes in the case where multiple operands
992   // are the same register.
993   SmallSet<Register, 4> SGPROperandRegs;
994 
995   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
996     return false;
997 
998   MachineBasicBlock::iterator I = MI.getIterator();
999   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1000                                 SGPROperandRegs, MRI);
1001 }
1002 
1003 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1004   MachineInstr &MI, MachineRegisterInfo &MRI,
1005   ArrayRef<unsigned> OpIndices) const {
1006   MachineIRBuilder B(MI);
1007   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1008 }
1009 
1010 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1011 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1012     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1013   Register Reg = MI.getOperand(OpIdx).getReg();
1014   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1015   if (Bank == &AMDGPU::SGPRRegBank)
1016     return;
1017 
1018   MachineIRBuilder B(MI);
1019 
1020   Reg = buildReadFirstLane(B, MRI, Reg);
1021   MI.getOperand(OpIdx).setReg(Reg);
1022 }
1023 
1024 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1025 /// rest will be in the remainder.
1026 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1027   unsigned TotalSize = Ty.getSizeInBits();
1028   if (!Ty.isVector())
1029     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1030 
1031   LLT EltTy = Ty.getElementType();
1032   unsigned EltSize = EltTy.getSizeInBits();
1033   assert(FirstSize % EltSize == 0);
1034 
1035   unsigned FirstPartNumElts = FirstSize / EltSize;
1036   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1037 
1038   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1039           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1040 }
1041 
1042 static LLT widen96To128(LLT Ty) {
1043   if (!Ty.isVector())
1044     return LLT::scalar(128);
1045 
1046   LLT EltTy = Ty.getElementType();
1047   assert(128 % EltTy.getSizeInBits() == 0);
1048   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1049 }
1050 
1051 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1052                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1053                                               MachineRegisterInfo &MRI) const {
1054   Register DstReg = MI.getOperand(0).getReg();
1055   const LLT LoadTy = MRI.getType(DstReg);
1056   unsigned LoadSize = LoadTy.getSizeInBits();
1057   const unsigned MaxNonSmrdLoadSize = 128;
1058 
1059   const RegisterBank *DstBank =
1060       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1061   if (DstBank == &AMDGPU::SGPRRegBank) {
1062     // There are some special cases that we need to look at for 32 bit and 96
1063     // bit SGPR loads otherwise we have nothing to do.
1064     if (LoadSize != 32 && LoadSize != 96)
1065       return false;
1066 
1067     MachineMemOperand *MMO = *MI.memoperands_begin();
1068     const unsigned MemSize = 8 * MMO->getSize();
1069     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1070     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1071     // scalar loads should have a load size of 32 but memory access size of less
1072     // than 32.
1073     if (LoadSize == 32 &&
1074         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1075       return false;
1076 
1077     Register PtrReg = MI.getOperand(1).getReg();
1078 
1079     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1080     MachineIRBuilder B(MI, O);
1081 
1082     if (LoadSize == 32) {
1083       // This is an extending load from a sub-dword size. Widen the memory
1084       // access size to 4 bytes and clear the extra high bits appropriately
1085       const LLT S32 = LLT::scalar(32);
1086       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1087         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1088         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1089         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1090       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1091         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1092         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1093         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1094       } else
1095         // We do not need to touch the higher bits for regular loads.
1096         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1097     } else {
1098       // 96-bit loads are only available for vector loads. We need to split this
1099       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1100       if (MMO->getAlign() < Align(16)) {
1101         MachineFunction *MF = MI.getParent()->getParent();
1102         ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1103         MachineIRBuilder B(MI, ApplyBank);
1104         LegalizerHelper Helper(*MF, ApplyBank, B);
1105         LLT Part64, Part32;
1106         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1107         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1108             LegalizerHelper::Legalized)
1109           return false;
1110         return true;
1111       } else {
1112         LLT WiderTy = widen96To128(LoadTy);
1113         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1114         if (WiderTy.isScalar())
1115           B.buildTrunc(MI.getOperand(0), WideLoad);
1116         else {
1117           B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1118                                               WideLoad);
1119         }
1120       }
1121     }
1122 
1123     MI.eraseFromParent();
1124     return true;
1125   }
1126 
1127   // 128-bit loads are supported for all instruction types.
1128   if (LoadSize <= MaxNonSmrdLoadSize)
1129     return false;
1130 
1131   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1132   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1133 
1134   if (SrcRegs.empty())
1135     SrcRegs.push_back(MI.getOperand(1).getReg());
1136 
1137   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1138 
1139   // RegBankSelect only emits scalar types, so we need to reset the pointer
1140   // operand to a pointer type.
1141   Register BasePtrReg = SrcRegs[0];
1142   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1143   MRI.setType(BasePtrReg, PtrTy);
1144 
1145   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1146   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1147   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1148   MachineIRBuilder B(MI, Observer);
1149   LegalizerHelper Helper(B.getMF(), Observer, B);
1150 
1151   if (LoadTy.isVector()) {
1152     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1153       return false;
1154   } else {
1155     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1156       return false;
1157   }
1158 
1159   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1160   return true;
1161 }
1162 
1163 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1164   MachineInstr &MI,
1165   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1166   MachineRegisterInfo &MRI) const {
1167   const MachineFunction &MF = *MI.getMF();
1168   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1169   const auto &TFI = *ST.getFrameLowering();
1170 
1171   // Guard in case the stack growth direction ever changes with scratch
1172   // instructions.
1173   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1174     return false;
1175 
1176   Register Dst = MI.getOperand(0).getReg();
1177   Register AllocSize = MI.getOperand(1).getReg();
1178   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1179 
1180   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1181 
1182   // TODO: Need to emit a wave reduction to get the maximum size.
1183   if (SizeBank != &AMDGPU::SGPRRegBank)
1184     return false;
1185 
1186   LLT PtrTy = MRI.getType(Dst);
1187   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1188 
1189   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1190   Register SPReg = Info->getStackPtrOffsetReg();
1191   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1192   MachineIRBuilder B(MI, ApplyBank);
1193 
1194   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1195   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1196 
1197   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1198   if (Alignment > TFI.getStackAlign()) {
1199     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1200     B.buildMaskLowPtrBits(Dst, PtrAdd,
1201                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1202   } else {
1203     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1204   }
1205 
1206   MI.eraseFromParent();
1207   return true;
1208 }
1209 
1210 bool AMDGPURegisterBankInfo::applyMappingImage(
1211     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1212     MachineRegisterInfo &MRI, int RsrcIdx) const {
1213   const int NumDefs = MI.getNumExplicitDefs();
1214 
1215   // The reported argument index is relative to the IR intrinsic call arguments,
1216   // so we need to shift by the number of defs and the intrinsic ID.
1217   RsrcIdx += NumDefs + 1;
1218 
1219   // Insert copies to VGPR arguments.
1220   applyDefaultMapping(OpdMapper);
1221 
1222   // Fixup any SGPR arguments.
1223   SmallVector<unsigned, 4> SGPRIndexes;
1224   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1225     if (!MI.getOperand(I).isReg())
1226       continue;
1227 
1228     // If this intrinsic has a sampler, it immediately follows rsrc.
1229     if (I == RsrcIdx || I == RsrcIdx + 1)
1230       SGPRIndexes.push_back(I);
1231   }
1232 
1233   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1234   return true;
1235 }
1236 
1237 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1238 // the three offsets (voffset, soffset and instoffset)
1239 unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1240     MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1241     Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1242   const LLT S32 = LLT::scalar(32);
1243   MachineRegisterInfo *MRI = B.getMRI();
1244 
1245   if (std::optional<int64_t> Imm =
1246           getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1247     uint32_t SOffset, ImmOffset;
1248     if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1249       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1250       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1251       InstOffsetVal = ImmOffset;
1252 
1253       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1254       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1255       return SOffset + ImmOffset;
1256     }
1257   }
1258 
1259   Register Base;
1260   unsigned Offset;
1261 
1262   std::tie(Base, Offset) =
1263       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1264 
1265   uint32_t SOffset, ImmOffset;
1266   if ((int)Offset > 0 &&
1267       TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1268     if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1269       VOffsetReg = Base;
1270       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1271       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1272       InstOffsetVal = ImmOffset;
1273       return 0; // XXX - Why is this 0?
1274     }
1275 
1276     // If we have SGPR base, we can use it for soffset.
1277     if (SOffset == 0) {
1278       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1279       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1280       SOffsetReg = Base;
1281       InstOffsetVal = ImmOffset;
1282       return 0; // XXX - Why is this 0?
1283     }
1284   }
1285 
1286   // Handle the variable sgpr + vgpr case.
1287   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1288   if (Add && (int)Offset >= 0) {
1289     Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1290     Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1291 
1292     const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1293     const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1294 
1295     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1296       VOffsetReg = Src0;
1297       SOffsetReg = Src1;
1298       return 0;
1299     }
1300 
1301     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1302       VOffsetReg = Src1;
1303       SOffsetReg = Src0;
1304       return 0;
1305     }
1306   }
1307 
1308   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1309   // have an SGPR offset and a VGPR resource.
1310   if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1311     VOffsetReg = CombinedOffset;
1312   } else {
1313     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1314     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1315   }
1316 
1317   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1318   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1319   return 0;
1320 }
1321 
1322 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1323   const OperandsMapper &OpdMapper) const {
1324   MachineInstr &MI = OpdMapper.getMI();
1325   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1326 
1327   const LLT S32 = LLT::scalar(32);
1328   Register Dst = MI.getOperand(0).getReg();
1329   LLT Ty = MRI.getType(Dst);
1330 
1331   const RegisterBank *RSrcBank =
1332     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1333   const RegisterBank *OffsetBank =
1334     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1335   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1336       OffsetBank == &AMDGPU::SGPRRegBank)
1337     return true; // Legal mapping
1338 
1339   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1340   // here but don't have an MMO.
1341 
1342   unsigned LoadSize = Ty.getSizeInBits();
1343   int NumLoads = 1;
1344   if (LoadSize == 256 || LoadSize == 512) {
1345     NumLoads = LoadSize / 128;
1346     Ty = Ty.divide(NumLoads);
1347   }
1348 
1349   // Use the alignment to ensure that the required offsets will fit into the
1350   // immediate offsets.
1351   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1352 
1353   MachineIRBuilder B(MI);
1354   MachineFunction &MF = B.getMF();
1355 
1356   Register SOffset;
1357   Register VOffset;
1358   int64_t ImmOffset = 0;
1359 
1360   unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1361                                         SOffset, ImmOffset, Alignment);
1362 
1363   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1364   // can, but we need to track an MMO for that.
1365   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1366   const Align MemAlign(4); // FIXME: ABI type alignment?
1367   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1368     MachinePointerInfo(),
1369     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1370     MachineMemOperand::MOInvariant,
1371     MemSize, MemAlign);
1372   if (MMOOffset != 0)
1373     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1374 
1375   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1376   // assume that the buffer is unswizzled.
1377 
1378   Register RSrc = MI.getOperand(1).getReg();
1379   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1380   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1381 
1382   SmallVector<Register, 4> LoadParts(NumLoads);
1383 
1384   MachineBasicBlock::iterator MII = MI.getIterator();
1385   MachineInstrSpan Span(MII, &B.getMBB());
1386 
1387   for (int i = 0; i < NumLoads; ++i) {
1388     if (NumLoads == 1) {
1389       LoadParts[i] = Dst;
1390     } else {
1391       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1392       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1393     }
1394 
1395     MachineMemOperand *MMO = BaseMMO;
1396     if (i != 0)
1397       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1398 
1399     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1400       .addDef(LoadParts[i])       // vdata
1401       .addUse(RSrc)               // rsrc
1402       .addUse(VIndex)             // vindex
1403       .addUse(VOffset)            // voffset
1404       .addUse(SOffset)            // soffset
1405       .addImm(ImmOffset + 16 * i) // offset(imm)
1406       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1407       .addImm(0)                  // idxen(imm)
1408       .addMemOperand(MMO);
1409   }
1410 
1411   // TODO: If only the resource is a VGPR, it may be better to execute the
1412   // scalar load in the waterfall loop if the resource is expected to frequently
1413   // be dynamically uniform.
1414   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1415     // Remove the original instruction to avoid potentially confusing the
1416     // waterfall loop logic.
1417     B.setInstr(*Span.begin());
1418     MI.eraseFromParent();
1419 
1420     SmallSet<Register, 4> OpsToWaterfall;
1421 
1422     OpsToWaterfall.insert(RSrc);
1423     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1424                            OpsToWaterfall, MRI);
1425   }
1426 
1427   if (NumLoads != 1) {
1428     if (Ty.isVector())
1429       B.buildConcatVectors(Dst, LoadParts);
1430     else
1431       B.buildMergeLikeInstr(Dst, LoadParts);
1432   }
1433 
1434   // We removed the instruction earlier with a waterfall loop.
1435   if (RSrcBank == &AMDGPU::SGPRRegBank)
1436     MI.eraseFromParent();
1437 
1438   return true;
1439 }
1440 
1441 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1442                                              bool Signed) const {
1443   MachineInstr &MI = OpdMapper.getMI();
1444   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1445 
1446   // Insert basic copies
1447   applyDefaultMapping(OpdMapper);
1448 
1449   Register DstReg = MI.getOperand(0).getReg();
1450   LLT Ty = MRI.getType(DstReg);
1451 
1452   const LLT S32 = LLT::scalar(32);
1453 
1454   unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1455   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1456   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1457   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1458 
1459   const RegisterBank *DstBank =
1460     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1461   if (DstBank == &AMDGPU::VGPRRegBank) {
1462     if (Ty == S32)
1463       return true;
1464 
1465     // There is no 64-bit vgpr bitfield extract instructions so the operation
1466     // is expanded to a sequence of instructions that implement the operation.
1467     ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1468     MachineIRBuilder B(MI, ApplyBank);
1469 
1470     const LLT S64 = LLT::scalar(64);
1471     // Shift the source operand so that extracted bits start at bit 0.
1472     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1473                               : B.buildLShr(S64, SrcReg, OffsetReg);
1474     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1475 
1476     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1477     // if the width is a constant.
1478     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1479       // Use the 32-bit bitfield extract instruction if the width is a constant.
1480       // Depending on the width size, use either the low or high 32-bits.
1481       auto Zero = B.buildConstant(S32, 0);
1482       auto WidthImm = ConstWidth->Value.getZExtValue();
1483       if (WidthImm <= 32) {
1484         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1485         // or clear the upper 32-bits.
1486         auto Extract =
1487             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1488                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1489         auto Extend =
1490             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1491         B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1492       } else {
1493         // Use bitfield extract on upper 32-bit source, and combine with lower
1494         // 32-bit source.
1495         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1496         auto Extract =
1497             Signed
1498                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1499                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1500         B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1501       }
1502       MI.eraseFromParent();
1503       return true;
1504     }
1505 
1506     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1507     // operations.
1508     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1509     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1510     if (Signed)
1511       B.buildAShr(S64, SignBit, ExtShift);
1512     else
1513       B.buildLShr(S64, SignBit, ExtShift);
1514     MI.eraseFromParent();
1515     return true;
1516   }
1517 
1518   // The scalar form packs the offset and width in a single operand.
1519 
1520   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1521   MachineIRBuilder B(MI, ApplyBank);
1522 
1523   // Ensure the high bits are clear to insert the offset.
1524   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1525   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1526 
1527   // Zeros out the low bits, so don't bother clamping the input value.
1528   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1529 
1530   // Transformation function, pack the offset and width of a BFE into
1531   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1532   // source, bits [5:0] contain the offset and bits [22:16] the width.
1533   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1534 
1535   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1536   // register class constraints.
1537   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1538                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1539 
1540   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1541   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1542     llvm_unreachable("failed to constrain BFE");
1543 
1544   MI.eraseFromParent();
1545   return true;
1546 }
1547 
1548 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1549     const OperandsMapper &OpdMapper) const {
1550   MachineInstr &MI = OpdMapper.getMI();
1551   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1552 
1553   // Insert basic copies.
1554   applyDefaultMapping(OpdMapper);
1555 
1556   Register Dst0 = MI.getOperand(0).getReg();
1557   Register Dst1 = MI.getOperand(1).getReg();
1558   Register Src0 = MI.getOperand(2).getReg();
1559   Register Src1 = MI.getOperand(3).getReg();
1560   Register Src2 = MI.getOperand(4).getReg();
1561 
1562   if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1563     return true;
1564 
1565   bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1566   LLT S1 = LLT::scalar(1);
1567   LLT S32 = LLT::scalar(32);
1568 
1569   bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1570   bool Accumulate = true;
1571 
1572   if (!DstOnValu) {
1573     if (mi_match(Src2, MRI, m_ZeroInt()))
1574       Accumulate = false;
1575   }
1576 
1577   // Keep the multiplication on the SALU.
1578   MachineIRBuilder B(MI);
1579 
1580   Register DstHi;
1581   Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1582   bool MulHiInVgpr = false;
1583 
1584   MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1585 
1586   if (Subtarget.hasSMulHi()) {
1587     DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1588                        : B.buildSMulH(S32, Src0, Src1).getReg(0);
1589     MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1590   } else {
1591     Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1592     Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1593 
1594     MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1595     MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1596 
1597     DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1598                        : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1599     MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1600 
1601     if (!DstOnValu) {
1602       DstHi = buildReadFirstLane(B, MRI, DstHi);
1603     } else {
1604       MulHiInVgpr = true;
1605     }
1606   }
1607 
1608   // Accumulate and produce the "carry-out" bit.
1609   //
1610   // The "carry-out" is defined as bit 64 of the result when computed as a
1611   // big integer. For unsigned multiply-add, this matches the usual definition
1612   // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1613   // result, which is determined as:
1614   //   sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1615   LLT CarryType = DstOnValu ? S1 : S32;
1616   const RegisterBank &CarryBank =
1617       DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1618   const RegisterBank &DstBank =
1619       DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1620   Register Carry;
1621   Register Zero;
1622 
1623   if (!IsUnsigned) {
1624     Zero = B.buildConstant(S32, 0).getReg(0);
1625     MRI.setRegBank(Zero,
1626                    MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1627 
1628     Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1629                 .getReg(0);
1630     MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1631                                       : AMDGPU::SGPRRegBank);
1632 
1633     if (DstOnValu && !MulHiInVgpr) {
1634       Carry = B.buildTrunc(S1, Carry).getReg(0);
1635       MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1636     }
1637   }
1638 
1639   if (Accumulate) {
1640     if (DstOnValu) {
1641       DstLo = B.buildCopy(S32, DstLo).getReg(0);
1642       DstHi = B.buildCopy(S32, DstHi).getReg(0);
1643       MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1644       MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1645     }
1646 
1647     auto Unmerge = B.buildUnmerge(S32, Src2);
1648     Register Src2Lo = Unmerge.getReg(0);
1649     Register Src2Hi = Unmerge.getReg(1);
1650     MRI.setRegBank(Src2Lo, DstBank);
1651     MRI.setRegBank(Src2Hi, DstBank);
1652 
1653     if (!IsUnsigned) {
1654       auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1655       MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1656 
1657       Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1658       MRI.setRegBank(Carry, CarryBank);
1659     }
1660 
1661     auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1662     DstLo = AddLo.getReg(0);
1663     Register CarryLo = AddLo.getReg(1);
1664     MRI.setRegBank(DstLo, DstBank);
1665     MRI.setRegBank(CarryLo, CarryBank);
1666 
1667     auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1668     DstHi = AddHi.getReg(0);
1669     MRI.setRegBank(DstHi, DstBank);
1670 
1671     Register CarryHi = AddHi.getReg(1);
1672     MRI.setRegBank(CarryHi, CarryBank);
1673 
1674     if (IsUnsigned) {
1675       Carry = CarryHi;
1676     } else {
1677       Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1678       MRI.setRegBank(Carry, CarryBank);
1679     }
1680   } else {
1681     if (IsUnsigned) {
1682       Carry = B.buildConstant(CarryType, 0).getReg(0);
1683       MRI.setRegBank(Carry, CarryBank);
1684     }
1685   }
1686 
1687   B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1688 
1689   if (DstOnValu) {
1690     B.buildCopy(Dst1, Carry);
1691   } else {
1692     B.buildTrunc(Dst1, Carry);
1693   }
1694 
1695   MI.eraseFromParent();
1696   return true;
1697 }
1698 
1699 // Return a suitable opcode for extending the operands of Opc when widening.
1700 static unsigned getExtendOp(unsigned Opc) {
1701   switch (Opc) {
1702   case TargetOpcode::G_ASHR:
1703   case TargetOpcode::G_SMIN:
1704   case TargetOpcode::G_SMAX:
1705     return TargetOpcode::G_SEXT;
1706   case TargetOpcode::G_LSHR:
1707   case TargetOpcode::G_UMIN:
1708   case TargetOpcode::G_UMAX:
1709     return TargetOpcode::G_ZEXT;
1710   default:
1711     return TargetOpcode::G_ANYEXT;
1712   }
1713 }
1714 
1715 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1716 // any illegal vector extend or unmerge operations.
1717 static std::pair<Register, Register>
1718 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1719   const LLT S32 = LLT::scalar(32);
1720   auto Bitcast = B.buildBitcast(S32, Src);
1721 
1722   if (ExtOpcode == TargetOpcode::G_SEXT) {
1723     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1724     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1725     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1726   }
1727 
1728   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1729   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1730     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1731     return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1732   }
1733 
1734   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1735   return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1736 }
1737 
1738 // For cases where only a single copy is inserted for matching register banks.
1739 // Replace the register in the instruction operand
1740 static bool substituteSimpleCopyRegs(
1741   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1742   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1743   if (!SrcReg.empty()) {
1744     assert(SrcReg.size() == 1);
1745     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1746     return true;
1747   }
1748 
1749   return false;
1750 }
1751 
1752 /// Handle register layout difference for f16 images for some subtargets.
1753 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1754                                                 MachineRegisterInfo &MRI,
1755                                                 Register Reg) const {
1756   if (!Subtarget.hasUnpackedD16VMem())
1757     return Reg;
1758 
1759   const LLT S16 = LLT::scalar(16);
1760   LLT StoreVT = MRI.getType(Reg);
1761   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1762     return Reg;
1763 
1764   auto Unmerge = B.buildUnmerge(S16, Reg);
1765 
1766 
1767   SmallVector<Register, 4> WideRegs;
1768   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1769     WideRegs.push_back(Unmerge.getReg(I));
1770 
1771   const LLT S32 = LLT::scalar(32);
1772   int NumElts = StoreVT.getNumElements();
1773 
1774   return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1775       .getReg(0);
1776 }
1777 
1778 static std::pair<Register, unsigned>
1779 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1780   int64_t Const;
1781   if (mi_match(Reg, MRI, m_ICst(Const)))
1782     return std::pair(Register(), Const);
1783 
1784   Register Base;
1785   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1786     return std::pair(Base, Const);
1787 
1788   // TODO: Handle G_OR used for add case
1789   return std::pair(Reg, 0);
1790 }
1791 
1792 std::pair<Register, unsigned>
1793 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1794                                            Register OrigOffset) const {
1795   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
1796   Register BaseReg;
1797   unsigned ImmOffset;
1798   const LLT S32 = LLT::scalar(32);
1799 
1800   // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1801   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1802                                                            OrigOffset);
1803 
1804   unsigned C1 = 0;
1805   if (ImmOffset != 0) {
1806     // If the immediate value is too big for the immoffset field, put only bits
1807     // that would normally fit in the immoffset field. The remaining value that
1808     // is copied/added for the voffset field is a large power of 2, and it
1809     // stands more chance of being CSEd with the copy/add for another similar
1810     // load/store.
1811     // However, do not do that rounding down if that is a negative
1812     // number, as it appears to be illegal to have a negative offset in the
1813     // vgpr, even if adding the immediate offset makes it positive.
1814     unsigned Overflow = ImmOffset & ~MaxImm;
1815     ImmOffset -= Overflow;
1816     if ((int32_t)Overflow < 0) {
1817       Overflow += ImmOffset;
1818       ImmOffset = 0;
1819     }
1820 
1821     C1 = ImmOffset;
1822     if (Overflow != 0) {
1823       if (!BaseReg)
1824         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1825       else {
1826         auto OverflowVal = B.buildConstant(S32, Overflow);
1827         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1828       }
1829     }
1830   }
1831 
1832   if (!BaseReg)
1833     BaseReg = B.buildConstant(S32, 0).getReg(0);
1834 
1835   return {BaseReg, C1};
1836 }
1837 
1838 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1839                                         Register SrcReg) const {
1840   MachineRegisterInfo &MRI = *B.getMRI();
1841   LLT SrcTy = MRI.getType(SrcReg);
1842   if (SrcTy.getSizeInBits() == 32) {
1843     // Use a v_mov_b32 here to make the exec dependency explicit.
1844     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1845       .addDef(DstReg)
1846       .addUse(SrcReg);
1847     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1848            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1849   }
1850 
1851   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1852   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1853 
1854   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1855     .addDef(TmpReg0)
1856     .addUse(SrcReg, 0, AMDGPU::sub0);
1857   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1858     .addDef(TmpReg1)
1859     .addUse(SrcReg, 0, AMDGPU::sub1);
1860   B.buildInstr(AMDGPU::REG_SEQUENCE)
1861     .addDef(DstReg)
1862     .addUse(TmpReg0)
1863     .addImm(AMDGPU::sub0)
1864     .addUse(TmpReg1)
1865     .addImm(AMDGPU::sub1);
1866 
1867   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1868          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1869 }
1870 
1871 /// Utility function for pushing dynamic vector indexes with a constant offset
1872 /// into waterfall loops.
1873 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1874                                    MachineInstr &IdxUseInstr,
1875                                    unsigned OpIdx,
1876                                    unsigned ConstOffset) {
1877   MachineRegisterInfo &MRI = *B.getMRI();
1878   const LLT S32 = LLT::scalar(32);
1879   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1880   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1881 
1882   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1883 
1884   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1885   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1886   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1887   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1888 }
1889 
1890 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1891 /// original 32-bit source value (to be inserted in the low part of the combined
1892 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1893 /// value.
1894 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1895                                   Register Hi32Reg, Register Lo32Reg,
1896                                   unsigned ExtOpc,
1897                                   const RegisterBank &RegBank,
1898                                   bool IsBooleanSrc = false) {
1899   if (ExtOpc == AMDGPU::G_ZEXT) {
1900     B.buildConstant(Hi32Reg, 0);
1901   } else if (ExtOpc == AMDGPU::G_SEXT) {
1902     if (IsBooleanSrc) {
1903       // If we know the original source was an s1, the high half is the same as
1904       // the low.
1905       B.buildCopy(Hi32Reg, Lo32Reg);
1906     } else {
1907       // Replicate sign bit from 32-bit extended part.
1908       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1909       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1910       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1911     }
1912   } else {
1913     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1914     B.buildUndef(Hi32Reg);
1915   }
1916 }
1917 
1918 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1919   MachineInstr &MI, MachineRegisterInfo &MRI,
1920   const OperandsMapper &OpdMapper) const {
1921 
1922   Register VecReg = MI.getOperand(1).getReg();
1923   Register Idx = MI.getOperand(2).getReg();
1924 
1925   const RegisterBank &IdxBank =
1926     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1927 
1928   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1929 
1930   LLT VecTy = MRI.getType(VecReg);
1931   unsigned EltSize = VecTy.getScalarSizeInBits();
1932   unsigned NumElem = VecTy.getNumElements();
1933 
1934   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1935                                                   IsDivergentIdx, &Subtarget))
1936     return false;
1937 
1938   MachineIRBuilder B(MI);
1939   LLT S32 = LLT::scalar(32);
1940 
1941   const RegisterBank &DstBank =
1942     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1943   const RegisterBank &SrcBank =
1944     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1945 
1946   const RegisterBank &CCBank =
1947     (DstBank == AMDGPU::SGPRRegBank &&
1948      SrcBank == AMDGPU::SGPRRegBank &&
1949      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1950                                      : AMDGPU::VCCRegBank;
1951   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1952 
1953   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1954     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1955     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1956   }
1957 
1958   LLT EltTy = VecTy.getScalarType();
1959   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1960   unsigned NumLanes = DstRegs.size();
1961   if (!NumLanes)
1962     NumLanes = 1;
1963   else
1964     EltTy = MRI.getType(DstRegs[0]);
1965 
1966   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1967   SmallVector<Register, 2> Res(NumLanes);
1968   for (unsigned L = 0; L < NumLanes; ++L)
1969     Res[L] = UnmergeToEltTy.getReg(L);
1970 
1971   for (unsigned I = 1; I < NumElem; ++I) {
1972     auto IC = B.buildConstant(S32, I);
1973     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1974     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1975     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1976 
1977     for (unsigned L = 0; L < NumLanes; ++L) {
1978       auto S = B.buildSelect(EltTy, Cmp,
1979                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1980 
1981       for (unsigned N : { 0, 2, 3 })
1982         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1983 
1984       Res[L] = S->getOperand(0).getReg();
1985     }
1986   }
1987 
1988   for (unsigned L = 0; L < NumLanes; ++L) {
1989     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1990     B.buildCopy(DstReg, Res[L]);
1991     MRI.setRegBank(DstReg, DstBank);
1992   }
1993 
1994   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1995   MI.eraseFromParent();
1996 
1997   return true;
1998 }
1999 
2000 // Insert a cross regbank copy for a register if it already has a bank that
2001 // differs from the one we want to set.
2002 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2003                                    MachineIRBuilder &B, Register &Reg,
2004                                    const RegisterBank &Bank) {
2005   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2006   if (CurrBank && *CurrBank != Bank) {
2007     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2008     MRI.setRegBank(Copy, Bank);
2009     return Copy;
2010   }
2011 
2012   MRI.setRegBank(Reg, Bank);
2013   return Reg;
2014 }
2015 
2016 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2017   MachineInstr &MI, MachineRegisterInfo &MRI,
2018   const OperandsMapper &OpdMapper) const {
2019 
2020   Register VecReg = MI.getOperand(1).getReg();
2021   Register Idx = MI.getOperand(3).getReg();
2022 
2023   const RegisterBank &IdxBank =
2024     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2025 
2026   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2027 
2028   LLT VecTy = MRI.getType(VecReg);
2029   unsigned EltSize = VecTy.getScalarSizeInBits();
2030   unsigned NumElem = VecTy.getNumElements();
2031 
2032   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2033                                                   IsDivergentIdx, &Subtarget))
2034     return false;
2035 
2036   MachineIRBuilder B(MI);
2037   LLT S32 = LLT::scalar(32);
2038 
2039   const RegisterBank &DstBank =
2040     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2041   const RegisterBank &SrcBank =
2042     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2043   const RegisterBank &InsBank =
2044     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2045 
2046   const RegisterBank &CCBank =
2047     (DstBank == AMDGPU::SGPRRegBank &&
2048      SrcBank == AMDGPU::SGPRRegBank &&
2049      InsBank == AMDGPU::SGPRRegBank &&
2050      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2051                                      : AMDGPU::VCCRegBank;
2052   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2053 
2054   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2055     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2056     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2057   }
2058 
2059   LLT EltTy = VecTy.getScalarType();
2060   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2061   unsigned NumLanes = InsRegs.size();
2062   if (!NumLanes) {
2063     NumLanes = 1;
2064     InsRegs.push_back(MI.getOperand(2).getReg());
2065   } else {
2066     EltTy = MRI.getType(InsRegs[0]);
2067   }
2068 
2069   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2070   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2071 
2072   for (unsigned I = 0; I < NumElem; ++I) {
2073     auto IC = B.buildConstant(S32, I);
2074     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2075     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2076     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2077 
2078     for (unsigned L = 0; L < NumLanes; ++L) {
2079       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2080       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2081       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2082 
2083       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2084       MRI.setRegBank(Select, DstBank);
2085 
2086       Ops[I * NumLanes + L] = Select;
2087     }
2088   }
2089 
2090   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2091   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2092     B.buildBuildVector(MI.getOperand(0), Ops);
2093   } else {
2094     auto Vec = B.buildBuildVector(MergeTy, Ops);
2095     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2096     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2097   }
2098 
2099   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2100   MI.eraseFromParent();
2101 
2102   return true;
2103 }
2104 
2105 void AMDGPURegisterBankInfo::applyMappingImpl(
2106     const OperandsMapper &OpdMapper) const {
2107   MachineInstr &MI = OpdMapper.getMI();
2108   unsigned Opc = MI.getOpcode();
2109   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2110   switch (Opc) {
2111   case AMDGPU::G_CONSTANT:
2112   case AMDGPU::G_IMPLICIT_DEF: {
2113     Register DstReg = MI.getOperand(0).getReg();
2114     LLT DstTy = MRI.getType(DstReg);
2115     if (DstTy != LLT::scalar(1))
2116       break;
2117 
2118     const RegisterBank *DstBank =
2119         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2120     if (DstBank == &AMDGPU::VCCRegBank)
2121       break;
2122     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2123     if (DefRegs.empty())
2124       DefRegs.push_back(DstReg);
2125 
2126     MachineIRBuilder B(MI);
2127     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2128 
2129     Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2130     LLVMContext &Ctx = B.getMF().getFunction().getContext();
2131 
2132     MI.getOperand(0).setReg(NewDstReg);
2133     if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2134       uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2135       MI.getOperand(1).setCImm(
2136           ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2137     }
2138 
2139     MRI.setRegBank(NewDstReg, *DstBank);
2140     B.buildTrunc(DefRegs[0], NewDstReg);
2141     return;
2142   }
2143   case AMDGPU::G_PHI: {
2144     Register DstReg = MI.getOperand(0).getReg();
2145     LLT DstTy = MRI.getType(DstReg);
2146     if (DstTy != LLT::scalar(1))
2147       break;
2148 
2149     const LLT S32 = LLT::scalar(32);
2150     const RegisterBank *DstBank =
2151       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2152     if (DstBank == &AMDGPU::VCCRegBank) {
2153       applyDefaultMapping(OpdMapper);
2154       // The standard handling only considers the result register bank for
2155       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2156       // produce an invalid copy. We can only copy with some kind of compare to
2157       // get a vector boolean result. Insert a register bank copy that will be
2158       // correctly lowered to a compare.
2159       MachineIRBuilder B(*MI.getParent()->getParent());
2160 
2161       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2162         Register SrcReg = MI.getOperand(I).getReg();
2163         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2164 
2165         if (SrcBank != &AMDGPU::VCCRegBank) {
2166           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2167           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2168 
2169           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2170           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2171           MI.getOperand(I).setReg(Copy.getReg(0));
2172         }
2173       }
2174 
2175       return;
2176     }
2177 
2178     // Phi handling is strange and only considers the bank of the destination.
2179     substituteSimpleCopyRegs(OpdMapper, 0);
2180 
2181     // Promote SGPR/VGPR booleans to s32
2182     MachineFunction *MF = MI.getParent()->getParent();
2183     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2184     MachineIRBuilder B(MI, ApplyBank);
2185     LegalizerHelper Helper(*MF, ApplyBank, B);
2186 
2187     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2188       llvm_unreachable("widen scalar should have succeeded");
2189 
2190     return;
2191   }
2192   case AMDGPU::G_ICMP:
2193   case AMDGPU::G_UADDO:
2194   case AMDGPU::G_USUBO:
2195   case AMDGPU::G_UADDE:
2196   case AMDGPU::G_SADDE:
2197   case AMDGPU::G_USUBE:
2198   case AMDGPU::G_SSUBE: {
2199     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2200     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2201 
2202     const RegisterBank *DstBank =
2203       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2204     if (DstBank != &AMDGPU::SGPRRegBank)
2205       break;
2206 
2207     const bool HasCarryIn = MI.getNumOperands() == 5;
2208 
2209     // If this is a scalar compare, promote the result to s32, as the selection
2210     // will end up using a copy to a 32-bit vreg.
2211     const LLT S32 = LLT::scalar(32);
2212     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2213     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2214     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2215     MachineIRBuilder B(MI);
2216 
2217     if (HasCarryIn) {
2218       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2219       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2220       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2221       MI.getOperand(4).setReg(NewSrcReg);
2222     }
2223 
2224     MachineBasicBlock *MBB = MI.getParent();
2225     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2226 
2227     // If we had a constrained VCC result register, a copy was inserted to VCC
2228     // from SGPR.
2229     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2230     if (DefRegs.empty())
2231       DefRegs.push_back(DstReg);
2232     B.buildTrunc(DefRegs[0], NewDstReg);
2233     return;
2234   }
2235   case AMDGPU::G_SELECT: {
2236     Register DstReg = MI.getOperand(0).getReg();
2237     LLT DstTy = MRI.getType(DstReg);
2238 
2239     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2240     if (CondRegs.empty())
2241       CondRegs.push_back(MI.getOperand(1).getReg());
2242     else {
2243       assert(CondRegs.size() == 1);
2244     }
2245 
2246     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2247     if (CondBank == &AMDGPU::SGPRRegBank) {
2248       MachineIRBuilder B(MI);
2249       const LLT S32 = LLT::scalar(32);
2250       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2251       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2252 
2253       MI.getOperand(1).setReg(NewCondReg);
2254       B.buildZExt(NewCondReg, CondRegs[0]);
2255     }
2256 
2257     if (DstTy.getSizeInBits() != 64)
2258       break;
2259 
2260     MachineIRBuilder B(MI);
2261     LLT HalfTy = getHalfSizedType(DstTy);
2262 
2263     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2264     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2265     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2266 
2267     // All inputs are SGPRs, nothing special to do.
2268     if (DefRegs.empty()) {
2269       assert(Src1Regs.empty() && Src2Regs.empty());
2270       break;
2271     }
2272 
2273     if (Src1Regs.empty())
2274       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2275     else {
2276       setRegsToType(MRI, Src1Regs, HalfTy);
2277     }
2278 
2279     if (Src2Regs.empty())
2280       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2281     else
2282       setRegsToType(MRI, Src2Regs, HalfTy);
2283 
2284     setRegsToType(MRI, DefRegs, HalfTy);
2285 
2286     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2287     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2288 
2289     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2290     MI.eraseFromParent();
2291     return;
2292   }
2293   case AMDGPU::G_BRCOND: {
2294     Register CondReg = MI.getOperand(0).getReg();
2295     // FIXME: Should use legalizer helper, but should change bool ext type.
2296     const RegisterBank *CondBank =
2297       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2298 
2299     if (CondBank == &AMDGPU::SGPRRegBank) {
2300       MachineIRBuilder B(MI);
2301       const LLT S32 = LLT::scalar(32);
2302       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2303       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2304 
2305       MI.getOperand(0).setReg(NewCondReg);
2306       B.buildZExt(NewCondReg, CondReg);
2307       return;
2308     }
2309 
2310     break;
2311   }
2312   case AMDGPU::G_AND:
2313   case AMDGPU::G_OR:
2314   case AMDGPU::G_XOR: {
2315     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2316     // there is a VGPR input.
2317     Register DstReg = MI.getOperand(0).getReg();
2318     LLT DstTy = MRI.getType(DstReg);
2319 
2320     if (DstTy.getSizeInBits() == 1) {
2321       const RegisterBank *DstBank =
2322         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2323       if (DstBank == &AMDGPU::VCCRegBank)
2324         break;
2325 
2326       MachineFunction *MF = MI.getParent()->getParent();
2327       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2328       MachineIRBuilder B(MI, ApplyBank);
2329       LegalizerHelper Helper(*MF, ApplyBank, B);
2330 
2331       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2332           LegalizerHelper::Legalized)
2333         llvm_unreachable("widen scalar should have succeeded");
2334       return;
2335     }
2336 
2337     if (DstTy.getSizeInBits() != 64)
2338       break;
2339 
2340     LLT HalfTy = getHalfSizedType(DstTy);
2341     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2342     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2343     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2344 
2345     // All inputs are SGPRs, nothing special to do.
2346     if (DefRegs.empty()) {
2347       assert(Src0Regs.empty() && Src1Regs.empty());
2348       break;
2349     }
2350 
2351     assert(DefRegs.size() == 2);
2352     assert(Src0Regs.size() == Src1Regs.size() &&
2353            (Src0Regs.empty() || Src0Regs.size() == 2));
2354 
2355     // Depending on where the source registers came from, the generic code may
2356     // have decided to split the inputs already or not. If not, we still need to
2357     // extract the values.
2358     MachineIRBuilder B(MI);
2359 
2360     if (Src0Regs.empty())
2361       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2362     else
2363       setRegsToType(MRI, Src0Regs, HalfTy);
2364 
2365     if (Src1Regs.empty())
2366       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2367     else
2368       setRegsToType(MRI, Src1Regs, HalfTy);
2369 
2370     setRegsToType(MRI, DefRegs, HalfTy);
2371 
2372     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2373     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2374 
2375     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2376     MI.eraseFromParent();
2377     return;
2378   }
2379   case AMDGPU::G_ABS: {
2380     Register SrcReg = MI.getOperand(1).getReg();
2381     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2382 
2383     // There is no VALU abs instruction so we need to replace it with a sub and
2384     // max combination.
2385     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2386       MachineFunction *MF = MI.getParent()->getParent();
2387       ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2388       MachineIRBuilder B(MI, Apply);
2389       LegalizerHelper Helper(*MF, Apply, B);
2390 
2391       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2392         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2393       return;
2394     }
2395     [[fallthrough]];
2396   }
2397   case AMDGPU::G_ADD:
2398   case AMDGPU::G_SUB:
2399   case AMDGPU::G_MUL:
2400   case AMDGPU::G_SHL:
2401   case AMDGPU::G_LSHR:
2402   case AMDGPU::G_ASHR:
2403   case AMDGPU::G_SMIN:
2404   case AMDGPU::G_SMAX:
2405   case AMDGPU::G_UMIN:
2406   case AMDGPU::G_UMAX: {
2407     Register DstReg = MI.getOperand(0).getReg();
2408     LLT DstTy = MRI.getType(DstReg);
2409 
2410     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2411     // Packed 16-bit operations need to be scalarized and promoted.
2412     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2413       break;
2414 
2415     const RegisterBank *DstBank =
2416       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2417     if (DstBank == &AMDGPU::VGPRRegBank)
2418       break;
2419 
2420     const LLT S32 = LLT::scalar(32);
2421     MachineBasicBlock *MBB = MI.getParent();
2422     MachineFunction *MF = MBB->getParent();
2423     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2424     MachineIRBuilder B(MI, ApplySALU);
2425 
2426     if (DstTy.isVector()) {
2427       Register WideSrc0Lo, WideSrc0Hi;
2428       Register WideSrc1Lo, WideSrc1Hi;
2429 
2430       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2431       std::tie(WideSrc0Lo, WideSrc0Hi)
2432         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2433       std::tie(WideSrc1Lo, WideSrc1Hi)
2434         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2435       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2436       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2437       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2438       MI.eraseFromParent();
2439     } else {
2440       LegalizerHelper Helper(*MF, ApplySALU, B);
2441 
2442       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2443         llvm_unreachable("widen scalar should have succeeded");
2444 
2445       // FIXME: s16 shift amounts should be legal.
2446       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2447           Opc == AMDGPU::G_ASHR) {
2448         B.setInsertPt(*MBB, MI.getIterator());
2449         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2450           llvm_unreachable("widen scalar should have succeeded");
2451       }
2452     }
2453 
2454     return;
2455   }
2456   case AMDGPU::G_SEXT_INREG: {
2457     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2458     if (SrcRegs.empty())
2459       break; // Nothing to repair
2460 
2461     const LLT S32 = LLT::scalar(32);
2462     MachineIRBuilder B(MI);
2463     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2464     GISelObserverWrapper Observer(&O);
2465     B.setChangeObserver(Observer);
2466 
2467     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2468     // we would need to further expand, and doesn't let us directly set the
2469     // result registers.
2470     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2471 
2472     int Amt = MI.getOperand(2).getImm();
2473     if (Amt <= 32) {
2474       // Downstream users have expectations for the high bit behavior, so freeze
2475       // incoming undefined bits.
2476       if (Amt == 32) {
2477         // The low bits are unchanged.
2478         B.buildFreeze(DstRegs[0], SrcRegs[0]);
2479       } else {
2480         auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2481         // Extend in the low bits and propagate the sign bit to the high half.
2482         B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2483       }
2484 
2485       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2486     } else {
2487       // The low bits are unchanged, and extend in the high bits.
2488       // No freeze required
2489       B.buildCopy(DstRegs[0], SrcRegs[0]);
2490       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2491     }
2492 
2493     Register DstReg = MI.getOperand(0).getReg();
2494     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2495     MI.eraseFromParent();
2496     return;
2497   }
2498   case AMDGPU::G_CTPOP:
2499   case AMDGPU::G_BITREVERSE: {
2500     const RegisterBank *DstBank =
2501       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2502     if (DstBank == &AMDGPU::SGPRRegBank)
2503       break;
2504 
2505     Register SrcReg = MI.getOperand(1).getReg();
2506     const LLT S32 = LLT::scalar(32);
2507     LLT Ty = MRI.getType(SrcReg);
2508     if (Ty == S32)
2509       break;
2510 
2511     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2512     MachineIRBuilder B(MI, ApplyVALU);
2513 
2514     MachineFunction &MF = B.getMF();
2515     LegalizerHelper Helper(MF, ApplyVALU, B);
2516 
2517     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2518       llvm_unreachable("narrowScalar should have succeeded");
2519     return;
2520   }
2521   case AMDGPU::G_AMDGPU_FFBH_U32:
2522   case AMDGPU::G_AMDGPU_FFBL_B32:
2523   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2524   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2525     const RegisterBank *DstBank =
2526         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2527     if (DstBank == &AMDGPU::SGPRRegBank)
2528       break;
2529 
2530     Register SrcReg = MI.getOperand(1).getReg();
2531     const LLT S32 = LLT::scalar(32);
2532     LLT Ty = MRI.getType(SrcReg);
2533     if (Ty == S32)
2534       break;
2535 
2536     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2537     // which return -1 when the input is zero:
2538     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2539     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2540     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2541     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2542     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2543     MachineIRBuilder B(MI, ApplyVALU);
2544     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2545     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2546                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2547                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2548                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2549                                 : Opc;
2550     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2551     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2552     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2553     unsigned AddOpc =
2554         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2555             ? AMDGPU::G_ADD
2556             : AMDGPU::G_UADDSAT;
2557     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2558     Register DstReg = MI.getOperand(0).getReg();
2559     B.buildUMin(DstReg, X, Y);
2560     MI.eraseFromParent();
2561     return;
2562   }
2563   case AMDGPU::G_SEXT:
2564   case AMDGPU::G_ZEXT:
2565   case AMDGPU::G_ANYEXT: {
2566     Register SrcReg = MI.getOperand(1).getReg();
2567     LLT SrcTy = MRI.getType(SrcReg);
2568     const bool Signed = Opc == AMDGPU::G_SEXT;
2569 
2570     assert(OpdMapper.getVRegs(1).empty());
2571 
2572     MachineIRBuilder B(MI);
2573     const RegisterBank *SrcBank =
2574       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2575 
2576     Register DstReg = MI.getOperand(0).getReg();
2577     LLT DstTy = MRI.getType(DstReg);
2578     if (DstTy.isScalar() &&
2579         SrcBank != &AMDGPU::SGPRRegBank &&
2580         SrcBank != &AMDGPU::VCCRegBank &&
2581         // FIXME: Should handle any type that round to s64 when irregular
2582         // breakdowns supported.
2583         DstTy.getSizeInBits() == 64 &&
2584         SrcTy.getSizeInBits() <= 32) {
2585       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2586 
2587       // Extend to 32-bit, and then extend the low half.
2588       if (Signed) {
2589         // TODO: Should really be buildSExtOrCopy
2590         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2591       } else if (Opc == AMDGPU::G_ZEXT) {
2592         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2593       } else {
2594         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2595       }
2596 
2597       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2598       MRI.setRegBank(DstReg, *SrcBank);
2599       MI.eraseFromParent();
2600       return;
2601     }
2602 
2603     if (SrcTy != LLT::scalar(1))
2604       return;
2605 
2606     // It is not legal to have a legalization artifact with a VCC source. Rather
2607     // than introducing a copy, insert the select we would have to select the
2608     // copy to.
2609     if (SrcBank == &AMDGPU::VCCRegBank) {
2610       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2611 
2612       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2613 
2614       unsigned DstSize = DstTy.getSizeInBits();
2615       // 64-bit select is SGPR only
2616       const bool UseSel64 = DstSize > 32 &&
2617         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2618 
2619       // TODO: Should s16 select be legal?
2620       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2621       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2622       auto False = B.buildConstant(SelType, 0);
2623 
2624       MRI.setRegBank(True.getReg(0), *DstBank);
2625       MRI.setRegBank(False.getReg(0), *DstBank);
2626       MRI.setRegBank(DstReg, *DstBank);
2627 
2628       if (DstSize > 32) {
2629         B.buildSelect(DefRegs[0], SrcReg, True, False);
2630         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2631       } else if (DstSize < 32) {
2632         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2633         MRI.setRegBank(Sel.getReg(0), *DstBank);
2634         B.buildTrunc(DstReg, Sel);
2635       } else {
2636         B.buildSelect(DstReg, SrcReg, True, False);
2637       }
2638 
2639       MI.eraseFromParent();
2640       return;
2641     }
2642 
2643     break;
2644   }
2645   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2646     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2647 
2648     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2649 
2650     Register DstReg = MI.getOperand(0).getReg();
2651     Register SrcReg = MI.getOperand(1).getReg();
2652 
2653     const LLT S32 = LLT::scalar(32);
2654     LLT DstTy = MRI.getType(DstReg);
2655     LLT SrcTy = MRI.getType(SrcReg);
2656 
2657     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2658       return;
2659 
2660     MachineIRBuilder B(MI);
2661 
2662     const ValueMapping &DstMapping
2663       = OpdMapper.getInstrMapping().getOperandMapping(0);
2664     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2665     const RegisterBank *SrcBank =
2666       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2667     const RegisterBank *IdxBank =
2668         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2669 
2670     Register BaseIdxReg;
2671     unsigned ConstOffset;
2672     std::tie(BaseIdxReg, ConstOffset) =
2673         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2674 
2675     // See if the index is an add of a constant which will be foldable by moving
2676     // the base register of the index later if this is going to be executed in a
2677     // waterfall loop. This is essentially to reassociate the add of a constant
2678     // with the readfirstlane.
2679     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2680                                    ConstOffset > 0 &&
2681                                    ConstOffset < SrcTy.getNumElements();
2682 
2683     // Move the base register. We'll re-insert the add later.
2684     if (ShouldMoveIndexIntoLoop)
2685       MI.getOperand(2).setReg(BaseIdxReg);
2686 
2687     // If this is a VGPR result only because the index was a VGPR result, the
2688     // actual indexing will be done on the SGPR source vector, which will
2689     // produce a scalar result. We need to copy to the VGPR result inside the
2690     // waterfall loop.
2691     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2692                                 SrcBank == &AMDGPU::SGPRRegBank;
2693     if (DstRegs.empty()) {
2694       applyDefaultMapping(OpdMapper);
2695 
2696       executeInWaterfallLoop(MI, MRI, { 2 });
2697 
2698       if (NeedCopyToVGPR) {
2699         // We don't want a phi for this temporary reg.
2700         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2701         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2702         MI.getOperand(0).setReg(TmpReg);
2703         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2704 
2705         // Use a v_mov_b32 here to make the exec dependency explicit.
2706         buildVCopy(B, DstReg, TmpReg);
2707       }
2708 
2709       // Re-insert the constant offset add inside the waterfall loop.
2710       if (ShouldMoveIndexIntoLoop)
2711         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2712 
2713       return;
2714     }
2715 
2716     assert(DstTy.getSizeInBits() == 64);
2717 
2718     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2719 
2720     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2721     auto One = B.buildConstant(S32, 1);
2722 
2723     MachineBasicBlock::iterator MII = MI.getIterator();
2724 
2725     // Split the vector index into 32-bit pieces. Prepare to move all of the
2726     // new instructions into a waterfall loop if necessary.
2727     //
2728     // Don't put the bitcast or constant in the loop.
2729     MachineInstrSpan Span(MII, &B.getMBB());
2730 
2731     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2732     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2733     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2734 
2735     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2736     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2737 
2738     MRI.setRegBank(DstReg, *DstBank);
2739     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2740     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2741     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2742     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2743 
2744     SmallSet<Register, 4> OpsToWaterfall;
2745     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2746       MI.eraseFromParent();
2747       return;
2748     }
2749 
2750     // Remove the original instruction to avoid potentially confusing the
2751     // waterfall loop logic.
2752     B.setInstr(*Span.begin());
2753     MI.eraseFromParent();
2754     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2755                            OpsToWaterfall, MRI);
2756 
2757     if (NeedCopyToVGPR) {
2758       MachineBasicBlock *LoopBB = Extract1->getParent();
2759       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2760       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2761       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2762       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2763 
2764       Extract0->getOperand(0).setReg(TmpReg0);
2765       Extract1->getOperand(0).setReg(TmpReg1);
2766 
2767       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2768 
2769       buildVCopy(B, DstRegs[0], TmpReg0);
2770       buildVCopy(B, DstRegs[1], TmpReg1);
2771     }
2772 
2773     if (ShouldMoveIndexIntoLoop)
2774       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2775 
2776     return;
2777   }
2778   case AMDGPU::G_INSERT_VECTOR_ELT: {
2779     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2780 
2781     Register DstReg = MI.getOperand(0).getReg();
2782     LLT VecTy = MRI.getType(DstReg);
2783 
2784     assert(OpdMapper.getVRegs(0).empty());
2785     assert(OpdMapper.getVRegs(3).empty());
2786 
2787     if (substituteSimpleCopyRegs(OpdMapper, 1))
2788       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2789 
2790     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2791       return;
2792 
2793     const RegisterBank *IdxBank =
2794       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2795 
2796     Register SrcReg = MI.getOperand(1).getReg();
2797     Register InsReg = MI.getOperand(2).getReg();
2798     LLT InsTy = MRI.getType(InsReg);
2799     (void)InsTy;
2800 
2801     Register BaseIdxReg;
2802     unsigned ConstOffset;
2803     std::tie(BaseIdxReg, ConstOffset) =
2804         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2805 
2806     // See if the index is an add of a constant which will be foldable by moving
2807     // the base register of the index later if this is going to be executed in a
2808     // waterfall loop. This is essentially to reassociate the add of a constant
2809     // with the readfirstlane.
2810     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2811       ConstOffset > 0 &&
2812       ConstOffset < VecTy.getNumElements();
2813 
2814     // Move the base register. We'll re-insert the add later.
2815     if (ShouldMoveIndexIntoLoop)
2816       MI.getOperand(3).setReg(BaseIdxReg);
2817 
2818 
2819     if (InsRegs.empty()) {
2820       executeInWaterfallLoop(MI, MRI, { 3 });
2821 
2822       // Re-insert the constant offset add inside the waterfall loop.
2823       if (ShouldMoveIndexIntoLoop) {
2824         MachineIRBuilder B(MI);
2825         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2826       }
2827 
2828       return;
2829     }
2830 
2831 
2832     assert(InsTy.getSizeInBits() == 64);
2833 
2834     const LLT S32 = LLT::scalar(32);
2835     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2836 
2837     MachineIRBuilder B(MI);
2838     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2839     auto One = B.buildConstant(S32, 1);
2840 
2841     // Split the vector index into 32-bit pieces. Prepare to move all of the
2842     // new instructions into a waterfall loop if necessary.
2843     //
2844     // Don't put the bitcast or constant in the loop.
2845     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2846 
2847     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2848     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2849     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2850 
2851     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2852     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2853 
2854     const RegisterBank *DstBank =
2855       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2856     const RegisterBank *SrcBank =
2857       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2858     const RegisterBank *InsSrcBank =
2859       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2860 
2861     MRI.setRegBank(InsReg, *InsSrcBank);
2862     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2863     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2864     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2865     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2866     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2867     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2868 
2869 
2870     SmallSet<Register, 4> OpsToWaterfall;
2871     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2872       B.setInsertPt(B.getMBB(), MI);
2873       B.buildBitcast(DstReg, InsHi);
2874       MI.eraseFromParent();
2875       return;
2876     }
2877 
2878     B.setInstr(*Span.begin());
2879     MI.eraseFromParent();
2880 
2881     // Figure out the point after the waterfall loop before mangling the control
2882     // flow.
2883     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2884                            OpsToWaterfall, MRI);
2885 
2886     // The insertion point is now right after the original instruction.
2887     //
2888     // Keep the bitcast to the original vector type out of the loop. Doing this
2889     // saved an extra phi we don't need inside the loop.
2890     B.buildBitcast(DstReg, InsHi);
2891 
2892     // Re-insert the constant offset add inside the waterfall loop.
2893     if (ShouldMoveIndexIntoLoop)
2894       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2895 
2896     return;
2897   }
2898   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2899   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2900   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2901   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2902   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2903   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2904   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
2905   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2906   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2907   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2908   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2909   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2910   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2911   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2912   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2913   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2914   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2915     applyDefaultMapping(OpdMapper);
2916     executeInWaterfallLoop(MI, MRI, {1, 4});
2917     return;
2918   }
2919   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2920   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2921   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2922   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2923   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2924   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2925   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2926   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2927   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2928   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2929   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2930   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2931     applyDefaultMapping(OpdMapper);
2932     executeInWaterfallLoop(MI, MRI, {2, 5});
2933     return;
2934   }
2935   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2936   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2937   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2938     applyDefaultMapping(OpdMapper);
2939     executeInWaterfallLoop(MI, MRI, {2, 5});
2940     return;
2941   }
2942   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2943     applyDefaultMapping(OpdMapper);
2944     executeInWaterfallLoop(MI, MRI, {3, 6});
2945     return;
2946   }
2947   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2948     applyMappingSBufferLoad(OpdMapper);
2949     return;
2950   }
2951   case AMDGPU::G_INTRINSIC: {
2952     switch (MI.getIntrinsicID()) {
2953     case Intrinsic::amdgcn_readlane: {
2954       substituteSimpleCopyRegs(OpdMapper, 2);
2955 
2956       assert(OpdMapper.getVRegs(0).empty());
2957       assert(OpdMapper.getVRegs(3).empty());
2958 
2959       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2960       // waterfall loop, so assume it's a uniform value.
2961       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2962       return;
2963     }
2964     case Intrinsic::amdgcn_writelane: {
2965       assert(OpdMapper.getVRegs(0).empty());
2966       assert(OpdMapper.getVRegs(2).empty());
2967       assert(OpdMapper.getVRegs(3).empty());
2968 
2969       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2970       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2971       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2972       return;
2973     }
2974     case Intrinsic::amdgcn_interp_p1:
2975     case Intrinsic::amdgcn_interp_p2:
2976     case Intrinsic::amdgcn_interp_mov:
2977     case Intrinsic::amdgcn_interp_p1_f16:
2978     case Intrinsic::amdgcn_interp_p2_f16:
2979     case Intrinsic::amdgcn_lds_param_load: {
2980       applyDefaultMapping(OpdMapper);
2981 
2982       // Readlane for m0 value, which is always the last operand.
2983       // FIXME: Should this be a waterfall loop instead?
2984       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2985       return;
2986     }
2987     case Intrinsic::amdgcn_interp_inreg_p10:
2988     case Intrinsic::amdgcn_interp_inreg_p2:
2989     case Intrinsic::amdgcn_interp_inreg_p10_f16:
2990     case Intrinsic::amdgcn_interp_inreg_p2_f16:
2991       applyDefaultMapping(OpdMapper);
2992       return;
2993     case Intrinsic::amdgcn_permlane16:
2994     case Intrinsic::amdgcn_permlanex16: {
2995       // Doing a waterfall loop over these wouldn't make any sense.
2996       substituteSimpleCopyRegs(OpdMapper, 2);
2997       substituteSimpleCopyRegs(OpdMapper, 3);
2998       constrainOpWithReadfirstlane(MI, MRI, 4);
2999       constrainOpWithReadfirstlane(MI, MRI, 5);
3000       return;
3001     }
3002     case Intrinsic::amdgcn_sbfe:
3003       applyMappingBFE(OpdMapper, true);
3004       return;
3005     case Intrinsic::amdgcn_ubfe:
3006       applyMappingBFE(OpdMapper, false);
3007       return;
3008     case Intrinsic::amdgcn_inverse_ballot:
3009       applyDefaultMapping(OpdMapper);
3010       constrainOpWithReadfirstlane(MI, MRI, 2); // Mask
3011       return;
3012     case Intrinsic::amdgcn_ballot:
3013       // Use default handling and insert copy to vcc source.
3014       break;
3015     }
3016     break;
3017   }
3018   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3019   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3020   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3021   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3022     const AMDGPU::RsrcIntrinsic *RSrcIntrin
3023       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3024     assert(RSrcIntrin && RSrcIntrin->IsImage);
3025     // Non-images can have complications from operands that allow both SGPR
3026     // and VGPR. For now it's too complicated to figure out the final opcode
3027     // to derive the register bank from the MCInstrDesc.
3028     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3029     return;
3030   }
3031   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3032     unsigned N = MI.getNumExplicitOperands() - 2;
3033     applyDefaultMapping(OpdMapper);
3034     executeInWaterfallLoop(MI, MRI, { N });
3035     return;
3036   }
3037   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3038     auto IntrID = MI.getIntrinsicID();
3039     switch (IntrID) {
3040     case Intrinsic::amdgcn_ds_ordered_add:
3041     case Intrinsic::amdgcn_ds_ordered_swap: {
3042       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3043       assert(OpdMapper.getVRegs(0).empty());
3044       substituteSimpleCopyRegs(OpdMapper, 3);
3045       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3046       return;
3047     }
3048     case Intrinsic::amdgcn_ds_gws_init:
3049     case Intrinsic::amdgcn_ds_gws_barrier:
3050     case Intrinsic::amdgcn_ds_gws_sema_br: {
3051       // Only the first lane is executes, so readfirstlane is safe.
3052       substituteSimpleCopyRegs(OpdMapper, 1);
3053       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3054       return;
3055     }
3056     case Intrinsic::amdgcn_ds_gws_sema_v:
3057     case Intrinsic::amdgcn_ds_gws_sema_p:
3058     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3059       // Only the first lane is executes, so readfirstlane is safe.
3060       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3061       return;
3062     }
3063     case Intrinsic::amdgcn_ds_append:
3064     case Intrinsic::amdgcn_ds_consume: {
3065       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3066       return;
3067     }
3068     case Intrinsic::amdgcn_s_sendmsg:
3069     case Intrinsic::amdgcn_s_sendmsghalt: {
3070       // FIXME: Should this use a waterfall loop?
3071       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3072       return;
3073     }
3074     case Intrinsic::amdgcn_s_setreg: {
3075       constrainOpWithReadfirstlane(MI, MRI, 2);
3076       return;
3077     }
3078     case Intrinsic::amdgcn_raw_buffer_load_lds:
3079     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3080       applyDefaultMapping(OpdMapper);
3081       constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3082       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3083       constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
3084       return;
3085     }
3086     case Intrinsic::amdgcn_struct_buffer_load_lds:
3087     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3088       applyDefaultMapping(OpdMapper);
3089       constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3090       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3091       constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
3092       return;
3093     }
3094     case Intrinsic::amdgcn_global_load_lds: {
3095       applyDefaultMapping(OpdMapper);
3096       constrainOpWithReadfirstlane(MI, MRI, 2);
3097       return;
3098     }
3099     case Intrinsic::amdgcn_lds_direct_load: {
3100       applyDefaultMapping(OpdMapper);
3101       // Readlane for m0 value, which is always the last operand.
3102       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3103       return;
3104     }
3105     case Intrinsic::amdgcn_exp_row:
3106       applyDefaultMapping(OpdMapper);
3107       constrainOpWithReadfirstlane(MI, MRI, 8); // M0
3108       return;
3109     default: {
3110       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3111               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3112         // Non-images can have complications from operands that allow both SGPR
3113         // and VGPR. For now it's too complicated to figure out the final opcode
3114         // to derive the register bank from the MCInstrDesc.
3115         if (RSrcIntrin->IsImage) {
3116           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3117           return;
3118         }
3119       }
3120 
3121       break;
3122     }
3123     }
3124     break;
3125   }
3126   case AMDGPU::G_SI_CALL: {
3127     // Use a set to avoid extra readfirstlanes in the case where multiple
3128     // operands are the same register.
3129     SmallSet<Register, 4> SGPROperandRegs;
3130 
3131     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3132       break;
3133 
3134     // Move all copies to physical SGPRs that are used by the call instruction
3135     // into the loop block. Start searching for these copies until the
3136     // ADJCALLSTACKUP.
3137     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3138     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3139 
3140     // Move all non-copies before the copies, so that a complete range can be
3141     // moved into the waterfall loop.
3142     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3143     // Count of NonCopyInstrs found until the current LastCopy.
3144     unsigned NonCopyInstrsLen = 0;
3145     MachineBasicBlock::iterator Start(&MI);
3146     MachineBasicBlock::iterator LastCopy = Start;
3147     MachineBasicBlock *MBB = MI.getParent();
3148     const SIMachineFunctionInfo *Info =
3149         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3150     while (Start->getOpcode() != FrameSetupOpcode) {
3151       --Start;
3152       bool IsCopy = false;
3153       if (Start->getOpcode() == AMDGPU::COPY) {
3154         auto &Dst = Start->getOperand(0);
3155         if (Dst.isReg()) {
3156           Register Reg = Dst.getReg();
3157           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3158             IsCopy = true;
3159           } else {
3160             // Also move the copy from the scratch rsrc descriptor into the loop
3161             // to allow it to be optimized away.
3162             auto &Src = Start->getOperand(1);
3163             if (Src.isReg()) {
3164               Reg = Src.getReg();
3165               IsCopy = Info->getScratchRSrcReg() == Reg;
3166             }
3167           }
3168         }
3169       }
3170 
3171       if (IsCopy) {
3172         LastCopy = Start;
3173         NonCopyInstrsLen = NonCopyInstrs.size();
3174       } else {
3175         NonCopyInstrs.push_back(&*Start);
3176       }
3177     }
3178     NonCopyInstrs.resize(NonCopyInstrsLen);
3179 
3180     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3181       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3182     }
3183     Start = LastCopy;
3184 
3185     // Do the same for copies after the loop
3186     NonCopyInstrs.clear();
3187     NonCopyInstrsLen = 0;
3188     MachineBasicBlock::iterator End(&MI);
3189     LastCopy = End;
3190     while (End->getOpcode() != FrameDestroyOpcode) {
3191       ++End;
3192       bool IsCopy = false;
3193       if (End->getOpcode() == AMDGPU::COPY) {
3194         auto &Src = End->getOperand(1);
3195         if (Src.isReg()) {
3196           Register Reg = Src.getReg();
3197           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3198         }
3199       }
3200 
3201       if (IsCopy) {
3202         LastCopy = End;
3203         NonCopyInstrsLen = NonCopyInstrs.size();
3204       } else {
3205         NonCopyInstrs.push_back(&*End);
3206       }
3207     }
3208     NonCopyInstrs.resize(NonCopyInstrsLen);
3209 
3210     End = LastCopy;
3211     ++LastCopy;
3212     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3213       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3214     }
3215 
3216     ++End;
3217     MachineIRBuilder B(*Start);
3218     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3219     break;
3220   }
3221   case AMDGPU::G_LOAD:
3222   case AMDGPU::G_ZEXTLOAD:
3223   case AMDGPU::G_SEXTLOAD: {
3224     if (applyMappingLoad(MI, OpdMapper, MRI))
3225       return;
3226     break;
3227   }
3228   case AMDGPU::G_DYN_STACKALLOC:
3229     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3230     return;
3231   case AMDGPU::G_SBFX:
3232     applyMappingBFE(OpdMapper, /*Signed*/ true);
3233     return;
3234   case AMDGPU::G_UBFX:
3235     applyMappingBFE(OpdMapper, /*Signed*/ false);
3236     return;
3237   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3238   case AMDGPU::G_AMDGPU_MAD_I64_I32:
3239     applyMappingMAD_64_32(OpdMapper);
3240     return;
3241   default:
3242     break;
3243   }
3244 
3245   return applyDefaultMapping(OpdMapper);
3246 }
3247 
3248 // vgpr, sgpr -> vgpr
3249 // vgpr, agpr -> vgpr
3250 // agpr, agpr -> agpr
3251 // agpr, sgpr -> vgpr
3252 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3253   if (RB0 == AMDGPU::InvalidRegBankID)
3254     return RB1;
3255   if (RB1 == AMDGPU::InvalidRegBankID)
3256     return RB0;
3257 
3258   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3259     return AMDGPU::SGPRRegBankID;
3260 
3261   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3262     return AMDGPU::AGPRRegBankID;
3263 
3264   return AMDGPU::VGPRRegBankID;
3265 }
3266 
3267 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3268   if (RB0 == AMDGPU::InvalidRegBankID)
3269     return RB1;
3270   if (RB1 == AMDGPU::InvalidRegBankID)
3271     return RB0;
3272 
3273   // vcc, vcc -> vcc
3274   // vcc, sgpr -> vcc
3275   // vcc, vgpr -> vcc
3276   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3277     return AMDGPU::VCCRegBankID;
3278 
3279   // vcc, vgpr -> vgpr
3280   return regBankUnion(RB0, RB1);
3281 }
3282 
3283 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3284                                                 const MachineInstr &MI) const {
3285   unsigned RegBank = AMDGPU::InvalidRegBankID;
3286 
3287   for (const MachineOperand &MO : MI.operands()) {
3288     if (!MO.isReg())
3289       continue;
3290     Register Reg = MO.getReg();
3291     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3292       RegBank = regBankUnion(RegBank, Bank->getID());
3293       if (RegBank == AMDGPU::VGPRRegBankID)
3294         break;
3295     }
3296   }
3297 
3298   return RegBank;
3299 }
3300 
3301 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3302   const MachineFunction &MF = *MI.getParent()->getParent();
3303   const MachineRegisterInfo &MRI = MF.getRegInfo();
3304   for (const MachineOperand &MO : MI.operands()) {
3305     if (!MO.isReg())
3306       continue;
3307     Register Reg = MO.getReg();
3308     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3309       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3310         return false;
3311     }
3312   }
3313   return true;
3314 }
3315 
3316 const RegisterBankInfo::InstructionMapping &
3317 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3318   const MachineFunction &MF = *MI.getParent()->getParent();
3319   const MachineRegisterInfo &MRI = MF.getRegInfo();
3320   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3321 
3322   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3323     const MachineOperand &SrcOp = MI.getOperand(i);
3324     if (!SrcOp.isReg())
3325       continue;
3326 
3327     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3328     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3329   }
3330   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3331                                MI.getNumOperands());
3332 }
3333 
3334 const RegisterBankInfo::InstructionMapping &
3335 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3336   const MachineFunction &MF = *MI.getParent()->getParent();
3337   const MachineRegisterInfo &MRI = MF.getRegInfo();
3338   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3339 
3340   // Even though we technically could use SGPRs, this would require knowledge of
3341   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3342   //
3343   // TODO: Unary ops are trivially OK, so accept SGPRs?
3344   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3345     const MachineOperand &Src = MI.getOperand(i);
3346     if (!Src.isReg())
3347       continue;
3348 
3349     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3350     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3351     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3352   }
3353 
3354   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3355                                MI.getNumOperands());
3356 }
3357 
3358 const RegisterBankInfo::InstructionMapping &
3359 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3360   const MachineFunction &MF = *MI.getParent()->getParent();
3361   const MachineRegisterInfo &MRI = MF.getRegInfo();
3362   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3363 
3364   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3365     const MachineOperand &Op = MI.getOperand(I);
3366     if (!Op.isReg())
3367       continue;
3368 
3369     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3370     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3371   }
3372 
3373   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3374                                MI.getNumOperands());
3375 }
3376 
3377 const RegisterBankInfo::InstructionMapping &
3378 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3379                                         const MachineInstr &MI,
3380                                         int RsrcIdx) const {
3381   // The reported argument index is relative to the IR intrinsic call arguments,
3382   // so we need to shift by the number of defs and the intrinsic ID.
3383   RsrcIdx += MI.getNumExplicitDefs() + 1;
3384 
3385   const int NumOps = MI.getNumOperands();
3386   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3387 
3388   // TODO: Should packed/unpacked D16 difference be reported here as part of
3389   // the value mapping?
3390   for (int I = 0; I != NumOps; ++I) {
3391     if (!MI.getOperand(I).isReg())
3392       continue;
3393 
3394     Register OpReg = MI.getOperand(I).getReg();
3395     // We replace some dead address operands with $noreg
3396     if (!OpReg)
3397       continue;
3398 
3399     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3400 
3401     // FIXME: Probably need a new intrinsic register bank searchable table to
3402     // handle arbitrary intrinsics easily.
3403     //
3404     // If this has a sampler, it immediately follows rsrc.
3405     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3406 
3407     if (MustBeSGPR) {
3408       // If this must be an SGPR, so we must report whatever it is as legal.
3409       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3410       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3411     } else {
3412       // Some operands must be VGPR, and these are easy to copy to.
3413       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3414     }
3415   }
3416 
3417   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3418 }
3419 
3420 /// Return the mapping for a pointer argument.
3421 const RegisterBankInfo::ValueMapping *
3422 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3423                                               Register PtrReg) const {
3424   LLT PtrTy = MRI.getType(PtrReg);
3425   unsigned Size = PtrTy.getSizeInBits();
3426   if (Subtarget.useFlatForGlobal() ||
3427       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3428     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3429 
3430   // If we're using MUBUF instructions for global memory, an SGPR base register
3431   // is possible. Otherwise this needs to be a VGPR.
3432   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3433   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3434 }
3435 
3436 const RegisterBankInfo::InstructionMapping &
3437 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3438 
3439   const MachineFunction &MF = *MI.getParent()->getParent();
3440   const MachineRegisterInfo &MRI = MF.getRegInfo();
3441   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3442   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3443   Register PtrReg = MI.getOperand(1).getReg();
3444   LLT PtrTy = MRI.getType(PtrReg);
3445   unsigned AS = PtrTy.getAddressSpace();
3446   unsigned PtrSize = PtrTy.getSizeInBits();
3447 
3448   const ValueMapping *ValMapping;
3449   const ValueMapping *PtrMapping;
3450 
3451   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3452 
3453   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3454     if (isScalarLoadLegal(MI)) {
3455       // We have a uniform instruction so we want to use an SMRD load
3456       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3457       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3458     } else {
3459       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3460 
3461       // If we're using MUBUF instructions for global memory, an SGPR base
3462       // register is possible. Otherwise this needs to be a VGPR.
3463       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3464         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3465 
3466       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3467     }
3468   } else {
3469     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3470     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3471   }
3472 
3473   OpdsMapping[0] = ValMapping;
3474   OpdsMapping[1] = PtrMapping;
3475   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3476       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3477   return Mapping;
3478 
3479   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3480   // handle that during instruction selection?
3481 }
3482 
3483 unsigned
3484 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3485                                      const MachineRegisterInfo &MRI,
3486                                      unsigned Default) const {
3487   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3488   return Bank ? Bank->getID() : Default;
3489 }
3490 
3491 const RegisterBankInfo::ValueMapping *
3492 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3493                                          const MachineRegisterInfo &MRI,
3494                                          const TargetRegisterInfo &TRI) const {
3495   // Lie and claim anything is legal, even though this needs to be an SGPR
3496   // applyMapping will have to deal with it as a waterfall loop.
3497   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3498   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3499   return AMDGPU::getValueMapping(Bank, Size);
3500 }
3501 
3502 const RegisterBankInfo::ValueMapping *
3503 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3504                                          const MachineRegisterInfo &MRI,
3505                                          const TargetRegisterInfo &TRI) const {
3506   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3507   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3508 }
3509 
3510 const RegisterBankInfo::ValueMapping *
3511 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3512                                          const MachineRegisterInfo &MRI,
3513                                          const TargetRegisterInfo &TRI) const {
3514   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3515   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3516 }
3517 
3518 ///
3519 /// This function must return a legal mapping, because
3520 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3521 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3522 /// VGPR to SGPR generated is illegal.
3523 ///
3524 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3525 // legal. These will be dealt with in applyMappingImpl.
3526 //
3527 const RegisterBankInfo::InstructionMapping &
3528 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3529   const MachineFunction &MF = *MI.getParent()->getParent();
3530   const MachineRegisterInfo &MRI = MF.getRegInfo();
3531 
3532   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3533     // The default logic bothers to analyze impossible alternative mappings. We
3534     // want the most straightforward mapping, so just directly handle this.
3535     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3536                                              *TRI);
3537     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3538                                              *TRI);
3539     assert(SrcBank && "src bank should have been assigned already");
3540     if (!DstBank)
3541       DstBank = SrcBank;
3542 
3543     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3544     if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3545         cannotCopy(*DstBank, *SrcBank, Size))
3546       return getInvalidInstructionMapping();
3547 
3548     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3549     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3550     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3551     OpdsMapping[0] = &ValMap;
3552     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3553       OpdsMapping[1] = &ValMap;
3554 
3555     return getInstructionMapping(
3556         1, /*Cost*/ 1,
3557         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3558   }
3559 
3560   if (MI.isRegSequence()) {
3561     // If any input is a VGPR, the result must be a VGPR. The default handling
3562     // assumes any copy between banks is legal.
3563     unsigned BankID = AMDGPU::SGPRRegBankID;
3564 
3565     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3566       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3567       // It doesn't make sense to use vcc or scc banks here, so just ignore
3568       // them.
3569       if (OpBank != AMDGPU::SGPRRegBankID) {
3570         BankID = AMDGPU::VGPRRegBankID;
3571         break;
3572       }
3573     }
3574     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3575 
3576     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3577     return getInstructionMapping(
3578         1, /*Cost*/ 1,
3579         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3580   }
3581 
3582   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3583   // properly.
3584   //
3585   // TODO: There are additional exec masking dependencies to analyze.
3586   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3587     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3588     Register DstReg = MI.getOperand(0).getReg();
3589 
3590     // Sometimes the result may have already been assigned a bank.
3591     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3592       ResultBank = DstBank->getID();
3593 
3594     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3595       Register Reg = MI.getOperand(I).getReg();
3596       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3597 
3598       // FIXME: Assuming VGPR for any undetermined inputs.
3599       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3600         ResultBank = AMDGPU::VGPRRegBankID;
3601         break;
3602       }
3603 
3604       // FIXME: Need to promote SGPR case to s32
3605       unsigned OpBank = Bank->getID();
3606       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3607     }
3608 
3609     assert(ResultBank != AMDGPU::InvalidRegBankID);
3610 
3611     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3612 
3613     const ValueMapping &ValMap =
3614         getValueMapping(0, Size, getRegBank(ResultBank));
3615     return getInstructionMapping(
3616         1, /*Cost*/ 1,
3617         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3618   }
3619 
3620   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3621   if (Mapping.isValid())
3622     return Mapping;
3623 
3624   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3625 
3626   switch (MI.getOpcode()) {
3627   default:
3628     return getInvalidInstructionMapping();
3629 
3630   case AMDGPU::G_AND:
3631   case AMDGPU::G_OR:
3632   case AMDGPU::G_XOR: {
3633     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3634     if (Size == 1) {
3635       const RegisterBank *DstBank
3636         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3637 
3638       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3639       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3640       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3641       if (DstBank) {
3642         TargetBankID = DstBank->getID();
3643         if (DstBank == &AMDGPU::VCCRegBank) {
3644           TargetBankID = AMDGPU::VCCRegBankID;
3645           BankLHS = AMDGPU::VCCRegBankID;
3646           BankRHS = AMDGPU::VCCRegBankID;
3647         } else {
3648           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3649                                  AMDGPU::SGPRRegBankID);
3650           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3651                                  AMDGPU::SGPRRegBankID);
3652         }
3653       } else {
3654         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3655                                AMDGPU::VCCRegBankID);
3656         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3657                                AMDGPU::VCCRegBankID);
3658 
3659         // Both inputs should be true booleans to produce a boolean result.
3660         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3661           TargetBankID = AMDGPU::VGPRRegBankID;
3662         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3663           TargetBankID = AMDGPU::VCCRegBankID;
3664           BankLHS = AMDGPU::VCCRegBankID;
3665           BankRHS = AMDGPU::VCCRegBankID;
3666         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3667           TargetBankID = AMDGPU::SGPRRegBankID;
3668         }
3669       }
3670 
3671       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3672       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3673       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3674       break;
3675     }
3676 
3677     if (Size == 64) {
3678 
3679       if (isSALUMapping(MI)) {
3680         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3681         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3682       } else {
3683         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3684         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3685         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3686 
3687         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3688         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3689       }
3690 
3691       break;
3692     }
3693 
3694     [[fallthrough]];
3695   }
3696   case AMDGPU::G_PTR_ADD:
3697   case AMDGPU::G_PTRMASK:
3698   case AMDGPU::G_ADD:
3699   case AMDGPU::G_SUB:
3700   case AMDGPU::G_MUL:
3701   case AMDGPU::G_SHL:
3702   case AMDGPU::G_LSHR:
3703   case AMDGPU::G_ASHR:
3704   case AMDGPU::G_UADDO:
3705   case AMDGPU::G_USUBO:
3706   case AMDGPU::G_UADDE:
3707   case AMDGPU::G_SADDE:
3708   case AMDGPU::G_USUBE:
3709   case AMDGPU::G_SSUBE:
3710   case AMDGPU::G_SMIN:
3711   case AMDGPU::G_SMAX:
3712   case AMDGPU::G_UMIN:
3713   case AMDGPU::G_UMAX:
3714   case AMDGPU::G_ABS:
3715   case AMDGPU::G_SHUFFLE_VECTOR:
3716   case AMDGPU::G_SBFX:
3717   case AMDGPU::G_UBFX:
3718     if (isSALUMapping(MI))
3719       return getDefaultMappingSOP(MI);
3720     [[fallthrough]];
3721 
3722   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3723   case AMDGPU::G_SSUBSAT:
3724   case AMDGPU::G_UADDSAT:
3725   case AMDGPU::G_USUBSAT:
3726   case AMDGPU::G_FADD:
3727   case AMDGPU::G_FSUB:
3728   case AMDGPU::G_FPTOSI:
3729   case AMDGPU::G_FPTOUI:
3730   case AMDGPU::G_FMUL:
3731   case AMDGPU::G_FMA:
3732   case AMDGPU::G_FMAD:
3733   case AMDGPU::G_FSQRT:
3734   case AMDGPU::G_FFLOOR:
3735   case AMDGPU::G_FCEIL:
3736   case AMDGPU::G_FRINT:
3737   case AMDGPU::G_SITOFP:
3738   case AMDGPU::G_UITOFP:
3739   case AMDGPU::G_FPTRUNC:
3740   case AMDGPU::G_FPEXT:
3741   case AMDGPU::G_FEXP2:
3742   case AMDGPU::G_FLOG2:
3743   case AMDGPU::G_FLDEXP:
3744   case AMDGPU::G_FMINNUM:
3745   case AMDGPU::G_FMAXNUM:
3746   case AMDGPU::G_FMINNUM_IEEE:
3747   case AMDGPU::G_FMAXNUM_IEEE:
3748   case AMDGPU::G_FCANONICALIZE:
3749   case AMDGPU::G_INTRINSIC_TRUNC:
3750   case AMDGPU::G_STRICT_FADD:
3751   case AMDGPU::G_STRICT_FSUB:
3752   case AMDGPU::G_STRICT_FMUL:
3753   case AMDGPU::G_STRICT_FMA:
3754   case AMDGPU::G_STRICT_FLDEXP:
3755   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3756   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3757   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3758   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3759   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3760   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3761   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3762   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3763   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3764   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3765   case AMDGPU::G_AMDGPU_SMED3:
3766   case AMDGPU::G_AMDGPU_FMED3:
3767     return getDefaultMappingVOP(MI);
3768   case AMDGPU::G_UMULH:
3769   case AMDGPU::G_SMULH: {
3770     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3771       return getDefaultMappingSOP(MI);
3772     return getDefaultMappingVOP(MI);
3773   }
3774   case AMDGPU::G_AMDGPU_MAD_U64_U32:
3775   case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3776     // Three possible mappings:
3777     //
3778     //  - Default SOP
3779     //  - Default VOP
3780     //  - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3781     //
3782     // This allows instruction selection to keep the multiplication part of the
3783     // instruction on the SALU.
3784     bool AllSalu = true;
3785     bool MulSalu = true;
3786     for (unsigned i = 0; i < 5; ++i) {
3787       Register Reg = MI.getOperand(i).getReg();
3788       if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3789         if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3790           AllSalu = false;
3791           if (i == 2 || i == 3) {
3792             MulSalu = false;
3793             break;
3794           }
3795         }
3796       }
3797     }
3798 
3799     if (AllSalu)
3800       return getDefaultMappingSOP(MI);
3801 
3802     // If the multiply-add is full-rate in VALU, use that even if the
3803     // multiplication part is scalar. Accumulating separately on the VALU would
3804     // take two instructions.
3805     if (!MulSalu || Subtarget.hasFullRate64Ops())
3806       return getDefaultMappingVOP(MI);
3807 
3808     // Keep the multiplication on the SALU, then accumulate on the VALU.
3809     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3810     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3811     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3812     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3813     OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3814     break;
3815   }
3816   case AMDGPU::G_IMPLICIT_DEF: {
3817     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3818     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3819     break;
3820   }
3821   case AMDGPU::G_FCONSTANT:
3822   case AMDGPU::G_CONSTANT:
3823   case AMDGPU::G_GLOBAL_VALUE:
3824   case AMDGPU::G_BLOCK_ADDR:
3825   case AMDGPU::G_READCYCLECOUNTER: {
3826     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3827     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3828     break;
3829   }
3830   case AMDGPU::G_FRAME_INDEX: {
3831     // TODO: This should be the same as other constants, but eliminateFrameIndex
3832     // currently assumes VALU uses.
3833     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3834     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3835     break;
3836   }
3837   case AMDGPU::G_DYN_STACKALLOC: {
3838     // Result is always uniform, and a wave reduction is needed for the source.
3839     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3840     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3841     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3842     break;
3843   }
3844   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3845     // This case is weird because we expect a physical register in the source,
3846     // but need to set a bank anyway.
3847     //
3848     // We could select the result to SGPR or VGPR, but for the one current use
3849     // it's more practical to always use VGPR.
3850     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3851     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3852     break;
3853   }
3854   case AMDGPU::G_INSERT: {
3855     unsigned BankID = getMappingType(MRI, MI);
3856     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3857     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3858     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3859     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3860     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3861     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3862     OpdsMapping[3] = nullptr;
3863     break;
3864   }
3865   case AMDGPU::G_EXTRACT: {
3866     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3867     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3868     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3869     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3870     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3871     OpdsMapping[2] = nullptr;
3872     break;
3873   }
3874   case AMDGPU::G_BUILD_VECTOR:
3875   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3876     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3877     if (DstTy == LLT::fixed_vector(2, 16)) {
3878       unsigned DstSize = DstTy.getSizeInBits();
3879       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3880       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3881       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3882       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3883 
3884       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3885       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3886       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3887       break;
3888     }
3889 
3890     [[fallthrough]];
3891   }
3892   case AMDGPU::G_MERGE_VALUES:
3893   case AMDGPU::G_CONCAT_VECTORS: {
3894     unsigned Bank = getMappingType(MRI, MI);
3895     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3896     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3897 
3898     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3899     // Op1 and Dst should use the same register bank.
3900     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3901       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3902     break;
3903   }
3904   case AMDGPU::G_BITREVERSE:
3905   case AMDGPU::G_BITCAST:
3906   case AMDGPU::G_INTTOPTR:
3907   case AMDGPU::G_PTRTOINT:
3908   case AMDGPU::G_FABS:
3909   case AMDGPU::G_FNEG: {
3910     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3911     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3912     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3913     break;
3914   }
3915   case AMDGPU::G_AMDGPU_FFBH_U32:
3916   case AMDGPU::G_AMDGPU_FFBL_B32:
3917   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3918   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3919     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3920     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3921     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3922     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3923     break;
3924   }
3925   case AMDGPU::G_CTPOP: {
3926     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3927     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3928     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3929 
3930     // This should really be getValueMappingSGPR64Only, but allowing the generic
3931     // code to handle the register split just makes using LegalizerHelper more
3932     // difficult.
3933     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3934     break;
3935   }
3936   case AMDGPU::G_TRUNC: {
3937     Register Dst = MI.getOperand(0).getReg();
3938     Register Src = MI.getOperand(1).getReg();
3939     unsigned Bank = getRegBankID(Src, MRI);
3940     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3941     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3942     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3943     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3944     break;
3945   }
3946   case AMDGPU::G_ZEXT:
3947   case AMDGPU::G_SEXT:
3948   case AMDGPU::G_ANYEXT:
3949   case AMDGPU::G_SEXT_INREG: {
3950     Register Dst = MI.getOperand(0).getReg();
3951     Register Src = MI.getOperand(1).getReg();
3952     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3953     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3954 
3955     unsigned DstBank;
3956     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3957     assert(SrcBank);
3958     switch (SrcBank->getID()) {
3959     case AMDGPU::SGPRRegBankID:
3960       DstBank = AMDGPU::SGPRRegBankID;
3961       break;
3962     default:
3963       DstBank = AMDGPU::VGPRRegBankID;
3964       break;
3965     }
3966 
3967     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3968     // 32-bits, and then to 64.
3969     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3970     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3971                                                        SrcSize);
3972     break;
3973   }
3974   case AMDGPU::G_FCMP: {
3975     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3976     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3977     OpdsMapping[1] = nullptr; // Predicate Operand.
3978     OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3979     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3980     break;
3981   }
3982   case AMDGPU::G_IS_FPCLASS: {
3983     Register SrcReg = MI.getOperand(1).getReg();
3984     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3985     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3986     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3987     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3988     break;
3989   }
3990   case AMDGPU::G_STORE: {
3991     assert(MI.getOperand(0).isReg());
3992     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3993 
3994     // FIXME: We need to specify a different reg bank once scalar stores are
3995     // supported.
3996     const ValueMapping *ValMapping =
3997         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3998     OpdsMapping[0] = ValMapping;
3999     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4000     break;
4001   }
4002   case AMDGPU::G_ICMP: {
4003     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4004     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4005 
4006     // See if the result register has already been constrained to vcc, which may
4007     // happen due to control flow intrinsic lowering.
4008     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4009                                     AMDGPU::SGPRRegBankID);
4010     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4011     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4012 
4013     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4014                      Op2Bank == AMDGPU::SGPRRegBankID &&
4015                      Op3Bank == AMDGPU::SGPRRegBankID &&
4016       (Size == 32 || (Size == 64 &&
4017                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4018                       Subtarget.hasScalarCompareEq64()));
4019 
4020     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4021     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4022 
4023     // TODO: Use 32-bit for scalar output size.
4024     // SCC results will need to be copied to a 32-bit SGPR virtual register.
4025     const unsigned ResultSize = 1;
4026 
4027     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4028     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4029     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4030     break;
4031   }
4032   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4033     // VGPR index can be used for waterfall when indexing a SGPR vector.
4034     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4035     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4036     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4037     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4038     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4039     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4040 
4041     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4042     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4043 
4044     // The index can be either if the source vector is VGPR.
4045     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4046     break;
4047   }
4048   case AMDGPU::G_INSERT_VECTOR_ELT: {
4049     unsigned OutputBankID = isSALUMapping(MI) ?
4050       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4051 
4052     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4053     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4054     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4055     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4056     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4057 
4058     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4059     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4060 
4061     // This is a weird case, because we need to break down the mapping based on
4062     // the register bank of a different operand.
4063     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4064       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4065                                                       InsertSize);
4066     } else {
4067       assert(InsertSize == 32 || InsertSize == 64);
4068       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4069     }
4070 
4071     // The index can be either if the source vector is VGPR.
4072     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4073     break;
4074   }
4075   case AMDGPU::G_UNMERGE_VALUES: {
4076     unsigned Bank = getMappingType(MRI, MI);
4077 
4078     // Op1 and Dst should use the same register bank.
4079     // FIXME: Shouldn't this be the default? Why do we need to handle this?
4080     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4081       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4082       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4083     }
4084     break;
4085   }
4086   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4087   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4088   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4089   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4090   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4091   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4092   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4093   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4094   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4095   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4096   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4097   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4098   case AMDGPU::G_AMDGPU_BUFFER_STORE:
4099   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4100   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4101   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4102   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4103     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4104 
4105     // rsrc
4106     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4107 
4108     // vindex
4109     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4110 
4111     // voffset
4112     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4113 
4114     // soffset
4115     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4116 
4117     // Any remaining operands are immediates and were correctly null
4118     // initialized.
4119     break;
4120   }
4121   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4122   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4123   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4124   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4125   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4126   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4127   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4128   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4129   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4130   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4131   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4132   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4133   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4134   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4135   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4136     // vdata_out
4137     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4138 
4139     // vdata_in
4140     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4141 
4142     // rsrc
4143     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4144 
4145     // vindex
4146     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4147 
4148     // voffset
4149     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4150 
4151     // soffset
4152     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4153 
4154     // Any remaining operands are immediates and were correctly null
4155     // initialized.
4156     break;
4157   }
4158   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4159     // vdata_out
4160     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4161 
4162     // vdata_in
4163     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4164 
4165     // cmp
4166     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4167 
4168     // rsrc
4169     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4170 
4171     // vindex
4172     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4173 
4174     // voffset
4175     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4176 
4177     // soffset
4178     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4179 
4180     // Any remaining operands are immediates and were correctly null
4181     // initialized.
4182     break;
4183   }
4184   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4185     // Lie and claim everything is legal, even though some need to be
4186     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4187     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4188     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4189 
4190     // We need to convert this to a MUBUF if either the resource of offset is
4191     // VGPR.
4192     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4193     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4194     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4195 
4196     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4197     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4198     break;
4199   }
4200   case AMDGPU::G_INTRINSIC: {
4201     switch (MI.getIntrinsicID()) {
4202     default:
4203       return getInvalidInstructionMapping();
4204     case Intrinsic::amdgcn_div_fmas:
4205     case Intrinsic::amdgcn_div_fixup:
4206     case Intrinsic::amdgcn_trig_preop:
4207     case Intrinsic::amdgcn_sin:
4208     case Intrinsic::amdgcn_cos:
4209     case Intrinsic::amdgcn_log_clamp:
4210     case Intrinsic::amdgcn_log:
4211     case Intrinsic::amdgcn_exp2:
4212     case Intrinsic::amdgcn_rcp:
4213     case Intrinsic::amdgcn_rcp_legacy:
4214     case Intrinsic::amdgcn_sqrt:
4215     case Intrinsic::amdgcn_rsq:
4216     case Intrinsic::amdgcn_rsq_legacy:
4217     case Intrinsic::amdgcn_rsq_clamp:
4218     case Intrinsic::amdgcn_fmul_legacy:
4219     case Intrinsic::amdgcn_fma_legacy:
4220     case Intrinsic::amdgcn_frexp_mant:
4221     case Intrinsic::amdgcn_frexp_exp:
4222     case Intrinsic::amdgcn_fract:
4223     case Intrinsic::amdgcn_cvt_pkrtz:
4224     case Intrinsic::amdgcn_cvt_pknorm_i16:
4225     case Intrinsic::amdgcn_cvt_pknorm_u16:
4226     case Intrinsic::amdgcn_cvt_pk_i16:
4227     case Intrinsic::amdgcn_cvt_pk_u16:
4228     case Intrinsic::amdgcn_fmed3:
4229     case Intrinsic::amdgcn_cubeid:
4230     case Intrinsic::amdgcn_cubema:
4231     case Intrinsic::amdgcn_cubesc:
4232     case Intrinsic::amdgcn_cubetc:
4233     case Intrinsic::amdgcn_sffbh:
4234     case Intrinsic::amdgcn_fmad_ftz:
4235     case Intrinsic::amdgcn_mbcnt_lo:
4236     case Intrinsic::amdgcn_mbcnt_hi:
4237     case Intrinsic::amdgcn_mul_u24:
4238     case Intrinsic::amdgcn_mul_i24:
4239     case Intrinsic::amdgcn_mulhi_u24:
4240     case Intrinsic::amdgcn_mulhi_i24:
4241     case Intrinsic::amdgcn_lerp:
4242     case Intrinsic::amdgcn_sad_u8:
4243     case Intrinsic::amdgcn_msad_u8:
4244     case Intrinsic::amdgcn_sad_hi_u8:
4245     case Intrinsic::amdgcn_sad_u16:
4246     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4247     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4248     case Intrinsic::amdgcn_mqsad_u32_u8:
4249     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4250     case Intrinsic::amdgcn_alignbyte:
4251     case Intrinsic::amdgcn_perm:
4252     case Intrinsic::amdgcn_fdot2:
4253     case Intrinsic::amdgcn_sdot2:
4254     case Intrinsic::amdgcn_udot2:
4255     case Intrinsic::amdgcn_sdot4:
4256     case Intrinsic::amdgcn_udot4:
4257     case Intrinsic::amdgcn_sdot8:
4258     case Intrinsic::amdgcn_udot8:
4259     case Intrinsic::amdgcn_fdot2_bf16_bf16:
4260     case Intrinsic::amdgcn_fdot2_f16_f16:
4261     case Intrinsic::amdgcn_fdot2_f32_bf16:
4262     case Intrinsic::amdgcn_sudot4:
4263     case Intrinsic::amdgcn_sudot8:
4264     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4265     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4266     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4267     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4268     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4269     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4270       return getDefaultMappingVOP(MI);
4271     case Intrinsic::amdgcn_sbfe:
4272     case Intrinsic::amdgcn_ubfe:
4273       if (isSALUMapping(MI))
4274         return getDefaultMappingSOP(MI);
4275       return getDefaultMappingVOP(MI);
4276     case Intrinsic::amdgcn_ds_swizzle:
4277     case Intrinsic::amdgcn_ds_permute:
4278     case Intrinsic::amdgcn_ds_bpermute:
4279     case Intrinsic::amdgcn_update_dpp:
4280     case Intrinsic::amdgcn_mov_dpp8:
4281     case Intrinsic::amdgcn_mov_dpp:
4282     case Intrinsic::amdgcn_strict_wwm:
4283     case Intrinsic::amdgcn_wwm:
4284     case Intrinsic::amdgcn_strict_wqm:
4285     case Intrinsic::amdgcn_wqm:
4286     case Intrinsic::amdgcn_softwqm:
4287     case Intrinsic::amdgcn_set_inactive:
4288     case Intrinsic::amdgcn_permlane64:
4289       return getDefaultMappingAllVGPR(MI);
4290     case Intrinsic::amdgcn_kernarg_segment_ptr:
4291     case Intrinsic::amdgcn_s_getpc:
4292     case Intrinsic::amdgcn_groupstaticsize:
4293     case Intrinsic::amdgcn_reloc_constant:
4294     case Intrinsic::returnaddress: {
4295       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4296       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4297       break;
4298     }
4299     case Intrinsic::amdgcn_wqm_vote: {
4300       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4301       OpdsMapping[0] = OpdsMapping[2]
4302         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4303       break;
4304     }
4305     case Intrinsic::amdgcn_ps_live: {
4306       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4307       break;
4308     }
4309     case Intrinsic::amdgcn_div_scale: {
4310       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4311       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4312       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4313       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4314 
4315       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4316       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4317       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4318       break;
4319     }
4320     case Intrinsic::amdgcn_class: {
4321       Register Src0Reg = MI.getOperand(2).getReg();
4322       Register Src1Reg = MI.getOperand(3).getReg();
4323       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4324       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4325       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4326       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4327       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4328       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4329       break;
4330     }
4331     case Intrinsic::amdgcn_icmp:
4332     case Intrinsic::amdgcn_fcmp: {
4333       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4334       // This is not VCCRegBank because this is not used in boolean contexts.
4335       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4336       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4337       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4338       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4339       break;
4340     }
4341     case Intrinsic::amdgcn_readlane: {
4342       // This must be an SGPR, but accept a VGPR.
4343       Register IdxReg = MI.getOperand(3).getReg();
4344       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4345       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4346       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4347       [[fallthrough]];
4348     }
4349     case Intrinsic::amdgcn_readfirstlane: {
4350       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4351       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4352       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4353       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4354       break;
4355     }
4356     case Intrinsic::amdgcn_writelane: {
4357       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4358       Register SrcReg = MI.getOperand(2).getReg();
4359       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4360       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4361       Register IdxReg = MI.getOperand(3).getReg();
4362       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4363       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4364       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4365 
4366       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4367       // to legalize.
4368       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4369       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4370       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4371       break;
4372     }
4373     case Intrinsic::amdgcn_if_break: {
4374       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4375       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4376       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4377       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4378       break;
4379     }
4380     case Intrinsic::amdgcn_permlane16:
4381     case Intrinsic::amdgcn_permlanex16: {
4382       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4383       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4384       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4385       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4386       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4387       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4388       break;
4389     }
4390     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4391     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4392     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4393     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4394     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4395     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4396     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4397     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4398     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4399     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4400     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4401     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4402     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4403     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4404     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4405     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4406     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4407     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4408     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4409     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4410     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4411     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4412     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4413     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4414     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4415     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4416     case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4417     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4418     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4419     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4420     case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4421     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4422     case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4423     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4424     case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4425     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4426     case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4427     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4428     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4429       // Default for MAI intrinsics.
4430       // srcC can also be an immediate which can be folded later.
4431       // FIXME: Should we eventually add an alternative mapping with AGPR src
4432       // for srcA/srcB?
4433       //
4434       // vdst, srcA, srcB, srcC
4435       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4436       OpdsMapping[0] =
4437           Info->mayNeedAGPRs()
4438               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4439               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4440       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4441       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4442       OpdsMapping[4] =
4443           Info->mayNeedAGPRs()
4444               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4445               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4446       break;
4447     }
4448     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4449     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4450     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4451     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4452     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4453     case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4454     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4455     case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4456     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4457     case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4458     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4459     case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4460     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4461     case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4462       // vdst, srcA, srcB, srcC, idx
4463       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4464       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4465       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4466       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4467       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4468       break;
4469     }
4470     case Intrinsic::amdgcn_interp_p1:
4471     case Intrinsic::amdgcn_interp_p2:
4472     case Intrinsic::amdgcn_interp_mov:
4473     case Intrinsic::amdgcn_interp_p1_f16:
4474     case Intrinsic::amdgcn_interp_p2_f16:
4475     case Intrinsic::amdgcn_lds_param_load: {
4476       const int M0Idx = MI.getNumOperands() - 1;
4477       Register M0Reg = MI.getOperand(M0Idx).getReg();
4478       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4479       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4480 
4481       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4482       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4483         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4484 
4485       // Must be SGPR, but we must take whatever the original bank is and fix it
4486       // later.
4487       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4488       break;
4489     }
4490     case Intrinsic::amdgcn_interp_inreg_p10:
4491     case Intrinsic::amdgcn_interp_inreg_p2:
4492     case Intrinsic::amdgcn_interp_inreg_p10_f16:
4493     case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4494       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4495       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4496       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4497       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4498       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4499       break;
4500     }
4501     case Intrinsic::amdgcn_ballot: {
4502       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4503       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4504       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4505       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4506       break;
4507     }
4508     case Intrinsic::amdgcn_inverse_ballot: {
4509       // This must be an SGPR, but accept a VGPR.
4510       Register MaskReg = MI.getOperand(2).getReg();
4511       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4512       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4513       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4514       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4515       break;
4516     }
4517     case Intrinsic::amdgcn_wave_reduce_umin:
4518     case Intrinsic::amdgcn_wave_reduce_umax: {
4519       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4520       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4521       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4522       auto regBankID =
4523           isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4524       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
4525       break;
4526     }
4527     }
4528     break;
4529   }
4530   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4531   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4532   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4533   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4534     auto IntrID = MI.getIntrinsicID();
4535     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4536     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4537     // Non-images can have complications from operands that allow both SGPR
4538     // and VGPR. For now it's too complicated to figure out the final opcode
4539     // to derive the register bank from the MCInstrDesc.
4540     assert(RSrcIntrin->IsImage);
4541     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4542   }
4543   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4544     unsigned N = MI.getNumExplicitOperands() - 2;
4545     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4546     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4547     if (N == 3) {
4548       // Sequential form: all operands combined into VGPR256/VGPR512
4549       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4550       if (Size > 256)
4551         Size = 512;
4552       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4553     } else {
4554       // NSA form
4555       for (unsigned I = 2; I < N; ++I) {
4556         unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4557         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4558       }
4559     }
4560     break;
4561   }
4562   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4563     auto IntrID = MI.getIntrinsicID();
4564     switch (IntrID) {
4565     case Intrinsic::amdgcn_s_getreg:
4566     case Intrinsic::amdgcn_s_memtime:
4567     case Intrinsic::amdgcn_s_memrealtime:
4568     case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4569     case Intrinsic::amdgcn_s_sendmsg_rtn: {
4570       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4571       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4572       break;
4573     }
4574     case Intrinsic::amdgcn_global_atomic_fadd:
4575     case Intrinsic::amdgcn_global_atomic_csub:
4576     case Intrinsic::amdgcn_global_atomic_fmin:
4577     case Intrinsic::amdgcn_global_atomic_fmax:
4578     case Intrinsic::amdgcn_flat_atomic_fadd:
4579     case Intrinsic::amdgcn_flat_atomic_fmin:
4580     case Intrinsic::amdgcn_flat_atomic_fmax:
4581     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4582     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4583       return getDefaultMappingAllVGPR(MI);
4584     case Intrinsic::amdgcn_ds_ordered_add:
4585     case Intrinsic::amdgcn_ds_ordered_swap:
4586     case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4587       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4588       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4589       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4590                                  AMDGPU::SGPRRegBankID);
4591       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4592       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4593       break;
4594     }
4595     case Intrinsic::amdgcn_ds_append:
4596     case Intrinsic::amdgcn_ds_consume: {
4597       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4598       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4599       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4600       break;
4601     }
4602     case Intrinsic::amdgcn_exp_compr:
4603       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4604       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4605       break;
4606     case Intrinsic::amdgcn_exp:
4607       // FIXME: Could we support packed types here?
4608       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4609       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4610       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4611       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4612       break;
4613     case Intrinsic::amdgcn_exp_row:
4614       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4615       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4616       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4617       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4618       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4619       break;
4620     case Intrinsic::amdgcn_s_sendmsg:
4621     case Intrinsic::amdgcn_s_sendmsghalt: {
4622       // This must be an SGPR, but accept a VGPR.
4623       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4624                                    AMDGPU::SGPRRegBankID);
4625       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4626       break;
4627     }
4628     case Intrinsic::amdgcn_s_setreg: {
4629       // This must be an SGPR, but accept a VGPR.
4630       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4631                                    AMDGPU::SGPRRegBankID);
4632       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4633       break;
4634     }
4635     case Intrinsic::amdgcn_end_cf: {
4636       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4637       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4638       break;
4639     }
4640     case Intrinsic::amdgcn_else: {
4641       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4642       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4643       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4644       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4645       break;
4646     }
4647     case Intrinsic::amdgcn_live_mask: {
4648       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4649       break;
4650     }
4651     case Intrinsic::amdgcn_wqm_demote:
4652     case Intrinsic::amdgcn_kill: {
4653       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4654       break;
4655     }
4656     case Intrinsic::amdgcn_raw_buffer_load:
4657     case Intrinsic::amdgcn_raw_ptr_buffer_load:
4658     case Intrinsic::amdgcn_raw_tbuffer_load:
4659     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4660       // FIXME: Should make intrinsic ID the last operand of the instruction,
4661       // then this would be the same as store
4662       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4663       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4664       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4665       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4666       break;
4667     }
4668     case Intrinsic::amdgcn_raw_buffer_load_lds:
4669     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
4670       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4671       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4672       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4673       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4674       break;
4675     }
4676     case Intrinsic::amdgcn_raw_buffer_store:
4677     case Intrinsic::amdgcn_raw_ptr_buffer_store:
4678     case Intrinsic::amdgcn_raw_buffer_store_format:
4679     case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
4680     case Intrinsic::amdgcn_raw_tbuffer_store:
4681     case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
4682       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4683       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4684       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4685       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4686       break;
4687     }
4688     case Intrinsic::amdgcn_struct_buffer_load:
4689     case Intrinsic::amdgcn_struct_ptr_buffer_load:
4690     case Intrinsic::amdgcn_struct_tbuffer_load:
4691     case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
4692       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4693       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4694       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4695       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4696       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4697       break;
4698     }
4699     case Intrinsic::amdgcn_struct_buffer_load_lds:
4700     case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
4701       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4702       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4703       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4704       OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4705       OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4706       break;
4707     }
4708     case Intrinsic::amdgcn_struct_buffer_store:
4709     case Intrinsic::amdgcn_struct_ptr_buffer_store:
4710     case Intrinsic::amdgcn_struct_tbuffer_store:
4711     case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
4712       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4713       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4714       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4715       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4716       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4717       break;
4718     }
4719     case Intrinsic::amdgcn_init_exec_from_input: {
4720       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4721       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4722       break;
4723     }
4724     case Intrinsic::amdgcn_ds_gws_init:
4725     case Intrinsic::amdgcn_ds_gws_barrier:
4726     case Intrinsic::amdgcn_ds_gws_sema_br: {
4727       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4728 
4729       // This must be an SGPR, but accept a VGPR.
4730       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4731                                    AMDGPU::SGPRRegBankID);
4732       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4733       break;
4734     }
4735     case Intrinsic::amdgcn_ds_gws_sema_v:
4736     case Intrinsic::amdgcn_ds_gws_sema_p:
4737     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4738       // This must be an SGPR, but accept a VGPR.
4739       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4740                                    AMDGPU::SGPRRegBankID);
4741       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4742       break;
4743     }
4744     case Intrinsic::amdgcn_global_load_lds: {
4745       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4746       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4747       break;
4748     }
4749     case Intrinsic::amdgcn_lds_direct_load: {
4750       const int M0Idx = MI.getNumOperands() - 1;
4751       Register M0Reg = MI.getOperand(M0Idx).getReg();
4752       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4753       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4754 
4755       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4756       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4757         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4758 
4759       // Must be SGPR, but we must take whatever the original bank is and fix it
4760       // later.
4761       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4762       break;
4763     }
4764     case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
4765     case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
4766       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4767       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4768       break;
4769     case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
4770       OpdsMapping[0] =
4771           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
4772       OpdsMapping[1] =
4773           getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
4774       OpdsMapping[3] =
4775           getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
4776       OpdsMapping[4] =
4777           getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
4778       OpdsMapping[5] =
4779           getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
4780       break;
4781     }
4782 
4783     default:
4784       return getInvalidInstructionMapping();
4785     }
4786     break;
4787   }
4788   case AMDGPU::G_SELECT: {
4789     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4790     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4791                                     AMDGPU::SGPRRegBankID);
4792     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4793                                     AMDGPU::SGPRRegBankID);
4794     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4795                     Op3Bank == AMDGPU::SGPRRegBankID;
4796 
4797     unsigned CondBankDefault = SGPRSrcs ?
4798       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4799     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4800                                      CondBankDefault);
4801     if (CondBank == AMDGPU::SGPRRegBankID)
4802       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4803     else if (CondBank == AMDGPU::VGPRRegBankID)
4804       CondBank = AMDGPU::VCCRegBankID;
4805 
4806     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4807       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4808 
4809     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4810 
4811     // TODO: Should report 32-bit for scalar condition type.
4812     if (Size == 64) {
4813       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4814       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4815       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4816       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4817     } else {
4818       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4819       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4820       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4821       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4822     }
4823 
4824     break;
4825   }
4826 
4827   case AMDGPU::G_SI_CALL: {
4828     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4829     // Lie and claim everything is legal, even though some need to be
4830     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4831     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4832 
4833     // Allow anything for implicit arguments
4834     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4835       if (MI.getOperand(I).isReg()) {
4836         Register Reg = MI.getOperand(I).getReg();
4837         auto OpBank = getRegBankID(Reg, MRI);
4838         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4839         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4840       }
4841     }
4842     break;
4843   }
4844   case AMDGPU::G_LOAD:
4845   case AMDGPU::G_ZEXTLOAD:
4846   case AMDGPU::G_SEXTLOAD:
4847     return getInstrMappingForLoad(MI);
4848 
4849   case AMDGPU::G_ATOMICRMW_XCHG:
4850   case AMDGPU::G_ATOMICRMW_ADD:
4851   case AMDGPU::G_ATOMICRMW_SUB:
4852   case AMDGPU::G_ATOMICRMW_AND:
4853   case AMDGPU::G_ATOMICRMW_OR:
4854   case AMDGPU::G_ATOMICRMW_XOR:
4855   case AMDGPU::G_ATOMICRMW_MAX:
4856   case AMDGPU::G_ATOMICRMW_MIN:
4857   case AMDGPU::G_ATOMICRMW_UMAX:
4858   case AMDGPU::G_ATOMICRMW_UMIN:
4859   case AMDGPU::G_ATOMICRMW_FADD:
4860   case AMDGPU::G_ATOMICRMW_UINC_WRAP:
4861   case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
4862   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4863   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4864   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4865     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4866     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4867     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4868     break;
4869   }
4870   case AMDGPU::G_ATOMIC_CMPXCHG: {
4871     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4872     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4873     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4874     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4875     break;
4876   }
4877   case AMDGPU::G_BRCOND: {
4878     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4879                                  AMDGPU::SGPRRegBankID);
4880     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4881     if (Bank != AMDGPU::SGPRRegBankID)
4882       Bank = AMDGPU::VCCRegBankID;
4883 
4884     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4885     break;
4886   }
4887   case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
4888   case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
4889     return getDefaultMappingVOP(MI);
4890   }
4891 
4892   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4893                                getOperandsMapping(OpdsMapping),
4894                                MI.getNumOperands());
4895 }
4896