1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPUGlobalISelUtils.h"
74 #include "AMDGPUInstrInfo.h"
75 #include "AMDGPUSubtarget.h"
76 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
84 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
85 #include "llvm/CodeGen/TargetRegisterInfo.h"
86 #include "llvm/CodeGen/TargetSubtargetInfo.h"
87 #include "llvm/IR/Constants.h"
88 
89 #define GET_TARGET_REGBANK_IMPL
90 #include "AMDGPUGenRegisterBank.inc"
91 
92 // This file will be TableGen'ed at some point.
93 #include "AMDGPUGenRegisterBankInfo.def"
94 
95 using namespace llvm;
96 using namespace MIPatternMatch;
97 
98 namespace {
99 
100 // Observer to apply a register bank to new registers created by LegalizerHelper.
101 class ApplyRegBankMapping final : public GISelChangeObserver {
102 private:
103   const AMDGPURegisterBankInfo &RBI;
104   MachineRegisterInfo &MRI;
105   const RegisterBank *NewBank;
106   SmallVector<MachineInstr *, 4> NewInsts;
107 
108 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)109   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
110                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
111     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
112 
~ApplyRegBankMapping()113   ~ApplyRegBankMapping() {
114     for (MachineInstr *MI : NewInsts)
115       applyBank(*MI);
116   }
117 
118   /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)119   void applyBank(MachineInstr &MI) {
120     const unsigned Opc = MI.getOpcode();
121     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
122         Opc == AMDGPU::G_SEXT) {
123       // LegalizerHelper wants to use the basic legalization artifacts when
124       // widening etc. We don't handle selection with vcc in artifact sources,
125       // so we need to use a sslect instead to handle these properly.
126       Register DstReg = MI.getOperand(0).getReg();
127       Register SrcReg = MI.getOperand(1).getReg();
128       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
129       if (SrcBank == &AMDGPU::VCCRegBank) {
130         const LLT S32 = LLT::scalar(32);
131         assert(MRI.getType(SrcReg) == LLT::scalar(1));
132         assert(MRI.getType(DstReg) == S32);
133         assert(NewBank == &AMDGPU::VGPRRegBank);
134 
135         // Replace the extension with a select, which really uses the boolean
136         // source.
137         MachineIRBuilder B(MI);
138         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
139         auto False = B.buildConstant(S32, 0);
140         B.buildSelect(DstReg, SrcReg, True, False);
141         MRI.setRegBank(True.getReg(0), *NewBank);
142         MRI.setRegBank(False.getReg(0), *NewBank);
143         MI.eraseFromParent();
144       }
145 
146       assert(!MRI.getRegClassOrRegBank(DstReg));
147       MRI.setRegBank(DstReg, *NewBank);
148       return;
149     }
150 
151 #ifndef NDEBUG
152     if (Opc == AMDGPU::G_TRUNC) {
153       Register DstReg = MI.getOperand(0).getReg();
154       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
155       assert(DstBank != &AMDGPU::VCCRegBank);
156     }
157 #endif
158 
159     for (MachineOperand &Op : MI.operands()) {
160       if (!Op.isReg())
161         continue;
162 
163       // We may see physical registers if building a real MI
164       Register Reg = Op.getReg();
165       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
166         continue;
167 
168       const RegisterBank *RB = NewBank;
169       if (MRI.getType(Reg) == LLT::scalar(1)) {
170         assert(NewBank == &AMDGPU::VGPRRegBank &&
171                "s1 operands should only be used for vector bools");
172         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
173                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
174                "not expecting legalization artifacts here");
175         RB = &AMDGPU::VCCRegBank;
176       }
177 
178       MRI.setRegBank(Reg, *RB);
179     }
180   }
181 
erasingInstr(MachineInstr & MI)182   void erasingInstr(MachineInstr &MI) override {}
183 
createdInstr(MachineInstr & MI)184   void createdInstr(MachineInstr &MI) override {
185     // At this point, the instruction was just inserted and has no operands.
186     NewInsts.push_back(&MI);
187   }
188 
changingInstr(MachineInstr & MI)189   void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)190   void changedInstr(MachineInstr &MI) override {}
191 };
192 
193 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)194 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
195     : AMDGPUGenRegisterBankInfo(),
196       Subtarget(ST),
197       TRI(Subtarget.getRegisterInfo()),
198       TII(Subtarget.getInstrInfo()) {
199 
200   // HACK: Until this is fully tablegen'd.
201   static llvm::once_flag InitializeRegisterBankFlag;
202 
203   static auto InitializeRegisterBankOnce = [this]() {
204     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207     (void)this;
208   };
209 
210   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212 
isVectorRegisterBank(const RegisterBank & Bank)213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214   unsigned BankID = Bank.getID();
215   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217 
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219                                           const RegisterBank &Src,
220                                           unsigned Size) const {
221   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224     return std::numeric_limits<unsigned>::max();
225   }
226 
227   // Bool values are tricky, because the meaning is based on context. The SCC
228   // and VCC banks are for the natural scalar and vector conditions produced by
229   // a compare.
230   //
231   // Legalization doesn't know about the necessary context, so an s1 use may
232   // have been a truncate from an arbitrary value, in which case a copy (lowered
233   // as a compare with 0) needs to be inserted.
234   if (Size == 1 &&
235       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236       (isVectorRegisterBank(Src) ||
237        Src.getID() == AMDGPU::SGPRRegBankID ||
238        Src.getID() == AMDGPU::VCCRegBankID))
239     return std::numeric_limits<unsigned>::max();
240 
241   // There is no direct copy between AGPRs.
242   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243       Src.getID() == AMDGPU::AGPRRegBankID)
244     return 4;
245 
246   return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248 
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const249 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
250   const ValueMapping &ValMapping,
251   const RegisterBank *CurBank) const {
252   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253   // VGPR.
254   // FIXME: Is there a better way to do this?
255   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256     return 10; // This is expensive.
257 
258   assert(ValMapping.NumBreakDowns == 2 &&
259          ValMapping.BreakDown[0].Length == 32 &&
260          ValMapping.BreakDown[0].StartIdx == 0 &&
261          ValMapping.BreakDown[1].Length == 32 &&
262          ValMapping.BreakDown[1].StartIdx == 32 &&
263          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264 
265   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267   // want.
268 
269   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270   // alignment restrictions, but this probably isn't important.
271   return 1;
272 }
273 
274 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
276                                                LLT Ty) const {
277   if (&RC == &AMDGPU::SReg_1RegClass)
278     return AMDGPU::VCCRegBank;
279 
280   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281   // VCC-like use.
282   if (TRI->isSGPRClass(&RC)) {
283     // FIXME: This probably came from a copy from a physical register, which
284     // should be inferrrable from the copied to-type. We don't have many boolean
285     // physical register constraints so just assume a normal SGPR for now.
286     if (!Ty.isValid())
287       return AMDGPU::SGPRRegBank;
288 
289     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290   }
291 
292   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294 
295 template <unsigned NumOps>
296 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const297 AMDGPURegisterBankInfo::addMappingFromTable(
298     const MachineInstr &MI, const MachineRegisterInfo &MRI,
299     const std::array<unsigned, NumOps> RegSrcOpIdx,
300     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301 
302   InstructionMappings AltMappings;
303 
304   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
305 
306   unsigned Sizes[NumOps];
307   for (unsigned I = 0; I < NumOps; ++I) {
308     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310   }
311 
312   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315   }
316 
317   // getInstrMapping's default mapping uses ID 1, so start at 2.
318   unsigned MappingID = 2;
319   for (const auto &Entry : Table) {
320     for (unsigned I = 0; I < NumOps; ++I) {
321       int OpIdx = RegSrcOpIdx[I];
322       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323     }
324 
325     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
326                                                  getOperandsMapping(Operands),
327                                                  Operands.size()));
328   }
329 
330   return AltMappings;
331 }
332 
333 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
335     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336   switch (MI.getIntrinsicID()) {
337   case Intrinsic::amdgcn_readlane: {
338     static const OpRegBankEntry<3> Table[2] = {
339       // Perfectly legal.
340       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341 
342       // Need a readfirstlane for the index.
343       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344     };
345 
346     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
348   }
349   case Intrinsic::amdgcn_writelane: {
350     static const OpRegBankEntry<4> Table[4] = {
351       // Perfectly legal.
352       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353 
354       // Need readfirstlane of first op
355       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356 
357       // Need readfirstlane of second op
358       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359 
360       // Need readfirstlane of both ops
361       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362     };
363 
364     // rsrc, voffset, offset
365     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
367   }
368   default:
369     return RegisterBankInfo::getInstrAlternativeMappings(MI);
370   }
371 }
372 
373 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
375     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376 
377   switch (MI.getIntrinsicID()) {
378   case Intrinsic::amdgcn_s_buffer_load: {
379     static const OpRegBankEntry<2> Table[4] = {
380       // Perfectly legal.
381       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382 
383       // Only need 1 register in loop
384       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385 
386       // Have to waterfall the resource.
387       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388 
389       // Have to waterfall the resource, and the offset.
390       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391     };
392 
393     // rsrc, offset
394     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
396   }
397   case Intrinsic::amdgcn_ds_ordered_add:
398   case Intrinsic::amdgcn_ds_ordered_swap: {
399     // VGPR = M0, VGPR
400     static const OpRegBankEntry<3> Table[2] = {
401       // Perfectly legal.
402       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
403 
404       // Need a readfirstlane for m0
405       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406     };
407 
408     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
410   }
411   case Intrinsic::amdgcn_s_sendmsg:
412   case Intrinsic::amdgcn_s_sendmsghalt: {
413     // FIXME: Should have no register for immediate
414     static const OpRegBankEntry<1> Table[2] = {
415       // Perfectly legal.
416       { { AMDGPU::SGPRRegBankID }, 1 },
417 
418       // Need readlane
419       { { AMDGPU::VGPRRegBankID }, 3 }
420     };
421 
422     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
424   }
425   default:
426     return RegisterBankInfo::getInstrAlternativeMappings(MI);
427   }
428 }
429 
memOpHasNoClobbered(const MachineMemOperand * MMO)430 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
431   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
432   return I && I->getMetadata("amdgpu.noclobber");
433 }
434 
435 // FIXME: Returns uniform if there's no source value information. This is
436 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)437 static bool isScalarLoadLegal(const MachineInstr &MI) {
438   if (!MI.hasOneMemOperand())
439     return false;
440 
441   const MachineMemOperand *MMO = *MI.memoperands_begin();
442   const unsigned AS = MMO->getAddrSpace();
443   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
444                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
445 
446   // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
447   return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
448          // Can't do a scalar atomic load.
449          !MMO->isAtomic() &&
450          // Don't use scalar loads for volatile accesses to non-constant address
451          // spaces.
452          (IsConst || !MMO->isVolatile()) &&
453          // Memory must be known constant, or not written before this load.
454          (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455          AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457 
458 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460     const MachineInstr &MI) const {
461 
462   const MachineFunction &MF = *MI.getParent()->getParent();
463   const MachineRegisterInfo &MRI = MF.getRegInfo();
464 
465 
466   InstructionMappings AltMappings;
467   switch (MI.getOpcode()) {
468   case TargetOpcode::G_CONSTANT: {
469     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470     if (Size == 1) {
471       static const OpRegBankEntry<1> Table[3] = {
472         { { AMDGPU::VGPRRegBankID }, 1 },
473         { { AMDGPU::SGPRRegBankID }, 1 },
474         { { AMDGPU::VCCRegBankID }, 1 }
475       };
476 
477       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478     }
479 
480     LLVM_FALLTHROUGH;
481   }
482   case TargetOpcode::G_FCONSTANT:
483   case TargetOpcode::G_FRAME_INDEX:
484   case TargetOpcode::G_GLOBAL_VALUE: {
485     static const OpRegBankEntry<1> Table[2] = {
486       { { AMDGPU::VGPRRegBankID }, 1 },
487       { { AMDGPU::SGPRRegBankID }, 1 }
488     };
489 
490     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491   }
492   case TargetOpcode::G_AND:
493   case TargetOpcode::G_OR:
494   case TargetOpcode::G_XOR: {
495     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496 
497     if (Size == 1) {
498       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499       const InstructionMapping &SCCMapping = getInstructionMapping(
500         1, 1, getOperandsMapping(
501           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504         3); // Num Operands
505       AltMappings.push_back(&SCCMapping);
506 
507       const InstructionMapping &VCCMapping0 = getInstructionMapping(
508         2, 1, getOperandsMapping(
509           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512         3); // Num Operands
513       AltMappings.push_back(&VCCMapping0);
514       return AltMappings;
515     }
516 
517     if (Size != 64)
518       break;
519 
520     const InstructionMapping &SSMapping = getInstructionMapping(
521       1, 1, getOperandsMapping(
522         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525       3); // Num Operands
526     AltMappings.push_back(&SSMapping);
527 
528     const InstructionMapping &VVMapping = getInstructionMapping(
529       2, 2, getOperandsMapping(
530         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533       3); // Num Operands
534     AltMappings.push_back(&VVMapping);
535     break;
536   }
537   case TargetOpcode::G_LOAD:
538   case TargetOpcode::G_ZEXTLOAD:
539   case TargetOpcode::G_SEXTLOAD: {
540     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542     unsigned PtrSize = PtrTy.getSizeInBits();
543     unsigned AS = PtrTy.getAddressSpace();
544 
545     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547         isScalarLoadLegal(MI)) {
548       const InstructionMapping &SSMapping = getInstructionMapping(
549           1, 1, getOperandsMapping(
550                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552           2); // Num Operands
553       AltMappings.push_back(&SSMapping);
554     }
555 
556     const InstructionMapping &VVMapping = getInstructionMapping(
557         2, 1,
558         getOperandsMapping(
559             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561         2); // Num Operands
562     AltMappings.push_back(&VVMapping);
563 
564     // It may be possible to have a vgpr = load sgpr mapping here, because
565     // the mubuf instructions support this kind of load, but probably for only
566     // gfx7 and older.  However, the addressing mode matching in the instruction
567     // selector should be able to do a better job of detecting and selecting
568     // these kinds of loads from the vgpr = load vgpr mapping.
569 
570     return AltMappings;
571 
572   }
573   case TargetOpcode::G_SELECT: {
574     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580       4); // Num Operands
581     AltMappings.push_back(&SSMapping);
582 
583     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588       4); // Num Operands
589     AltMappings.push_back(&VVMapping);
590 
591     return AltMappings;
592   }
593   case TargetOpcode::G_SMIN:
594   case TargetOpcode::G_SMAX:
595   case TargetOpcode::G_UMIN:
596   case TargetOpcode::G_UMAX: {
597     static const OpRegBankEntry<3> Table[2] = {
598       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
599 
600       // Scalar requires cmp+select, and extends if 16-bit.
601       // FIXME: Should there be separate costs for 32 and 16-bit
602       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
603     };
604 
605     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
606     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
607   }
608   case TargetOpcode::G_UADDE:
609   case TargetOpcode::G_USUBE:
610   case TargetOpcode::G_SADDE:
611   case TargetOpcode::G_SSUBE: {
612     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
613     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
614       getOperandsMapping(
615         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
617          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
618          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
619          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
620       5); // Num Operands
621     AltMappings.push_back(&SSMapping);
622 
623     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
624       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
626                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
627                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
628                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
629       5); // Num Operands
630     AltMappings.push_back(&VVMapping);
631     return AltMappings;
632   }
633   case AMDGPU::G_BRCOND: {
634     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
635 
636     // TODO: Change type to 32 for scalar
637     const InstructionMapping &SMapping = getInstructionMapping(
638       1, 1, getOperandsMapping(
639         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
640       2); // Num Operands
641     AltMappings.push_back(&SMapping);
642 
643     const InstructionMapping &VMapping = getInstructionMapping(
644       1, 1, getOperandsMapping(
645         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
646       2); // Num Operands
647     AltMappings.push_back(&VMapping);
648     return AltMappings;
649   }
650   case AMDGPU::G_INTRINSIC:
651     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
652   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
653     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
654   default:
655     break;
656   }
657   return RegisterBankInfo::getInstrAlternativeMappings(MI);
658 }
659 
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const660 void AMDGPURegisterBankInfo::split64BitValueForMapping(
661   MachineIRBuilder &B,
662   SmallVector<Register, 2> &Regs,
663   LLT HalfTy,
664   Register Reg) const {
665   assert(HalfTy.getSizeInBits() == 32);
666   MachineRegisterInfo *MRI = B.getMRI();
667   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
668   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
669   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
670   MRI->setRegBank(LoLHS, *Bank);
671   MRI->setRegBank(HiLHS, *Bank);
672 
673   Regs.push_back(LoLHS);
674   Regs.push_back(HiLHS);
675 
676   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
677     .addDef(LoLHS)
678     .addDef(HiLHS)
679     .addUse(Reg);
680 }
681 
682 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)683 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
684                           LLT NewTy) {
685   for (Register Reg : Regs) {
686     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
687     MRI.setType(Reg, NewTy);
688   }
689 }
690 
getHalfSizedType(LLT Ty)691 static LLT getHalfSizedType(LLT Ty) {
692   if (Ty.isVector()) {
693     assert(Ty.getNumElements() % 2 == 0);
694     return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
695   }
696 
697   assert(Ty.getSizeInBits() % 2 == 0);
698   return LLT::scalar(Ty.getSizeInBits() / 2);
699 }
700 
701 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
702 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
703 /// execute the instruction for each unique combination of values in all lanes
704 /// in the wave. The block will be split such that rest of the instructions are
705 /// moved to a new block.
706 ///
707 /// Essentially performs this loop:
708 //
709 /// Save Execution Mask
710 /// For (Lane : Wavefront) {
711 ///   Enable Lane, Disable all other lanes
712 ///   SGPR = read SGPR value for current lane from VGPR
713 ///   VGPRResult[Lane] = use_op SGPR
714 /// }
715 /// Restore Execution Mask
716 ///
717 /// There is additional complexity to try for compare values to identify the
718 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const719 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
720   MachineIRBuilder &B,
721   iterator_range<MachineBasicBlock::iterator> Range,
722   SmallSet<Register, 4> &SGPROperandRegs,
723   MachineRegisterInfo &MRI) const {
724   SmallVector<Register, 4> ResultRegs;
725   SmallVector<Register, 4> InitResultRegs;
726   SmallVector<Register, 4> PhiRegs;
727 
728   // Track use registers which have already been expanded with a readfirstlane
729   // sequence. This may have multiple uses if moving a sequence.
730   DenseMap<Register, Register> WaterfalledRegMap;
731 
732   MachineBasicBlock &MBB = B.getMBB();
733   MachineFunction *MF = &B.getMF();
734 
735   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
736   const unsigned WaveAndOpc = Subtarget.isWave32() ?
737     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
738   const unsigned MovTermOpc = Subtarget.isWave32() ?
739     AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
740   const unsigned XorTermOpc = Subtarget.isWave32() ?
741     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
742   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
743     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
744   const unsigned ExecReg =  Subtarget.isWave32() ?
745     AMDGPU::EXEC_LO : AMDGPU::EXEC;
746 
747 #ifndef NDEBUG
748   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
749 #endif
750 
751   for (MachineInstr &MI : Range) {
752     for (MachineOperand &Def : MI.defs()) {
753       if (MRI.use_nodbg_empty(Def.getReg()))
754         continue;
755 
756       LLT ResTy = MRI.getType(Def.getReg());
757       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
758       ResultRegs.push_back(Def.getReg());
759       Register InitReg = B.buildUndef(ResTy).getReg(0);
760       Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
761       InitResultRegs.push_back(InitReg);
762       PhiRegs.push_back(PhiReg);
763       MRI.setRegBank(PhiReg, *DefBank);
764       MRI.setRegBank(InitReg, *DefBank);
765     }
766   }
767 
768   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
769   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
770 
771   // Don't bother using generic instructions/registers for the exec mask.
772   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
773     .addDef(InitSaveExecReg);
774 
775   Register PhiExec = MRI.createVirtualRegister(WaveRC);
776   Register NewExec = MRI.createVirtualRegister(WaveRC);
777 
778   // To insert the loop we need to split the block. Move everything before this
779   // point to a new block, and insert a new empty block before this instruction.
780   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
781   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
782   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
783   MachineFunction::iterator MBBI(MBB);
784   ++MBBI;
785   MF->insert(MBBI, LoopBB);
786   MF->insert(MBBI, RestoreExecBB);
787   MF->insert(MBBI, RemainderBB);
788 
789   LoopBB->addSuccessor(RestoreExecBB);
790   LoopBB->addSuccessor(LoopBB);
791 
792   // Move the rest of the block into a new block.
793   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
794   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
795 
796   MBB.addSuccessor(LoopBB);
797   RestoreExecBB->addSuccessor(RemainderBB);
798 
799   B.setInsertPt(*LoopBB, LoopBB->end());
800 
801   B.buildInstr(TargetOpcode::PHI)
802     .addDef(PhiExec)
803     .addReg(InitSaveExecReg)
804     .addMBB(&MBB)
805     .addReg(NewExec)
806     .addMBB(LoopBB);
807 
808   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
809     B.buildInstr(TargetOpcode::G_PHI)
810       .addDef(std::get<2>(Result))
811       .addReg(std::get<0>(Result)) // Initial value / implicit_def
812       .addMBB(&MBB)
813       .addReg(std::get<1>(Result)) // Mid-loop value.
814       .addMBB(LoopBB);
815   }
816 
817   const DebugLoc &DL = B.getDL();
818 
819   MachineInstr &FirstInst = *Range.begin();
820 
821   // Move the instruction into the loop. Note we moved everything after
822   // Range.end() already into a new block, so Range.end() is no longer valid.
823   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
824 
825   // Figure out the iterator range after splicing the instructions.
826   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
827   auto NewEnd = LoopBB->end();
828 
829   MachineBasicBlock::iterator I = Range.begin();
830   B.setInsertPt(*LoopBB, I);
831 
832   Register CondReg;
833 
834   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
835 
836   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
837     for (MachineOperand &Op : MI.uses()) {
838       if (!Op.isReg() || Op.isDef())
839         continue;
840 
841       Register OldReg = Op.getReg();
842       if (!SGPROperandRegs.count(OldReg))
843         continue;
844 
845       // See if we already processed this register in another instruction in the
846       // sequence.
847       auto OldVal = WaterfalledRegMap.find(OldReg);
848       if (OldVal != WaterfalledRegMap.end()) {
849         Op.setReg(OldVal->second);
850         continue;
851       }
852 
853       Register OpReg = Op.getReg();
854       LLT OpTy = MRI.getType(OpReg);
855 
856       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
857       if (OpBank != &AMDGPU::VGPRRegBank) {
858         // Insert copy from AGPR to VGPR before the loop.
859         B.setMBB(MBB);
860         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
861         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
862         B.setInstr(*I);
863       }
864 
865       unsigned OpSize = OpTy.getSizeInBits();
866 
867       // Can only do a readlane of 32-bit pieces.
868       if (OpSize == 32) {
869         // Avoid extra copies in the simple case of one 32-bit register.
870         Register CurrentLaneOpReg
871           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
872         MRI.setType(CurrentLaneOpReg, OpTy);
873 
874         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
875         // Read the next variant <- also loop target.
876         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
877                 CurrentLaneOpReg)
878           .addReg(OpReg);
879 
880         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
881         bool First = CondReg == AMDGPU::NoRegister;
882         if (First)
883           CondReg = NewCondReg;
884 
885         // Compare the just read M0 value to all possible Idx values.
886         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
887           .addDef(NewCondReg)
888           .addReg(CurrentLaneOpReg)
889           .addReg(OpReg);
890         Op.setReg(CurrentLaneOpReg);
891 
892         if (!First) {
893           Register AndReg = MRI.createVirtualRegister(WaveRC);
894 
895           // If there are multiple operands to consider, and the conditions.
896           B.buildInstr(WaveAndOpc)
897             .addDef(AndReg)
898             .addReg(NewCondReg)
899             .addReg(CondReg);
900           CondReg = AndReg;
901         }
902       } else {
903         LLT S32 = LLT::scalar(32);
904         SmallVector<Register, 8> ReadlanePieces;
905 
906         // The compares can be done as 64-bit, but the extract needs to be done
907         // in 32-bit pieces.
908 
909         bool Is64 = OpSize % 64 == 0;
910 
911         LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
912         unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
913           : AMDGPU::V_CMP_EQ_U32_e64;
914 
915         // The compares can be done as 64-bit, but the extract needs to be done
916         // in 32-bit pieces.
917 
918         // Insert the unmerge before the loop.
919 
920         B.setMBB(MBB);
921         auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
922         B.setInstr(*I);
923 
924         unsigned NumPieces = Unmerge->getNumOperands() - 1;
925         for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
926           Register UnmergePiece = Unmerge.getReg(PieceIdx);
927 
928           Register CurrentLaneOpReg;
929           if (Is64) {
930             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
931             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
932 
933             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
934             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
935             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
936 
937             // Read the next variant <- also loop target.
938             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
939                     CurrentLaneOpRegLo)
940               .addReg(UnmergePiece, 0, AMDGPU::sub0);
941 
942             // Read the next variant <- also loop target.
943             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
944                     CurrentLaneOpRegHi)
945               .addReg(UnmergePiece, 0, AMDGPU::sub1);
946 
947             CurrentLaneOpReg =
948               B.buildMerge(LLT::scalar(64),
949                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
950               .getReg(0);
951 
952             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
953 
954             if (OpTy.getScalarSizeInBits() == 64) {
955               // If we need to produce a 64-bit element vector, so use the
956               // merged pieces
957               ReadlanePieces.push_back(CurrentLaneOpReg);
958             } else {
959               // 32-bit element type.
960               ReadlanePieces.push_back(CurrentLaneOpRegLo);
961               ReadlanePieces.push_back(CurrentLaneOpRegHi);
962             }
963           } else {
964             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
965             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
966             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
967 
968             // Read the next variant <- also loop target.
969             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
970                     CurrentLaneOpReg)
971               .addReg(UnmergePiece);
972             ReadlanePieces.push_back(CurrentLaneOpReg);
973           }
974 
975           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
976           bool First = CondReg == AMDGPU::NoRegister;
977           if (First)
978             CondReg = NewCondReg;
979 
980           B.buildInstr(CmpOp)
981             .addDef(NewCondReg)
982             .addReg(CurrentLaneOpReg)
983             .addReg(UnmergePiece);
984 
985           if (!First) {
986             Register AndReg = MRI.createVirtualRegister(WaveRC);
987 
988             // If there are multiple operands to consider, and the conditions.
989             B.buildInstr(WaveAndOpc)
990               .addDef(AndReg)
991               .addReg(NewCondReg)
992               .addReg(CondReg);
993             CondReg = AndReg;
994           }
995         }
996 
997         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
998         // BUILD_VECTOR
999         if (OpTy.isVector()) {
1000           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
1001           Op.setReg(Merge.getReg(0));
1002         } else {
1003           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
1004           Op.setReg(Merge.getReg(0));
1005         }
1006 
1007         MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
1008       }
1009 
1010       // Make sure we don't re-process this register again.
1011       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
1012     }
1013   }
1014 
1015   B.setInsertPt(*LoopBB, LoopBB->end());
1016 
1017   // Update EXEC, save the original EXEC value to VCC.
1018   B.buildInstr(AndSaveExecOpc)
1019     .addDef(NewExec)
1020     .addReg(CondReg, RegState::Kill);
1021 
1022   MRI.setSimpleHint(NewExec, CondReg);
1023 
1024   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1025   B.buildInstr(XorTermOpc)
1026     .addDef(ExecReg)
1027     .addReg(ExecReg)
1028     .addReg(NewExec);
1029 
1030   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1031   // s_cbranch_scc0?
1032 
1033   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1034   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1035     .addMBB(LoopBB);
1036 
1037   // Save the EXEC mask before the loop.
1038   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1039     .addReg(ExecReg);
1040 
1041   // Restore the EXEC mask after the loop.
1042   B.setMBB(*RestoreExecBB);
1043   B.buildInstr(MovTermOpc)
1044     .addDef(ExecReg)
1045     .addReg(SaveExecReg);
1046 
1047   // Set the insert point after the original instruction, so any new
1048   // instructions will be in the remainder.
1049   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1050 
1051   return true;
1052 }
1053 
1054 // Return any unique registers used by \p MI at \p OpIndices that need to be
1055 // handled in a waterfall loop. Returns these registers in \p
1056 // SGPROperandRegs. Returns true if there are any operands to handle and a
1057 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1058 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1059   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1060   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1061   for (unsigned Op : OpIndices) {
1062     assert(MI.getOperand(Op).isUse());
1063     Register Reg = MI.getOperand(Op).getReg();
1064     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1065     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1066       SGPROperandRegs.insert(Reg);
1067   }
1068 
1069   // No operands need to be replaced, so no need to loop.
1070   return !SGPROperandRegs.empty();
1071 }
1072 
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1073 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1074   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1075   ArrayRef<unsigned> OpIndices) const {
1076   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1077   // are the same register.
1078   SmallSet<Register, 4> SGPROperandRegs;
1079 
1080   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1081     return false;
1082 
1083   MachineBasicBlock::iterator I = MI.getIterator();
1084   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1085                                 SGPROperandRegs, MRI);
1086 }
1087 
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1088 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1089   MachineInstr &MI, MachineRegisterInfo &MRI,
1090   ArrayRef<unsigned> OpIndices) const {
1091   MachineIRBuilder B(MI);
1092   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1093 }
1094 
1095 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1096 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1097     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1098   Register Reg = MI.getOperand(OpIdx).getReg();
1099   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1100   if (Bank == &AMDGPU::SGPRRegBank)
1101     return;
1102 
1103   LLT Ty = MRI.getType(Reg);
1104   MachineIRBuilder B(MI);
1105 
1106   if (Bank != &AMDGPU::VGPRRegBank) {
1107     // We need to copy from AGPR to VGPR
1108     Reg = B.buildCopy(Ty, Reg).getReg(0);
1109     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1110   }
1111 
1112   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1113   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1114     .addDef(SGPR)
1115     .addReg(Reg);
1116 
1117   MRI.setType(SGPR, Ty);
1118 
1119   const TargetRegisterClass *Constrained =
1120       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1121   (void)Constrained;
1122   assert(Constrained && "Failed to constrain readfirstlane src reg");
1123 
1124   MI.getOperand(OpIdx).setReg(SGPR);
1125 }
1126 
1127 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1128 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1129 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1130   unsigned TotalSize = Ty.getSizeInBits();
1131   if (!Ty.isVector())
1132     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1133 
1134   LLT EltTy = Ty.getElementType();
1135   unsigned EltSize = EltTy.getSizeInBits();
1136   assert(FirstSize % EltSize == 0);
1137 
1138   unsigned FirstPartNumElts = FirstSize / EltSize;
1139   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1140 
1141   return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
1142           LLT::scalarOrVector(RemainderElts, EltTy)};
1143 }
1144 
widen96To128(LLT Ty)1145 static LLT widen96To128(LLT Ty) {
1146   if (!Ty.isVector())
1147     return LLT::scalar(128);
1148 
1149   LLT EltTy = Ty.getElementType();
1150   assert(128 % EltTy.getSizeInBits() == 0);
1151   return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
1152 }
1153 
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1154 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1155                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1156                                               MachineRegisterInfo &MRI) const {
1157   Register DstReg = MI.getOperand(0).getReg();
1158   const LLT LoadTy = MRI.getType(DstReg);
1159   unsigned LoadSize = LoadTy.getSizeInBits();
1160   const unsigned MaxNonSmrdLoadSize = 128;
1161 
1162   const RegisterBank *PtrBank =
1163     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1164   if (PtrBank == &AMDGPU::SGPRRegBank) {
1165     // If the pointer is an SGPR, we ordinarily have nothing to do.
1166     if (LoadSize != 96)
1167       return false;
1168 
1169     MachineMemOperand *MMO = *MI.memoperands_begin();
1170     Register PtrReg = MI.getOperand(1).getReg();
1171     // 96-bit loads are only available for vector loads. We need to split this
1172     // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1173 
1174     MachineIRBuilder B(MI);
1175     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1176     GISelObserverWrapper Observer(&O);
1177     B.setChangeObserver(Observer);
1178 
1179     if (MMO->getAlign() < Align(16)) {
1180       LLT Part64, Part32;
1181       std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1182       auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1183       auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1184 
1185       auto Undef = B.buildUndef(LoadTy);
1186       auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1187       B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1188     } else {
1189       LLT WiderTy = widen96To128(LoadTy);
1190       auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1191       B.buildExtract(MI.getOperand(0), WideLoad, 0);
1192     }
1193 
1194     MI.eraseFromParent();
1195     return true;
1196   }
1197 
1198   // 128-bit loads are supported for all instruction types.
1199   if (LoadSize <= MaxNonSmrdLoadSize)
1200     return false;
1201 
1202   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1203   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1204 
1205   if (SrcRegs.empty())
1206     SrcRegs.push_back(MI.getOperand(1).getReg());
1207 
1208   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1209 
1210   // RegBankSelect only emits scalar types, so we need to reset the pointer
1211   // operand to a pointer type.
1212   Register BasePtrReg = SrcRegs[0];
1213   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1214   MRI.setType(BasePtrReg, PtrTy);
1215 
1216   MachineIRBuilder B(MI);
1217 
1218   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1219   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1220   ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
1221   GISelObserverWrapper Observer(&O);
1222   B.setChangeObserver(Observer);
1223   LegalizerHelper Helper(B.getMF(), Observer, B);
1224 
1225   if (LoadTy.isVector()) {
1226     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1227       return false;
1228   } else {
1229     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1230       return false;
1231   }
1232 
1233   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1234   return true;
1235 }
1236 
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1237 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1238   MachineInstr &MI,
1239   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1240   MachineRegisterInfo &MRI) const {
1241   const MachineFunction &MF = *MI.getMF();
1242   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1243   const auto &TFI = *ST.getFrameLowering();
1244 
1245   // Guard in case the stack growth direction ever changes with scratch
1246   // instructions.
1247   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1248     return false;
1249 
1250   Register Dst = MI.getOperand(0).getReg();
1251   Register AllocSize = MI.getOperand(1).getReg();
1252   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1253 
1254   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1255 
1256   // TODO: Need to emit a wave reduction to get the maximum size.
1257   if (SizeBank != &AMDGPU::SGPRRegBank)
1258     return false;
1259 
1260   LLT PtrTy = MRI.getType(Dst);
1261   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1262 
1263   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1264   Register SPReg = Info->getStackPtrOffsetReg();
1265   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1266   GISelObserverWrapper Observer(&ApplyBank);
1267 
1268   MachineIRBuilder B(MI);
1269   B.setChangeObserver(Observer);
1270 
1271   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1272   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1273 
1274   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1275   if (Alignment > TFI.getStackAlign()) {
1276     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1277     B.buildMaskLowPtrBits(Dst, PtrAdd,
1278                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1279   } else {
1280     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1281   }
1282 
1283   MI.eraseFromParent();
1284   return true;
1285 }
1286 
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1287 bool AMDGPURegisterBankInfo::applyMappingImage(
1288     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1289     MachineRegisterInfo &MRI, int RsrcIdx) const {
1290   const int NumDefs = MI.getNumExplicitDefs();
1291 
1292   // The reported argument index is relative to the IR intrinsic call arguments,
1293   // so we need to shift by the number of defs and the intrinsic ID.
1294   RsrcIdx += NumDefs + 1;
1295 
1296   // Insert copies to VGPR arguments.
1297   applyDefaultMapping(OpdMapper);
1298 
1299   // Fixup any SGPR arguments.
1300   SmallVector<unsigned, 4> SGPRIndexes;
1301   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1302     if (!MI.getOperand(I).isReg())
1303       continue;
1304 
1305     // If this intrinsic has a sampler, it immediately follows rsrc.
1306     if (I == RsrcIdx || I == RsrcIdx + 1)
1307       SGPRIndexes.push_back(I);
1308   }
1309 
1310   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1311   return true;
1312 }
1313 
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1314 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1315                                         Register Reg) {
1316   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1317   if (!Def)
1318     return Reg;
1319 
1320   // TODO: Guard against this being an implicit def
1321   return Def->getOperand(0).getReg();
1322 }
1323 
1324 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1325 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1326 static unsigned setBufferOffsets(MachineIRBuilder &B,
1327                                  const AMDGPURegisterBankInfo &RBI,
1328                                  Register CombinedOffset, Register &VOffsetReg,
1329                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1330                                  Align Alignment) {
1331   const LLT S32 = LLT::scalar(32);
1332   MachineRegisterInfo *MRI = B.getMRI();
1333 
1334   if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
1335     uint32_t SOffset, ImmOffset;
1336     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1337                                  Alignment)) {
1338       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1339       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1340       InstOffsetVal = ImmOffset;
1341 
1342       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1343       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1344       return SOffset + ImmOffset;
1345     }
1346   }
1347 
1348   Register Base;
1349   unsigned Offset;
1350 
1351   std::tie(Base, Offset) =
1352       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1353 
1354   uint32_t SOffset, ImmOffset;
1355   if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1356                                              &RBI.Subtarget, Alignment)) {
1357     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1358       VOffsetReg = Base;
1359       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1360       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1361       InstOffsetVal = ImmOffset;
1362       return 0; // XXX - Why is this 0?
1363     }
1364 
1365     // If we have SGPR base, we can use it for soffset.
1366     if (SOffset == 0) {
1367       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1368       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1369       SOffsetReg = Base;
1370       InstOffsetVal = ImmOffset;
1371       return 0; // XXX - Why is this 0?
1372     }
1373   }
1374 
1375   // Handle the variable sgpr + vgpr case.
1376   if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
1377     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1378     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1379 
1380     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1381     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1382 
1383     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1384       VOffsetReg = Src0;
1385       SOffsetReg = Src1;
1386       return 0;
1387     }
1388 
1389     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1390       VOffsetReg = Src1;
1391       SOffsetReg = Src0;
1392       return 0;
1393     }
1394   }
1395 
1396   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1397   // have an SGPR offset and a VGPR resource.
1398   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1399     VOffsetReg = CombinedOffset;
1400   } else {
1401     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1402     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1403   }
1404 
1405   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1406   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1407   return 0;
1408 }
1409 
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1410 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1411   const OperandsMapper &OpdMapper) const {
1412   MachineInstr &MI = OpdMapper.getMI();
1413   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1414 
1415   const LLT S32 = LLT::scalar(32);
1416   Register Dst = MI.getOperand(0).getReg();
1417   LLT Ty = MRI.getType(Dst);
1418 
1419   const RegisterBank *RSrcBank =
1420     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1421   const RegisterBank *OffsetBank =
1422     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1423   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1424       OffsetBank == &AMDGPU::SGPRRegBank)
1425     return true; // Legal mapping
1426 
1427   // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1428   // here but don't have an MMO.
1429 
1430   unsigned LoadSize = Ty.getSizeInBits();
1431   int NumLoads = 1;
1432   if (LoadSize == 256 || LoadSize == 512) {
1433     NumLoads = LoadSize / 128;
1434     Ty = Ty.divide(NumLoads);
1435   }
1436 
1437   // Use the alignment to ensure that the required offsets will fit into the
1438   // immediate offsets.
1439   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1440 
1441   MachineIRBuilder B(MI);
1442   MachineFunction &MF = B.getMF();
1443 
1444   Register SOffset;
1445   Register VOffset;
1446   int64_t ImmOffset = 0;
1447 
1448   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1449                                         VOffset, SOffset, ImmOffset, Alignment);
1450 
1451   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1452   // can, but we neeed to track an MMO for that.
1453   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1454   const Align MemAlign(4); // FIXME: ABI type alignment?
1455   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1456     MachinePointerInfo(),
1457     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1458     MachineMemOperand::MOInvariant,
1459     MemSize, MemAlign);
1460   if (MMOOffset != 0)
1461     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1462 
1463   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1464   // assume that the buffer is unswizzled.
1465 
1466   Register RSrc = MI.getOperand(1).getReg();
1467   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1468   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1469 
1470   SmallVector<Register, 4> LoadParts(NumLoads);
1471 
1472   MachineBasicBlock::iterator MII = MI.getIterator();
1473   MachineInstrSpan Span(MII, &B.getMBB());
1474 
1475   for (int i = 0; i < NumLoads; ++i) {
1476     if (NumLoads == 1) {
1477       LoadParts[i] = Dst;
1478     } else {
1479       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1480       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1481     }
1482 
1483     MachineMemOperand *MMO = BaseMMO;
1484     if (i != 0)
1485       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1486 
1487     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1488       .addDef(LoadParts[i])       // vdata
1489       .addUse(RSrc)               // rsrc
1490       .addUse(VIndex)             // vindex
1491       .addUse(VOffset)            // voffset
1492       .addUse(SOffset)            // soffset
1493       .addImm(ImmOffset + 16 * i) // offset(imm)
1494       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1495       .addImm(0)                  // idxen(imm)
1496       .addMemOperand(MMO);
1497   }
1498 
1499   // TODO: If only the resource is a VGPR, it may be better to execute the
1500   // scalar load in the waterfall loop if the resource is expected to frequently
1501   // be dynamically uniform.
1502   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1503     // Remove the original instruction to avoid potentially confusing the
1504     // waterfall loop logic.
1505     B.setInstr(*Span.begin());
1506     MI.eraseFromParent();
1507 
1508     SmallSet<Register, 4> OpsToWaterfall;
1509 
1510     OpsToWaterfall.insert(RSrc);
1511     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1512                            OpsToWaterfall, MRI);
1513   }
1514 
1515   if (NumLoads != 1) {
1516     if (Ty.isVector())
1517       B.buildConcatVectors(Dst, LoadParts);
1518     else
1519       B.buildMerge(Dst, LoadParts);
1520   }
1521 
1522   // We removed the instruction earlier with a waterfall loop.
1523   if (RSrcBank == &AMDGPU::SGPRRegBank)
1524     MI.eraseFromParent();
1525 
1526   return true;
1527 }
1528 
applyMappingBFEIntrinsic(const OperandsMapper & OpdMapper,bool Signed) const1529 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1530   const OperandsMapper &OpdMapper, bool Signed) const {
1531   MachineInstr &MI = OpdMapper.getMI();
1532   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1533 
1534   // Insert basic copies
1535   applyDefaultMapping(OpdMapper);
1536 
1537   Register DstReg = MI.getOperand(0).getReg();
1538   LLT Ty = MRI.getType(DstReg);
1539 
1540   const LLT S32 = LLT::scalar(32);
1541 
1542   const RegisterBank *DstBank =
1543     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1544   if (DstBank == &AMDGPU::VGPRRegBank) {
1545     if (Ty == S32)
1546       return true;
1547 
1548     // TODO: 64-bit version is scalar only, so we need to expand this.
1549     return false;
1550   }
1551 
1552   Register SrcReg = MI.getOperand(2).getReg();
1553   Register OffsetReg = MI.getOperand(3).getReg();
1554   Register WidthReg = MI.getOperand(4).getReg();
1555 
1556   // The scalar form packs the offset and width in a single operand.
1557 
1558   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1559   GISelObserverWrapper Observer(&ApplyBank);
1560   MachineIRBuilder B(MI);
1561   B.setChangeObserver(Observer);
1562 
1563   // Ensure the high bits are clear to insert the offset.
1564   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1565   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1566 
1567   // Zeros out the low bits, so don't bother clamping the input value.
1568   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1569 
1570   // Transformation function, pack the offset and width of a BFE into
1571   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1572   // source, bits [5:0] contain the offset and bits [22:16] the width.
1573   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1574 
1575   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1576   // register class constraints.
1577   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1578                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1579 
1580   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1581   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1582     llvm_unreachable("failed to constrain BFE");
1583 
1584   MI.eraseFromParent();
1585   return true;
1586 }
1587 
1588 // FIXME: Duplicated from LegalizerHelper
minMaxToCompare(unsigned Opc)1589 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1590   switch (Opc) {
1591   case TargetOpcode::G_SMIN:
1592     return CmpInst::ICMP_SLT;
1593   case TargetOpcode::G_SMAX:
1594     return CmpInst::ICMP_SGT;
1595   case TargetOpcode::G_UMIN:
1596     return CmpInst::ICMP_ULT;
1597   case TargetOpcode::G_UMAX:
1598     return CmpInst::ICMP_UGT;
1599   default:
1600     llvm_unreachable("not in integer min/max");
1601   }
1602 }
1603 
minMaxToExtend(unsigned Opc)1604 static unsigned minMaxToExtend(unsigned Opc) {
1605   switch (Opc) {
1606   case TargetOpcode::G_SMIN:
1607   case TargetOpcode::G_SMAX:
1608     return TargetOpcode::G_SEXT;
1609   case TargetOpcode::G_UMIN:
1610   case TargetOpcode::G_UMAX:
1611     return TargetOpcode::G_ZEXT;
1612   default:
1613     llvm_unreachable("not in integer min/max");
1614   }
1615 }
1616 
1617 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1618 // any illegal vector extend or unmerge operations.
1619 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1620 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1621   const LLT S32 = LLT::scalar(32);
1622   auto Bitcast = B.buildBitcast(S32, Src);
1623 
1624   if (ExtOpcode == TargetOpcode::G_SEXT) {
1625     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1626     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1627     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1628   }
1629 
1630   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1631   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1632     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1633     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1634   }
1635 
1636   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1637   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1638 }
1639 
buildExpandedScalarMinMax(MachineIRBuilder & B,CmpInst::Predicate Pred,Register Dst,Register Src0,Register Src1)1640 static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
1641                                                CmpInst::Predicate Pred,
1642                                                Register Dst, Register Src0,
1643                                                Register Src1) {
1644   const LLT CmpType = LLT::scalar(32);
1645   auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1646   return B.buildSelect(Dst, Cmp, Src0, Src1);
1647 }
1648 
1649 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
lowerScalarMinMax(MachineIRBuilder & B,MachineInstr & MI) const1650 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1651                                                MachineInstr &MI) const {
1652   Register Dst = MI.getOperand(0).getReg();
1653   Register Src0 = MI.getOperand(1).getReg();
1654   Register Src1 = MI.getOperand(2).getReg();
1655 
1656   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1657   MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
1658 
1659   Register CmpReg = Sel->getOperand(1).getReg();
1660   B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
1661   MI.eraseFromParent();
1662 }
1663 
1664 // For cases where only a single copy is inserted for matching register banks.
1665 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1666 static bool substituteSimpleCopyRegs(
1667   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1668   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1669   if (!SrcReg.empty()) {
1670     assert(SrcReg.size() == 1);
1671     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1672     return true;
1673   }
1674 
1675   return false;
1676 }
1677 
1678 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1679 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1680                                                 MachineRegisterInfo &MRI,
1681                                                 Register Reg) const {
1682   if (!Subtarget.hasUnpackedD16VMem())
1683     return Reg;
1684 
1685   const LLT S16 = LLT::scalar(16);
1686   LLT StoreVT = MRI.getType(Reg);
1687   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1688     return Reg;
1689 
1690   auto Unmerge = B.buildUnmerge(S16, Reg);
1691 
1692 
1693   SmallVector<Register, 4> WideRegs;
1694   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1695     WideRegs.push_back(Unmerge.getReg(I));
1696 
1697   const LLT S32 = LLT::scalar(32);
1698   int NumElts = StoreVT.getNumElements();
1699 
1700   return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1701 }
1702 
1703 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1704 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1705   int64_t Const;
1706   if (mi_match(Reg, MRI, m_ICst(Const)))
1707     return std::make_pair(Register(), Const);
1708 
1709   Register Base;
1710   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1711     return std::make_pair(Base, Const);
1712 
1713   // TODO: Handle G_OR used for add case
1714   return std::make_pair(Reg, 0);
1715 }
1716 
1717 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1718 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1719                                            Register OrigOffset) const {
1720   const unsigned MaxImm = 4095;
1721   Register BaseReg;
1722   unsigned ImmOffset;
1723   const LLT S32 = LLT::scalar(32);
1724 
1725   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1726                                                            OrigOffset);
1727 
1728   unsigned C1 = 0;
1729   if (ImmOffset != 0) {
1730     // If the immediate value is too big for the immoffset field, put the value
1731     // and -4096 into the immoffset field so that the value that is copied/added
1732     // for the voffset field is a multiple of 4096, and it stands more chance
1733     // of being CSEd with the copy/add for another similar load/store.
1734     // However, do not do that rounding down to a multiple of 4096 if that is a
1735     // negative number, as it appears to be illegal to have a negative offset
1736     // in the vgpr, even if adding the immediate offset makes it positive.
1737     unsigned Overflow = ImmOffset & ~MaxImm;
1738     ImmOffset -= Overflow;
1739     if ((int32_t)Overflow < 0) {
1740       Overflow += ImmOffset;
1741       ImmOffset = 0;
1742     }
1743 
1744     C1 = ImmOffset;
1745     if (Overflow != 0) {
1746       if (!BaseReg)
1747         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1748       else {
1749         auto OverflowVal = B.buildConstant(S32, Overflow);
1750         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1751       }
1752     }
1753   }
1754 
1755   if (!BaseReg)
1756     BaseReg = B.buildConstant(S32, 0).getReg(0);
1757 
1758   return {BaseReg, C1};
1759 }
1760 
isZero(Register Reg,MachineRegisterInfo & MRI)1761 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1762   int64_t C;
1763   return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1764 }
1765 
extractGLC(unsigned CachePolicy)1766 static unsigned extractGLC(unsigned CachePolicy) {
1767   return CachePolicy & 1;
1768 }
1769 
extractSLC(unsigned CachePolicy)1770 static unsigned extractSLC(unsigned CachePolicy) {
1771   return (CachePolicy >> 1) & 1;
1772 }
1773 
extractDLC(unsigned CachePolicy)1774 static unsigned extractDLC(unsigned CachePolicy) {
1775   return (CachePolicy >> 2) & 1;
1776 }
1777 
1778 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1779 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1780                                              MachineInstr &MI) const {
1781    MachineRegisterInfo &MRI = *B.getMRI();
1782   executeInWaterfallLoop(B, MI, MRI, {2, 4});
1783 
1784   // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1785 
1786   Register VData = MI.getOperand(1).getReg();
1787   LLT Ty = MRI.getType(VData);
1788 
1789   int EltSize = Ty.getScalarSizeInBits();
1790   int Size = Ty.getSizeInBits();
1791 
1792   // FIXME: Broken integer truncstore.
1793   if (EltSize != 32)
1794     report_fatal_error("unhandled intrinsic store");
1795 
1796   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1797   const int MemSize = (*MI.memoperands_begin())->getSize();
1798 
1799 
1800   Register RSrc = MI.getOperand(2).getReg();
1801   Register VOffset = MI.getOperand(3).getReg();
1802   Register SOffset = MI.getOperand(4).getReg();
1803   unsigned CachePolicy = MI.getOperand(5).getImm();
1804 
1805   unsigned ImmOffset;
1806   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1807 
1808   const bool Offen = !isZero(VOffset, MRI);
1809 
1810   unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1811   switch (8 * MemSize) {
1812   case 8:
1813     Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1814                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1815     break;
1816   case 16:
1817     Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1818                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1819     break;
1820   default:
1821     Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1822                   AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1823     if (Size > 32)
1824       Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1825     break;
1826   }
1827 
1828 
1829   // Set the insertion point back to the instruction in case it was moved into a
1830   // loop.
1831   B.setInstr(MI);
1832 
1833   MachineInstrBuilder MIB = B.buildInstr(Opc)
1834     .addUse(VData);
1835 
1836   if (Offen)
1837     MIB.addUse(VOffset);
1838 
1839   MIB.addUse(RSrc)
1840      .addUse(SOffset)
1841      .addImm(ImmOffset)
1842      .addImm(extractGLC(CachePolicy))
1843      .addImm(extractSLC(CachePolicy))
1844      .addImm(0) // tfe: FIXME: Remove from inst
1845      .addImm(extractDLC(CachePolicy))
1846      .cloneMemRefs(MI);
1847 
1848   // FIXME: We need a way to report failure from applyMappingImpl.
1849   // Insert constrain copies before inserting the loop.
1850   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1851     report_fatal_error("failed to constrain selected store intrinsic");
1852 
1853   return MIB;
1854 }
1855 
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1856 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1857                                         Register SrcReg) const {
1858   MachineRegisterInfo &MRI = *B.getMRI();
1859   LLT SrcTy = MRI.getType(SrcReg);
1860   if (SrcTy.getSizeInBits() == 32) {
1861     // Use a v_mov_b32 here to make the exec dependency explicit.
1862     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1863       .addDef(DstReg)
1864       .addUse(SrcReg);
1865     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1866            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1867   }
1868 
1869   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1870   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1871 
1872   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1873     .addDef(TmpReg0)
1874     .addUse(SrcReg, 0, AMDGPU::sub0);
1875   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876     .addDef(TmpReg1)
1877     .addUse(SrcReg, 0, AMDGPU::sub1);
1878   B.buildInstr(AMDGPU::REG_SEQUENCE)
1879     .addDef(DstReg)
1880     .addUse(TmpReg0)
1881     .addImm(AMDGPU::sub0)
1882     .addUse(TmpReg1)
1883     .addImm(AMDGPU::sub1);
1884 
1885   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1886          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1887 }
1888 
1889 /// Utility function for pushing dynamic vector indexes with a constant offset
1890 /// into waterwall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1891 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1892                                    MachineInstr &IdxUseInstr,
1893                                    unsigned OpIdx,
1894                                    unsigned ConstOffset) {
1895   MachineRegisterInfo &MRI = *B.getMRI();
1896   const LLT S32 = LLT::scalar(32);
1897   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1898   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1899 
1900   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1901 
1902   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1903   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1904   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1905   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1906 }
1907 
1908 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1909 /// original 32-bit source value (to be inserted in the low part of the combined
1910 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1911 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1912 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1913                                   Register Hi32Reg, Register Lo32Reg,
1914                                   unsigned ExtOpc,
1915                                   const RegisterBank &RegBank,
1916                                   bool IsBooleanSrc = false) {
1917   if (ExtOpc == AMDGPU::G_ZEXT) {
1918     B.buildConstant(Hi32Reg, 0);
1919   } else if (ExtOpc == AMDGPU::G_SEXT) {
1920     if (IsBooleanSrc) {
1921       // If we know the original source was an s1, the high half is the same as
1922       // the low.
1923       B.buildCopy(Hi32Reg, Lo32Reg);
1924     } else {
1925       // Replicate sign bit from 32-bit extended part.
1926       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1927       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1928       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1929     }
1930   } else {
1931     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1932     B.buildUndef(Hi32Reg);
1933   }
1934 }
1935 
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1936 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1937   MachineInstr &MI, MachineRegisterInfo &MRI,
1938   const OperandsMapper &OpdMapper) const {
1939 
1940   Register VecReg = MI.getOperand(1).getReg();
1941   Register Idx = MI.getOperand(2).getReg();
1942 
1943   const RegisterBank &IdxBank =
1944     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1945 
1946   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1947 
1948   LLT VecTy = MRI.getType(VecReg);
1949   unsigned EltSize = VecTy.getScalarSizeInBits();
1950   unsigned NumElem = VecTy.getNumElements();
1951 
1952   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1953                                                   IsDivergentIdx))
1954     return false;
1955 
1956   MachineIRBuilder B(MI);
1957   LLT S32 = LLT::scalar(32);
1958 
1959   const RegisterBank &DstBank =
1960     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1961   const RegisterBank &SrcBank =
1962     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1963 
1964   const RegisterBank &CCBank =
1965     (DstBank == AMDGPU::SGPRRegBank &&
1966      SrcBank == AMDGPU::SGPRRegBank &&
1967      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1968                                      : AMDGPU::VCCRegBank;
1969   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1970 
1971   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1972     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1973     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1974   }
1975 
1976   LLT EltTy = VecTy.getScalarType();
1977   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1978   unsigned NumLanes = DstRegs.size();
1979   if (!NumLanes)
1980     NumLanes = 1;
1981   else
1982     EltTy = MRI.getType(DstRegs[0]);
1983 
1984   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1985   SmallVector<Register, 2> Res(NumLanes);
1986   for (unsigned L = 0; L < NumLanes; ++L)
1987     Res[L] = UnmergeToEltTy.getReg(L);
1988 
1989   for (unsigned I = 1; I < NumElem; ++I) {
1990     auto IC = B.buildConstant(S32, I);
1991     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1992     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1993     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1994 
1995     for (unsigned L = 0; L < NumLanes; ++L) {
1996       auto S = B.buildSelect(EltTy, Cmp,
1997                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1998 
1999       for (unsigned N : { 0, 2, 3 })
2000         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2001 
2002       Res[L] = S->getOperand(0).getReg();
2003     }
2004   }
2005 
2006   for (unsigned L = 0; L < NumLanes; ++L) {
2007     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2008     B.buildCopy(DstReg, Res[L]);
2009     MRI.setRegBank(DstReg, DstBank);
2010   }
2011 
2012   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2013   MI.eraseFromParent();
2014 
2015   return true;
2016 }
2017 
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const2018 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2019   MachineInstr &MI, MachineRegisterInfo &MRI,
2020   const OperandsMapper &OpdMapper) const {
2021 
2022   Register VecReg = MI.getOperand(1).getReg();
2023   Register Idx = MI.getOperand(3).getReg();
2024 
2025   const RegisterBank &IdxBank =
2026     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2027 
2028   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2029 
2030   LLT VecTy = MRI.getType(VecReg);
2031   unsigned EltSize = VecTy.getScalarSizeInBits();
2032   unsigned NumElem = VecTy.getNumElements();
2033 
2034   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2035                                                   IsDivergentIdx))
2036     return false;
2037 
2038   MachineIRBuilder B(MI);
2039   LLT S32 = LLT::scalar(32);
2040 
2041   const RegisterBank &DstBank =
2042     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2043   const RegisterBank &SrcBank =
2044     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2045   const RegisterBank &InsBank =
2046     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2047 
2048   const RegisterBank &CCBank =
2049     (DstBank == AMDGPU::SGPRRegBank &&
2050      SrcBank == AMDGPU::SGPRRegBank &&
2051      InsBank == AMDGPU::SGPRRegBank &&
2052      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2053                                      : AMDGPU::VCCRegBank;
2054   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2055 
2056   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2057     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2058     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2059   }
2060 
2061   LLT EltTy = VecTy.getScalarType();
2062   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2063   unsigned NumLanes = InsRegs.size();
2064   if (!NumLanes) {
2065     NumLanes = 1;
2066     InsRegs.push_back(MI.getOperand(2).getReg());
2067   } else {
2068     EltTy = MRI.getType(InsRegs[0]);
2069   }
2070 
2071   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2072   SmallVector<Register, 16> Ops(NumElem * NumLanes);
2073 
2074   for (unsigned I = 0; I < NumElem; ++I) {
2075     auto IC = B.buildConstant(S32, I);
2076     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2077     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2078     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2079 
2080     for (unsigned L = 0; L < NumLanes; ++L) {
2081       auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
2082                              UnmergeToEltTy.getReg(I * NumLanes + L));
2083 
2084       for (unsigned N : { 0, 2, 3 })
2085         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2086 
2087       Ops[I * NumLanes + L] = S->getOperand(0).getReg();
2088     }
2089   }
2090 
2091   LLT MergeTy = LLT::vector(Ops.size(), EltTy);
2092   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2093     B.buildBuildVector(MI.getOperand(0), Ops);
2094   } else {
2095     auto Vec = B.buildBuildVector(MergeTy, Ops);
2096     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2097     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2098   }
2099 
2100   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2101   MI.eraseFromParent();
2102 
2103   return true;
2104 }
2105 
applyMappingImpl(const OperandsMapper & OpdMapper) const2106 void AMDGPURegisterBankInfo::applyMappingImpl(
2107     const OperandsMapper &OpdMapper) const {
2108   MachineInstr &MI = OpdMapper.getMI();
2109   unsigned Opc = MI.getOpcode();
2110   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2111   switch (Opc) {
2112   case AMDGPU::G_PHI: {
2113     Register DstReg = MI.getOperand(0).getReg();
2114     LLT DstTy = MRI.getType(DstReg);
2115     if (DstTy != LLT::scalar(1))
2116       break;
2117 
2118     const LLT S32 = LLT::scalar(32);
2119     const RegisterBank *DstBank =
2120       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2121     if (DstBank == &AMDGPU::VCCRegBank) {
2122       applyDefaultMapping(OpdMapper);
2123       // The standard handling only considers the result register bank for
2124       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2125       // produce an invalid copy. We can only copy with some kind of compare to
2126       // get a vector boolean result. Insert a regitser bank copy that will be
2127       // correctly lowered to a compare.
2128       MachineIRBuilder B(*MI.getParent()->getParent());
2129 
2130       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2131         Register SrcReg = MI.getOperand(I).getReg();
2132         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2133 
2134         if (SrcBank != &AMDGPU::VCCRegBank) {
2135           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2136           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2137 
2138           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2139           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2140           MI.getOperand(I).setReg(Copy.getReg(0));
2141         }
2142       }
2143 
2144       return;
2145     }
2146 
2147     // Phi handling is strange and only considers the bank of the destination.
2148     substituteSimpleCopyRegs(OpdMapper, 0);
2149 
2150     // Promote SGPR/VGPR booleans to s32
2151     MachineFunction *MF = MI.getParent()->getParent();
2152     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2153     GISelObserverWrapper Observer(&ApplyBank);
2154     MachineIRBuilder B(MI);
2155     LegalizerHelper Helper(*MF, Observer, B);
2156 
2157     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2158       llvm_unreachable("widen scalar should have succeeded");
2159 
2160     return;
2161   }
2162   case AMDGPU::G_ICMP:
2163   case AMDGPU::G_UADDO:
2164   case AMDGPU::G_USUBO:
2165   case AMDGPU::G_UADDE:
2166   case AMDGPU::G_SADDE:
2167   case AMDGPU::G_USUBE:
2168   case AMDGPU::G_SSUBE: {
2169     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2170     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2171 
2172     const RegisterBank *DstBank =
2173       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2174     if (DstBank != &AMDGPU::SGPRRegBank)
2175       break;
2176 
2177     const bool HasCarryIn = MI.getNumOperands() == 5;
2178 
2179     // If this is a scalar compare, promote the result to s32, as the selection
2180     // will end up using a copy to a 32-bit vreg.
2181     const LLT S32 = LLT::scalar(32);
2182     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2183     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2184     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2185     MachineIRBuilder B(MI);
2186 
2187     if (HasCarryIn) {
2188       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2189       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2190       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2191       MI.getOperand(4).setReg(NewSrcReg);
2192     }
2193 
2194     MachineBasicBlock *MBB = MI.getParent();
2195     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2196 
2197     // If we had a constrained VCC result register, a copy was inserted to VCC
2198     // from SGPR.
2199     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2200     if (DefRegs.empty())
2201       DefRegs.push_back(DstReg);
2202     B.buildTrunc(DefRegs[0], NewDstReg);
2203     return;
2204   }
2205   case AMDGPU::G_SELECT: {
2206     Register DstReg = MI.getOperand(0).getReg();
2207     LLT DstTy = MRI.getType(DstReg);
2208 
2209     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2210     if (CondRegs.empty())
2211       CondRegs.push_back(MI.getOperand(1).getReg());
2212     else {
2213       assert(CondRegs.size() == 1);
2214     }
2215 
2216     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2217     if (CondBank == &AMDGPU::SGPRRegBank) {
2218       MachineIRBuilder B(MI);
2219       const LLT S32 = LLT::scalar(32);
2220       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2221       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2222 
2223       MI.getOperand(1).setReg(NewCondReg);
2224       B.buildZExt(NewCondReg, CondRegs[0]);
2225     }
2226 
2227     if (DstTy.getSizeInBits() != 64)
2228       break;
2229 
2230     MachineIRBuilder B(MI);
2231     LLT HalfTy = getHalfSizedType(DstTy);
2232 
2233     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2234     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2235     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2236 
2237     // All inputs are SGPRs, nothing special to do.
2238     if (DefRegs.empty()) {
2239       assert(Src1Regs.empty() && Src2Regs.empty());
2240       break;
2241     }
2242 
2243     if (Src1Regs.empty())
2244       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2245     else {
2246       setRegsToType(MRI, Src1Regs, HalfTy);
2247     }
2248 
2249     if (Src2Regs.empty())
2250       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2251     else
2252       setRegsToType(MRI, Src2Regs, HalfTy);
2253 
2254     setRegsToType(MRI, DefRegs, HalfTy);
2255 
2256     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2257     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2258 
2259     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2260     MI.eraseFromParent();
2261     return;
2262   }
2263   case AMDGPU::G_BRCOND: {
2264     Register CondReg = MI.getOperand(0).getReg();
2265     // FIXME: Should use legalizer helper, but should change bool ext type.
2266     const RegisterBank *CondBank =
2267       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2268 
2269     if (CondBank == &AMDGPU::SGPRRegBank) {
2270       MachineIRBuilder B(MI);
2271       const LLT S32 = LLT::scalar(32);
2272       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2273       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2274 
2275       MI.getOperand(0).setReg(NewCondReg);
2276       B.buildZExt(NewCondReg, CondReg);
2277       return;
2278     }
2279 
2280     break;
2281   }
2282   case AMDGPU::G_AND:
2283   case AMDGPU::G_OR:
2284   case AMDGPU::G_XOR: {
2285     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2286     // there is a VGPR input.
2287     Register DstReg = MI.getOperand(0).getReg();
2288     LLT DstTy = MRI.getType(DstReg);
2289 
2290     if (DstTy.getSizeInBits() == 1) {
2291       const RegisterBank *DstBank =
2292         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2293       if (DstBank == &AMDGPU::VCCRegBank)
2294         break;
2295 
2296       MachineFunction *MF = MI.getParent()->getParent();
2297       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2298       GISelObserverWrapper Observer(&ApplyBank);
2299       MachineIRBuilder B(MI);
2300       LegalizerHelper Helper(*MF, Observer, B);
2301 
2302       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2303           LegalizerHelper::Legalized)
2304         llvm_unreachable("widen scalar should have succeeded");
2305       return;
2306     }
2307 
2308     if (DstTy.getSizeInBits() != 64)
2309       break;
2310 
2311     LLT HalfTy = getHalfSizedType(DstTy);
2312     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2313     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2314     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2315 
2316     // All inputs are SGPRs, nothing special to do.
2317     if (DefRegs.empty()) {
2318       assert(Src0Regs.empty() && Src1Regs.empty());
2319       break;
2320     }
2321 
2322     assert(DefRegs.size() == 2);
2323     assert(Src0Regs.size() == Src1Regs.size() &&
2324            (Src0Regs.empty() || Src0Regs.size() == 2));
2325 
2326     // Depending on where the source registers came from, the generic code may
2327     // have decided to split the inputs already or not. If not, we still need to
2328     // extract the values.
2329     MachineIRBuilder B(MI);
2330 
2331     if (Src0Regs.empty())
2332       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2333     else
2334       setRegsToType(MRI, Src0Regs, HalfTy);
2335 
2336     if (Src1Regs.empty())
2337       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2338     else
2339       setRegsToType(MRI, Src1Regs, HalfTy);
2340 
2341     setRegsToType(MRI, DefRegs, HalfTy);
2342 
2343     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2344     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2345 
2346     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2347     MI.eraseFromParent();
2348     return;
2349   }
2350   case AMDGPU::G_ADD:
2351   case AMDGPU::G_SUB:
2352   case AMDGPU::G_MUL:
2353   case AMDGPU::G_SHL:
2354   case AMDGPU::G_LSHR:
2355   case AMDGPU::G_ASHR: {
2356     Register DstReg = MI.getOperand(0).getReg();
2357     LLT DstTy = MRI.getType(DstReg);
2358 
2359     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2360     // Packed 16-bit operations need to be scalarized and promoted.
2361     if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
2362       break;
2363 
2364     const RegisterBank *DstBank =
2365       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2366     if (DstBank == &AMDGPU::VGPRRegBank)
2367       break;
2368 
2369     const LLT S32 = LLT::scalar(32);
2370     MachineBasicBlock *MBB = MI.getParent();
2371     MachineFunction *MF = MBB->getParent();
2372     MachineIRBuilder B(MI);
2373     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2374     GISelObserverWrapper Observer(&ApplySALU);
2375 
2376     if (DstTy.isVector()) {
2377       B.setChangeObserver(Observer);
2378 
2379       Register WideSrc0Lo, WideSrc0Hi;
2380       Register WideSrc1Lo, WideSrc1Hi;
2381 
2382       std::tie(WideSrc0Lo, WideSrc0Hi)
2383         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
2384       std::tie(WideSrc1Lo, WideSrc1Hi)
2385         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
2386       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2387       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2388       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2389       MI.eraseFromParent();
2390     } else {
2391       LegalizerHelper Helper(*MF, Observer, B);
2392 
2393       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2394         llvm_unreachable("widen scalar should have succeeded");
2395 
2396       // FIXME: s16 shift amounts should be legal.
2397       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2398           Opc == AMDGPU::G_ASHR) {
2399         B.setInsertPt(*MBB, MI.getIterator());
2400         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2401           llvm_unreachable("widen scalar should have succeeded");
2402       }
2403     }
2404 
2405     return;
2406   }
2407   case AMDGPU::G_SMIN:
2408   case AMDGPU::G_SMAX:
2409   case AMDGPU::G_UMIN:
2410   case AMDGPU::G_UMAX: {
2411     Register DstReg = MI.getOperand(0).getReg();
2412     const RegisterBank *DstBank =
2413       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2414     if (DstBank == &AMDGPU::VGPRRegBank)
2415       break;
2416 
2417     MachineFunction *MF = MI.getParent()->getParent();
2418     MachineIRBuilder B(MI);
2419 
2420     // Turn scalar min/max into a compare and select.
2421     LLT Ty = MRI.getType(DstReg);
2422     const LLT S32 = LLT::scalar(32);
2423     const LLT S16 = LLT::scalar(16);
2424     const LLT V2S16 = LLT::vector(2, 16);
2425 
2426     if (Ty == V2S16) {
2427       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2428       GISelObserverWrapper Observer(&ApplySALU);
2429       B.setChangeObserver(Observer);
2430 
2431       // Need to widen to s32, and expand as cmp + select, and avoid producing
2432       // illegal vector extends or unmerges that would need further
2433       // legalization.
2434       //
2435       // TODO: Should we just readfirstlane? That should probably be handled
2436       // with a UniformVGPR register bank that wouldn't need special
2437       // consideration here.
2438 
2439       Register Dst = MI.getOperand(0).getReg();
2440       Register Src0 = MI.getOperand(1).getReg();
2441       Register Src1 = MI.getOperand(2).getReg();
2442 
2443       Register WideSrc0Lo, WideSrc0Hi;
2444       Register WideSrc1Lo, WideSrc1Hi;
2445 
2446       unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
2447 
2448       std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
2449       std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
2450 
2451       Register Lo = MRI.createGenericVirtualRegister(S32);
2452       Register Hi = MRI.createGenericVirtualRegister(S32);
2453       const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
2454       buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
2455       buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
2456 
2457       B.buildBuildVectorTrunc(Dst, {Lo, Hi});
2458       MI.eraseFromParent();
2459     } else if (Ty == S16) {
2460       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2461       GISelObserverWrapper Observer(&ApplySALU);
2462       LegalizerHelper Helper(*MF, Observer, B);
2463 
2464       // Need to widen to s32, and expand as cmp + select.
2465       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2466         llvm_unreachable("widenScalar should have succeeded");
2467 
2468       // FIXME: This is relying on widenScalar leaving MI in place.
2469       lowerScalarMinMax(B, MI);
2470     } else
2471       lowerScalarMinMax(B, MI);
2472 
2473     return;
2474   }
2475   case AMDGPU::G_SEXT_INREG: {
2476     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2477     if (SrcRegs.empty())
2478       break; // Nothing to repair
2479 
2480     const LLT S32 = LLT::scalar(32);
2481     MachineIRBuilder B(MI);
2482     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2483     GISelObserverWrapper Observer(&O);
2484     B.setChangeObserver(Observer);
2485 
2486     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2487     // we would need to further expand, and doesn't let us directly set the
2488     // result registers.
2489     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2490 
2491     int Amt = MI.getOperand(2).getImm();
2492     if (Amt <= 32) {
2493       if (Amt == 32) {
2494         // The low bits are unchanged.
2495         B.buildCopy(DstRegs[0], SrcRegs[0]);
2496       } else {
2497         // Extend in the low bits and propagate the sign bit to the high half.
2498         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2499       }
2500 
2501       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2502     } else {
2503       // The low bits are unchanged, and extend in the high bits.
2504       B.buildCopy(DstRegs[0], SrcRegs[0]);
2505       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2506     }
2507 
2508     Register DstReg = MI.getOperand(0).getReg();
2509     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2510     MI.eraseFromParent();
2511     return;
2512   }
2513   case AMDGPU::G_CTPOP:
2514   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2515   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2516     MachineIRBuilder B(MI);
2517     MachineFunction &MF = B.getMF();
2518 
2519     const RegisterBank *DstBank =
2520       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2521     if (DstBank == &AMDGPU::SGPRRegBank)
2522       break;
2523 
2524     Register SrcReg = MI.getOperand(1).getReg();
2525     const LLT S32 = LLT::scalar(32);
2526     LLT Ty = MRI.getType(SrcReg);
2527     if (Ty == S32)
2528       break;
2529 
2530     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2531     GISelObserverWrapper Observer(&ApplyVALU);
2532     LegalizerHelper Helper(MF, Observer, B);
2533 
2534     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2535       llvm_unreachable("narrowScalar should have succeeded");
2536     return;
2537   }
2538   case AMDGPU::G_SEXT:
2539   case AMDGPU::G_ZEXT:
2540   case AMDGPU::G_ANYEXT: {
2541     Register SrcReg = MI.getOperand(1).getReg();
2542     LLT SrcTy = MRI.getType(SrcReg);
2543     const bool Signed = Opc == AMDGPU::G_SEXT;
2544 
2545     assert(empty(OpdMapper.getVRegs(1)));
2546 
2547     MachineIRBuilder B(MI);
2548     const RegisterBank *SrcBank =
2549       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2550 
2551     Register DstReg = MI.getOperand(0).getReg();
2552     LLT DstTy = MRI.getType(DstReg);
2553     if (DstTy.isScalar() &&
2554         SrcBank != &AMDGPU::SGPRRegBank &&
2555         SrcBank != &AMDGPU::VCCRegBank &&
2556         // FIXME: Should handle any type that round to s64 when irregular
2557         // breakdowns supported.
2558         DstTy.getSizeInBits() == 64 &&
2559         SrcTy.getSizeInBits() <= 32) {
2560       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2561 
2562       // Extend to 32-bit, and then extend the low half.
2563       if (Signed) {
2564         // TODO: Should really be buildSExtOrCopy
2565         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2566       } else if (Opc == AMDGPU::G_ZEXT) {
2567         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2568       } else {
2569         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2570       }
2571 
2572       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2573       MRI.setRegBank(DstReg, *SrcBank);
2574       MI.eraseFromParent();
2575       return;
2576     }
2577 
2578     if (SrcTy != LLT::scalar(1))
2579       return;
2580 
2581     // It is not legal to have a legalization artifact with a VCC source. Rather
2582     // than introducing a copy, insert the select we would have to select the
2583     // copy to.
2584     if (SrcBank == &AMDGPU::VCCRegBank) {
2585       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2586 
2587       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2588 
2589       unsigned DstSize = DstTy.getSizeInBits();
2590       // 64-bit select is SGPR only
2591       const bool UseSel64 = DstSize > 32 &&
2592         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2593 
2594       // TODO: Should s16 select be legal?
2595       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2596       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2597       auto False = B.buildConstant(SelType, 0);
2598 
2599       MRI.setRegBank(True.getReg(0), *DstBank);
2600       MRI.setRegBank(False.getReg(0), *DstBank);
2601       MRI.setRegBank(DstReg, *DstBank);
2602 
2603       if (DstSize > 32) {
2604         B.buildSelect(DefRegs[0], SrcReg, True, False);
2605         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2606       } else if (DstSize < 32) {
2607         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2608         MRI.setRegBank(Sel.getReg(0), *DstBank);
2609         B.buildTrunc(DstReg, Sel);
2610       } else {
2611         B.buildSelect(DstReg, SrcReg, True, False);
2612       }
2613 
2614       MI.eraseFromParent();
2615       return;
2616     }
2617 
2618     break;
2619   }
2620   case AMDGPU::G_BUILD_VECTOR:
2621   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2622     Register DstReg = MI.getOperand(0).getReg();
2623     LLT DstTy = MRI.getType(DstReg);
2624     if (DstTy != LLT::vector(2, 16))
2625       break;
2626 
2627     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2628     substituteSimpleCopyRegs(OpdMapper, 1);
2629     substituteSimpleCopyRegs(OpdMapper, 2);
2630 
2631     const RegisterBank *DstBank =
2632       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2633     if (DstBank == &AMDGPU::SGPRRegBank)
2634       break; // Can use S_PACK_* instructions.
2635 
2636     MachineIRBuilder B(MI);
2637 
2638     Register Lo = MI.getOperand(1).getReg();
2639     Register Hi = MI.getOperand(2).getReg();
2640     const LLT S32 = LLT::scalar(32);
2641 
2642     const RegisterBank *BankLo =
2643       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2644     const RegisterBank *BankHi =
2645       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2646 
2647     Register ZextLo;
2648     Register ShiftHi;
2649 
2650     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2651       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2652       MRI.setRegBank(ZextLo, *BankLo);
2653 
2654       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2655       MRI.setRegBank(ZextHi, *BankHi);
2656 
2657       auto ShiftAmt = B.buildConstant(S32, 16);
2658       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2659 
2660       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2661       MRI.setRegBank(ShiftHi, *BankHi);
2662     } else {
2663       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2664       MRI.setRegBank(MaskLo, *BankLo);
2665 
2666       auto ShiftAmt = B.buildConstant(S32, 16);
2667       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2668 
2669       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2670       MRI.setRegBank(ShiftHi, *BankHi);
2671 
2672       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2673       MRI.setRegBank(ZextLo, *BankLo);
2674     }
2675 
2676     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2677     MRI.setRegBank(Or.getReg(0), *DstBank);
2678 
2679     B.buildBitcast(DstReg, Or);
2680     MI.eraseFromParent();
2681     return;
2682   }
2683   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2684     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2685 
2686     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2687 
2688     Register DstReg = MI.getOperand(0).getReg();
2689     Register SrcReg = MI.getOperand(1).getReg();
2690 
2691     const LLT S32 = LLT::scalar(32);
2692     LLT DstTy = MRI.getType(DstReg);
2693     LLT SrcTy = MRI.getType(SrcReg);
2694 
2695     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2696       return;
2697 
2698     MachineIRBuilder B(MI);
2699 
2700     const ValueMapping &DstMapping
2701       = OpdMapper.getInstrMapping().getOperandMapping(0);
2702     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2703     const RegisterBank *SrcBank =
2704       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2705     const RegisterBank *IdxBank =
2706         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2707 
2708     Register BaseIdxReg;
2709     unsigned ConstOffset;
2710     std::tie(BaseIdxReg, ConstOffset) =
2711         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2712 
2713     // See if the index is an add of a constant which will be foldable by moving
2714     // the base register of the index later if this is going to be executed in a
2715     // waterfall loop. This is essentially to reassociate the add of a constant
2716     // with the readfirstlane.
2717     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2718                                    ConstOffset > 0 &&
2719                                    ConstOffset < SrcTy.getNumElements();
2720 
2721     // Move the base register. We'll re-insert the add later.
2722     if (ShouldMoveIndexIntoLoop)
2723       MI.getOperand(2).setReg(BaseIdxReg);
2724 
2725     // If this is a VGPR result only because the index was a VGPR result, the
2726     // actual indexing will be done on the SGPR source vector, which will
2727     // produce a scalar result. We need to copy to the VGPR result inside the
2728     // waterfall loop.
2729     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2730                                 SrcBank == &AMDGPU::SGPRRegBank;
2731     if (DstRegs.empty()) {
2732       applyDefaultMapping(OpdMapper);
2733 
2734       executeInWaterfallLoop(MI, MRI, { 2 });
2735 
2736       if (NeedCopyToVGPR) {
2737         // We don't want a phi for this temporary reg.
2738         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2739         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2740         MI.getOperand(0).setReg(TmpReg);
2741         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2742 
2743         // Use a v_mov_b32 here to make the exec dependency explicit.
2744         buildVCopy(B, DstReg, TmpReg);
2745       }
2746 
2747       // Re-insert the constant offset add inside the waterfall loop.
2748       if (ShouldMoveIndexIntoLoop)
2749         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2750 
2751       return;
2752     }
2753 
2754     assert(DstTy.getSizeInBits() == 64);
2755 
2756     LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2757 
2758     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2759     auto One = B.buildConstant(S32, 1);
2760 
2761     MachineBasicBlock::iterator MII = MI.getIterator();
2762 
2763     // Split the vector index into 32-bit pieces. Prepare to move all of the
2764     // new instructions into a waterfall loop if necessary.
2765     //
2766     // Don't put the bitcast or constant in the loop.
2767     MachineInstrSpan Span(MII, &B.getMBB());
2768 
2769     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2770     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2771     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2772 
2773     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2774     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2775 
2776     MRI.setRegBank(DstReg, *DstBank);
2777     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2778     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2779     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2780     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2781 
2782     SmallSet<Register, 4> OpsToWaterfall;
2783     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2784       MI.eraseFromParent();
2785       return;
2786     }
2787 
2788     // Remove the original instruction to avoid potentially confusing the
2789     // waterfall loop logic.
2790     B.setInstr(*Span.begin());
2791     MI.eraseFromParent();
2792     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2793                            OpsToWaterfall, MRI);
2794 
2795     if (NeedCopyToVGPR) {
2796       MachineBasicBlock *LoopBB = Extract1->getParent();
2797       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2798       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2799       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2800       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2801 
2802       Extract0->getOperand(0).setReg(TmpReg0);
2803       Extract1->getOperand(0).setReg(TmpReg1);
2804 
2805       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2806 
2807       buildVCopy(B, DstRegs[0], TmpReg0);
2808       buildVCopy(B, DstRegs[1], TmpReg1);
2809     }
2810 
2811     if (ShouldMoveIndexIntoLoop)
2812       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2813 
2814     return;
2815   }
2816   case AMDGPU::G_INSERT_VECTOR_ELT: {
2817     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2818 
2819     Register DstReg = MI.getOperand(0).getReg();
2820     LLT VecTy = MRI.getType(DstReg);
2821 
2822     assert(OpdMapper.getVRegs(0).empty());
2823     assert(OpdMapper.getVRegs(3).empty());
2824 
2825     if (substituteSimpleCopyRegs(OpdMapper, 1))
2826       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2827 
2828     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2829       return;
2830 
2831     const RegisterBank *IdxBank =
2832       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2833 
2834     Register SrcReg = MI.getOperand(1).getReg();
2835     Register InsReg = MI.getOperand(2).getReg();
2836     LLT InsTy = MRI.getType(InsReg);
2837     (void)InsTy;
2838 
2839     Register BaseIdxReg;
2840     unsigned ConstOffset;
2841     std::tie(BaseIdxReg, ConstOffset) =
2842         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2843 
2844     // See if the index is an add of a constant which will be foldable by moving
2845     // the base register of the index later if this is going to be executed in a
2846     // waterfall loop. This is essentially to reassociate the add of a constant
2847     // with the readfirstlane.
2848     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2849       ConstOffset > 0 &&
2850       ConstOffset < VecTy.getNumElements();
2851 
2852     // Move the base register. We'll re-insert the add later.
2853     if (ShouldMoveIndexIntoLoop)
2854       MI.getOperand(3).setReg(BaseIdxReg);
2855 
2856 
2857     if (InsRegs.empty()) {
2858       executeInWaterfallLoop(MI, MRI, { 3 });
2859 
2860       // Re-insert the constant offset add inside the waterfall loop.
2861       if (ShouldMoveIndexIntoLoop) {
2862         MachineIRBuilder B(MI);
2863         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2864       }
2865 
2866       return;
2867     }
2868 
2869 
2870     assert(InsTy.getSizeInBits() == 64);
2871 
2872     const LLT S32 = LLT::scalar(32);
2873     LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2874 
2875     MachineIRBuilder B(MI);
2876     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2877     auto One = B.buildConstant(S32, 1);
2878 
2879     // Split the vector index into 32-bit pieces. Prepare to move all of the
2880     // new instructions into a waterfall loop if necessary.
2881     //
2882     // Don't put the bitcast or constant in the loop.
2883     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2884 
2885     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2886     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2887     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2888 
2889     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2890     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2891 
2892     const RegisterBank *DstBank =
2893       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2894     const RegisterBank *SrcBank =
2895       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2896     const RegisterBank *InsSrcBank =
2897       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2898 
2899     MRI.setRegBank(InsReg, *InsSrcBank);
2900     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2901     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2902     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2903     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2904     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2905     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2906 
2907 
2908     SmallSet<Register, 4> OpsToWaterfall;
2909     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2910       B.setInsertPt(B.getMBB(), MI);
2911       B.buildBitcast(DstReg, InsHi);
2912       MI.eraseFromParent();
2913       return;
2914     }
2915 
2916     B.setInstr(*Span.begin());
2917     MI.eraseFromParent();
2918 
2919     // Figure out the point after the waterfall loop before mangling the control
2920     // flow.
2921     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2922                            OpsToWaterfall, MRI);
2923 
2924     // The insertion point is now right after the original instruction.
2925     //
2926     // Keep the bitcast to the original vector type out of the loop. Doing this
2927     // saved an extra phi we don't need inside the loop.
2928     B.buildBitcast(DstReg, InsHi);
2929 
2930     // Re-insert the constant offset add inside the waterfall loop.
2931     if (ShouldMoveIndexIntoLoop)
2932       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2933 
2934     return;
2935   }
2936   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2937   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2938   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2939   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2940   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2941   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2942   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2943   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2944   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2945   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2946   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2947   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2948   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2949   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2950   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2951   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2952     applyDefaultMapping(OpdMapper);
2953     executeInWaterfallLoop(MI, MRI, {1, 4});
2954     return;
2955   }
2956   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2957   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2958   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2959   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2960   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2961   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2962   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2963   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2964   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2965   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2966   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2967   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2968     applyDefaultMapping(OpdMapper);
2969     executeInWaterfallLoop(MI, MRI, {2, 5});
2970     return;
2971   }
2972   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
2973     applyDefaultMapping(OpdMapper);
2974     executeInWaterfallLoop(MI, MRI, {2, 5});
2975     return;
2976   }
2977   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2978     applyDefaultMapping(OpdMapper);
2979     executeInWaterfallLoop(MI, MRI, {3, 6});
2980     return;
2981   }
2982   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2983     applyMappingSBufferLoad(OpdMapper);
2984     return;
2985   }
2986   case AMDGPU::G_INTRINSIC: {
2987     switch (MI.getIntrinsicID()) {
2988     case Intrinsic::amdgcn_readlane: {
2989       substituteSimpleCopyRegs(OpdMapper, 2);
2990 
2991       assert(OpdMapper.getVRegs(0).empty());
2992       assert(OpdMapper.getVRegs(3).empty());
2993 
2994       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2995       // waterfall loop, so assume it's a uniform value.
2996       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2997       return;
2998     }
2999     case Intrinsic::amdgcn_writelane: {
3000       assert(OpdMapper.getVRegs(0).empty());
3001       assert(OpdMapper.getVRegs(2).empty());
3002       assert(OpdMapper.getVRegs(3).empty());
3003 
3004       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3005       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
3006       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3007       return;
3008     }
3009     case Intrinsic::amdgcn_interp_p1:
3010     case Intrinsic::amdgcn_interp_p2:
3011     case Intrinsic::amdgcn_interp_mov:
3012     case Intrinsic::amdgcn_interp_p1_f16:
3013     case Intrinsic::amdgcn_interp_p2_f16: {
3014       applyDefaultMapping(OpdMapper);
3015 
3016       // Readlane for m0 value, which is always the last operand.
3017       // FIXME: Should this be a waterfall loop instead?
3018       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3019       return;
3020     }
3021     case Intrinsic::amdgcn_permlane16:
3022     case Intrinsic::amdgcn_permlanex16: {
3023       // Doing a waterfall loop over these wouldn't make any sense.
3024       substituteSimpleCopyRegs(OpdMapper, 2);
3025       substituteSimpleCopyRegs(OpdMapper, 3);
3026       constrainOpWithReadfirstlane(MI, MRI, 4);
3027       constrainOpWithReadfirstlane(MI, MRI, 5);
3028       return;
3029     }
3030     case Intrinsic::amdgcn_sbfe:
3031       applyMappingBFEIntrinsic(OpdMapper, true);
3032       return;
3033     case Intrinsic::amdgcn_ubfe:
3034       applyMappingBFEIntrinsic(OpdMapper, false);
3035       return;
3036     case Intrinsic::amdgcn_ballot:
3037       // Use default handling and insert copy to vcc source.
3038       break;
3039     }
3040     break;
3041   }
3042   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3043   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3044     const AMDGPU::RsrcIntrinsic *RSrcIntrin
3045       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3046     assert(RSrcIntrin && RSrcIntrin->IsImage);
3047     // Non-images can have complications from operands that allow both SGPR
3048     // and VGPR. For now it's too complicated to figure out the final opcode
3049     // to derive the register bank from the MCInstrDesc.
3050     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3051     return;
3052   }
3053   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3054     unsigned N = MI.getNumExplicitOperands() - 2;
3055     executeInWaterfallLoop(MI, MRI, { N });
3056     return;
3057   }
3058   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3059     auto IntrID = MI.getIntrinsicID();
3060     switch (IntrID) {
3061     case Intrinsic::amdgcn_ds_ordered_add:
3062     case Intrinsic::amdgcn_ds_ordered_swap: {
3063       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3064       assert(OpdMapper.getVRegs(0).empty());
3065       substituteSimpleCopyRegs(OpdMapper, 3);
3066       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3067       return;
3068     }
3069     case Intrinsic::amdgcn_ds_gws_init:
3070     case Intrinsic::amdgcn_ds_gws_barrier:
3071     case Intrinsic::amdgcn_ds_gws_sema_br: {
3072       // Only the first lane is executes, so readfirstlane is safe.
3073       substituteSimpleCopyRegs(OpdMapper, 1);
3074       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3075       return;
3076     }
3077     case Intrinsic::amdgcn_ds_gws_sema_v:
3078     case Intrinsic::amdgcn_ds_gws_sema_p:
3079     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3080       // Only the first lane is executes, so readfirstlane is safe.
3081       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3082       return;
3083     }
3084     case Intrinsic::amdgcn_ds_append:
3085     case Intrinsic::amdgcn_ds_consume: {
3086       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3087       return;
3088     }
3089     case Intrinsic::amdgcn_s_sendmsg:
3090     case Intrinsic::amdgcn_s_sendmsghalt: {
3091       // FIXME: Should this use a waterfall loop?
3092       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3093       return;
3094     }
3095     case Intrinsic::amdgcn_s_setreg: {
3096       constrainOpWithReadfirstlane(MI, MRI, 2);
3097       return;
3098     }
3099     default: {
3100       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3101               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3102         // Non-images can have complications from operands that allow both SGPR
3103         // and VGPR. For now it's too complicated to figure out the final opcode
3104         // to derive the register bank from the MCInstrDesc.
3105         if (RSrcIntrin->IsImage) {
3106           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3107           return;
3108         }
3109       }
3110 
3111       break;
3112     }
3113     }
3114     break;
3115   }
3116   case AMDGPU::G_LOAD:
3117   case AMDGPU::G_ZEXTLOAD:
3118   case AMDGPU::G_SEXTLOAD: {
3119     if (applyMappingLoad(MI, OpdMapper, MRI))
3120       return;
3121     break;
3122   }
3123   case AMDGPU::G_DYN_STACKALLOC:
3124     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3125     return;
3126   default:
3127     break;
3128   }
3129 
3130   return applyDefaultMapping(OpdMapper);
3131 }
3132 
3133 // vgpr, sgpr -> vgpr
3134 // vgpr, agpr -> vgpr
3135 // agpr, agpr -> agpr
3136 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3137 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3138   if (RB0 == AMDGPU::InvalidRegBankID)
3139     return RB1;
3140   if (RB1 == AMDGPU::InvalidRegBankID)
3141     return RB0;
3142 
3143   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3144     return AMDGPU::SGPRRegBankID;
3145 
3146   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3147     return AMDGPU::AGPRRegBankID;
3148 
3149   return AMDGPU::VGPRRegBankID;
3150 }
3151 
regBankBoolUnion(unsigned RB0,unsigned RB1)3152 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3153   if (RB0 == AMDGPU::InvalidRegBankID)
3154     return RB1;
3155   if (RB1 == AMDGPU::InvalidRegBankID)
3156     return RB0;
3157 
3158   // vcc, vcc -> vcc
3159   // vcc, sgpr -> vcc
3160   // vcc, vgpr -> vcc
3161   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3162     return AMDGPU::VCCRegBankID;
3163 
3164   // vcc, vgpr -> vgpr
3165   return regBankUnion(RB0, RB1);
3166 }
3167 
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3168 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3169                                                 const MachineInstr &MI) const {
3170   unsigned RegBank = AMDGPU::InvalidRegBankID;
3171 
3172   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3173     if (!MI.getOperand(i).isReg())
3174       continue;
3175     Register Reg = MI.getOperand(i).getReg();
3176     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3177       RegBank = regBankUnion(RegBank, Bank->getID());
3178       if (RegBank == AMDGPU::VGPRRegBankID)
3179         break;
3180     }
3181   }
3182 
3183   return RegBank;
3184 }
3185 
isSALUMapping(const MachineInstr & MI) const3186 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3187   const MachineFunction &MF = *MI.getParent()->getParent();
3188   const MachineRegisterInfo &MRI = MF.getRegInfo();
3189   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3190     if (!MI.getOperand(i).isReg())
3191       continue;
3192     Register Reg = MI.getOperand(i).getReg();
3193     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3194       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3195         return false;
3196     }
3197   }
3198   return true;
3199 }
3200 
3201 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3202 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3203   const MachineFunction &MF = *MI.getParent()->getParent();
3204   const MachineRegisterInfo &MRI = MF.getRegInfo();
3205   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3206 
3207   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3208     const MachineOperand &SrcOp = MI.getOperand(i);
3209     if (!SrcOp.isReg())
3210       continue;
3211 
3212     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3213     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3214   }
3215   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3216                                MI.getNumOperands());
3217 }
3218 
3219 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3220 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3221   const MachineFunction &MF = *MI.getParent()->getParent();
3222   const MachineRegisterInfo &MRI = MF.getRegInfo();
3223   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3224 
3225   // Even though we technically could use SGPRs, this would require knowledge of
3226   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3227   //
3228   // TODO: Unary ops are trivially OK, so accept SGPRs?
3229   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3230     const MachineOperand &Src = MI.getOperand(i);
3231     if (!Src.isReg())
3232       continue;
3233 
3234     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3235     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3236     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3237   }
3238 
3239   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3240                                MI.getNumOperands());
3241 }
3242 
3243 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3244 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3245   const MachineFunction &MF = *MI.getParent()->getParent();
3246   const MachineRegisterInfo &MRI = MF.getRegInfo();
3247   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3248 
3249   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3250     const MachineOperand &Op = MI.getOperand(I);
3251     if (!Op.isReg())
3252       continue;
3253 
3254     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3255     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3256   }
3257 
3258   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3259                                MI.getNumOperands());
3260 }
3261 
3262 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3263 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3264                                         const MachineInstr &MI,
3265                                         int RsrcIdx) const {
3266   // The reported argument index is relative to the IR intrinsic call arguments,
3267   // so we need to shift by the number of defs and the intrinsic ID.
3268   RsrcIdx += MI.getNumExplicitDefs() + 1;
3269 
3270   const int NumOps = MI.getNumOperands();
3271   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3272 
3273   // TODO: Should packed/unpacked D16 difference be reported here as part of
3274   // the value mapping?
3275   for (int I = 0; I != NumOps; ++I) {
3276     if (!MI.getOperand(I).isReg())
3277       continue;
3278 
3279     Register OpReg = MI.getOperand(I).getReg();
3280     // We replace some dead address operands with $noreg
3281     if (!OpReg)
3282       continue;
3283 
3284     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3285 
3286     // FIXME: Probably need a new intrinsic register bank searchable table to
3287     // handle arbitrary intrinsics easily.
3288     //
3289     // If this has a sampler, it immediately follows rsrc.
3290     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3291 
3292     if (MustBeSGPR) {
3293       // If this must be an SGPR, so we must report whatever it is as legal.
3294       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3295       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3296     } else {
3297       // Some operands must be VGPR, and these are easy to copy to.
3298       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3299     }
3300   }
3301 
3302   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3303 }
3304 
3305 /// Return the mapping for a pointer arugment.
3306 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3307 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3308                                               Register PtrReg) const {
3309   LLT PtrTy = MRI.getType(PtrReg);
3310   unsigned Size = PtrTy.getSizeInBits();
3311   if (Subtarget.useFlatForGlobal() ||
3312       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3313     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3314 
3315   // If we're using MUBUF instructions for global memory, an SGPR base register
3316   // is possible. Otherwise this needs to be a VGPR.
3317   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3318   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3319 }
3320 
3321 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3322 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3323 
3324   const MachineFunction &MF = *MI.getParent()->getParent();
3325   const MachineRegisterInfo &MRI = MF.getRegInfo();
3326   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3327   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3328   Register PtrReg = MI.getOperand(1).getReg();
3329   LLT PtrTy = MRI.getType(PtrReg);
3330   unsigned AS = PtrTy.getAddressSpace();
3331   unsigned PtrSize = PtrTy.getSizeInBits();
3332 
3333   const ValueMapping *ValMapping;
3334   const ValueMapping *PtrMapping;
3335 
3336   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3337 
3338   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3339     if (isScalarLoadLegal(MI)) {
3340       // We have a uniform instruction so we want to use an SMRD load
3341       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3342       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3343     } else {
3344       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3345 
3346       // If we're using MUBUF instructions for global memory, an SGPR base
3347       // register is possible. Otherwise this needs to be a VGPR.
3348       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3349         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3350 
3351       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3352     }
3353   } else {
3354     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3355     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3356   }
3357 
3358   OpdsMapping[0] = ValMapping;
3359   OpdsMapping[1] = PtrMapping;
3360   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3361       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3362   return Mapping;
3363 
3364   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3365   // handle that during instruction selection?
3366 }
3367 
3368 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3369 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3370                                      const MachineRegisterInfo &MRI,
3371                                      unsigned Default) const {
3372   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3373   return Bank ? Bank->getID() : Default;
3374 }
3375 
3376 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3377 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3378                                          const MachineRegisterInfo &MRI,
3379                                          const TargetRegisterInfo &TRI) const {
3380   // Lie and claim anything is legal, even though this needs to be an SGPR
3381   // applyMapping will have to deal with it as a waterfall loop.
3382   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3383   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3384   return AMDGPU::getValueMapping(Bank, Size);
3385 }
3386 
3387 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3388 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3389                                          const MachineRegisterInfo &MRI,
3390                                          const TargetRegisterInfo &TRI) const {
3391   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3392   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3393 }
3394 
3395 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3396 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3397                                          const MachineRegisterInfo &MRI,
3398                                          const TargetRegisterInfo &TRI) const {
3399   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3400   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3401 }
3402 
3403 ///
3404 /// This function must return a legal mapping, because
3405 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3406 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3407 /// VGPR to SGPR generated is illegal.
3408 ///
3409 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3410 // legal. These will be dealt with in applyMappingImpl.
3411 //
3412 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3413 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3414   const MachineFunction &MF = *MI.getParent()->getParent();
3415   const MachineRegisterInfo &MRI = MF.getRegInfo();
3416 
3417   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3418     // The default logic bothers to analyze impossible alternative mappings. We
3419     // want the most straightforward mapping, so just directly handle this.
3420     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3421                                              *TRI);
3422     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3423                                              *TRI);
3424     assert(SrcBank && "src bank should have been assigned already");
3425     if (!DstBank)
3426       DstBank = SrcBank;
3427 
3428     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3429     if (cannotCopy(*DstBank, *SrcBank, Size))
3430       return getInvalidInstructionMapping();
3431 
3432     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3433     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3434     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3435     OpdsMapping[0] = &ValMap;
3436     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3437       OpdsMapping[1] = &ValMap;
3438 
3439     return getInstructionMapping(
3440         1, /*Cost*/ 1,
3441         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3442   }
3443 
3444   if (MI.isRegSequence()) {
3445     // If any input is a VGPR, the result must be a VGPR. The default handling
3446     // assumes any copy between banks is legal.
3447     unsigned BankID = AMDGPU::SGPRRegBankID;
3448 
3449     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3450       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3451       // It doesn't make sense to use vcc or scc banks here, so just ignore
3452       // them.
3453       if (OpBank != AMDGPU::SGPRRegBankID) {
3454         BankID = AMDGPU::VGPRRegBankID;
3455         break;
3456       }
3457     }
3458     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3459 
3460     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3461     return getInstructionMapping(
3462         1, /*Cost*/ 1,
3463         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3464   }
3465 
3466   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3467   // properly.
3468   //
3469   // TODO: There are additional exec masking dependencies to analyze.
3470   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3471     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3472     Register DstReg = MI.getOperand(0).getReg();
3473 
3474     // Sometimes the result may have already been assigned a bank.
3475     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3476       ResultBank = DstBank->getID();
3477 
3478     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3479       Register Reg = MI.getOperand(I).getReg();
3480       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3481 
3482       // FIXME: Assuming VGPR for any undetermined inputs.
3483       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3484         ResultBank = AMDGPU::VGPRRegBankID;
3485         break;
3486       }
3487 
3488       // FIXME: Need to promote SGPR case to s32
3489       unsigned OpBank = Bank->getID();
3490       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3491     }
3492 
3493     assert(ResultBank != AMDGPU::InvalidRegBankID);
3494 
3495     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3496 
3497     const ValueMapping &ValMap =
3498         getValueMapping(0, Size, getRegBank(ResultBank));
3499     return getInstructionMapping(
3500         1, /*Cost*/ 1,
3501         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3502   }
3503 
3504   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3505   if (Mapping.isValid())
3506     return Mapping;
3507 
3508   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3509 
3510   switch (MI.getOpcode()) {
3511   default:
3512     return getInvalidInstructionMapping();
3513 
3514   case AMDGPU::G_AND:
3515   case AMDGPU::G_OR:
3516   case AMDGPU::G_XOR: {
3517     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3518     if (Size == 1) {
3519       const RegisterBank *DstBank
3520         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3521 
3522       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3523       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3524       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3525       if (DstBank) {
3526         TargetBankID = DstBank->getID();
3527         if (DstBank == &AMDGPU::VCCRegBank) {
3528           TargetBankID = AMDGPU::VCCRegBankID;
3529           BankLHS = AMDGPU::VCCRegBankID;
3530           BankRHS = AMDGPU::VCCRegBankID;
3531         } else {
3532           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3533                                  AMDGPU::SGPRRegBankID);
3534           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3535                                  AMDGPU::SGPRRegBankID);
3536         }
3537       } else {
3538         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3539                                AMDGPU::VCCRegBankID);
3540         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3541                                AMDGPU::VCCRegBankID);
3542 
3543         // Both inputs should be true booleans to produce a boolean result.
3544         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3545           TargetBankID = AMDGPU::VGPRRegBankID;
3546         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3547           TargetBankID = AMDGPU::VCCRegBankID;
3548           BankLHS = AMDGPU::VCCRegBankID;
3549           BankRHS = AMDGPU::VCCRegBankID;
3550         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3551           TargetBankID = AMDGPU::SGPRRegBankID;
3552         }
3553       }
3554 
3555       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3556       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3557       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3558       break;
3559     }
3560 
3561     if (Size == 64) {
3562 
3563       if (isSALUMapping(MI)) {
3564         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3565         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3566       } else {
3567         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3568         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3569         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3570 
3571         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3572         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3573       }
3574 
3575       break;
3576     }
3577 
3578     LLVM_FALLTHROUGH;
3579   }
3580   case AMDGPU::G_PTR_ADD:
3581   case AMDGPU::G_PTRMASK:
3582   case AMDGPU::G_ADD:
3583   case AMDGPU::G_SUB:
3584   case AMDGPU::G_MUL:
3585   case AMDGPU::G_SHL:
3586   case AMDGPU::G_LSHR:
3587   case AMDGPU::G_ASHR:
3588   case AMDGPU::G_UADDO:
3589   case AMDGPU::G_USUBO:
3590   case AMDGPU::G_UADDE:
3591   case AMDGPU::G_SADDE:
3592   case AMDGPU::G_USUBE:
3593   case AMDGPU::G_SSUBE:
3594   case AMDGPU::G_SMIN:
3595   case AMDGPU::G_SMAX:
3596   case AMDGPU::G_UMIN:
3597   case AMDGPU::G_UMAX:
3598   case AMDGPU::G_SHUFFLE_VECTOR:
3599     if (isSALUMapping(MI))
3600       return getDefaultMappingSOP(MI);
3601     LLVM_FALLTHROUGH;
3602 
3603   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3604   case AMDGPU::G_SSUBSAT:
3605   case AMDGPU::G_UADDSAT:
3606   case AMDGPU::G_USUBSAT:
3607   case AMDGPU::G_FADD:
3608   case AMDGPU::G_FSUB:
3609   case AMDGPU::G_FPTOSI:
3610   case AMDGPU::G_FPTOUI:
3611   case AMDGPU::G_FMUL:
3612   case AMDGPU::G_FMA:
3613   case AMDGPU::G_FMAD:
3614   case AMDGPU::G_FSQRT:
3615   case AMDGPU::G_FFLOOR:
3616   case AMDGPU::G_FCEIL:
3617   case AMDGPU::G_FRINT:
3618   case AMDGPU::G_SITOFP:
3619   case AMDGPU::G_UITOFP:
3620   case AMDGPU::G_FPTRUNC:
3621   case AMDGPU::G_FPEXT:
3622   case AMDGPU::G_FEXP2:
3623   case AMDGPU::G_FLOG2:
3624   case AMDGPU::G_FMINNUM:
3625   case AMDGPU::G_FMAXNUM:
3626   case AMDGPU::G_FMINNUM_IEEE:
3627   case AMDGPU::G_FMAXNUM_IEEE:
3628   case AMDGPU::G_FCANONICALIZE:
3629   case AMDGPU::G_INTRINSIC_TRUNC:
3630   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3631   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3632   case AMDGPU::G_AMDGPU_FFBH_U32:
3633   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3634   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3635   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3636   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3637   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3638   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3639   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3640     return getDefaultMappingVOP(MI);
3641   case AMDGPU::G_UMULH:
3642   case AMDGPU::G_SMULH: {
3643     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3644       return getDefaultMappingSOP(MI);
3645     return getDefaultMappingVOP(MI);
3646   }
3647   case AMDGPU::G_IMPLICIT_DEF: {
3648     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3649     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3650     break;
3651   }
3652   case AMDGPU::G_FCONSTANT:
3653   case AMDGPU::G_CONSTANT:
3654   case AMDGPU::G_GLOBAL_VALUE:
3655   case AMDGPU::G_BLOCK_ADDR:
3656   case AMDGPU::G_READCYCLECOUNTER: {
3657     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3658     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3659     break;
3660   }
3661   case AMDGPU::G_FRAME_INDEX: {
3662     // TODO: This should be the same as other constants, but eliminateFrameIndex
3663     // currently assumes VALU uses.
3664     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3665     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3666     break;
3667   }
3668   case AMDGPU::G_DYN_STACKALLOC: {
3669     // Result is always uniform, and a wave reduction is needed for the source.
3670     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3671     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3672     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3673     break;
3674   }
3675   case AMDGPU::G_INSERT: {
3676     unsigned BankID = getMappingType(MRI, MI);
3677     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3678     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3679     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3680     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3681     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3682     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3683     OpdsMapping[3] = nullptr;
3684     break;
3685   }
3686   case AMDGPU::G_EXTRACT: {
3687     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3688     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3689     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3690     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3691     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3692     OpdsMapping[2] = nullptr;
3693     break;
3694   }
3695   case AMDGPU::G_BUILD_VECTOR:
3696   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3697     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3698     if (DstTy == LLT::vector(2, 16)) {
3699       unsigned DstSize = DstTy.getSizeInBits();
3700       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3701       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3702       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3703       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3704 
3705       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3706       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3707       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3708       break;
3709     }
3710 
3711     LLVM_FALLTHROUGH;
3712   }
3713   case AMDGPU::G_MERGE_VALUES:
3714   case AMDGPU::G_CONCAT_VECTORS: {
3715     unsigned Bank = getMappingType(MRI, MI);
3716     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3717     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3718 
3719     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3720     // Op1 and Dst should use the same register bank.
3721     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3722       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3723     break;
3724   }
3725   case AMDGPU::G_BITCAST:
3726   case AMDGPU::G_INTTOPTR:
3727   case AMDGPU::G_PTRTOINT:
3728   case AMDGPU::G_BITREVERSE:
3729   case AMDGPU::G_FABS:
3730   case AMDGPU::G_FNEG: {
3731     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3732     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3733     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3734     break;
3735   }
3736   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3737   case AMDGPU::G_CTTZ_ZERO_UNDEF:
3738   case AMDGPU::G_CTPOP: {
3739     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3740     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3741     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3742 
3743     // This should really be getValueMappingSGPR64Only, but allowing the generic
3744     // code to handle the register split just makes using LegalizerHelper more
3745     // difficult.
3746     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3747     break;
3748   }
3749   case AMDGPU::G_TRUNC: {
3750     Register Dst = MI.getOperand(0).getReg();
3751     Register Src = MI.getOperand(1).getReg();
3752     unsigned Bank = getRegBankID(Src, MRI);
3753     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3754     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3755     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3756     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3757     break;
3758   }
3759   case AMDGPU::G_ZEXT:
3760   case AMDGPU::G_SEXT:
3761   case AMDGPU::G_ANYEXT:
3762   case AMDGPU::G_SEXT_INREG: {
3763     Register Dst = MI.getOperand(0).getReg();
3764     Register Src = MI.getOperand(1).getReg();
3765     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3766     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3767 
3768     unsigned DstBank;
3769     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3770     assert(SrcBank);
3771     switch (SrcBank->getID()) {
3772     case AMDGPU::SGPRRegBankID:
3773       DstBank = AMDGPU::SGPRRegBankID;
3774       break;
3775     default:
3776       DstBank = AMDGPU::VGPRRegBankID;
3777       break;
3778     }
3779 
3780     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3781     // 32-bits, and then to 64.
3782     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3783     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3784                                                        SrcSize);
3785     break;
3786   }
3787   case AMDGPU::G_FCMP: {
3788     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3789     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3790     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3791     OpdsMapping[1] = nullptr; // Predicate Operand.
3792     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3793     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3794     break;
3795   }
3796   case AMDGPU::G_STORE: {
3797     assert(MI.getOperand(0).isReg());
3798     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3799 
3800     // FIXME: We need to specify a different reg bank once scalar stores are
3801     // supported.
3802     const ValueMapping *ValMapping =
3803         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3804     OpdsMapping[0] = ValMapping;
3805     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3806     break;
3807   }
3808   case AMDGPU::G_ICMP: {
3809     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3810     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3811 
3812     // See if the result register has already been constrained to vcc, which may
3813     // happen due to control flow intrinsic lowering.
3814     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3815                                     AMDGPU::SGPRRegBankID);
3816     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3817     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3818 
3819     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3820                      Op2Bank == AMDGPU::SGPRRegBankID &&
3821                      Op3Bank == AMDGPU::SGPRRegBankID &&
3822       (Size == 32 || (Size == 64 &&
3823                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3824                       Subtarget.hasScalarCompareEq64()));
3825 
3826     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3827     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3828 
3829     // TODO: Use 32-bit for scalar output size.
3830     // SCC results will need to be copied to a 32-bit SGPR virtual register.
3831     const unsigned ResultSize = 1;
3832 
3833     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3834     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3835     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3836     break;
3837   }
3838   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3839     // VGPR index can be used for waterfall when indexing a SGPR vector.
3840     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3841     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3842     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3843     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3844     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3845     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3846 
3847     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3848     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3849 
3850     // The index can be either if the source vector is VGPR.
3851     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3852     break;
3853   }
3854   case AMDGPU::G_INSERT_VECTOR_ELT: {
3855     unsigned OutputBankID = isSALUMapping(MI) ?
3856       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3857 
3858     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3859     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3860     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3861     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3862     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3863 
3864     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3865     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3866 
3867     // This is a weird case, because we need to break down the mapping based on
3868     // the register bank of a different operand.
3869     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3870       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3871                                                       InsertSize);
3872     } else {
3873       assert(InsertSize == 32 || InsertSize == 64);
3874       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3875     }
3876 
3877     // The index can be either if the source vector is VGPR.
3878     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3879     break;
3880   }
3881   case AMDGPU::G_UNMERGE_VALUES: {
3882     unsigned Bank = getMappingType(MRI, MI);
3883 
3884     // Op1 and Dst should use the same register bank.
3885     // FIXME: Shouldn't this be the default? Why do we need to handle this?
3886     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3887       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3888       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3889     }
3890     break;
3891   }
3892   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3893   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3894   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3895   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3896   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3897   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3898   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3899   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3900   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3901   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3902   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3903   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3904   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3905   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3906   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3907   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3908     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3909 
3910     // rsrc
3911     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3912 
3913     // vindex
3914     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3915 
3916     // voffset
3917     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3918 
3919     // soffset
3920     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3921 
3922     // Any remaining operands are immediates and were correctly null
3923     // initialized.
3924     break;
3925   }
3926   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3927   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3928   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3929   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3930   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3931   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3932   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3933   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3934   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3935   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3936   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3937   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3938   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
3939     // vdata_out
3940     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3941 
3942     // vdata_in
3943     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3944 
3945     // rsrc
3946     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3947 
3948     // vindex
3949     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3950 
3951     // voffset
3952     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3953 
3954     // soffset
3955     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3956 
3957     // Any remaining operands are immediates and were correctly null
3958     // initialized.
3959     break;
3960   }
3961   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3962     // vdata_out
3963     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3964 
3965     // vdata_in
3966     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3967 
3968     // cmp
3969     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3970 
3971     // rsrc
3972     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3973 
3974     // vindex
3975     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3976 
3977     // voffset
3978     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3979 
3980     // soffset
3981     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3982 
3983     // Any remaining operands are immediates and were correctly null
3984     // initialized.
3985     break;
3986   }
3987   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3988     // Lie and claim everything is legal, even though some need to be
3989     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3990     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3991     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3992 
3993     // We need to convert this to a MUBUF if either the resource of offset is
3994     // VGPR.
3995     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3996     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3997     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3998 
3999     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4000     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4001     break;
4002   }
4003   case AMDGPU::G_INTRINSIC: {
4004     switch (MI.getIntrinsicID()) {
4005     default:
4006       return getInvalidInstructionMapping();
4007     case Intrinsic::amdgcn_div_fmas:
4008     case Intrinsic::amdgcn_div_fixup:
4009     case Intrinsic::amdgcn_trig_preop:
4010     case Intrinsic::amdgcn_sin:
4011     case Intrinsic::amdgcn_cos:
4012     case Intrinsic::amdgcn_log_clamp:
4013     case Intrinsic::amdgcn_rcp:
4014     case Intrinsic::amdgcn_rcp_legacy:
4015     case Intrinsic::amdgcn_sqrt:
4016     case Intrinsic::amdgcn_rsq:
4017     case Intrinsic::amdgcn_rsq_legacy:
4018     case Intrinsic::amdgcn_rsq_clamp:
4019     case Intrinsic::amdgcn_fmul_legacy:
4020     case Intrinsic::amdgcn_fma_legacy:
4021     case Intrinsic::amdgcn_ldexp:
4022     case Intrinsic::amdgcn_frexp_mant:
4023     case Intrinsic::amdgcn_frexp_exp:
4024     case Intrinsic::amdgcn_fract:
4025     case Intrinsic::amdgcn_cvt_pkrtz:
4026     case Intrinsic::amdgcn_cvt_pknorm_i16:
4027     case Intrinsic::amdgcn_cvt_pknorm_u16:
4028     case Intrinsic::amdgcn_cvt_pk_i16:
4029     case Intrinsic::amdgcn_cvt_pk_u16:
4030     case Intrinsic::amdgcn_fmed3:
4031     case Intrinsic::amdgcn_cubeid:
4032     case Intrinsic::amdgcn_cubema:
4033     case Intrinsic::amdgcn_cubesc:
4034     case Intrinsic::amdgcn_cubetc:
4035     case Intrinsic::amdgcn_sffbh:
4036     case Intrinsic::amdgcn_fmad_ftz:
4037     case Intrinsic::amdgcn_mbcnt_lo:
4038     case Intrinsic::amdgcn_mbcnt_hi:
4039     case Intrinsic::amdgcn_mul_u24:
4040     case Intrinsic::amdgcn_mul_i24:
4041     case Intrinsic::amdgcn_lerp:
4042     case Intrinsic::amdgcn_sad_u8:
4043     case Intrinsic::amdgcn_msad_u8:
4044     case Intrinsic::amdgcn_sad_hi_u8:
4045     case Intrinsic::amdgcn_sad_u16:
4046     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4047     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4048     case Intrinsic::amdgcn_mqsad_u32_u8:
4049     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4050     case Intrinsic::amdgcn_alignbit:
4051     case Intrinsic::amdgcn_alignbyte:
4052     case Intrinsic::amdgcn_fdot2:
4053     case Intrinsic::amdgcn_sdot2:
4054     case Intrinsic::amdgcn_udot2:
4055     case Intrinsic::amdgcn_sdot4:
4056     case Intrinsic::amdgcn_udot4:
4057     case Intrinsic::amdgcn_sdot8:
4058     case Intrinsic::amdgcn_udot8:
4059       return getDefaultMappingVOP(MI);
4060     case Intrinsic::amdgcn_sbfe:
4061     case Intrinsic::amdgcn_ubfe:
4062       if (isSALUMapping(MI))
4063         return getDefaultMappingSOP(MI);
4064       return getDefaultMappingVOP(MI);
4065     case Intrinsic::amdgcn_ds_swizzle:
4066     case Intrinsic::amdgcn_ds_permute:
4067     case Intrinsic::amdgcn_ds_bpermute:
4068     case Intrinsic::amdgcn_update_dpp:
4069     case Intrinsic::amdgcn_mov_dpp8:
4070     case Intrinsic::amdgcn_mov_dpp:
4071     case Intrinsic::amdgcn_wwm:
4072     case Intrinsic::amdgcn_wqm:
4073     case Intrinsic::amdgcn_softwqm:
4074     case Intrinsic::amdgcn_set_inactive:
4075       return getDefaultMappingAllVGPR(MI);
4076     case Intrinsic::amdgcn_kernarg_segment_ptr:
4077     case Intrinsic::amdgcn_s_getpc:
4078     case Intrinsic::amdgcn_groupstaticsize:
4079     case Intrinsic::amdgcn_reloc_constant:
4080     case Intrinsic::returnaddress: {
4081       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4082       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4083       break;
4084     }
4085     case Intrinsic::amdgcn_wqm_vote: {
4086       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4087       OpdsMapping[0] = OpdsMapping[2]
4088         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4089       break;
4090     }
4091     case Intrinsic::amdgcn_ps_live: {
4092       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4093       break;
4094     }
4095     case Intrinsic::amdgcn_div_scale: {
4096       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4097       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4098       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4099       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4100 
4101       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4102       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4103       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4104       break;
4105     }
4106     case Intrinsic::amdgcn_class: {
4107       Register Src0Reg = MI.getOperand(2).getReg();
4108       Register Src1Reg = MI.getOperand(3).getReg();
4109       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4110       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4111       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4112       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4113       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4114       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4115       break;
4116     }
4117     case Intrinsic::amdgcn_icmp:
4118     case Intrinsic::amdgcn_fcmp: {
4119       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4120       // This is not VCCRegBank because this is not used in boolean contexts.
4121       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4122       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4123       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4124       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4125       break;
4126     }
4127     case Intrinsic::amdgcn_readlane: {
4128       // This must be an SGPR, but accept a VGPR.
4129       Register IdxReg = MI.getOperand(3).getReg();
4130       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4131       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4132       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4133       LLVM_FALLTHROUGH;
4134     }
4135     case Intrinsic::amdgcn_readfirstlane: {
4136       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4137       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4138       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4139       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4140       break;
4141     }
4142     case Intrinsic::amdgcn_writelane: {
4143       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4144       Register SrcReg = MI.getOperand(2).getReg();
4145       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4146       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4147       Register IdxReg = MI.getOperand(3).getReg();
4148       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4149       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4150       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4151 
4152       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4153       // to legalize.
4154       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4155       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4156       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4157       break;
4158     }
4159     case Intrinsic::amdgcn_if_break: {
4160       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4161       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4162       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4163       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4164       break;
4165     }
4166     case Intrinsic::amdgcn_permlane16:
4167     case Intrinsic::amdgcn_permlanex16: {
4168       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4169       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4170       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4171       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4172       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4173       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4174       break;
4175     }
4176     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4177     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4178     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4179     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4180     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4181     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4182     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4183     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4184     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4185     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4186     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4187     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4188     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4189     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4190     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4191     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4192     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4193     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4194     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4195     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
4196       // Default for MAI intrinsics.
4197       // srcC can also be an immediate which can be folded later.
4198       // FIXME: Should we eventually add an alternative mapping with AGPR src
4199       // for srcA/srcB?
4200       //
4201       // vdst, srcA, srcB, srcC
4202       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4203       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4204       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4205       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4206       break;
4207     }
4208     case Intrinsic::amdgcn_interp_p1:
4209     case Intrinsic::amdgcn_interp_p2:
4210     case Intrinsic::amdgcn_interp_mov:
4211     case Intrinsic::amdgcn_interp_p1_f16:
4212     case Intrinsic::amdgcn_interp_p2_f16: {
4213       const int M0Idx = MI.getNumOperands() - 1;
4214       Register M0Reg = MI.getOperand(M0Idx).getReg();
4215       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4216       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4217 
4218       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4219       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4220         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4221 
4222       // Must be SGPR, but we must take whatever the original bank is and fix it
4223       // later.
4224       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4225       break;
4226     }
4227     case Intrinsic::amdgcn_ballot: {
4228       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4229       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4230       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4231       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4232       break;
4233     }
4234     }
4235     break;
4236   }
4237   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4238   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4239     auto IntrID = MI.getIntrinsicID();
4240     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4241     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4242     // Non-images can have complications from operands that allow both SGPR
4243     // and VGPR. For now it's too complicated to figure out the final opcode
4244     // to derive the register bank from the MCInstrDesc.
4245     assert(RSrcIntrin->IsImage);
4246     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4247   }
4248   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4249     unsigned N = MI.getNumExplicitOperands() - 2;
4250     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4251     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4252     for (unsigned I = 2; I < N; ++I)
4253       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4254     break;
4255   }
4256   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4257     auto IntrID = MI.getIntrinsicID();
4258     switch (IntrID) {
4259     case Intrinsic::amdgcn_s_getreg:
4260     case Intrinsic::amdgcn_s_memtime:
4261     case Intrinsic::amdgcn_s_memrealtime:
4262     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4263       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4264       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4265       break;
4266     }
4267     case Intrinsic::amdgcn_global_atomic_fadd:
4268     case Intrinsic::amdgcn_global_atomic_csub:
4269       return getDefaultMappingAllVGPR(MI);
4270     case Intrinsic::amdgcn_ds_ordered_add:
4271     case Intrinsic::amdgcn_ds_ordered_swap: {
4272       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4273       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4274       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4275                                  AMDGPU::SGPRRegBankID);
4276       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4277       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4278       break;
4279     }
4280     case Intrinsic::amdgcn_ds_append:
4281     case Intrinsic::amdgcn_ds_consume: {
4282       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4283       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4284       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4285       break;
4286     }
4287     case Intrinsic::amdgcn_exp_compr:
4288       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4289       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4290       break;
4291     case Intrinsic::amdgcn_exp:
4292       // FIXME: Could we support packed types here?
4293       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4294       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4295       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4296       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4297       break;
4298     case Intrinsic::amdgcn_s_sendmsg:
4299     case Intrinsic::amdgcn_s_sendmsghalt: {
4300       // This must be an SGPR, but accept a VGPR.
4301       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4302                                    AMDGPU::SGPRRegBankID);
4303       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4304       break;
4305     }
4306     case Intrinsic::amdgcn_s_setreg: {
4307       // This must be an SGPR, but accept a VGPR.
4308       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4309                                    AMDGPU::SGPRRegBankID);
4310       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4311       break;
4312     }
4313     case Intrinsic::amdgcn_end_cf: {
4314       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4315       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4316       break;
4317     }
4318     case Intrinsic::amdgcn_else: {
4319       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4320       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4321       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4322       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4323       break;
4324     }
4325     case Intrinsic::amdgcn_kill: {
4326       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4327       break;
4328     }
4329     case Intrinsic::amdgcn_raw_buffer_load:
4330     case Intrinsic::amdgcn_raw_tbuffer_load: {
4331       // FIXME: Should make intrinsic ID the last operand of the instruction,
4332       // then this would be the same as store
4333       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4334       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4335       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4336       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4337       break;
4338     }
4339     case Intrinsic::amdgcn_raw_buffer_store:
4340     case Intrinsic::amdgcn_raw_buffer_store_format:
4341     case Intrinsic::amdgcn_raw_tbuffer_store: {
4342       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4343       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4344       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4345       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4346       break;
4347     }
4348     case Intrinsic::amdgcn_struct_buffer_load:
4349     case Intrinsic::amdgcn_struct_tbuffer_load: {
4350       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4351       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4352       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4353       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4354       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4355       break;
4356     }
4357     case Intrinsic::amdgcn_struct_buffer_store:
4358     case Intrinsic::amdgcn_struct_tbuffer_store: {
4359       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4360       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4361       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4362       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4363       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4364       break;
4365     }
4366     case Intrinsic::amdgcn_init_exec_from_input: {
4367       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4368       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4369       break;
4370     }
4371     case Intrinsic::amdgcn_ds_gws_init:
4372     case Intrinsic::amdgcn_ds_gws_barrier:
4373     case Intrinsic::amdgcn_ds_gws_sema_br: {
4374       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4375 
4376       // This must be an SGPR, but accept a VGPR.
4377       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4378                                    AMDGPU::SGPRRegBankID);
4379       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4380       break;
4381     }
4382     case Intrinsic::amdgcn_ds_gws_sema_v:
4383     case Intrinsic::amdgcn_ds_gws_sema_p:
4384     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4385       // This must be an SGPR, but accept a VGPR.
4386       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4387                                    AMDGPU::SGPRRegBankID);
4388       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4389       break;
4390     }
4391     default:
4392       return getInvalidInstructionMapping();
4393     }
4394     break;
4395   }
4396   case AMDGPU::G_SELECT: {
4397     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4398     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4399                                     AMDGPU::SGPRRegBankID);
4400     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4401                                     AMDGPU::SGPRRegBankID);
4402     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4403                     Op3Bank == AMDGPU::SGPRRegBankID;
4404 
4405     unsigned CondBankDefault = SGPRSrcs ?
4406       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4407     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4408                                      CondBankDefault);
4409     if (CondBank == AMDGPU::SGPRRegBankID)
4410       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4411     else if (CondBank == AMDGPU::VGPRRegBankID)
4412       CondBank = AMDGPU::VCCRegBankID;
4413 
4414     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4415       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4416 
4417     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4418 
4419     // TODO: Should report 32-bit for scalar condition type.
4420     if (Size == 64) {
4421       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4422       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4423       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4424       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4425     } else {
4426       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4427       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4428       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4429       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4430     }
4431 
4432     break;
4433   }
4434 
4435   case AMDGPU::G_LOAD:
4436   case AMDGPU::G_ZEXTLOAD:
4437   case AMDGPU::G_SEXTLOAD:
4438     return getInstrMappingForLoad(MI);
4439 
4440   case AMDGPU::G_ATOMICRMW_XCHG:
4441   case AMDGPU::G_ATOMICRMW_ADD:
4442   case AMDGPU::G_ATOMICRMW_SUB:
4443   case AMDGPU::G_ATOMICRMW_AND:
4444   case AMDGPU::G_ATOMICRMW_OR:
4445   case AMDGPU::G_ATOMICRMW_XOR:
4446   case AMDGPU::G_ATOMICRMW_MAX:
4447   case AMDGPU::G_ATOMICRMW_MIN:
4448   case AMDGPU::G_ATOMICRMW_UMAX:
4449   case AMDGPU::G_ATOMICRMW_UMIN:
4450   case AMDGPU::G_ATOMICRMW_FADD:
4451   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4452   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4453   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4454   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4455   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4456     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4457     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4458     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4459     break;
4460   }
4461   case AMDGPU::G_ATOMIC_CMPXCHG: {
4462     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4463     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4464     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4465     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4466     break;
4467   }
4468   case AMDGPU::G_BRCOND: {
4469     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4470                                  AMDGPU::SGPRRegBankID);
4471     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4472     if (Bank != AMDGPU::SGPRRegBankID)
4473       Bank = AMDGPU::VCCRegBankID;
4474 
4475     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4476     break;
4477   }
4478   }
4479 
4480   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4481                                getOperandsMapping(OpdsMapping),
4482                                MI.getNumOperands());
4483 }
4484