1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 const AMDGPURegisterBankInfo &RBI; 101 MachineRegisterInfo &MRI; 102 const RegisterBank *NewBank; 103 SmallVector<MachineInstr *, 4> NewInsts; 104 105 public: 106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 107 MachineRegisterInfo &MRI_, const RegisterBank *RB) 108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 109 110 ~ApplyRegBankMapping() { 111 for (MachineInstr *MI : NewInsts) 112 applyBank(*MI); 113 } 114 115 /// Set any registers that don't have a set register class or bank to SALU. 116 void applyBank(MachineInstr &MI) { 117 const unsigned Opc = MI.getOpcode(); 118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 119 Opc == AMDGPU::G_SEXT) { 120 // LegalizerHelper wants to use the basic legalization artifacts when 121 // widening etc. We don't handle selection with vcc in artifact sources, 122 // so we need to use a select instead to handle these properly. 123 Register DstReg = MI.getOperand(0).getReg(); 124 Register SrcReg = MI.getOperand(1).getReg(); 125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 126 if (SrcBank == &AMDGPU::VCCRegBank) { 127 const LLT S32 = LLT::scalar(32); 128 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 129 assert(MRI.getType(DstReg) == S32); 130 assert(NewBank == &AMDGPU::VGPRRegBank); 131 132 // Replace the extension with a select, which really uses the boolean 133 // source. 134 MachineIRBuilder B(MI); 135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 136 auto False = B.buildConstant(S32, 0); 137 B.buildSelect(DstReg, SrcReg, True, False); 138 MRI.setRegBank(True.getReg(0), *NewBank); 139 MRI.setRegBank(False.getReg(0), *NewBank); 140 MI.eraseFromParent(); 141 } 142 143 assert(!MRI.getRegClassOrRegBank(DstReg)); 144 MRI.setRegBank(DstReg, *NewBank); 145 return; 146 } 147 148 #ifndef NDEBUG 149 if (Opc == AMDGPU::G_TRUNC) { 150 Register DstReg = MI.getOperand(0).getReg(); 151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 152 assert(DstBank != &AMDGPU::VCCRegBank); 153 } 154 #endif 155 156 for (MachineOperand &Op : MI.operands()) { 157 if (!Op.isReg()) 158 continue; 159 160 // We may see physical registers if building a real MI 161 Register Reg = Op.getReg(); 162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 163 continue; 164 165 const RegisterBank *RB = NewBank; 166 if (MRI.getType(Reg) == LLT::scalar(1)) { 167 assert(NewBank == &AMDGPU::VGPRRegBank && 168 "s1 operands should only be used for vector bools"); 169 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 170 MI.getOpcode() != AMDGPU::G_ANYEXT) && 171 "not expecting legalization artifacts here"); 172 RB = &AMDGPU::VCCRegBank; 173 } 174 175 MRI.setRegBank(Reg, *RB); 176 } 177 } 178 179 void erasingInstr(MachineInstr &MI) override {} 180 181 void createdInstr(MachineInstr &MI) override { 182 // At this point, the instruction was just inserted and has no operands. 183 NewInsts.push_back(&MI); 184 } 185 186 void changingInstr(MachineInstr &MI) override {} 187 void changedInstr(MachineInstr &MI) override { 188 // FIXME: In principle we should probably add the instruction to NewInsts, 189 // but the way the LegalizerHelper uses the observer, we will always see the 190 // registers we need to set the regbank on also referenced in a new 191 // instruction. 192 } 193 }; 194 195 } 196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 197 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), 198 TII(Subtarget.getInstrInfo()) { 199 200 // HACK: Until this is fully tablegen'd. 201 static llvm::once_flag InitializeRegisterBankFlag; 202 203 static auto InitializeRegisterBankOnce = [this]() { 204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 207 (void)this; 208 }; 209 210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 211 } 212 213 static bool isVectorRegisterBank(const RegisterBank &Bank) { 214 unsigned BankID = Bank.getID(); 215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 216 } 217 218 bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { 219 return RB != &AMDGPU::SGPRRegBank; 220 } 221 222 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 223 const RegisterBank &Src, 224 unsigned Size) const { 225 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 226 if (Dst.getID() == AMDGPU::SGPRRegBankID && 227 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 228 return std::numeric_limits<unsigned>::max(); 229 } 230 231 // Bool values are tricky, because the meaning is based on context. The SCC 232 // and VCC banks are for the natural scalar and vector conditions produced by 233 // a compare. 234 // 235 // Legalization doesn't know about the necessary context, so an s1 use may 236 // have been a truncate from an arbitrary value, in which case a copy (lowered 237 // as a compare with 0) needs to be inserted. 238 if (Size == 1 && 239 (Dst.getID() == AMDGPU::SGPRRegBankID) && 240 (isVectorRegisterBank(Src) || 241 Src.getID() == AMDGPU::SGPRRegBankID || 242 Src.getID() == AMDGPU::VCCRegBankID)) 243 return std::numeric_limits<unsigned>::max(); 244 245 // There is no direct copy between AGPRs. 246 if (Dst.getID() == AMDGPU::AGPRRegBankID && 247 Src.getID() == AMDGPU::AGPRRegBankID) 248 return 4; 249 250 return RegisterBankInfo::copyCost(Dst, Src, Size); 251 } 252 253 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 254 const ValueMapping &ValMapping, 255 const RegisterBank *CurBank) const { 256 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 257 // VGPR. 258 // FIXME: Is there a better way to do this? 259 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 260 return 10; // This is expensive. 261 262 assert(ValMapping.NumBreakDowns == 2 && 263 ValMapping.BreakDown[0].Length == 32 && 264 ValMapping.BreakDown[0].StartIdx == 0 && 265 ValMapping.BreakDown[1].Length == 32 && 266 ValMapping.BreakDown[1].StartIdx == 32 && 267 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 268 269 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 270 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 271 // want. 272 273 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 274 // alignment restrictions, but this probably isn't important. 275 return 1; 276 } 277 278 const RegisterBank & 279 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 280 LLT Ty) const { 281 if (&RC == &AMDGPU::SReg_1RegClass) 282 return AMDGPU::VCCRegBank; 283 284 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 285 // VCC-like use. 286 if (TRI->isSGPRClass(&RC)) { 287 // FIXME: This probably came from a copy from a physical register, which 288 // should be inferable from the copied to-type. We don't have many boolean 289 // physical register constraints so just assume a normal SGPR for now. 290 if (!Ty.isValid()) 291 return AMDGPU::SGPRRegBank; 292 293 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 294 } 295 296 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 297 } 298 299 template <unsigned NumOps> 300 RegisterBankInfo::InstructionMappings 301 AMDGPURegisterBankInfo::addMappingFromTable( 302 const MachineInstr &MI, const MachineRegisterInfo &MRI, 303 const std::array<unsigned, NumOps> RegSrcOpIdx, 304 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 305 306 InstructionMappings AltMappings; 307 308 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 309 310 unsigned Sizes[NumOps]; 311 for (unsigned I = 0; I < NumOps; ++I) { 312 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 313 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 314 } 315 316 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 317 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 318 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 319 } 320 321 // getInstrMapping's default mapping uses ID 1, so start at 2. 322 unsigned MappingID = 2; 323 for (const auto &Entry : Table) { 324 for (unsigned I = 0; I < NumOps; ++I) { 325 int OpIdx = RegSrcOpIdx[I]; 326 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 327 } 328 329 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 330 getOperandsMapping(Operands), 331 Operands.size())); 332 } 333 334 return AltMappings; 335 } 336 337 RegisterBankInfo::InstructionMappings 338 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 339 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 340 switch (MI.getIntrinsicID()) { 341 case Intrinsic::amdgcn_readlane: { 342 static const OpRegBankEntry<3> Table[2] = { 343 // Perfectly legal. 344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 345 346 // Need a readfirstlane for the index. 347 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 348 }; 349 350 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 351 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 352 } 353 case Intrinsic::amdgcn_writelane: { 354 static const OpRegBankEntry<4> Table[4] = { 355 // Perfectly legal. 356 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 357 358 // Need readfirstlane of first op 359 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 360 361 // Need readfirstlane of second op 362 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 363 364 // Need readfirstlane of both ops 365 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 366 }; 367 368 // rsrc, voffset, offset 369 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 370 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table); 371 } 372 default: 373 return RegisterBankInfo::getInstrAlternativeMappings(MI); 374 } 375 } 376 377 RegisterBankInfo::InstructionMappings 378 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 379 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 380 381 switch (MI.getIntrinsicID()) { 382 case Intrinsic::amdgcn_s_buffer_load: { 383 static const OpRegBankEntry<2> Table[4] = { 384 // Perfectly legal. 385 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 386 387 // Only need 1 register in loop 388 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 389 390 // Have to waterfall the resource. 391 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 392 393 // Have to waterfall the resource, and the offset. 394 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 395 }; 396 397 // rsrc, offset 398 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 399 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table); 400 } 401 case Intrinsic::amdgcn_ds_ordered_add: 402 case Intrinsic::amdgcn_ds_ordered_swap: { 403 // VGPR = M0, VGPR 404 static const OpRegBankEntry<3> Table[2] = { 405 // Perfectly legal. 406 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 407 408 // Need a readfirstlane for m0 409 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 410 }; 411 412 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 413 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 414 } 415 case Intrinsic::amdgcn_s_sendmsg: 416 case Intrinsic::amdgcn_s_sendmsghalt: { 417 // FIXME: Should have no register for immediate 418 static const OpRegBankEntry<1> Table[2] = { 419 // Perfectly legal. 420 { { AMDGPU::SGPRRegBankID }, 1 }, 421 422 // Need readlane 423 { { AMDGPU::VGPRRegBankID }, 3 } 424 }; 425 426 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 427 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table); 428 } 429 default: 430 return RegisterBankInfo::getInstrAlternativeMappings(MI); 431 } 432 } 433 434 // FIXME: Returns uniform if there's no source value information. This is 435 // probably wrong. 436 static bool isScalarLoadLegal(const MachineInstr &MI) { 437 if (!MI.hasOneMemOperand()) 438 return false; 439 440 const MachineMemOperand *MMO = *MI.memoperands_begin(); 441 const unsigned AS = MMO->getAddrSpace(); 442 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 443 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 444 // Require 4-byte alignment. 445 return MMO->getAlign() >= Align(4) && 446 // Can't do a scalar atomic load. 447 !MMO->isAtomic() && 448 // Don't use scalar loads for volatile accesses to non-constant address 449 // spaces. 450 (IsConst || !MMO->isVolatile()) && 451 // Memory must be known constant, or not written before this load. 452 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 453 AMDGPUInstrInfo::isUniformMMO(MMO); 454 } 455 456 RegisterBankInfo::InstructionMappings 457 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 458 const MachineInstr &MI) const { 459 460 const MachineFunction &MF = *MI.getParent()->getParent(); 461 const MachineRegisterInfo &MRI = MF.getRegInfo(); 462 463 464 InstructionMappings AltMappings; 465 switch (MI.getOpcode()) { 466 case TargetOpcode::G_CONSTANT: 467 case TargetOpcode::G_IMPLICIT_DEF: { 468 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 469 if (Size == 1) { 470 static const OpRegBankEntry<1> Table[3] = { 471 { { AMDGPU::VGPRRegBankID }, 1 }, 472 { { AMDGPU::SGPRRegBankID }, 1 }, 473 { { AMDGPU::VCCRegBankID }, 1 } 474 }; 475 476 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 477 } 478 479 [[fallthrough]]; 480 } 481 case TargetOpcode::G_FCONSTANT: 482 case TargetOpcode::G_FRAME_INDEX: 483 case TargetOpcode::G_GLOBAL_VALUE: { 484 static const OpRegBankEntry<1> Table[2] = { 485 { { AMDGPU::VGPRRegBankID }, 1 }, 486 { { AMDGPU::SGPRRegBankID }, 1 } 487 }; 488 489 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 490 } 491 case TargetOpcode::G_AND: 492 case TargetOpcode::G_OR: 493 case TargetOpcode::G_XOR: { 494 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 495 496 if (Size == 1) { 497 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 498 const InstructionMapping &SCCMapping = getInstructionMapping( 499 1, 1, getOperandsMapping( 500 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 501 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 503 3); // Num Operands 504 AltMappings.push_back(&SCCMapping); 505 506 const InstructionMapping &VCCMapping0 = getInstructionMapping( 507 2, 1, getOperandsMapping( 508 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 509 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 511 3); // Num Operands 512 AltMappings.push_back(&VCCMapping0); 513 return AltMappings; 514 } 515 516 if (Size != 64) 517 break; 518 519 const InstructionMapping &SSMapping = getInstructionMapping( 520 1, 1, getOperandsMapping( 521 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 522 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 524 3); // Num Operands 525 AltMappings.push_back(&SSMapping); 526 527 const InstructionMapping &VVMapping = getInstructionMapping( 528 2, 2, getOperandsMapping( 529 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 530 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 532 3); // Num Operands 533 AltMappings.push_back(&VVMapping); 534 break; 535 } 536 case TargetOpcode::G_LOAD: 537 case TargetOpcode::G_ZEXTLOAD: 538 case TargetOpcode::G_SEXTLOAD: { 539 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 540 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 541 unsigned PtrSize = PtrTy.getSizeInBits(); 542 unsigned AS = PtrTy.getAddressSpace(); 543 544 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 545 AS != AMDGPUAS::PRIVATE_ADDRESS) && 546 isScalarLoadLegal(MI)) { 547 const InstructionMapping &SSMapping = getInstructionMapping( 548 1, 1, getOperandsMapping( 549 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 550 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 551 2); // Num Operands 552 AltMappings.push_back(&SSMapping); 553 } 554 555 const InstructionMapping &VVMapping = getInstructionMapping( 556 2, 1, 557 getOperandsMapping( 558 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 559 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 560 2); // Num Operands 561 AltMappings.push_back(&VVMapping); 562 563 // It may be possible to have a vgpr = load sgpr mapping here, because 564 // the mubuf instructions support this kind of load, but probably for only 565 // gfx7 and older. However, the addressing mode matching in the instruction 566 // selector should be able to do a better job of detecting and selecting 567 // these kinds of loads from the vgpr = load vgpr mapping. 568 569 return AltMappings; 570 571 } 572 case TargetOpcode::G_SELECT: { 573 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 574 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 575 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 576 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 579 4); // Num Operands 580 AltMappings.push_back(&SSMapping); 581 582 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 583 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 584 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 585 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 587 4); // Num Operands 588 AltMappings.push_back(&VVMapping); 589 590 return AltMappings; 591 } 592 case TargetOpcode::G_UADDE: 593 case TargetOpcode::G_USUBE: 594 case TargetOpcode::G_SADDE: 595 case TargetOpcode::G_SSUBE: { 596 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 597 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 598 getOperandsMapping( 599 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 600 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 604 5); // Num Operands 605 AltMappings.push_back(&SSMapping); 606 607 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 608 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 609 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 610 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 612 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 613 5); // Num Operands 614 AltMappings.push_back(&VVMapping); 615 return AltMappings; 616 } 617 case AMDGPU::G_BRCOND: { 618 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 619 620 // TODO: Change type to 32 for scalar 621 const InstructionMapping &SMapping = getInstructionMapping( 622 1, 1, getOperandsMapping( 623 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 624 2); // Num Operands 625 AltMappings.push_back(&SMapping); 626 627 const InstructionMapping &VMapping = getInstructionMapping( 628 1, 1, getOperandsMapping( 629 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 630 2); // Num Operands 631 AltMappings.push_back(&VMapping); 632 return AltMappings; 633 } 634 case AMDGPU::G_INTRINSIC: 635 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 636 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 637 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 638 default: 639 break; 640 } 641 return RegisterBankInfo::getInstrAlternativeMappings(MI); 642 } 643 644 void AMDGPURegisterBankInfo::split64BitValueForMapping( 645 MachineIRBuilder &B, 646 SmallVector<Register, 2> &Regs, 647 LLT HalfTy, 648 Register Reg) const { 649 assert(HalfTy.getSizeInBits() == 32); 650 MachineRegisterInfo *MRI = B.getMRI(); 651 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 652 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 653 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 654 MRI->setRegBank(LoLHS, *Bank); 655 MRI->setRegBank(HiLHS, *Bank); 656 657 Regs.push_back(LoLHS); 658 Regs.push_back(HiLHS); 659 660 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 661 .addDef(LoLHS) 662 .addDef(HiLHS) 663 .addUse(Reg); 664 } 665 666 /// Replace the current type each register in \p Regs has with \p NewTy 667 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 668 LLT NewTy) { 669 for (Register Reg : Regs) { 670 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 671 MRI.setType(Reg, NewTy); 672 } 673 } 674 675 static LLT getHalfSizedType(LLT Ty) { 676 if (Ty.isVector()) { 677 assert(Ty.getElementCount().isKnownMultipleOf(2)); 678 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 679 Ty.getElementType()); 680 } 681 682 assert(Ty.getScalarSizeInBits() % 2 == 0); 683 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 684 } 685 686 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector 687 // source value into a scalar register. 688 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, 689 MachineRegisterInfo &MRI, 690 Register Src) const { 691 LLT Ty = MRI.getType(Src); 692 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); 693 694 if (Bank == &AMDGPU::SGPRRegBank) 695 return Src; 696 697 unsigned Bits = Ty.getSizeInBits(); 698 assert(Bits % 32 == 0); 699 700 if (Bank != &AMDGPU::VGPRRegBank) { 701 // We need to copy from AGPR to VGPR 702 Src = B.buildCopy(Ty, Src).getReg(0); 703 MRI.setRegBank(Src, AMDGPU::VGPRRegBank); 704 } 705 706 LLT S32 = LLT::scalar(32); 707 unsigned NumParts = Bits / 32; 708 SmallVector<Register, 8> SrcParts; 709 SmallVector<Register, 8> DstParts; 710 711 if (Bits == 32) { 712 SrcParts.push_back(Src); 713 } else { 714 auto Unmerge = B.buildUnmerge(S32, Src); 715 for (unsigned i = 0; i < NumParts; ++i) 716 SrcParts.push_back(Unmerge.getReg(i)); 717 } 718 719 for (unsigned i = 0; i < NumParts; ++i) { 720 Register SrcPart = SrcParts[i]; 721 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 722 MRI.setType(DstPart, NumParts == 1 ? Ty : S32); 723 724 const TargetRegisterClass *Constrained = 725 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); 726 (void)Constrained; 727 assert(Constrained && "Failed to constrain readfirstlane src reg"); 728 729 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); 730 731 DstParts.push_back(DstPart); 732 } 733 734 if (Bits == 32) 735 return DstParts[0]; 736 737 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0); 738 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); 739 return Dst; 740 } 741 742 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 743 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 744 /// execute the instruction for each unique combination of values in all lanes 745 /// in the wave. The block will be split such that rest of the instructions are 746 /// moved to a new block. 747 /// 748 /// Essentially performs this loop: 749 // 750 /// Save Execution Mask 751 /// For (Lane : Wavefront) { 752 /// Enable Lane, Disable all other lanes 753 /// SGPR = read SGPR value for current lane from VGPR 754 /// VGPRResult[Lane] = use_op SGPR 755 /// } 756 /// Restore Execution Mask 757 /// 758 /// There is additional complexity to try for compare values to identify the 759 /// unique values used. 760 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 761 MachineIRBuilder &B, 762 iterator_range<MachineBasicBlock::iterator> Range, 763 SmallSet<Register, 4> &SGPROperandRegs, 764 MachineRegisterInfo &MRI) const { 765 766 // Track use registers which have already been expanded with a readfirstlane 767 // sequence. This may have multiple uses if moving a sequence. 768 DenseMap<Register, Register> WaterfalledRegMap; 769 770 MachineBasicBlock &MBB = B.getMBB(); 771 MachineFunction *MF = &B.getMF(); 772 773 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 774 const unsigned MovExecOpc = 775 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 776 const unsigned MovExecTermOpc = 777 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 778 779 const unsigned XorTermOpc = Subtarget.isWave32() ? 780 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 781 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 782 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 783 const unsigned ExecReg = Subtarget.isWave32() ? 784 AMDGPU::EXEC_LO : AMDGPU::EXEC; 785 786 #ifndef NDEBUG 787 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 788 #endif 789 790 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 791 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 792 793 // Don't bother using generic instructions/registers for the exec mask. 794 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 795 .addDef(InitSaveExecReg); 796 797 Register PhiExec = MRI.createVirtualRegister(WaveRC); 798 Register NewExec = MRI.createVirtualRegister(WaveRC); 799 800 // To insert the loop we need to split the block. Move everything before this 801 // point to a new block, and insert a new empty block before this instruction. 802 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 803 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); 804 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 805 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 806 MachineFunction::iterator MBBI(MBB); 807 ++MBBI; 808 MF->insert(MBBI, LoopBB); 809 MF->insert(MBBI, BodyBB); 810 MF->insert(MBBI, RestoreExecBB); 811 MF->insert(MBBI, RemainderBB); 812 813 LoopBB->addSuccessor(BodyBB); 814 BodyBB->addSuccessor(RestoreExecBB); 815 BodyBB->addSuccessor(LoopBB); 816 817 // Move the rest of the block into a new block. 818 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 819 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 820 821 MBB.addSuccessor(LoopBB); 822 RestoreExecBB->addSuccessor(RemainderBB); 823 824 B.setInsertPt(*LoopBB, LoopBB->end()); 825 826 B.buildInstr(TargetOpcode::PHI) 827 .addDef(PhiExec) 828 .addReg(InitSaveExecReg) 829 .addMBB(&MBB) 830 .addReg(NewExec) 831 .addMBB(BodyBB); 832 833 const DebugLoc &DL = B.getDL(); 834 835 MachineInstr &FirstInst = *Range.begin(); 836 837 // Move the instruction into the loop body. Note we moved everything after 838 // Range.end() already into a new block, so Range.end() is no longer valid. 839 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); 840 841 // Figure out the iterator range after splicing the instructions. 842 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 843 auto NewEnd = BodyBB->end(); 844 845 B.setMBB(*LoopBB); 846 847 LLT S1 = LLT::scalar(1); 848 Register CondReg; 849 850 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 851 852 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 853 for (MachineOperand &Op : MI.all_uses()) { 854 Register OldReg = Op.getReg(); 855 if (!SGPROperandRegs.count(OldReg)) 856 continue; 857 858 // See if we already processed this register in another instruction in the 859 // sequence. 860 auto OldVal = WaterfalledRegMap.find(OldReg); 861 if (OldVal != WaterfalledRegMap.end()) { 862 Op.setReg(OldVal->second); 863 continue; 864 } 865 866 Register OpReg = Op.getReg(); 867 LLT OpTy = MRI.getType(OpReg); 868 869 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 870 if (OpBank != &AMDGPU::VGPRRegBank) { 871 // Insert copy from AGPR to VGPR before the loop. 872 B.setMBB(MBB); 873 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 874 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 875 B.setMBB(*LoopBB); 876 } 877 878 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); 879 880 // Build the comparison(s). 881 unsigned OpSize = OpTy.getSizeInBits(); 882 bool Is64 = OpSize % 64 == 0; 883 unsigned PartSize = Is64 ? 64 : 32; 884 LLT PartTy = LLT::scalar(PartSize); 885 unsigned NumParts = OpSize / PartSize; 886 SmallVector<Register, 8> OpParts; 887 SmallVector<Register, 8> CurrentLaneParts; 888 889 if (NumParts == 1) { 890 OpParts.push_back(OpReg); 891 CurrentLaneParts.push_back(CurrentLaneReg); 892 } else { 893 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); 894 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); 895 for (unsigned i = 0; i < NumParts; ++i) { 896 OpParts.push_back(UnmergeOp.getReg(i)); 897 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); 898 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); 899 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); 900 } 901 } 902 903 for (unsigned i = 0; i < NumParts; ++i) { 904 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], 905 OpParts[i]).getReg(0); 906 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); 907 908 if (!CondReg) { 909 CondReg = CmpReg; 910 } else { 911 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); 912 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); 913 } 914 } 915 916 Op.setReg(CurrentLaneReg); 917 918 // Make sure we don't re-process this register again. 919 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); 920 } 921 } 922 923 // The ballot becomes a no-op during instruction selection. 924 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, 925 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}, 926 false) 927 .addReg(CondReg) 928 .getReg(0); 929 MRI.setRegClass(CondReg, WaveRC); 930 931 // Update EXEC, save the original EXEC value to VCC. 932 B.buildInstr(AndSaveExecOpc) 933 .addDef(NewExec) 934 .addReg(CondReg, RegState::Kill); 935 936 MRI.setSimpleHint(NewExec, CondReg); 937 938 B.setInsertPt(*BodyBB, BodyBB->end()); 939 940 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 941 B.buildInstr(XorTermOpc) 942 .addDef(ExecReg) 943 .addReg(ExecReg) 944 .addReg(NewExec); 945 946 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 947 // s_cbranch_scc0? 948 949 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 950 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 951 952 // Save the EXEC mask before the loop. 953 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 954 .addReg(ExecReg); 955 956 // Restore the EXEC mask after the loop. 957 B.setMBB(*RestoreExecBB); 958 B.buildInstr(MovExecTermOpc) 959 .addDef(ExecReg) 960 .addReg(SaveExecReg); 961 962 // Set the insert point after the original instruction, so any new 963 // instructions will be in the remainder. 964 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 965 966 return true; 967 } 968 969 // Return any unique registers used by \p MI at \p OpIndices that need to be 970 // handled in a waterfall loop. Returns these registers in \p 971 // SGPROperandRegs. Returns true if there are any operands to handle and a 972 // waterfall loop is necessary. 973 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 974 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 975 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 976 for (unsigned Op : OpIndices) { 977 assert(MI.getOperand(Op).isUse()); 978 Register Reg = MI.getOperand(Op).getReg(); 979 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 980 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 981 SGPROperandRegs.insert(Reg); 982 } 983 984 // No operands need to be replaced, so no need to loop. 985 return !SGPROperandRegs.empty(); 986 } 987 988 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 989 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 990 ArrayRef<unsigned> OpIndices) const { 991 // Use a set to avoid extra readfirstlanes in the case where multiple operands 992 // are the same register. 993 SmallSet<Register, 4> SGPROperandRegs; 994 995 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 996 return false; 997 998 MachineBasicBlock::iterator I = MI.getIterator(); 999 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1000 SGPROperandRegs, MRI); 1001 } 1002 1003 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1004 MachineInstr &MI, MachineRegisterInfo &MRI, 1005 ArrayRef<unsigned> OpIndices) const { 1006 MachineIRBuilder B(MI); 1007 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1008 } 1009 1010 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1011 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1012 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1013 Register Reg = MI.getOperand(OpIdx).getReg(); 1014 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1015 if (Bank == &AMDGPU::SGPRRegBank) 1016 return; 1017 1018 MachineIRBuilder B(MI); 1019 1020 Reg = buildReadFirstLane(B, MRI, Reg); 1021 MI.getOperand(OpIdx).setReg(Reg); 1022 } 1023 1024 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1025 /// rest will be in the remainder. 1026 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1027 unsigned TotalSize = Ty.getSizeInBits(); 1028 if (!Ty.isVector()) 1029 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1030 1031 LLT EltTy = Ty.getElementType(); 1032 unsigned EltSize = EltTy.getSizeInBits(); 1033 assert(FirstSize % EltSize == 0); 1034 1035 unsigned FirstPartNumElts = FirstSize / EltSize; 1036 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1037 1038 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1039 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1040 } 1041 1042 static LLT widen96To128(LLT Ty) { 1043 if (!Ty.isVector()) 1044 return LLT::scalar(128); 1045 1046 LLT EltTy = Ty.getElementType(); 1047 assert(128 % EltTy.getSizeInBits() == 0); 1048 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1049 } 1050 1051 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1052 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1053 MachineRegisterInfo &MRI) const { 1054 Register DstReg = MI.getOperand(0).getReg(); 1055 const LLT LoadTy = MRI.getType(DstReg); 1056 unsigned LoadSize = LoadTy.getSizeInBits(); 1057 const unsigned MaxNonSmrdLoadSize = 128; 1058 1059 const RegisterBank *DstBank = 1060 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1061 if (DstBank == &AMDGPU::SGPRRegBank) { 1062 // There are some special cases that we need to look at for 32 bit and 96 1063 // bit SGPR loads otherwise we have nothing to do. 1064 if (LoadSize != 32 && LoadSize != 96) 1065 return false; 1066 1067 MachineMemOperand *MMO = *MI.memoperands_begin(); 1068 const unsigned MemSize = 8 * MMO->getSize(); 1069 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1070 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1071 // scalar loads should have a load size of 32 but memory access size of less 1072 // than 32. 1073 if (LoadSize == 32 && 1074 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1075 return false; 1076 1077 Register PtrReg = MI.getOperand(1).getReg(); 1078 1079 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1080 MachineIRBuilder B(MI, O); 1081 1082 if (LoadSize == 32) { 1083 // This is an extending load from a sub-dword size. Widen the memory 1084 // access size to 4 bytes and clear the extra high bits appropriately 1085 const LLT S32 = LLT::scalar(32); 1086 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1087 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1088 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1089 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1090 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1091 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1092 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1093 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1094 } else 1095 // We do not need to touch the higher bits for regular loads. 1096 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1097 } else { 1098 // 96-bit loads are only available for vector loads. We need to split this 1099 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1100 if (MMO->getAlign() < Align(16)) { 1101 MachineFunction *MF = MI.getParent()->getParent(); 1102 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1103 MachineIRBuilder B(MI, ApplyBank); 1104 LegalizerHelper Helper(*MF, ApplyBank, B); 1105 LLT Part64, Part32; 1106 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1107 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1108 LegalizerHelper::Legalized) 1109 return false; 1110 return true; 1111 } else { 1112 LLT WiderTy = widen96To128(LoadTy); 1113 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1114 if (WiderTy.isScalar()) 1115 B.buildTrunc(MI.getOperand(0), WideLoad); 1116 else { 1117 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1118 WideLoad); 1119 } 1120 } 1121 } 1122 1123 MI.eraseFromParent(); 1124 return true; 1125 } 1126 1127 // 128-bit loads are supported for all instruction types. 1128 if (LoadSize <= MaxNonSmrdLoadSize) 1129 return false; 1130 1131 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1132 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1133 1134 if (SrcRegs.empty()) 1135 SrcRegs.push_back(MI.getOperand(1).getReg()); 1136 1137 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1138 1139 // RegBankSelect only emits scalar types, so we need to reset the pointer 1140 // operand to a pointer type. 1141 Register BasePtrReg = SrcRegs[0]; 1142 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1143 MRI.setType(BasePtrReg, PtrTy); 1144 1145 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1146 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1147 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1148 MachineIRBuilder B(MI, Observer); 1149 LegalizerHelper Helper(B.getMF(), Observer, B); 1150 1151 if (LoadTy.isVector()) { 1152 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1153 return false; 1154 } else { 1155 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1156 return false; 1157 } 1158 1159 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1160 return true; 1161 } 1162 1163 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1164 MachineInstr &MI, 1165 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1166 MachineRegisterInfo &MRI) const { 1167 const MachineFunction &MF = *MI.getMF(); 1168 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1169 const auto &TFI = *ST.getFrameLowering(); 1170 1171 // Guard in case the stack growth direction ever changes with scratch 1172 // instructions. 1173 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1174 return false; 1175 1176 Register Dst = MI.getOperand(0).getReg(); 1177 Register AllocSize = MI.getOperand(1).getReg(); 1178 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1179 1180 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1181 1182 // TODO: Need to emit a wave reduction to get the maximum size. 1183 if (SizeBank != &AMDGPU::SGPRRegBank) 1184 return false; 1185 1186 LLT PtrTy = MRI.getType(Dst); 1187 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1188 1189 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1190 Register SPReg = Info->getStackPtrOffsetReg(); 1191 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1192 MachineIRBuilder B(MI, ApplyBank); 1193 1194 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1195 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1196 1197 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1198 if (Alignment > TFI.getStackAlign()) { 1199 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1200 B.buildMaskLowPtrBits(Dst, PtrAdd, 1201 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1202 } else { 1203 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1204 } 1205 1206 MI.eraseFromParent(); 1207 return true; 1208 } 1209 1210 bool AMDGPURegisterBankInfo::applyMappingImage( 1211 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1212 MachineRegisterInfo &MRI, int RsrcIdx) const { 1213 const int NumDefs = MI.getNumExplicitDefs(); 1214 1215 // The reported argument index is relative to the IR intrinsic call arguments, 1216 // so we need to shift by the number of defs and the intrinsic ID. 1217 RsrcIdx += NumDefs + 1; 1218 1219 // Insert copies to VGPR arguments. 1220 applyDefaultMapping(OpdMapper); 1221 1222 // Fixup any SGPR arguments. 1223 SmallVector<unsigned, 4> SGPRIndexes; 1224 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1225 if (!MI.getOperand(I).isReg()) 1226 continue; 1227 1228 // If this intrinsic has a sampler, it immediately follows rsrc. 1229 if (I == RsrcIdx || I == RsrcIdx + 1) 1230 SGPRIndexes.push_back(I); 1231 } 1232 1233 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1234 return true; 1235 } 1236 1237 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1238 // the three offsets (voffset, soffset and instoffset) 1239 unsigned AMDGPURegisterBankInfo::setBufferOffsets( 1240 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, 1241 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { 1242 const LLT S32 = LLT::scalar(32); 1243 MachineRegisterInfo *MRI = B.getMRI(); 1244 1245 if (std::optional<int64_t> Imm = 1246 getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1247 uint32_t SOffset, ImmOffset; 1248 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) { 1249 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1250 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1251 InstOffsetVal = ImmOffset; 1252 1253 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1254 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1255 return SOffset + ImmOffset; 1256 } 1257 } 1258 1259 Register Base; 1260 unsigned Offset; 1261 1262 std::tie(Base, Offset) = 1263 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1264 1265 uint32_t SOffset, ImmOffset; 1266 if ((int)Offset > 0 && 1267 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { 1268 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1269 VOffsetReg = Base; 1270 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1271 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1272 InstOffsetVal = ImmOffset; 1273 return 0; // XXX - Why is this 0? 1274 } 1275 1276 // If we have SGPR base, we can use it for soffset. 1277 if (SOffset == 0) { 1278 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1279 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1280 SOffsetReg = Base; 1281 InstOffsetVal = ImmOffset; 1282 return 0; // XXX - Why is this 0? 1283 } 1284 } 1285 1286 // Handle the variable sgpr + vgpr case. 1287 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1288 if (Add && (int)Offset >= 0) { 1289 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); 1290 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); 1291 1292 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI); 1293 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI); 1294 1295 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1296 VOffsetReg = Src0; 1297 SOffsetReg = Src1; 1298 return 0; 1299 } 1300 1301 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1302 VOffsetReg = Src1; 1303 SOffsetReg = Src0; 1304 return 0; 1305 } 1306 } 1307 1308 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1309 // have an SGPR offset and a VGPR resource. 1310 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { 1311 VOffsetReg = CombinedOffset; 1312 } else { 1313 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1314 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1315 } 1316 1317 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1318 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1319 return 0; 1320 } 1321 1322 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1323 const OperandsMapper &OpdMapper) const { 1324 MachineInstr &MI = OpdMapper.getMI(); 1325 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1326 1327 const LLT S32 = LLT::scalar(32); 1328 Register Dst = MI.getOperand(0).getReg(); 1329 LLT Ty = MRI.getType(Dst); 1330 1331 const RegisterBank *RSrcBank = 1332 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1333 const RegisterBank *OffsetBank = 1334 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1335 if (RSrcBank == &AMDGPU::SGPRRegBank && 1336 OffsetBank == &AMDGPU::SGPRRegBank) 1337 return true; // Legal mapping 1338 1339 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1340 // here but don't have an MMO. 1341 1342 unsigned LoadSize = Ty.getSizeInBits(); 1343 int NumLoads = 1; 1344 if (LoadSize == 256 || LoadSize == 512) { 1345 NumLoads = LoadSize / 128; 1346 Ty = Ty.divide(NumLoads); 1347 } 1348 1349 // Use the alignment to ensure that the required offsets will fit into the 1350 // immediate offsets. 1351 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1352 1353 MachineIRBuilder B(MI); 1354 MachineFunction &MF = B.getMF(); 1355 1356 Register SOffset; 1357 Register VOffset; 1358 int64_t ImmOffset = 0; 1359 1360 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset, 1361 SOffset, ImmOffset, Alignment); 1362 1363 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1364 // can, but we need to track an MMO for that. 1365 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1366 const Align MemAlign(4); // FIXME: ABI type alignment? 1367 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1368 MachinePointerInfo(), 1369 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1370 MachineMemOperand::MOInvariant, 1371 MemSize, MemAlign); 1372 if (MMOOffset != 0) 1373 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1374 1375 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1376 // assume that the buffer is unswizzled. 1377 1378 Register RSrc = MI.getOperand(1).getReg(); 1379 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1380 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1381 1382 SmallVector<Register, 4> LoadParts(NumLoads); 1383 1384 MachineBasicBlock::iterator MII = MI.getIterator(); 1385 MachineInstrSpan Span(MII, &B.getMBB()); 1386 1387 for (int i = 0; i < NumLoads; ++i) { 1388 if (NumLoads == 1) { 1389 LoadParts[i] = Dst; 1390 } else { 1391 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1392 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1393 } 1394 1395 MachineMemOperand *MMO = BaseMMO; 1396 if (i != 0) 1397 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1398 1399 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1400 .addDef(LoadParts[i]) // vdata 1401 .addUse(RSrc) // rsrc 1402 .addUse(VIndex) // vindex 1403 .addUse(VOffset) // voffset 1404 .addUse(SOffset) // soffset 1405 .addImm(ImmOffset + 16 * i) // offset(imm) 1406 .addImm(0) // cachepolicy, swizzled buffer(imm) 1407 .addImm(0) // idxen(imm) 1408 .addMemOperand(MMO); 1409 } 1410 1411 // TODO: If only the resource is a VGPR, it may be better to execute the 1412 // scalar load in the waterfall loop if the resource is expected to frequently 1413 // be dynamically uniform. 1414 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1415 // Remove the original instruction to avoid potentially confusing the 1416 // waterfall loop logic. 1417 B.setInstr(*Span.begin()); 1418 MI.eraseFromParent(); 1419 1420 SmallSet<Register, 4> OpsToWaterfall; 1421 1422 OpsToWaterfall.insert(RSrc); 1423 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1424 OpsToWaterfall, MRI); 1425 } 1426 1427 if (NumLoads != 1) { 1428 if (Ty.isVector()) 1429 B.buildConcatVectors(Dst, LoadParts); 1430 else 1431 B.buildMergeLikeInstr(Dst, LoadParts); 1432 } 1433 1434 // We removed the instruction earlier with a waterfall loop. 1435 if (RSrcBank == &AMDGPU::SGPRRegBank) 1436 MI.eraseFromParent(); 1437 1438 return true; 1439 } 1440 1441 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, 1442 bool Signed) const { 1443 MachineInstr &MI = OpdMapper.getMI(); 1444 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1445 1446 // Insert basic copies 1447 applyDefaultMapping(OpdMapper); 1448 1449 Register DstReg = MI.getOperand(0).getReg(); 1450 LLT Ty = MRI.getType(DstReg); 1451 1452 const LLT S32 = LLT::scalar(32); 1453 1454 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; 1455 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1456 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1457 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1458 1459 const RegisterBank *DstBank = 1460 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1461 if (DstBank == &AMDGPU::VGPRRegBank) { 1462 if (Ty == S32) 1463 return true; 1464 1465 // There is no 64-bit vgpr bitfield extract instructions so the operation 1466 // is expanded to a sequence of instructions that implement the operation. 1467 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); 1468 MachineIRBuilder B(MI, ApplyBank); 1469 1470 const LLT S64 = LLT::scalar(64); 1471 // Shift the source operand so that extracted bits start at bit 0. 1472 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1473 : B.buildLShr(S64, SrcReg, OffsetReg); 1474 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1475 1476 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1477 // if the width is a constant. 1478 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1479 // Use the 32-bit bitfield extract instruction if the width is a constant. 1480 // Depending on the width size, use either the low or high 32-bits. 1481 auto Zero = B.buildConstant(S32, 0); 1482 auto WidthImm = ConstWidth->Value.getZExtValue(); 1483 if (WidthImm <= 32) { 1484 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1485 // or clear the upper 32-bits. 1486 auto Extract = 1487 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1488 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1489 auto Extend = 1490 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1491 B.buildMergeLikeInstr(DstReg, {Extract, Extend}); 1492 } else { 1493 // Use bitfield extract on upper 32-bit source, and combine with lower 1494 // 32-bit source. 1495 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1496 auto Extract = 1497 Signed 1498 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1499 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1500 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1501 } 1502 MI.eraseFromParent(); 1503 return true; 1504 } 1505 1506 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1507 // operations. 1508 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1509 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1510 if (Signed) 1511 B.buildAShr(S64, SignBit, ExtShift); 1512 else 1513 B.buildLShr(S64, SignBit, ExtShift); 1514 MI.eraseFromParent(); 1515 return true; 1516 } 1517 1518 // The scalar form packs the offset and width in a single operand. 1519 1520 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1521 MachineIRBuilder B(MI, ApplyBank); 1522 1523 // Ensure the high bits are clear to insert the offset. 1524 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1525 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1526 1527 // Zeros out the low bits, so don't bother clamping the input value. 1528 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1529 1530 // Transformation function, pack the offset and width of a BFE into 1531 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1532 // source, bits [5:0] contain the offset and bits [22:16] the width. 1533 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1534 1535 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1536 // register class constraints. 1537 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1538 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1539 1540 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1541 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1542 llvm_unreachable("failed to constrain BFE"); 1543 1544 MI.eraseFromParent(); 1545 return true; 1546 } 1547 1548 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( 1549 const OperandsMapper &OpdMapper) const { 1550 MachineInstr &MI = OpdMapper.getMI(); 1551 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1552 1553 // Insert basic copies. 1554 applyDefaultMapping(OpdMapper); 1555 1556 Register Dst0 = MI.getOperand(0).getReg(); 1557 Register Dst1 = MI.getOperand(1).getReg(); 1558 Register Src0 = MI.getOperand(2).getReg(); 1559 Register Src1 = MI.getOperand(3).getReg(); 1560 Register Src2 = MI.getOperand(4).getReg(); 1561 1562 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) 1563 return true; 1564 1565 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 1566 LLT S1 = LLT::scalar(1); 1567 LLT S32 = LLT::scalar(32); 1568 1569 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; 1570 bool Accumulate = true; 1571 1572 if (!DstOnValu) { 1573 if (mi_match(Src2, MRI, m_ZeroInt())) 1574 Accumulate = false; 1575 } 1576 1577 // Keep the multiplication on the SALU. 1578 MachineIRBuilder B(MI); 1579 1580 Register DstHi; 1581 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); 1582 bool MulHiInVgpr = false; 1583 1584 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); 1585 1586 if (Subtarget.hasSMulHi()) { 1587 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) 1588 : B.buildSMulH(S32, Src0, Src1).getReg(0); 1589 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); 1590 } else { 1591 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); 1592 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); 1593 1594 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); 1595 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); 1596 1597 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) 1598 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); 1599 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1600 1601 if (!DstOnValu) { 1602 DstHi = buildReadFirstLane(B, MRI, DstHi); 1603 } else { 1604 MulHiInVgpr = true; 1605 } 1606 } 1607 1608 // Accumulate and produce the "carry-out" bit. 1609 // 1610 // The "carry-out" is defined as bit 64 of the result when computed as a 1611 // big integer. For unsigned multiply-add, this matches the usual definition 1612 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the 1613 // result, which is determined as: 1614 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add 1615 LLT CarryType = DstOnValu ? S1 : S32; 1616 const RegisterBank &CarryBank = 1617 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 1618 const RegisterBank &DstBank = 1619 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; 1620 Register Carry; 1621 Register Zero; 1622 1623 if (!IsUnsigned) { 1624 Zero = B.buildConstant(S32, 0).getReg(0); 1625 MRI.setRegBank(Zero, 1626 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); 1627 1628 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) 1629 .getReg(0); 1630 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank 1631 : AMDGPU::SGPRRegBank); 1632 1633 if (DstOnValu && !MulHiInVgpr) { 1634 Carry = B.buildTrunc(S1, Carry).getReg(0); 1635 MRI.setRegBank(Carry, AMDGPU::VCCRegBank); 1636 } 1637 } 1638 1639 if (Accumulate) { 1640 if (DstOnValu) { 1641 DstLo = B.buildCopy(S32, DstLo).getReg(0); 1642 DstHi = B.buildCopy(S32, DstHi).getReg(0); 1643 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); 1644 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1645 } 1646 1647 auto Unmerge = B.buildUnmerge(S32, Src2); 1648 Register Src2Lo = Unmerge.getReg(0); 1649 Register Src2Hi = Unmerge.getReg(1); 1650 MRI.setRegBank(Src2Lo, DstBank); 1651 MRI.setRegBank(Src2Hi, DstBank); 1652 1653 if (!IsUnsigned) { 1654 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); 1655 MRI.setRegBank(Src2Sign.getReg(0), CarryBank); 1656 1657 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); 1658 MRI.setRegBank(Carry, CarryBank); 1659 } 1660 1661 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); 1662 DstLo = AddLo.getReg(0); 1663 Register CarryLo = AddLo.getReg(1); 1664 MRI.setRegBank(DstLo, DstBank); 1665 MRI.setRegBank(CarryLo, CarryBank); 1666 1667 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); 1668 DstHi = AddHi.getReg(0); 1669 MRI.setRegBank(DstHi, DstBank); 1670 1671 Register CarryHi = AddHi.getReg(1); 1672 MRI.setRegBank(CarryHi, CarryBank); 1673 1674 if (IsUnsigned) { 1675 Carry = CarryHi; 1676 } else { 1677 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); 1678 MRI.setRegBank(Carry, CarryBank); 1679 } 1680 } else { 1681 if (IsUnsigned) { 1682 Carry = B.buildConstant(CarryType, 0).getReg(0); 1683 MRI.setRegBank(Carry, CarryBank); 1684 } 1685 } 1686 1687 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi}); 1688 1689 if (DstOnValu) { 1690 B.buildCopy(Dst1, Carry); 1691 } else { 1692 B.buildTrunc(Dst1, Carry); 1693 } 1694 1695 MI.eraseFromParent(); 1696 return true; 1697 } 1698 1699 // Return a suitable opcode for extending the operands of Opc when widening. 1700 static unsigned getExtendOp(unsigned Opc) { 1701 switch (Opc) { 1702 case TargetOpcode::G_ASHR: 1703 case TargetOpcode::G_SMIN: 1704 case TargetOpcode::G_SMAX: 1705 return TargetOpcode::G_SEXT; 1706 case TargetOpcode::G_LSHR: 1707 case TargetOpcode::G_UMIN: 1708 case TargetOpcode::G_UMAX: 1709 return TargetOpcode::G_ZEXT; 1710 default: 1711 return TargetOpcode::G_ANYEXT; 1712 } 1713 } 1714 1715 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1716 // any illegal vector extend or unmerge operations. 1717 static std::pair<Register, Register> 1718 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1719 const LLT S32 = LLT::scalar(32); 1720 auto Bitcast = B.buildBitcast(S32, Src); 1721 1722 if (ExtOpcode == TargetOpcode::G_SEXT) { 1723 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1724 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1725 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1726 } 1727 1728 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1729 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1730 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1731 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1732 } 1733 1734 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1735 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1736 } 1737 1738 // For cases where only a single copy is inserted for matching register banks. 1739 // Replace the register in the instruction operand 1740 static bool substituteSimpleCopyRegs( 1741 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1742 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1743 if (!SrcReg.empty()) { 1744 assert(SrcReg.size() == 1); 1745 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1746 return true; 1747 } 1748 1749 return false; 1750 } 1751 1752 /// Handle register layout difference for f16 images for some subtargets. 1753 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1754 MachineRegisterInfo &MRI, 1755 Register Reg) const { 1756 if (!Subtarget.hasUnpackedD16VMem()) 1757 return Reg; 1758 1759 const LLT S16 = LLT::scalar(16); 1760 LLT StoreVT = MRI.getType(Reg); 1761 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1762 return Reg; 1763 1764 auto Unmerge = B.buildUnmerge(S16, Reg); 1765 1766 1767 SmallVector<Register, 4> WideRegs; 1768 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1769 WideRegs.push_back(Unmerge.getReg(I)); 1770 1771 const LLT S32 = LLT::scalar(32); 1772 int NumElts = StoreVT.getNumElements(); 1773 1774 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs) 1775 .getReg(0); 1776 } 1777 1778 static std::pair<Register, unsigned> 1779 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1780 int64_t Const; 1781 if (mi_match(Reg, MRI, m_ICst(Const))) 1782 return std::pair(Register(), Const); 1783 1784 Register Base; 1785 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1786 return std::pair(Base, Const); 1787 1788 // TODO: Handle G_OR used for add case 1789 return std::pair(Reg, 0); 1790 } 1791 1792 std::pair<Register, unsigned> 1793 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1794 Register OrigOffset) const { 1795 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); 1796 Register BaseReg; 1797 unsigned ImmOffset; 1798 const LLT S32 = LLT::scalar(32); 1799 1800 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. 1801 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1802 OrigOffset); 1803 1804 unsigned C1 = 0; 1805 if (ImmOffset != 0) { 1806 // If the immediate value is too big for the immoffset field, put only bits 1807 // that would normally fit in the immoffset field. The remaining value that 1808 // is copied/added for the voffset field is a large power of 2, and it 1809 // stands more chance of being CSEd with the copy/add for another similar 1810 // load/store. 1811 // However, do not do that rounding down if that is a negative 1812 // number, as it appears to be illegal to have a negative offset in the 1813 // vgpr, even if adding the immediate offset makes it positive. 1814 unsigned Overflow = ImmOffset & ~MaxImm; 1815 ImmOffset -= Overflow; 1816 if ((int32_t)Overflow < 0) { 1817 Overflow += ImmOffset; 1818 ImmOffset = 0; 1819 } 1820 1821 C1 = ImmOffset; 1822 if (Overflow != 0) { 1823 if (!BaseReg) 1824 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1825 else { 1826 auto OverflowVal = B.buildConstant(S32, Overflow); 1827 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1828 } 1829 } 1830 } 1831 1832 if (!BaseReg) 1833 BaseReg = B.buildConstant(S32, 0).getReg(0); 1834 1835 return {BaseReg, C1}; 1836 } 1837 1838 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1839 Register SrcReg) const { 1840 MachineRegisterInfo &MRI = *B.getMRI(); 1841 LLT SrcTy = MRI.getType(SrcReg); 1842 if (SrcTy.getSizeInBits() == 32) { 1843 // Use a v_mov_b32 here to make the exec dependency explicit. 1844 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1845 .addDef(DstReg) 1846 .addUse(SrcReg); 1847 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1848 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1849 } 1850 1851 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1852 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1853 1854 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1855 .addDef(TmpReg0) 1856 .addUse(SrcReg, 0, AMDGPU::sub0); 1857 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1858 .addDef(TmpReg1) 1859 .addUse(SrcReg, 0, AMDGPU::sub1); 1860 B.buildInstr(AMDGPU::REG_SEQUENCE) 1861 .addDef(DstReg) 1862 .addUse(TmpReg0) 1863 .addImm(AMDGPU::sub0) 1864 .addUse(TmpReg1) 1865 .addImm(AMDGPU::sub1); 1866 1867 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1868 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1869 } 1870 1871 /// Utility function for pushing dynamic vector indexes with a constant offset 1872 /// into waterfall loops. 1873 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1874 MachineInstr &IdxUseInstr, 1875 unsigned OpIdx, 1876 unsigned ConstOffset) { 1877 MachineRegisterInfo &MRI = *B.getMRI(); 1878 const LLT S32 = LLT::scalar(32); 1879 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1880 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1881 1882 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1883 1884 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1885 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1886 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1887 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1888 } 1889 1890 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1891 /// original 32-bit source value (to be inserted in the low part of the combined 1892 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1893 /// value. 1894 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1895 Register Hi32Reg, Register Lo32Reg, 1896 unsigned ExtOpc, 1897 const RegisterBank &RegBank, 1898 bool IsBooleanSrc = false) { 1899 if (ExtOpc == AMDGPU::G_ZEXT) { 1900 B.buildConstant(Hi32Reg, 0); 1901 } else if (ExtOpc == AMDGPU::G_SEXT) { 1902 if (IsBooleanSrc) { 1903 // If we know the original source was an s1, the high half is the same as 1904 // the low. 1905 B.buildCopy(Hi32Reg, Lo32Reg); 1906 } else { 1907 // Replicate sign bit from 32-bit extended part. 1908 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1909 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1910 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1911 } 1912 } else { 1913 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1914 B.buildUndef(Hi32Reg); 1915 } 1916 } 1917 1918 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1919 MachineInstr &MI, MachineRegisterInfo &MRI, 1920 const OperandsMapper &OpdMapper) const { 1921 1922 Register VecReg = MI.getOperand(1).getReg(); 1923 Register Idx = MI.getOperand(2).getReg(); 1924 1925 const RegisterBank &IdxBank = 1926 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1927 1928 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1929 1930 LLT VecTy = MRI.getType(VecReg); 1931 unsigned EltSize = VecTy.getScalarSizeInBits(); 1932 unsigned NumElem = VecTy.getNumElements(); 1933 1934 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1935 IsDivergentIdx, &Subtarget)) 1936 return false; 1937 1938 MachineIRBuilder B(MI); 1939 LLT S32 = LLT::scalar(32); 1940 1941 const RegisterBank &DstBank = 1942 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1943 const RegisterBank &SrcBank = 1944 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1945 1946 const RegisterBank &CCBank = 1947 (DstBank == AMDGPU::SGPRRegBank && 1948 SrcBank == AMDGPU::SGPRRegBank && 1949 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1950 : AMDGPU::VCCRegBank; 1951 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1952 1953 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1954 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1955 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1956 } 1957 1958 LLT EltTy = VecTy.getScalarType(); 1959 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1960 unsigned NumLanes = DstRegs.size(); 1961 if (!NumLanes) 1962 NumLanes = 1; 1963 else 1964 EltTy = MRI.getType(DstRegs[0]); 1965 1966 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1967 SmallVector<Register, 2> Res(NumLanes); 1968 for (unsigned L = 0; L < NumLanes; ++L) 1969 Res[L] = UnmergeToEltTy.getReg(L); 1970 1971 for (unsigned I = 1; I < NumElem; ++I) { 1972 auto IC = B.buildConstant(S32, I); 1973 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1974 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1975 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1976 1977 for (unsigned L = 0; L < NumLanes; ++L) { 1978 auto S = B.buildSelect(EltTy, Cmp, 1979 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1980 1981 for (unsigned N : { 0, 2, 3 }) 1982 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1983 1984 Res[L] = S->getOperand(0).getReg(); 1985 } 1986 } 1987 1988 for (unsigned L = 0; L < NumLanes; ++L) { 1989 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1990 B.buildCopy(DstReg, Res[L]); 1991 MRI.setRegBank(DstReg, DstBank); 1992 } 1993 1994 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 1995 MI.eraseFromParent(); 1996 1997 return true; 1998 } 1999 2000 // Insert a cross regbank copy for a register if it already has a bank that 2001 // differs from the one we want to set. 2002 static Register constrainRegToBank(MachineRegisterInfo &MRI, 2003 MachineIRBuilder &B, Register &Reg, 2004 const RegisterBank &Bank) { 2005 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 2006 if (CurrBank && *CurrBank != Bank) { 2007 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2008 MRI.setRegBank(Copy, Bank); 2009 return Copy; 2010 } 2011 2012 MRI.setRegBank(Reg, Bank); 2013 return Reg; 2014 } 2015 2016 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2017 MachineInstr &MI, MachineRegisterInfo &MRI, 2018 const OperandsMapper &OpdMapper) const { 2019 2020 Register VecReg = MI.getOperand(1).getReg(); 2021 Register Idx = MI.getOperand(3).getReg(); 2022 2023 const RegisterBank &IdxBank = 2024 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2025 2026 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2027 2028 LLT VecTy = MRI.getType(VecReg); 2029 unsigned EltSize = VecTy.getScalarSizeInBits(); 2030 unsigned NumElem = VecTy.getNumElements(); 2031 2032 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2033 IsDivergentIdx, &Subtarget)) 2034 return false; 2035 2036 MachineIRBuilder B(MI); 2037 LLT S32 = LLT::scalar(32); 2038 2039 const RegisterBank &DstBank = 2040 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2041 const RegisterBank &SrcBank = 2042 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2043 const RegisterBank &InsBank = 2044 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2045 2046 const RegisterBank &CCBank = 2047 (DstBank == AMDGPU::SGPRRegBank && 2048 SrcBank == AMDGPU::SGPRRegBank && 2049 InsBank == AMDGPU::SGPRRegBank && 2050 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2051 : AMDGPU::VCCRegBank; 2052 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2053 2054 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2055 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2056 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2057 } 2058 2059 LLT EltTy = VecTy.getScalarType(); 2060 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2061 unsigned NumLanes = InsRegs.size(); 2062 if (!NumLanes) { 2063 NumLanes = 1; 2064 InsRegs.push_back(MI.getOperand(2).getReg()); 2065 } else { 2066 EltTy = MRI.getType(InsRegs[0]); 2067 } 2068 2069 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2070 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2071 2072 for (unsigned I = 0; I < NumElem; ++I) { 2073 auto IC = B.buildConstant(S32, I); 2074 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2075 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2076 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2077 2078 for (unsigned L = 0; L < NumLanes; ++L) { 2079 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2080 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2081 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2082 2083 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2084 MRI.setRegBank(Select, DstBank); 2085 2086 Ops[I * NumLanes + L] = Select; 2087 } 2088 } 2089 2090 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2091 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2092 B.buildBuildVector(MI.getOperand(0), Ops); 2093 } else { 2094 auto Vec = B.buildBuildVector(MergeTy, Ops); 2095 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2096 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2097 } 2098 2099 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2100 MI.eraseFromParent(); 2101 2102 return true; 2103 } 2104 2105 void AMDGPURegisterBankInfo::applyMappingImpl( 2106 const OperandsMapper &OpdMapper) const { 2107 MachineInstr &MI = OpdMapper.getMI(); 2108 unsigned Opc = MI.getOpcode(); 2109 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2110 switch (Opc) { 2111 case AMDGPU::G_CONSTANT: 2112 case AMDGPU::G_IMPLICIT_DEF: { 2113 Register DstReg = MI.getOperand(0).getReg(); 2114 LLT DstTy = MRI.getType(DstReg); 2115 if (DstTy != LLT::scalar(1)) 2116 break; 2117 2118 const RegisterBank *DstBank = 2119 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2120 if (DstBank == &AMDGPU::VCCRegBank) 2121 break; 2122 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2123 if (DefRegs.empty()) 2124 DefRegs.push_back(DstReg); 2125 2126 MachineIRBuilder B(MI); 2127 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2128 2129 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 2130 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 2131 2132 MI.getOperand(0).setReg(NewDstReg); 2133 if (Opc != AMDGPU::G_IMPLICIT_DEF) { 2134 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); 2135 MI.getOperand(1).setCImm( 2136 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal)); 2137 } 2138 2139 MRI.setRegBank(NewDstReg, *DstBank); 2140 B.buildTrunc(DefRegs[0], NewDstReg); 2141 return; 2142 } 2143 case AMDGPU::G_PHI: { 2144 Register DstReg = MI.getOperand(0).getReg(); 2145 LLT DstTy = MRI.getType(DstReg); 2146 if (DstTy != LLT::scalar(1)) 2147 break; 2148 2149 const LLT S32 = LLT::scalar(32); 2150 const RegisterBank *DstBank = 2151 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2152 if (DstBank == &AMDGPU::VCCRegBank) { 2153 applyDefaultMapping(OpdMapper); 2154 // The standard handling only considers the result register bank for 2155 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2156 // produce an invalid copy. We can only copy with some kind of compare to 2157 // get a vector boolean result. Insert a register bank copy that will be 2158 // correctly lowered to a compare. 2159 MachineIRBuilder B(*MI.getParent()->getParent()); 2160 2161 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2162 Register SrcReg = MI.getOperand(I).getReg(); 2163 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2164 2165 if (SrcBank != &AMDGPU::VCCRegBank) { 2166 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2167 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2168 2169 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2170 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2171 MI.getOperand(I).setReg(Copy.getReg(0)); 2172 } 2173 } 2174 2175 return; 2176 } 2177 2178 // Phi handling is strange and only considers the bank of the destination. 2179 substituteSimpleCopyRegs(OpdMapper, 0); 2180 2181 // Promote SGPR/VGPR booleans to s32 2182 MachineFunction *MF = MI.getParent()->getParent(); 2183 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2184 MachineIRBuilder B(MI, ApplyBank); 2185 LegalizerHelper Helper(*MF, ApplyBank, B); 2186 2187 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2188 llvm_unreachable("widen scalar should have succeeded"); 2189 2190 return; 2191 } 2192 case AMDGPU::G_ICMP: 2193 case AMDGPU::G_UADDO: 2194 case AMDGPU::G_USUBO: 2195 case AMDGPU::G_UADDE: 2196 case AMDGPU::G_SADDE: 2197 case AMDGPU::G_USUBE: 2198 case AMDGPU::G_SSUBE: { 2199 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2200 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2201 2202 const RegisterBank *DstBank = 2203 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2204 if (DstBank != &AMDGPU::SGPRRegBank) 2205 break; 2206 2207 const bool HasCarryIn = MI.getNumOperands() == 5; 2208 2209 // If this is a scalar compare, promote the result to s32, as the selection 2210 // will end up using a copy to a 32-bit vreg. 2211 const LLT S32 = LLT::scalar(32); 2212 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2213 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2214 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2215 MachineIRBuilder B(MI); 2216 2217 if (HasCarryIn) { 2218 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2219 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2220 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2221 MI.getOperand(4).setReg(NewSrcReg); 2222 } 2223 2224 MachineBasicBlock *MBB = MI.getParent(); 2225 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2226 2227 // If we had a constrained VCC result register, a copy was inserted to VCC 2228 // from SGPR. 2229 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2230 if (DefRegs.empty()) 2231 DefRegs.push_back(DstReg); 2232 B.buildTrunc(DefRegs[0], NewDstReg); 2233 return; 2234 } 2235 case AMDGPU::G_SELECT: { 2236 Register DstReg = MI.getOperand(0).getReg(); 2237 LLT DstTy = MRI.getType(DstReg); 2238 2239 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2240 if (CondRegs.empty()) 2241 CondRegs.push_back(MI.getOperand(1).getReg()); 2242 else { 2243 assert(CondRegs.size() == 1); 2244 } 2245 2246 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2247 if (CondBank == &AMDGPU::SGPRRegBank) { 2248 MachineIRBuilder B(MI); 2249 const LLT S32 = LLT::scalar(32); 2250 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2251 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2252 2253 MI.getOperand(1).setReg(NewCondReg); 2254 B.buildZExt(NewCondReg, CondRegs[0]); 2255 } 2256 2257 if (DstTy.getSizeInBits() != 64) 2258 break; 2259 2260 MachineIRBuilder B(MI); 2261 LLT HalfTy = getHalfSizedType(DstTy); 2262 2263 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2264 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2265 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2266 2267 // All inputs are SGPRs, nothing special to do. 2268 if (DefRegs.empty()) { 2269 assert(Src1Regs.empty() && Src2Regs.empty()); 2270 break; 2271 } 2272 2273 if (Src1Regs.empty()) 2274 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2275 else { 2276 setRegsToType(MRI, Src1Regs, HalfTy); 2277 } 2278 2279 if (Src2Regs.empty()) 2280 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2281 else 2282 setRegsToType(MRI, Src2Regs, HalfTy); 2283 2284 setRegsToType(MRI, DefRegs, HalfTy); 2285 2286 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2287 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2288 2289 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2290 MI.eraseFromParent(); 2291 return; 2292 } 2293 case AMDGPU::G_BRCOND: { 2294 Register CondReg = MI.getOperand(0).getReg(); 2295 // FIXME: Should use legalizer helper, but should change bool ext type. 2296 const RegisterBank *CondBank = 2297 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2298 2299 if (CondBank == &AMDGPU::SGPRRegBank) { 2300 MachineIRBuilder B(MI); 2301 const LLT S32 = LLT::scalar(32); 2302 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2303 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2304 2305 MI.getOperand(0).setReg(NewCondReg); 2306 B.buildZExt(NewCondReg, CondReg); 2307 return; 2308 } 2309 2310 break; 2311 } 2312 case AMDGPU::G_AND: 2313 case AMDGPU::G_OR: 2314 case AMDGPU::G_XOR: { 2315 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2316 // there is a VGPR input. 2317 Register DstReg = MI.getOperand(0).getReg(); 2318 LLT DstTy = MRI.getType(DstReg); 2319 2320 if (DstTy.getSizeInBits() == 1) { 2321 const RegisterBank *DstBank = 2322 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2323 if (DstBank == &AMDGPU::VCCRegBank) 2324 break; 2325 2326 MachineFunction *MF = MI.getParent()->getParent(); 2327 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2328 MachineIRBuilder B(MI, ApplyBank); 2329 LegalizerHelper Helper(*MF, ApplyBank, B); 2330 2331 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2332 LegalizerHelper::Legalized) 2333 llvm_unreachable("widen scalar should have succeeded"); 2334 return; 2335 } 2336 2337 if (DstTy.getSizeInBits() != 64) 2338 break; 2339 2340 LLT HalfTy = getHalfSizedType(DstTy); 2341 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2342 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2343 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2344 2345 // All inputs are SGPRs, nothing special to do. 2346 if (DefRegs.empty()) { 2347 assert(Src0Regs.empty() && Src1Regs.empty()); 2348 break; 2349 } 2350 2351 assert(DefRegs.size() == 2); 2352 assert(Src0Regs.size() == Src1Regs.size() && 2353 (Src0Regs.empty() || Src0Regs.size() == 2)); 2354 2355 // Depending on where the source registers came from, the generic code may 2356 // have decided to split the inputs already or not. If not, we still need to 2357 // extract the values. 2358 MachineIRBuilder B(MI); 2359 2360 if (Src0Regs.empty()) 2361 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2362 else 2363 setRegsToType(MRI, Src0Regs, HalfTy); 2364 2365 if (Src1Regs.empty()) 2366 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2367 else 2368 setRegsToType(MRI, Src1Regs, HalfTy); 2369 2370 setRegsToType(MRI, DefRegs, HalfTy); 2371 2372 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2373 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2374 2375 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2376 MI.eraseFromParent(); 2377 return; 2378 } 2379 case AMDGPU::G_ABS: { 2380 Register SrcReg = MI.getOperand(1).getReg(); 2381 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2382 2383 // There is no VALU abs instruction so we need to replace it with a sub and 2384 // max combination. 2385 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2386 MachineFunction *MF = MI.getParent()->getParent(); 2387 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); 2388 MachineIRBuilder B(MI, Apply); 2389 LegalizerHelper Helper(*MF, Apply, B); 2390 2391 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2392 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2393 return; 2394 } 2395 [[fallthrough]]; 2396 } 2397 case AMDGPU::G_ADD: 2398 case AMDGPU::G_SUB: 2399 case AMDGPU::G_MUL: 2400 case AMDGPU::G_SHL: 2401 case AMDGPU::G_LSHR: 2402 case AMDGPU::G_ASHR: 2403 case AMDGPU::G_SMIN: 2404 case AMDGPU::G_SMAX: 2405 case AMDGPU::G_UMIN: 2406 case AMDGPU::G_UMAX: { 2407 Register DstReg = MI.getOperand(0).getReg(); 2408 LLT DstTy = MRI.getType(DstReg); 2409 2410 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2411 // Packed 16-bit operations need to be scalarized and promoted. 2412 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2413 break; 2414 2415 const RegisterBank *DstBank = 2416 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2417 if (DstBank == &AMDGPU::VGPRRegBank) 2418 break; 2419 2420 const LLT S32 = LLT::scalar(32); 2421 MachineBasicBlock *MBB = MI.getParent(); 2422 MachineFunction *MF = MBB->getParent(); 2423 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2424 MachineIRBuilder B(MI, ApplySALU); 2425 2426 if (DstTy.isVector()) { 2427 Register WideSrc0Lo, WideSrc0Hi; 2428 Register WideSrc1Lo, WideSrc1Hi; 2429 2430 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2431 std::tie(WideSrc0Lo, WideSrc0Hi) 2432 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2433 std::tie(WideSrc1Lo, WideSrc1Hi) 2434 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2435 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2436 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2437 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2438 MI.eraseFromParent(); 2439 } else { 2440 LegalizerHelper Helper(*MF, ApplySALU, B); 2441 2442 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2443 llvm_unreachable("widen scalar should have succeeded"); 2444 2445 // FIXME: s16 shift amounts should be legal. 2446 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2447 Opc == AMDGPU::G_ASHR) { 2448 B.setInsertPt(*MBB, MI.getIterator()); 2449 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2450 llvm_unreachable("widen scalar should have succeeded"); 2451 } 2452 } 2453 2454 return; 2455 } 2456 case AMDGPU::G_SEXT_INREG: { 2457 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2458 if (SrcRegs.empty()) 2459 break; // Nothing to repair 2460 2461 const LLT S32 = LLT::scalar(32); 2462 MachineIRBuilder B(MI); 2463 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2464 GISelObserverWrapper Observer(&O); 2465 B.setChangeObserver(Observer); 2466 2467 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2468 // we would need to further expand, and doesn't let us directly set the 2469 // result registers. 2470 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2471 2472 int Amt = MI.getOperand(2).getImm(); 2473 if (Amt <= 32) { 2474 // Downstream users have expectations for the high bit behavior, so freeze 2475 // incoming undefined bits. 2476 if (Amt == 32) { 2477 // The low bits are unchanged. 2478 B.buildFreeze(DstRegs[0], SrcRegs[0]); 2479 } else { 2480 auto Freeze = B.buildFreeze(S32, SrcRegs[0]); 2481 // Extend in the low bits and propagate the sign bit to the high half. 2482 B.buildSExtInReg(DstRegs[0], Freeze, Amt); 2483 } 2484 2485 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2486 } else { 2487 // The low bits are unchanged, and extend in the high bits. 2488 // No freeze required 2489 B.buildCopy(DstRegs[0], SrcRegs[0]); 2490 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2491 } 2492 2493 Register DstReg = MI.getOperand(0).getReg(); 2494 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2495 MI.eraseFromParent(); 2496 return; 2497 } 2498 case AMDGPU::G_CTPOP: 2499 case AMDGPU::G_BITREVERSE: { 2500 const RegisterBank *DstBank = 2501 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2502 if (DstBank == &AMDGPU::SGPRRegBank) 2503 break; 2504 2505 Register SrcReg = MI.getOperand(1).getReg(); 2506 const LLT S32 = LLT::scalar(32); 2507 LLT Ty = MRI.getType(SrcReg); 2508 if (Ty == S32) 2509 break; 2510 2511 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2512 MachineIRBuilder B(MI, ApplyVALU); 2513 2514 MachineFunction &MF = B.getMF(); 2515 LegalizerHelper Helper(MF, ApplyVALU, B); 2516 2517 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2518 llvm_unreachable("narrowScalar should have succeeded"); 2519 return; 2520 } 2521 case AMDGPU::G_AMDGPU_FFBH_U32: 2522 case AMDGPU::G_AMDGPU_FFBL_B32: 2523 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2524 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2525 const RegisterBank *DstBank = 2526 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2527 if (DstBank == &AMDGPU::SGPRRegBank) 2528 break; 2529 2530 Register SrcReg = MI.getOperand(1).getReg(); 2531 const LLT S32 = LLT::scalar(32); 2532 LLT Ty = MRI.getType(SrcReg); 2533 if (Ty == S32) 2534 break; 2535 2536 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2537 // which return -1 when the input is zero: 2538 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2539 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2540 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2541 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2542 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2543 MachineIRBuilder B(MI, ApplyVALU); 2544 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2545 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2546 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2547 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2548 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2549 : Opc; 2550 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2551 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2552 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2553 unsigned AddOpc = 2554 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2555 ? AMDGPU::G_ADD 2556 : AMDGPU::G_UADDSAT; 2557 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2558 Register DstReg = MI.getOperand(0).getReg(); 2559 B.buildUMin(DstReg, X, Y); 2560 MI.eraseFromParent(); 2561 return; 2562 } 2563 case AMDGPU::G_SEXT: 2564 case AMDGPU::G_ZEXT: 2565 case AMDGPU::G_ANYEXT: { 2566 Register SrcReg = MI.getOperand(1).getReg(); 2567 LLT SrcTy = MRI.getType(SrcReg); 2568 const bool Signed = Opc == AMDGPU::G_SEXT; 2569 2570 assert(OpdMapper.getVRegs(1).empty()); 2571 2572 MachineIRBuilder B(MI); 2573 const RegisterBank *SrcBank = 2574 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2575 2576 Register DstReg = MI.getOperand(0).getReg(); 2577 LLT DstTy = MRI.getType(DstReg); 2578 if (DstTy.isScalar() && 2579 SrcBank != &AMDGPU::SGPRRegBank && 2580 SrcBank != &AMDGPU::VCCRegBank && 2581 // FIXME: Should handle any type that round to s64 when irregular 2582 // breakdowns supported. 2583 DstTy.getSizeInBits() == 64 && 2584 SrcTy.getSizeInBits() <= 32) { 2585 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2586 2587 // Extend to 32-bit, and then extend the low half. 2588 if (Signed) { 2589 // TODO: Should really be buildSExtOrCopy 2590 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2591 } else if (Opc == AMDGPU::G_ZEXT) { 2592 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2593 } else { 2594 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2595 } 2596 2597 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2598 MRI.setRegBank(DstReg, *SrcBank); 2599 MI.eraseFromParent(); 2600 return; 2601 } 2602 2603 if (SrcTy != LLT::scalar(1)) 2604 return; 2605 2606 // It is not legal to have a legalization artifact with a VCC source. Rather 2607 // than introducing a copy, insert the select we would have to select the 2608 // copy to. 2609 if (SrcBank == &AMDGPU::VCCRegBank) { 2610 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2611 2612 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2613 2614 unsigned DstSize = DstTy.getSizeInBits(); 2615 // 64-bit select is SGPR only 2616 const bool UseSel64 = DstSize > 32 && 2617 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2618 2619 // TODO: Should s16 select be legal? 2620 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2621 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2622 auto False = B.buildConstant(SelType, 0); 2623 2624 MRI.setRegBank(True.getReg(0), *DstBank); 2625 MRI.setRegBank(False.getReg(0), *DstBank); 2626 MRI.setRegBank(DstReg, *DstBank); 2627 2628 if (DstSize > 32) { 2629 B.buildSelect(DefRegs[0], SrcReg, True, False); 2630 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2631 } else if (DstSize < 32) { 2632 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2633 MRI.setRegBank(Sel.getReg(0), *DstBank); 2634 B.buildTrunc(DstReg, Sel); 2635 } else { 2636 B.buildSelect(DstReg, SrcReg, True, False); 2637 } 2638 2639 MI.eraseFromParent(); 2640 return; 2641 } 2642 2643 break; 2644 } 2645 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2646 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2647 2648 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2649 2650 Register DstReg = MI.getOperand(0).getReg(); 2651 Register SrcReg = MI.getOperand(1).getReg(); 2652 2653 const LLT S32 = LLT::scalar(32); 2654 LLT DstTy = MRI.getType(DstReg); 2655 LLT SrcTy = MRI.getType(SrcReg); 2656 2657 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2658 return; 2659 2660 MachineIRBuilder B(MI); 2661 2662 const ValueMapping &DstMapping 2663 = OpdMapper.getInstrMapping().getOperandMapping(0); 2664 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2665 const RegisterBank *SrcBank = 2666 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2667 const RegisterBank *IdxBank = 2668 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2669 2670 Register BaseIdxReg; 2671 unsigned ConstOffset; 2672 std::tie(BaseIdxReg, ConstOffset) = 2673 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2674 2675 // See if the index is an add of a constant which will be foldable by moving 2676 // the base register of the index later if this is going to be executed in a 2677 // waterfall loop. This is essentially to reassociate the add of a constant 2678 // with the readfirstlane. 2679 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2680 ConstOffset > 0 && 2681 ConstOffset < SrcTy.getNumElements(); 2682 2683 // Move the base register. We'll re-insert the add later. 2684 if (ShouldMoveIndexIntoLoop) 2685 MI.getOperand(2).setReg(BaseIdxReg); 2686 2687 // If this is a VGPR result only because the index was a VGPR result, the 2688 // actual indexing will be done on the SGPR source vector, which will 2689 // produce a scalar result. We need to copy to the VGPR result inside the 2690 // waterfall loop. 2691 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2692 SrcBank == &AMDGPU::SGPRRegBank; 2693 if (DstRegs.empty()) { 2694 applyDefaultMapping(OpdMapper); 2695 2696 executeInWaterfallLoop(MI, MRI, { 2 }); 2697 2698 if (NeedCopyToVGPR) { 2699 // We don't want a phi for this temporary reg. 2700 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2701 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2702 MI.getOperand(0).setReg(TmpReg); 2703 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2704 2705 // Use a v_mov_b32 here to make the exec dependency explicit. 2706 buildVCopy(B, DstReg, TmpReg); 2707 } 2708 2709 // Re-insert the constant offset add inside the waterfall loop. 2710 if (ShouldMoveIndexIntoLoop) 2711 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2712 2713 return; 2714 } 2715 2716 assert(DstTy.getSizeInBits() == 64); 2717 2718 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2719 2720 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2721 auto One = B.buildConstant(S32, 1); 2722 2723 MachineBasicBlock::iterator MII = MI.getIterator(); 2724 2725 // Split the vector index into 32-bit pieces. Prepare to move all of the 2726 // new instructions into a waterfall loop if necessary. 2727 // 2728 // Don't put the bitcast or constant in the loop. 2729 MachineInstrSpan Span(MII, &B.getMBB()); 2730 2731 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2732 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2733 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2734 2735 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2736 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2737 2738 MRI.setRegBank(DstReg, *DstBank); 2739 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2740 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2741 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2742 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2743 2744 SmallSet<Register, 4> OpsToWaterfall; 2745 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2746 MI.eraseFromParent(); 2747 return; 2748 } 2749 2750 // Remove the original instruction to avoid potentially confusing the 2751 // waterfall loop logic. 2752 B.setInstr(*Span.begin()); 2753 MI.eraseFromParent(); 2754 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2755 OpsToWaterfall, MRI); 2756 2757 if (NeedCopyToVGPR) { 2758 MachineBasicBlock *LoopBB = Extract1->getParent(); 2759 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2760 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2761 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2762 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2763 2764 Extract0->getOperand(0).setReg(TmpReg0); 2765 Extract1->getOperand(0).setReg(TmpReg1); 2766 2767 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2768 2769 buildVCopy(B, DstRegs[0], TmpReg0); 2770 buildVCopy(B, DstRegs[1], TmpReg1); 2771 } 2772 2773 if (ShouldMoveIndexIntoLoop) 2774 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2775 2776 return; 2777 } 2778 case AMDGPU::G_INSERT_VECTOR_ELT: { 2779 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2780 2781 Register DstReg = MI.getOperand(0).getReg(); 2782 LLT VecTy = MRI.getType(DstReg); 2783 2784 assert(OpdMapper.getVRegs(0).empty()); 2785 assert(OpdMapper.getVRegs(3).empty()); 2786 2787 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2788 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2789 2790 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2791 return; 2792 2793 const RegisterBank *IdxBank = 2794 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2795 2796 Register SrcReg = MI.getOperand(1).getReg(); 2797 Register InsReg = MI.getOperand(2).getReg(); 2798 LLT InsTy = MRI.getType(InsReg); 2799 (void)InsTy; 2800 2801 Register BaseIdxReg; 2802 unsigned ConstOffset; 2803 std::tie(BaseIdxReg, ConstOffset) = 2804 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2805 2806 // See if the index is an add of a constant which will be foldable by moving 2807 // the base register of the index later if this is going to be executed in a 2808 // waterfall loop. This is essentially to reassociate the add of a constant 2809 // with the readfirstlane. 2810 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2811 ConstOffset > 0 && 2812 ConstOffset < VecTy.getNumElements(); 2813 2814 // Move the base register. We'll re-insert the add later. 2815 if (ShouldMoveIndexIntoLoop) 2816 MI.getOperand(3).setReg(BaseIdxReg); 2817 2818 2819 if (InsRegs.empty()) { 2820 executeInWaterfallLoop(MI, MRI, { 3 }); 2821 2822 // Re-insert the constant offset add inside the waterfall loop. 2823 if (ShouldMoveIndexIntoLoop) { 2824 MachineIRBuilder B(MI); 2825 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2826 } 2827 2828 return; 2829 } 2830 2831 2832 assert(InsTy.getSizeInBits() == 64); 2833 2834 const LLT S32 = LLT::scalar(32); 2835 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2836 2837 MachineIRBuilder B(MI); 2838 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2839 auto One = B.buildConstant(S32, 1); 2840 2841 // Split the vector index into 32-bit pieces. Prepare to move all of the 2842 // new instructions into a waterfall loop if necessary. 2843 // 2844 // Don't put the bitcast or constant in the loop. 2845 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2846 2847 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2848 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2849 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2850 2851 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2852 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2853 2854 const RegisterBank *DstBank = 2855 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2856 const RegisterBank *SrcBank = 2857 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2858 const RegisterBank *InsSrcBank = 2859 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2860 2861 MRI.setRegBank(InsReg, *InsSrcBank); 2862 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2863 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2864 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2865 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2866 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2867 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2868 2869 2870 SmallSet<Register, 4> OpsToWaterfall; 2871 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2872 B.setInsertPt(B.getMBB(), MI); 2873 B.buildBitcast(DstReg, InsHi); 2874 MI.eraseFromParent(); 2875 return; 2876 } 2877 2878 B.setInstr(*Span.begin()); 2879 MI.eraseFromParent(); 2880 2881 // Figure out the point after the waterfall loop before mangling the control 2882 // flow. 2883 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2884 OpsToWaterfall, MRI); 2885 2886 // The insertion point is now right after the original instruction. 2887 // 2888 // Keep the bitcast to the original vector type out of the loop. Doing this 2889 // saved an extra phi we don't need inside the loop. 2890 B.buildBitcast(DstReg, InsHi); 2891 2892 // Re-insert the constant offset add inside the waterfall loop. 2893 if (ShouldMoveIndexIntoLoop) 2894 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2895 2896 return; 2897 } 2898 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2899 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2900 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2901 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2902 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2903 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2904 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 2905 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2906 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2907 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2908 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2909 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2910 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2911 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2912 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2913 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2914 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2915 applyDefaultMapping(OpdMapper); 2916 executeInWaterfallLoop(MI, MRI, {1, 4}); 2917 return; 2918 } 2919 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2920 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2921 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2922 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2923 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2924 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2925 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2926 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2927 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2928 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2929 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2930 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2931 applyDefaultMapping(OpdMapper); 2932 executeInWaterfallLoop(MI, MRI, {2, 5}); 2933 return; 2934 } 2935 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2936 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2937 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2938 applyDefaultMapping(OpdMapper); 2939 executeInWaterfallLoop(MI, MRI, {2, 5}); 2940 return; 2941 } 2942 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2943 applyDefaultMapping(OpdMapper); 2944 executeInWaterfallLoop(MI, MRI, {3, 6}); 2945 return; 2946 } 2947 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2948 applyMappingSBufferLoad(OpdMapper); 2949 return; 2950 } 2951 case AMDGPU::G_INTRINSIC: { 2952 switch (MI.getIntrinsicID()) { 2953 case Intrinsic::amdgcn_readlane: { 2954 substituteSimpleCopyRegs(OpdMapper, 2); 2955 2956 assert(OpdMapper.getVRegs(0).empty()); 2957 assert(OpdMapper.getVRegs(3).empty()); 2958 2959 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2960 // waterfall loop, so assume it's a uniform value. 2961 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2962 return; 2963 } 2964 case Intrinsic::amdgcn_writelane: { 2965 assert(OpdMapper.getVRegs(0).empty()); 2966 assert(OpdMapper.getVRegs(2).empty()); 2967 assert(OpdMapper.getVRegs(3).empty()); 2968 2969 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2970 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2971 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2972 return; 2973 } 2974 case Intrinsic::amdgcn_interp_p1: 2975 case Intrinsic::amdgcn_interp_p2: 2976 case Intrinsic::amdgcn_interp_mov: 2977 case Intrinsic::amdgcn_interp_p1_f16: 2978 case Intrinsic::amdgcn_interp_p2_f16: 2979 case Intrinsic::amdgcn_lds_param_load: { 2980 applyDefaultMapping(OpdMapper); 2981 2982 // Readlane for m0 value, which is always the last operand. 2983 // FIXME: Should this be a waterfall loop instead? 2984 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 2985 return; 2986 } 2987 case Intrinsic::amdgcn_interp_inreg_p10: 2988 case Intrinsic::amdgcn_interp_inreg_p2: 2989 case Intrinsic::amdgcn_interp_inreg_p10_f16: 2990 case Intrinsic::amdgcn_interp_inreg_p2_f16: 2991 applyDefaultMapping(OpdMapper); 2992 return; 2993 case Intrinsic::amdgcn_permlane16: 2994 case Intrinsic::amdgcn_permlanex16: { 2995 // Doing a waterfall loop over these wouldn't make any sense. 2996 substituteSimpleCopyRegs(OpdMapper, 2); 2997 substituteSimpleCopyRegs(OpdMapper, 3); 2998 constrainOpWithReadfirstlane(MI, MRI, 4); 2999 constrainOpWithReadfirstlane(MI, MRI, 5); 3000 return; 3001 } 3002 case Intrinsic::amdgcn_sbfe: 3003 applyMappingBFE(OpdMapper, true); 3004 return; 3005 case Intrinsic::amdgcn_ubfe: 3006 applyMappingBFE(OpdMapper, false); 3007 return; 3008 case Intrinsic::amdgcn_inverse_ballot: 3009 applyDefaultMapping(OpdMapper); 3010 constrainOpWithReadfirstlane(MI, MRI, 2); // Mask 3011 return; 3012 case Intrinsic::amdgcn_ballot: 3013 // Use default handling and insert copy to vcc source. 3014 break; 3015 } 3016 break; 3017 } 3018 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3019 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3020 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3021 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3022 const AMDGPU::RsrcIntrinsic *RSrcIntrin 3023 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 3024 assert(RSrcIntrin && RSrcIntrin->IsImage); 3025 // Non-images can have complications from operands that allow both SGPR 3026 // and VGPR. For now it's too complicated to figure out the final opcode 3027 // to derive the register bank from the MCInstrDesc. 3028 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3029 return; 3030 } 3031 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3032 unsigned N = MI.getNumExplicitOperands() - 2; 3033 applyDefaultMapping(OpdMapper); 3034 executeInWaterfallLoop(MI, MRI, { N }); 3035 return; 3036 } 3037 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3038 auto IntrID = MI.getIntrinsicID(); 3039 switch (IntrID) { 3040 case Intrinsic::amdgcn_ds_ordered_add: 3041 case Intrinsic::amdgcn_ds_ordered_swap: { 3042 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3043 assert(OpdMapper.getVRegs(0).empty()); 3044 substituteSimpleCopyRegs(OpdMapper, 3); 3045 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3046 return; 3047 } 3048 case Intrinsic::amdgcn_ds_gws_init: 3049 case Intrinsic::amdgcn_ds_gws_barrier: 3050 case Intrinsic::amdgcn_ds_gws_sema_br: { 3051 // Only the first lane is executes, so readfirstlane is safe. 3052 substituteSimpleCopyRegs(OpdMapper, 1); 3053 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3054 return; 3055 } 3056 case Intrinsic::amdgcn_ds_gws_sema_v: 3057 case Intrinsic::amdgcn_ds_gws_sema_p: 3058 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3059 // Only the first lane is executes, so readfirstlane is safe. 3060 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3061 return; 3062 } 3063 case Intrinsic::amdgcn_ds_append: 3064 case Intrinsic::amdgcn_ds_consume: { 3065 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3066 return; 3067 } 3068 case Intrinsic::amdgcn_s_sendmsg: 3069 case Intrinsic::amdgcn_s_sendmsghalt: { 3070 // FIXME: Should this use a waterfall loop? 3071 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3072 return; 3073 } 3074 case Intrinsic::amdgcn_s_setreg: { 3075 constrainOpWithReadfirstlane(MI, MRI, 2); 3076 return; 3077 } 3078 case Intrinsic::amdgcn_raw_buffer_load_lds: 3079 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 3080 applyDefaultMapping(OpdMapper); 3081 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3082 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3083 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset 3084 return; 3085 } 3086 case Intrinsic::amdgcn_struct_buffer_load_lds: 3087 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 3088 applyDefaultMapping(OpdMapper); 3089 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3090 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3091 constrainOpWithReadfirstlane(MI, MRI, 6); // soffset 3092 return; 3093 } 3094 case Intrinsic::amdgcn_global_load_lds: { 3095 applyDefaultMapping(OpdMapper); 3096 constrainOpWithReadfirstlane(MI, MRI, 2); 3097 return; 3098 } 3099 case Intrinsic::amdgcn_lds_direct_load: { 3100 applyDefaultMapping(OpdMapper); 3101 // Readlane for m0 value, which is always the last operand. 3102 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3103 return; 3104 } 3105 case Intrinsic::amdgcn_exp_row: 3106 applyDefaultMapping(OpdMapper); 3107 constrainOpWithReadfirstlane(MI, MRI, 8); // M0 3108 return; 3109 default: { 3110 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3111 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3112 // Non-images can have complications from operands that allow both SGPR 3113 // and VGPR. For now it's too complicated to figure out the final opcode 3114 // to derive the register bank from the MCInstrDesc. 3115 if (RSrcIntrin->IsImage) { 3116 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3117 return; 3118 } 3119 } 3120 3121 break; 3122 } 3123 } 3124 break; 3125 } 3126 case AMDGPU::G_SI_CALL: { 3127 // Use a set to avoid extra readfirstlanes in the case where multiple 3128 // operands are the same register. 3129 SmallSet<Register, 4> SGPROperandRegs; 3130 3131 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3132 break; 3133 3134 // Move all copies to physical SGPRs that are used by the call instruction 3135 // into the loop block. Start searching for these copies until the 3136 // ADJCALLSTACKUP. 3137 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3138 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3139 3140 // Move all non-copies before the copies, so that a complete range can be 3141 // moved into the waterfall loop. 3142 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3143 // Count of NonCopyInstrs found until the current LastCopy. 3144 unsigned NonCopyInstrsLen = 0; 3145 MachineBasicBlock::iterator Start(&MI); 3146 MachineBasicBlock::iterator LastCopy = Start; 3147 MachineBasicBlock *MBB = MI.getParent(); 3148 const SIMachineFunctionInfo *Info = 3149 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3150 while (Start->getOpcode() != FrameSetupOpcode) { 3151 --Start; 3152 bool IsCopy = false; 3153 if (Start->getOpcode() == AMDGPU::COPY) { 3154 auto &Dst = Start->getOperand(0); 3155 if (Dst.isReg()) { 3156 Register Reg = Dst.getReg(); 3157 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3158 IsCopy = true; 3159 } else { 3160 // Also move the copy from the scratch rsrc descriptor into the loop 3161 // to allow it to be optimized away. 3162 auto &Src = Start->getOperand(1); 3163 if (Src.isReg()) { 3164 Reg = Src.getReg(); 3165 IsCopy = Info->getScratchRSrcReg() == Reg; 3166 } 3167 } 3168 } 3169 } 3170 3171 if (IsCopy) { 3172 LastCopy = Start; 3173 NonCopyInstrsLen = NonCopyInstrs.size(); 3174 } else { 3175 NonCopyInstrs.push_back(&*Start); 3176 } 3177 } 3178 NonCopyInstrs.resize(NonCopyInstrsLen); 3179 3180 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3181 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3182 } 3183 Start = LastCopy; 3184 3185 // Do the same for copies after the loop 3186 NonCopyInstrs.clear(); 3187 NonCopyInstrsLen = 0; 3188 MachineBasicBlock::iterator End(&MI); 3189 LastCopy = End; 3190 while (End->getOpcode() != FrameDestroyOpcode) { 3191 ++End; 3192 bool IsCopy = false; 3193 if (End->getOpcode() == AMDGPU::COPY) { 3194 auto &Src = End->getOperand(1); 3195 if (Src.isReg()) { 3196 Register Reg = Src.getReg(); 3197 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3198 } 3199 } 3200 3201 if (IsCopy) { 3202 LastCopy = End; 3203 NonCopyInstrsLen = NonCopyInstrs.size(); 3204 } else { 3205 NonCopyInstrs.push_back(&*End); 3206 } 3207 } 3208 NonCopyInstrs.resize(NonCopyInstrsLen); 3209 3210 End = LastCopy; 3211 ++LastCopy; 3212 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3213 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3214 } 3215 3216 ++End; 3217 MachineIRBuilder B(*Start); 3218 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI); 3219 break; 3220 } 3221 case AMDGPU::G_LOAD: 3222 case AMDGPU::G_ZEXTLOAD: 3223 case AMDGPU::G_SEXTLOAD: { 3224 if (applyMappingLoad(MI, OpdMapper, MRI)) 3225 return; 3226 break; 3227 } 3228 case AMDGPU::G_DYN_STACKALLOC: 3229 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3230 return; 3231 case AMDGPU::G_SBFX: 3232 applyMappingBFE(OpdMapper, /*Signed*/ true); 3233 return; 3234 case AMDGPU::G_UBFX: 3235 applyMappingBFE(OpdMapper, /*Signed*/ false); 3236 return; 3237 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3238 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3239 applyMappingMAD_64_32(OpdMapper); 3240 return; 3241 default: 3242 break; 3243 } 3244 3245 return applyDefaultMapping(OpdMapper); 3246 } 3247 3248 // vgpr, sgpr -> vgpr 3249 // vgpr, agpr -> vgpr 3250 // agpr, agpr -> agpr 3251 // agpr, sgpr -> vgpr 3252 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3253 if (RB0 == AMDGPU::InvalidRegBankID) 3254 return RB1; 3255 if (RB1 == AMDGPU::InvalidRegBankID) 3256 return RB0; 3257 3258 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3259 return AMDGPU::SGPRRegBankID; 3260 3261 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3262 return AMDGPU::AGPRRegBankID; 3263 3264 return AMDGPU::VGPRRegBankID; 3265 } 3266 3267 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3268 if (RB0 == AMDGPU::InvalidRegBankID) 3269 return RB1; 3270 if (RB1 == AMDGPU::InvalidRegBankID) 3271 return RB0; 3272 3273 // vcc, vcc -> vcc 3274 // vcc, sgpr -> vcc 3275 // vcc, vgpr -> vcc 3276 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3277 return AMDGPU::VCCRegBankID; 3278 3279 // vcc, vgpr -> vgpr 3280 return regBankUnion(RB0, RB1); 3281 } 3282 3283 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3284 const MachineInstr &MI) const { 3285 unsigned RegBank = AMDGPU::InvalidRegBankID; 3286 3287 for (const MachineOperand &MO : MI.operands()) { 3288 if (!MO.isReg()) 3289 continue; 3290 Register Reg = MO.getReg(); 3291 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3292 RegBank = regBankUnion(RegBank, Bank->getID()); 3293 if (RegBank == AMDGPU::VGPRRegBankID) 3294 break; 3295 } 3296 } 3297 3298 return RegBank; 3299 } 3300 3301 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3302 const MachineFunction &MF = *MI.getParent()->getParent(); 3303 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3304 for (const MachineOperand &MO : MI.operands()) { 3305 if (!MO.isReg()) 3306 continue; 3307 Register Reg = MO.getReg(); 3308 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3309 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3310 return false; 3311 } 3312 } 3313 return true; 3314 } 3315 3316 const RegisterBankInfo::InstructionMapping & 3317 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3318 const MachineFunction &MF = *MI.getParent()->getParent(); 3319 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3320 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3321 3322 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3323 const MachineOperand &SrcOp = MI.getOperand(i); 3324 if (!SrcOp.isReg()) 3325 continue; 3326 3327 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3328 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3329 } 3330 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3331 MI.getNumOperands()); 3332 } 3333 3334 const RegisterBankInfo::InstructionMapping & 3335 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3336 const MachineFunction &MF = *MI.getParent()->getParent(); 3337 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3338 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3339 3340 // Even though we technically could use SGPRs, this would require knowledge of 3341 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3342 // 3343 // TODO: Unary ops are trivially OK, so accept SGPRs? 3344 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3345 const MachineOperand &Src = MI.getOperand(i); 3346 if (!Src.isReg()) 3347 continue; 3348 3349 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3350 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3351 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3352 } 3353 3354 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3355 MI.getNumOperands()); 3356 } 3357 3358 const RegisterBankInfo::InstructionMapping & 3359 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3360 const MachineFunction &MF = *MI.getParent()->getParent(); 3361 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3362 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3363 3364 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3365 const MachineOperand &Op = MI.getOperand(I); 3366 if (!Op.isReg()) 3367 continue; 3368 3369 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3370 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3371 } 3372 3373 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3374 MI.getNumOperands()); 3375 } 3376 3377 const RegisterBankInfo::InstructionMapping & 3378 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3379 const MachineInstr &MI, 3380 int RsrcIdx) const { 3381 // The reported argument index is relative to the IR intrinsic call arguments, 3382 // so we need to shift by the number of defs and the intrinsic ID. 3383 RsrcIdx += MI.getNumExplicitDefs() + 1; 3384 3385 const int NumOps = MI.getNumOperands(); 3386 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3387 3388 // TODO: Should packed/unpacked D16 difference be reported here as part of 3389 // the value mapping? 3390 for (int I = 0; I != NumOps; ++I) { 3391 if (!MI.getOperand(I).isReg()) 3392 continue; 3393 3394 Register OpReg = MI.getOperand(I).getReg(); 3395 // We replace some dead address operands with $noreg 3396 if (!OpReg) 3397 continue; 3398 3399 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3400 3401 // FIXME: Probably need a new intrinsic register bank searchable table to 3402 // handle arbitrary intrinsics easily. 3403 // 3404 // If this has a sampler, it immediately follows rsrc. 3405 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3406 3407 if (MustBeSGPR) { 3408 // If this must be an SGPR, so we must report whatever it is as legal. 3409 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3410 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3411 } else { 3412 // Some operands must be VGPR, and these are easy to copy to. 3413 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3414 } 3415 } 3416 3417 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3418 } 3419 3420 /// Return the mapping for a pointer argument. 3421 const RegisterBankInfo::ValueMapping * 3422 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3423 Register PtrReg) const { 3424 LLT PtrTy = MRI.getType(PtrReg); 3425 unsigned Size = PtrTy.getSizeInBits(); 3426 if (Subtarget.useFlatForGlobal() || 3427 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3428 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3429 3430 // If we're using MUBUF instructions for global memory, an SGPR base register 3431 // is possible. Otherwise this needs to be a VGPR. 3432 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3433 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3434 } 3435 3436 const RegisterBankInfo::InstructionMapping & 3437 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3438 3439 const MachineFunction &MF = *MI.getParent()->getParent(); 3440 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3441 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3442 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3443 Register PtrReg = MI.getOperand(1).getReg(); 3444 LLT PtrTy = MRI.getType(PtrReg); 3445 unsigned AS = PtrTy.getAddressSpace(); 3446 unsigned PtrSize = PtrTy.getSizeInBits(); 3447 3448 const ValueMapping *ValMapping; 3449 const ValueMapping *PtrMapping; 3450 3451 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3452 3453 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3454 if (isScalarLoadLegal(MI)) { 3455 // We have a uniform instruction so we want to use an SMRD load 3456 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3457 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3458 } else { 3459 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3460 3461 // If we're using MUBUF instructions for global memory, an SGPR base 3462 // register is possible. Otherwise this needs to be a VGPR. 3463 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3464 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3465 3466 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3467 } 3468 } else { 3469 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3470 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3471 } 3472 3473 OpdsMapping[0] = ValMapping; 3474 OpdsMapping[1] = PtrMapping; 3475 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3476 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3477 return Mapping; 3478 3479 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3480 // handle that during instruction selection? 3481 } 3482 3483 unsigned 3484 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3485 const MachineRegisterInfo &MRI, 3486 unsigned Default) const { 3487 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3488 return Bank ? Bank->getID() : Default; 3489 } 3490 3491 const RegisterBankInfo::ValueMapping * 3492 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3493 const MachineRegisterInfo &MRI, 3494 const TargetRegisterInfo &TRI) const { 3495 // Lie and claim anything is legal, even though this needs to be an SGPR 3496 // applyMapping will have to deal with it as a waterfall loop. 3497 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3498 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3499 return AMDGPU::getValueMapping(Bank, Size); 3500 } 3501 3502 const RegisterBankInfo::ValueMapping * 3503 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3504 const MachineRegisterInfo &MRI, 3505 const TargetRegisterInfo &TRI) const { 3506 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3507 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3508 } 3509 3510 const RegisterBankInfo::ValueMapping * 3511 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3512 const MachineRegisterInfo &MRI, 3513 const TargetRegisterInfo &TRI) const { 3514 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3515 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3516 } 3517 3518 /// 3519 /// This function must return a legal mapping, because 3520 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3521 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3522 /// VGPR to SGPR generated is illegal. 3523 /// 3524 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3525 // legal. These will be dealt with in applyMappingImpl. 3526 // 3527 const RegisterBankInfo::InstructionMapping & 3528 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3529 const MachineFunction &MF = *MI.getParent()->getParent(); 3530 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3531 3532 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3533 // The default logic bothers to analyze impossible alternative mappings. We 3534 // want the most straightforward mapping, so just directly handle this. 3535 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3536 *TRI); 3537 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3538 *TRI); 3539 assert(SrcBank && "src bank should have been assigned already"); 3540 if (!DstBank) 3541 DstBank = SrcBank; 3542 3543 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3544 if (MI.getOpcode() != AMDGPU::G_FREEZE && 3545 cannotCopy(*DstBank, *SrcBank, Size)) 3546 return getInvalidInstructionMapping(); 3547 3548 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3549 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3550 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3551 OpdsMapping[0] = &ValMap; 3552 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3553 OpdsMapping[1] = &ValMap; 3554 3555 return getInstructionMapping( 3556 1, /*Cost*/ 1, 3557 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3558 } 3559 3560 if (MI.isRegSequence()) { 3561 // If any input is a VGPR, the result must be a VGPR. The default handling 3562 // assumes any copy between banks is legal. 3563 unsigned BankID = AMDGPU::SGPRRegBankID; 3564 3565 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3566 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3567 // It doesn't make sense to use vcc or scc banks here, so just ignore 3568 // them. 3569 if (OpBank != AMDGPU::SGPRRegBankID) { 3570 BankID = AMDGPU::VGPRRegBankID; 3571 break; 3572 } 3573 } 3574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3575 3576 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3577 return getInstructionMapping( 3578 1, /*Cost*/ 1, 3579 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3580 } 3581 3582 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3583 // properly. 3584 // 3585 // TODO: There are additional exec masking dependencies to analyze. 3586 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3587 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3588 Register DstReg = MI.getOperand(0).getReg(); 3589 3590 // Sometimes the result may have already been assigned a bank. 3591 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3592 ResultBank = DstBank->getID(); 3593 3594 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3595 Register Reg = MI.getOperand(I).getReg(); 3596 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3597 3598 // FIXME: Assuming VGPR for any undetermined inputs. 3599 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3600 ResultBank = AMDGPU::VGPRRegBankID; 3601 break; 3602 } 3603 3604 // FIXME: Need to promote SGPR case to s32 3605 unsigned OpBank = Bank->getID(); 3606 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3607 } 3608 3609 assert(ResultBank != AMDGPU::InvalidRegBankID); 3610 3611 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3612 3613 const ValueMapping &ValMap = 3614 getValueMapping(0, Size, getRegBank(ResultBank)); 3615 return getInstructionMapping( 3616 1, /*Cost*/ 1, 3617 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3618 } 3619 3620 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3621 if (Mapping.isValid()) 3622 return Mapping; 3623 3624 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3625 3626 switch (MI.getOpcode()) { 3627 default: 3628 return getInvalidInstructionMapping(); 3629 3630 case AMDGPU::G_AND: 3631 case AMDGPU::G_OR: 3632 case AMDGPU::G_XOR: { 3633 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3634 if (Size == 1) { 3635 const RegisterBank *DstBank 3636 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3637 3638 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3639 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3640 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3641 if (DstBank) { 3642 TargetBankID = DstBank->getID(); 3643 if (DstBank == &AMDGPU::VCCRegBank) { 3644 TargetBankID = AMDGPU::VCCRegBankID; 3645 BankLHS = AMDGPU::VCCRegBankID; 3646 BankRHS = AMDGPU::VCCRegBankID; 3647 } else { 3648 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3649 AMDGPU::SGPRRegBankID); 3650 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3651 AMDGPU::SGPRRegBankID); 3652 } 3653 } else { 3654 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3655 AMDGPU::VCCRegBankID); 3656 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3657 AMDGPU::VCCRegBankID); 3658 3659 // Both inputs should be true booleans to produce a boolean result. 3660 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3661 TargetBankID = AMDGPU::VGPRRegBankID; 3662 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3663 TargetBankID = AMDGPU::VCCRegBankID; 3664 BankLHS = AMDGPU::VCCRegBankID; 3665 BankRHS = AMDGPU::VCCRegBankID; 3666 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3667 TargetBankID = AMDGPU::SGPRRegBankID; 3668 } 3669 } 3670 3671 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3672 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3673 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3674 break; 3675 } 3676 3677 if (Size == 64) { 3678 3679 if (isSALUMapping(MI)) { 3680 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3681 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3682 } else { 3683 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3684 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3685 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3686 3687 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3688 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3689 } 3690 3691 break; 3692 } 3693 3694 [[fallthrough]]; 3695 } 3696 case AMDGPU::G_PTR_ADD: 3697 case AMDGPU::G_PTRMASK: 3698 case AMDGPU::G_ADD: 3699 case AMDGPU::G_SUB: 3700 case AMDGPU::G_MUL: 3701 case AMDGPU::G_SHL: 3702 case AMDGPU::G_LSHR: 3703 case AMDGPU::G_ASHR: 3704 case AMDGPU::G_UADDO: 3705 case AMDGPU::G_USUBO: 3706 case AMDGPU::G_UADDE: 3707 case AMDGPU::G_SADDE: 3708 case AMDGPU::G_USUBE: 3709 case AMDGPU::G_SSUBE: 3710 case AMDGPU::G_SMIN: 3711 case AMDGPU::G_SMAX: 3712 case AMDGPU::G_UMIN: 3713 case AMDGPU::G_UMAX: 3714 case AMDGPU::G_ABS: 3715 case AMDGPU::G_SHUFFLE_VECTOR: 3716 case AMDGPU::G_SBFX: 3717 case AMDGPU::G_UBFX: 3718 if (isSALUMapping(MI)) 3719 return getDefaultMappingSOP(MI); 3720 [[fallthrough]]; 3721 3722 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3723 case AMDGPU::G_SSUBSAT: 3724 case AMDGPU::G_UADDSAT: 3725 case AMDGPU::G_USUBSAT: 3726 case AMDGPU::G_FADD: 3727 case AMDGPU::G_FSUB: 3728 case AMDGPU::G_FPTOSI: 3729 case AMDGPU::G_FPTOUI: 3730 case AMDGPU::G_FMUL: 3731 case AMDGPU::G_FMA: 3732 case AMDGPU::G_FMAD: 3733 case AMDGPU::G_FSQRT: 3734 case AMDGPU::G_FFLOOR: 3735 case AMDGPU::G_FCEIL: 3736 case AMDGPU::G_FRINT: 3737 case AMDGPU::G_SITOFP: 3738 case AMDGPU::G_UITOFP: 3739 case AMDGPU::G_FPTRUNC: 3740 case AMDGPU::G_FPEXT: 3741 case AMDGPU::G_FEXP2: 3742 case AMDGPU::G_FLOG2: 3743 case AMDGPU::G_FLDEXP: 3744 case AMDGPU::G_FMINNUM: 3745 case AMDGPU::G_FMAXNUM: 3746 case AMDGPU::G_FMINNUM_IEEE: 3747 case AMDGPU::G_FMAXNUM_IEEE: 3748 case AMDGPU::G_FCANONICALIZE: 3749 case AMDGPU::G_INTRINSIC_TRUNC: 3750 case AMDGPU::G_STRICT_FADD: 3751 case AMDGPU::G_STRICT_FSUB: 3752 case AMDGPU::G_STRICT_FMUL: 3753 case AMDGPU::G_STRICT_FMA: 3754 case AMDGPU::G_STRICT_FLDEXP: 3755 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3756 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3757 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3758 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3759 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3760 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3761 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3762 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3763 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3764 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3765 case AMDGPU::G_AMDGPU_SMED3: 3766 case AMDGPU::G_AMDGPU_FMED3: 3767 return getDefaultMappingVOP(MI); 3768 case AMDGPU::G_UMULH: 3769 case AMDGPU::G_SMULH: { 3770 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3771 return getDefaultMappingSOP(MI); 3772 return getDefaultMappingVOP(MI); 3773 } 3774 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3775 case AMDGPU::G_AMDGPU_MAD_I64_I32: { 3776 // Three possible mappings: 3777 // 3778 // - Default SOP 3779 // - Default VOP 3780 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. 3781 // 3782 // This allows instruction selection to keep the multiplication part of the 3783 // instruction on the SALU. 3784 bool AllSalu = true; 3785 bool MulSalu = true; 3786 for (unsigned i = 0; i < 5; ++i) { 3787 Register Reg = MI.getOperand(i).getReg(); 3788 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3789 if (Bank->getID() != AMDGPU::SGPRRegBankID) { 3790 AllSalu = false; 3791 if (i == 2 || i == 3) { 3792 MulSalu = false; 3793 break; 3794 } 3795 } 3796 } 3797 } 3798 3799 if (AllSalu) 3800 return getDefaultMappingSOP(MI); 3801 3802 // If the multiply-add is full-rate in VALU, use that even if the 3803 // multiplication part is scalar. Accumulating separately on the VALU would 3804 // take two instructions. 3805 if (!MulSalu || Subtarget.hasFullRate64Ops()) 3806 return getDefaultMappingVOP(MI); 3807 3808 // Keep the multiplication on the SALU, then accumulate on the VALU. 3809 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3810 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3811 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3812 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3813 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3814 break; 3815 } 3816 case AMDGPU::G_IMPLICIT_DEF: { 3817 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3818 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3819 break; 3820 } 3821 case AMDGPU::G_FCONSTANT: 3822 case AMDGPU::G_CONSTANT: 3823 case AMDGPU::G_GLOBAL_VALUE: 3824 case AMDGPU::G_BLOCK_ADDR: 3825 case AMDGPU::G_READCYCLECOUNTER: { 3826 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3827 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3828 break; 3829 } 3830 case AMDGPU::G_FRAME_INDEX: { 3831 // TODO: This should be the same as other constants, but eliminateFrameIndex 3832 // currently assumes VALU uses. 3833 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3834 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3835 break; 3836 } 3837 case AMDGPU::G_DYN_STACKALLOC: { 3838 // Result is always uniform, and a wave reduction is needed for the source. 3839 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3840 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3841 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3842 break; 3843 } 3844 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 3845 // This case is weird because we expect a physical register in the source, 3846 // but need to set a bank anyway. 3847 // 3848 // We could select the result to SGPR or VGPR, but for the one current use 3849 // it's more practical to always use VGPR. 3850 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3851 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3852 break; 3853 } 3854 case AMDGPU::G_INSERT: { 3855 unsigned BankID = getMappingType(MRI, MI); 3856 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3857 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3858 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3859 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3860 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3861 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3862 OpdsMapping[3] = nullptr; 3863 break; 3864 } 3865 case AMDGPU::G_EXTRACT: { 3866 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3867 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3868 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3869 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3870 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3871 OpdsMapping[2] = nullptr; 3872 break; 3873 } 3874 case AMDGPU::G_BUILD_VECTOR: 3875 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3876 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3877 if (DstTy == LLT::fixed_vector(2, 16)) { 3878 unsigned DstSize = DstTy.getSizeInBits(); 3879 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3880 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3881 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3882 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3883 3884 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3885 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3886 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3887 break; 3888 } 3889 3890 [[fallthrough]]; 3891 } 3892 case AMDGPU::G_MERGE_VALUES: 3893 case AMDGPU::G_CONCAT_VECTORS: { 3894 unsigned Bank = getMappingType(MRI, MI); 3895 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3896 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3897 3898 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3899 // Op1 and Dst should use the same register bank. 3900 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3901 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3902 break; 3903 } 3904 case AMDGPU::G_BITREVERSE: 3905 case AMDGPU::G_BITCAST: 3906 case AMDGPU::G_INTTOPTR: 3907 case AMDGPU::G_PTRTOINT: 3908 case AMDGPU::G_FABS: 3909 case AMDGPU::G_FNEG: { 3910 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3911 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3912 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3913 break; 3914 } 3915 case AMDGPU::G_AMDGPU_FFBH_U32: 3916 case AMDGPU::G_AMDGPU_FFBL_B32: 3917 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3918 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 3919 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3920 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3921 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3922 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 3923 break; 3924 } 3925 case AMDGPU::G_CTPOP: { 3926 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3927 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3928 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3929 3930 // This should really be getValueMappingSGPR64Only, but allowing the generic 3931 // code to handle the register split just makes using LegalizerHelper more 3932 // difficult. 3933 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3934 break; 3935 } 3936 case AMDGPU::G_TRUNC: { 3937 Register Dst = MI.getOperand(0).getReg(); 3938 Register Src = MI.getOperand(1).getReg(); 3939 unsigned Bank = getRegBankID(Src, MRI); 3940 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3941 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3942 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3943 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3944 break; 3945 } 3946 case AMDGPU::G_ZEXT: 3947 case AMDGPU::G_SEXT: 3948 case AMDGPU::G_ANYEXT: 3949 case AMDGPU::G_SEXT_INREG: { 3950 Register Dst = MI.getOperand(0).getReg(); 3951 Register Src = MI.getOperand(1).getReg(); 3952 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3953 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3954 3955 unsigned DstBank; 3956 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3957 assert(SrcBank); 3958 switch (SrcBank->getID()) { 3959 case AMDGPU::SGPRRegBankID: 3960 DstBank = AMDGPU::SGPRRegBankID; 3961 break; 3962 default: 3963 DstBank = AMDGPU::VGPRRegBankID; 3964 break; 3965 } 3966 3967 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3968 // 32-bits, and then to 64. 3969 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3970 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3971 SrcSize); 3972 break; 3973 } 3974 case AMDGPU::G_FCMP: { 3975 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3976 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3977 OpdsMapping[1] = nullptr; // Predicate Operand. 3978 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3979 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3980 break; 3981 } 3982 case AMDGPU::G_IS_FPCLASS: { 3983 Register SrcReg = MI.getOperand(1).getReg(); 3984 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 3985 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3986 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 3987 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 3988 break; 3989 } 3990 case AMDGPU::G_STORE: { 3991 assert(MI.getOperand(0).isReg()); 3992 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3993 3994 // FIXME: We need to specify a different reg bank once scalar stores are 3995 // supported. 3996 const ValueMapping *ValMapping = 3997 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3998 OpdsMapping[0] = ValMapping; 3999 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4000 break; 4001 } 4002 case AMDGPU::G_ICMP: { 4003 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 4004 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4005 4006 // See if the result register has already been constrained to vcc, which may 4007 // happen due to control flow intrinsic lowering. 4008 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4009 AMDGPU::SGPRRegBankID); 4010 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4011 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 4012 4013 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 4014 Op2Bank == AMDGPU::SGPRRegBankID && 4015 Op3Bank == AMDGPU::SGPRRegBankID && 4016 (Size == 32 || (Size == 64 && 4017 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 4018 Subtarget.hasScalarCompareEq64())); 4019 4020 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4021 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4022 4023 // TODO: Use 32-bit for scalar output size. 4024 // SCC results will need to be copied to a 32-bit SGPR virtual register. 4025 const unsigned ResultSize = 1; 4026 4027 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 4028 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 4029 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 4030 break; 4031 } 4032 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 4033 // VGPR index can be used for waterfall when indexing a SGPR vector. 4034 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4035 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4036 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4037 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4038 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4039 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 4040 4041 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 4042 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 4043 4044 // The index can be either if the source vector is VGPR. 4045 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4046 break; 4047 } 4048 case AMDGPU::G_INSERT_VECTOR_ELT: { 4049 unsigned OutputBankID = isSALUMapping(MI) ? 4050 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4051 4052 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4053 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4054 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4055 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4056 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 4057 4058 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4059 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4060 4061 // This is a weird case, because we need to break down the mapping based on 4062 // the register bank of a different operand. 4063 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 4064 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 4065 InsertSize); 4066 } else { 4067 assert(InsertSize == 32 || InsertSize == 64); 4068 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 4069 } 4070 4071 // The index can be either if the source vector is VGPR. 4072 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 4073 break; 4074 } 4075 case AMDGPU::G_UNMERGE_VALUES: { 4076 unsigned Bank = getMappingType(MRI, MI); 4077 4078 // Op1 and Dst should use the same register bank. 4079 // FIXME: Shouldn't this be the default? Why do we need to handle this? 4080 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4081 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 4082 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 4083 } 4084 break; 4085 } 4086 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 4087 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4088 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4090 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 4092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 4093 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 4094 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 4095 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 4096 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 4097 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 4098 case AMDGPU::G_AMDGPU_BUFFER_STORE: 4099 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 4100 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 4101 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 4102 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 4103 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4104 4105 // rsrc 4106 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4107 4108 // vindex 4109 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4110 4111 // voffset 4112 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4113 4114 // soffset 4115 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4116 4117 // Any remaining operands are immediates and were correctly null 4118 // initialized. 4119 break; 4120 } 4121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 4122 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 4123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 4124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 4125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 4126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 4127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 4128 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 4129 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 4130 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 4131 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 4132 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 4133 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 4134 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 4135 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 4136 // vdata_out 4137 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4138 4139 // vdata_in 4140 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4141 4142 // rsrc 4143 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4144 4145 // vindex 4146 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4147 4148 // voffset 4149 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4150 4151 // soffset 4152 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4153 4154 // Any remaining operands are immediates and were correctly null 4155 // initialized. 4156 break; 4157 } 4158 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4159 // vdata_out 4160 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4161 4162 // vdata_in 4163 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4164 4165 // cmp 4166 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4167 4168 // rsrc 4169 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4170 4171 // vindex 4172 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4173 4174 // voffset 4175 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4176 4177 // soffset 4178 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4179 4180 // Any remaining operands are immediates and were correctly null 4181 // initialized. 4182 break; 4183 } 4184 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 4185 // Lie and claim everything is legal, even though some need to be 4186 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4187 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4188 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4189 4190 // We need to convert this to a MUBUF if either the resource of offset is 4191 // VGPR. 4192 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4193 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4194 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4195 4196 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4197 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4198 break; 4199 } 4200 case AMDGPU::G_INTRINSIC: { 4201 switch (MI.getIntrinsicID()) { 4202 default: 4203 return getInvalidInstructionMapping(); 4204 case Intrinsic::amdgcn_div_fmas: 4205 case Intrinsic::amdgcn_div_fixup: 4206 case Intrinsic::amdgcn_trig_preop: 4207 case Intrinsic::amdgcn_sin: 4208 case Intrinsic::amdgcn_cos: 4209 case Intrinsic::amdgcn_log_clamp: 4210 case Intrinsic::amdgcn_log: 4211 case Intrinsic::amdgcn_exp2: 4212 case Intrinsic::amdgcn_rcp: 4213 case Intrinsic::amdgcn_rcp_legacy: 4214 case Intrinsic::amdgcn_sqrt: 4215 case Intrinsic::amdgcn_rsq: 4216 case Intrinsic::amdgcn_rsq_legacy: 4217 case Intrinsic::amdgcn_rsq_clamp: 4218 case Intrinsic::amdgcn_fmul_legacy: 4219 case Intrinsic::amdgcn_fma_legacy: 4220 case Intrinsic::amdgcn_frexp_mant: 4221 case Intrinsic::amdgcn_frexp_exp: 4222 case Intrinsic::amdgcn_fract: 4223 case Intrinsic::amdgcn_cvt_pkrtz: 4224 case Intrinsic::amdgcn_cvt_pknorm_i16: 4225 case Intrinsic::amdgcn_cvt_pknorm_u16: 4226 case Intrinsic::amdgcn_cvt_pk_i16: 4227 case Intrinsic::amdgcn_cvt_pk_u16: 4228 case Intrinsic::amdgcn_fmed3: 4229 case Intrinsic::amdgcn_cubeid: 4230 case Intrinsic::amdgcn_cubema: 4231 case Intrinsic::amdgcn_cubesc: 4232 case Intrinsic::amdgcn_cubetc: 4233 case Intrinsic::amdgcn_sffbh: 4234 case Intrinsic::amdgcn_fmad_ftz: 4235 case Intrinsic::amdgcn_mbcnt_lo: 4236 case Intrinsic::amdgcn_mbcnt_hi: 4237 case Intrinsic::amdgcn_mul_u24: 4238 case Intrinsic::amdgcn_mul_i24: 4239 case Intrinsic::amdgcn_mulhi_u24: 4240 case Intrinsic::amdgcn_mulhi_i24: 4241 case Intrinsic::amdgcn_lerp: 4242 case Intrinsic::amdgcn_sad_u8: 4243 case Intrinsic::amdgcn_msad_u8: 4244 case Intrinsic::amdgcn_sad_hi_u8: 4245 case Intrinsic::amdgcn_sad_u16: 4246 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4247 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4248 case Intrinsic::amdgcn_mqsad_u32_u8: 4249 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4250 case Intrinsic::amdgcn_alignbyte: 4251 case Intrinsic::amdgcn_perm: 4252 case Intrinsic::amdgcn_fdot2: 4253 case Intrinsic::amdgcn_sdot2: 4254 case Intrinsic::amdgcn_udot2: 4255 case Intrinsic::amdgcn_sdot4: 4256 case Intrinsic::amdgcn_udot4: 4257 case Intrinsic::amdgcn_sdot8: 4258 case Intrinsic::amdgcn_udot8: 4259 case Intrinsic::amdgcn_fdot2_bf16_bf16: 4260 case Intrinsic::amdgcn_fdot2_f16_f16: 4261 case Intrinsic::amdgcn_fdot2_f32_bf16: 4262 case Intrinsic::amdgcn_sudot4: 4263 case Intrinsic::amdgcn_sudot8: 4264 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: 4265 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: 4266 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: 4267 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: 4268 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: 4269 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: 4270 return getDefaultMappingVOP(MI); 4271 case Intrinsic::amdgcn_sbfe: 4272 case Intrinsic::amdgcn_ubfe: 4273 if (isSALUMapping(MI)) 4274 return getDefaultMappingSOP(MI); 4275 return getDefaultMappingVOP(MI); 4276 case Intrinsic::amdgcn_ds_swizzle: 4277 case Intrinsic::amdgcn_ds_permute: 4278 case Intrinsic::amdgcn_ds_bpermute: 4279 case Intrinsic::amdgcn_update_dpp: 4280 case Intrinsic::amdgcn_mov_dpp8: 4281 case Intrinsic::amdgcn_mov_dpp: 4282 case Intrinsic::amdgcn_strict_wwm: 4283 case Intrinsic::amdgcn_wwm: 4284 case Intrinsic::amdgcn_strict_wqm: 4285 case Intrinsic::amdgcn_wqm: 4286 case Intrinsic::amdgcn_softwqm: 4287 case Intrinsic::amdgcn_set_inactive: 4288 case Intrinsic::amdgcn_permlane64: 4289 return getDefaultMappingAllVGPR(MI); 4290 case Intrinsic::amdgcn_kernarg_segment_ptr: 4291 case Intrinsic::amdgcn_s_getpc: 4292 case Intrinsic::amdgcn_groupstaticsize: 4293 case Intrinsic::amdgcn_reloc_constant: 4294 case Intrinsic::returnaddress: { 4295 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4296 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4297 break; 4298 } 4299 case Intrinsic::amdgcn_wqm_vote: { 4300 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4301 OpdsMapping[0] = OpdsMapping[2] 4302 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4303 break; 4304 } 4305 case Intrinsic::amdgcn_ps_live: { 4306 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4307 break; 4308 } 4309 case Intrinsic::amdgcn_div_scale: { 4310 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4311 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4312 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4313 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4314 4315 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4316 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4317 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4318 break; 4319 } 4320 case Intrinsic::amdgcn_class: { 4321 Register Src0Reg = MI.getOperand(2).getReg(); 4322 Register Src1Reg = MI.getOperand(3).getReg(); 4323 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4324 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4325 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4326 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4327 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4328 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4329 break; 4330 } 4331 case Intrinsic::amdgcn_icmp: 4332 case Intrinsic::amdgcn_fcmp: { 4333 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4334 // This is not VCCRegBank because this is not used in boolean contexts. 4335 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4336 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4337 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4338 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4339 break; 4340 } 4341 case Intrinsic::amdgcn_readlane: { 4342 // This must be an SGPR, but accept a VGPR. 4343 Register IdxReg = MI.getOperand(3).getReg(); 4344 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4345 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4346 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4347 [[fallthrough]]; 4348 } 4349 case Intrinsic::amdgcn_readfirstlane: { 4350 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4351 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4352 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4353 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4354 break; 4355 } 4356 case Intrinsic::amdgcn_writelane: { 4357 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4358 Register SrcReg = MI.getOperand(2).getReg(); 4359 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4360 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4361 Register IdxReg = MI.getOperand(3).getReg(); 4362 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4363 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4364 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4365 4366 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4367 // to legalize. 4368 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4369 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4370 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4371 break; 4372 } 4373 case Intrinsic::amdgcn_if_break: { 4374 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4375 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4376 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4377 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4378 break; 4379 } 4380 case Intrinsic::amdgcn_permlane16: 4381 case Intrinsic::amdgcn_permlanex16: { 4382 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4383 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4384 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4385 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4386 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4387 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4388 break; 4389 } 4390 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4391 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4392 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4393 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4394 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4395 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4396 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4397 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4398 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4399 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4400 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4401 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4402 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4403 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4404 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4405 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4406 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4407 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4408 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4409 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4410 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4411 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4412 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4413 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4414 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4415 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4416 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: 4417 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: 4418 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: 4419 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: 4420 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: 4421 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: 4422 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: 4423 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: 4424 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: 4425 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: 4426 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: 4427 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: 4428 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { 4429 // Default for MAI intrinsics. 4430 // srcC can also be an immediate which can be folded later. 4431 // FIXME: Should we eventually add an alternative mapping with AGPR src 4432 // for srcA/srcB? 4433 // 4434 // vdst, srcA, srcB, srcC 4435 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4436 OpdsMapping[0] = 4437 Info->mayNeedAGPRs() 4438 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4439 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4440 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4441 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4442 OpdsMapping[4] = 4443 Info->mayNeedAGPRs() 4444 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4445 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4446 break; 4447 } 4448 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 4449 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 4450 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 4451 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 4452 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 4453 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 4454 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 4455 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 4456 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 4457 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 4458 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 4459 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 4460 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 4461 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: { 4462 // vdst, srcA, srcB, srcC, idx 4463 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4464 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4465 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4466 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4467 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4468 break; 4469 } 4470 case Intrinsic::amdgcn_interp_p1: 4471 case Intrinsic::amdgcn_interp_p2: 4472 case Intrinsic::amdgcn_interp_mov: 4473 case Intrinsic::amdgcn_interp_p1_f16: 4474 case Intrinsic::amdgcn_interp_p2_f16: 4475 case Intrinsic::amdgcn_lds_param_load: { 4476 const int M0Idx = MI.getNumOperands() - 1; 4477 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4478 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4479 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4480 4481 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4482 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4483 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4484 4485 // Must be SGPR, but we must take whatever the original bank is and fix it 4486 // later. 4487 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4488 break; 4489 } 4490 case Intrinsic::amdgcn_interp_inreg_p10: 4491 case Intrinsic::amdgcn_interp_inreg_p2: 4492 case Intrinsic::amdgcn_interp_inreg_p10_f16: 4493 case Intrinsic::amdgcn_interp_inreg_p2_f16: { 4494 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4495 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4496 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4497 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4498 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4499 break; 4500 } 4501 case Intrinsic::amdgcn_ballot: { 4502 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4503 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4504 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4505 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4506 break; 4507 } 4508 case Intrinsic::amdgcn_inverse_ballot: { 4509 // This must be an SGPR, but accept a VGPR. 4510 Register MaskReg = MI.getOperand(2).getReg(); 4511 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); 4512 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); 4513 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4514 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); 4515 break; 4516 } 4517 case Intrinsic::amdgcn_wave_reduce_umin: 4518 case Intrinsic::amdgcn_wave_reduce_umax: { 4519 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4520 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4521 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4522 auto regBankID = 4523 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4524 OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); 4525 break; 4526 } 4527 } 4528 break; 4529 } 4530 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4531 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4532 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4533 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4534 auto IntrID = MI.getIntrinsicID(); 4535 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4536 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4537 // Non-images can have complications from operands that allow both SGPR 4538 // and VGPR. For now it's too complicated to figure out the final opcode 4539 // to derive the register bank from the MCInstrDesc. 4540 assert(RSrcIntrin->IsImage); 4541 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4542 } 4543 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4544 unsigned N = MI.getNumExplicitOperands() - 2; 4545 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4546 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4547 if (N == 3) { 4548 // Sequential form: all operands combined into VGPR256/VGPR512 4549 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4550 if (Size > 256) 4551 Size = 512; 4552 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4553 } else { 4554 // NSA form 4555 for (unsigned I = 2; I < N; ++I) { 4556 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); 4557 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4558 } 4559 } 4560 break; 4561 } 4562 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4563 auto IntrID = MI.getIntrinsicID(); 4564 switch (IntrID) { 4565 case Intrinsic::amdgcn_s_getreg: 4566 case Intrinsic::amdgcn_s_memtime: 4567 case Intrinsic::amdgcn_s_memrealtime: 4568 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: 4569 case Intrinsic::amdgcn_s_sendmsg_rtn: { 4570 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4571 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4572 break; 4573 } 4574 case Intrinsic::amdgcn_global_atomic_fadd: 4575 case Intrinsic::amdgcn_global_atomic_csub: 4576 case Intrinsic::amdgcn_global_atomic_fmin: 4577 case Intrinsic::amdgcn_global_atomic_fmax: 4578 case Intrinsic::amdgcn_flat_atomic_fadd: 4579 case Intrinsic::amdgcn_flat_atomic_fmin: 4580 case Intrinsic::amdgcn_flat_atomic_fmax: 4581 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 4582 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 4583 return getDefaultMappingAllVGPR(MI); 4584 case Intrinsic::amdgcn_ds_ordered_add: 4585 case Intrinsic::amdgcn_ds_ordered_swap: 4586 case Intrinsic::amdgcn_ds_fadd_v2bf16: { 4587 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4588 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4589 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4590 AMDGPU::SGPRRegBankID); 4591 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4592 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4593 break; 4594 } 4595 case Intrinsic::amdgcn_ds_append: 4596 case Intrinsic::amdgcn_ds_consume: { 4597 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4598 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4599 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4600 break; 4601 } 4602 case Intrinsic::amdgcn_exp_compr: 4603 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4604 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4605 break; 4606 case Intrinsic::amdgcn_exp: 4607 // FIXME: Could we support packed types here? 4608 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4609 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4610 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4611 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4612 break; 4613 case Intrinsic::amdgcn_exp_row: 4614 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4615 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4616 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4617 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4618 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); 4619 break; 4620 case Intrinsic::amdgcn_s_sendmsg: 4621 case Intrinsic::amdgcn_s_sendmsghalt: { 4622 // This must be an SGPR, but accept a VGPR. 4623 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4624 AMDGPU::SGPRRegBankID); 4625 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4626 break; 4627 } 4628 case Intrinsic::amdgcn_s_setreg: { 4629 // This must be an SGPR, but accept a VGPR. 4630 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4631 AMDGPU::SGPRRegBankID); 4632 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4633 break; 4634 } 4635 case Intrinsic::amdgcn_end_cf: { 4636 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4637 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4638 break; 4639 } 4640 case Intrinsic::amdgcn_else: { 4641 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4642 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4643 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4644 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4645 break; 4646 } 4647 case Intrinsic::amdgcn_live_mask: { 4648 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4649 break; 4650 } 4651 case Intrinsic::amdgcn_wqm_demote: 4652 case Intrinsic::amdgcn_kill: { 4653 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4654 break; 4655 } 4656 case Intrinsic::amdgcn_raw_buffer_load: 4657 case Intrinsic::amdgcn_raw_ptr_buffer_load: 4658 case Intrinsic::amdgcn_raw_tbuffer_load: 4659 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { 4660 // FIXME: Should make intrinsic ID the last operand of the instruction, 4661 // then this would be the same as store 4662 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4663 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4664 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4665 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4666 break; 4667 } 4668 case Intrinsic::amdgcn_raw_buffer_load_lds: 4669 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { 4670 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4671 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4672 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4673 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4674 break; 4675 } 4676 case Intrinsic::amdgcn_raw_buffer_store: 4677 case Intrinsic::amdgcn_raw_ptr_buffer_store: 4678 case Intrinsic::amdgcn_raw_buffer_store_format: 4679 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 4680 case Intrinsic::amdgcn_raw_tbuffer_store: 4681 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { 4682 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4683 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4684 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4685 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4686 break; 4687 } 4688 case Intrinsic::amdgcn_struct_buffer_load: 4689 case Intrinsic::amdgcn_struct_ptr_buffer_load: 4690 case Intrinsic::amdgcn_struct_tbuffer_load: 4691 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { 4692 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4693 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4694 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4695 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4696 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4697 break; 4698 } 4699 case Intrinsic::amdgcn_struct_buffer_load_lds: 4700 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 4701 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4702 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4703 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4704 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4705 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4706 break; 4707 } 4708 case Intrinsic::amdgcn_struct_buffer_store: 4709 case Intrinsic::amdgcn_struct_ptr_buffer_store: 4710 case Intrinsic::amdgcn_struct_tbuffer_store: 4711 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { 4712 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4713 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4714 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4715 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4716 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4717 break; 4718 } 4719 case Intrinsic::amdgcn_init_exec_from_input: { 4720 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4721 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4722 break; 4723 } 4724 case Intrinsic::amdgcn_ds_gws_init: 4725 case Intrinsic::amdgcn_ds_gws_barrier: 4726 case Intrinsic::amdgcn_ds_gws_sema_br: { 4727 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4728 4729 // This must be an SGPR, but accept a VGPR. 4730 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4731 AMDGPU::SGPRRegBankID); 4732 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4733 break; 4734 } 4735 case Intrinsic::amdgcn_ds_gws_sema_v: 4736 case Intrinsic::amdgcn_ds_gws_sema_p: 4737 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4738 // This must be an SGPR, but accept a VGPR. 4739 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4740 AMDGPU::SGPRRegBankID); 4741 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4742 break; 4743 } 4744 case Intrinsic::amdgcn_global_load_lds: { 4745 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4746 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4747 break; 4748 } 4749 case Intrinsic::amdgcn_lds_direct_load: { 4750 const int M0Idx = MI.getNumOperands() - 1; 4751 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4752 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4753 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4754 4755 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4756 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4757 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4758 4759 // Must be SGPR, but we must take whatever the original bank is and fix it 4760 // later. 4761 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4762 break; 4763 } 4764 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 4765 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: 4766 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4767 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4768 break; 4769 case Intrinsic::amdgcn_ds_bvh_stack_rtn: { 4770 OpdsMapping[0] = 4771 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst 4772 OpdsMapping[1] = 4773 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr 4774 OpdsMapping[3] = 4775 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr 4776 OpdsMapping[4] = 4777 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0 4778 OpdsMapping[5] = 4779 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 4780 break; 4781 } 4782 4783 default: 4784 return getInvalidInstructionMapping(); 4785 } 4786 break; 4787 } 4788 case AMDGPU::G_SELECT: { 4789 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4790 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4791 AMDGPU::SGPRRegBankID); 4792 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4793 AMDGPU::SGPRRegBankID); 4794 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4795 Op3Bank == AMDGPU::SGPRRegBankID; 4796 4797 unsigned CondBankDefault = SGPRSrcs ? 4798 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4799 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4800 CondBankDefault); 4801 if (CondBank == AMDGPU::SGPRRegBankID) 4802 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4803 else if (CondBank == AMDGPU::VGPRRegBankID) 4804 CondBank = AMDGPU::VCCRegBankID; 4805 4806 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4807 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4808 4809 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4810 4811 // TODO: Should report 32-bit for scalar condition type. 4812 if (Size == 64) { 4813 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4814 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4815 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4816 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4817 } else { 4818 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4819 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4820 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4821 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4822 } 4823 4824 break; 4825 } 4826 4827 case AMDGPU::G_SI_CALL: { 4828 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4829 // Lie and claim everything is legal, even though some need to be 4830 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4831 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4832 4833 // Allow anything for implicit arguments 4834 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 4835 if (MI.getOperand(I).isReg()) { 4836 Register Reg = MI.getOperand(I).getReg(); 4837 auto OpBank = getRegBankID(Reg, MRI); 4838 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 4839 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 4840 } 4841 } 4842 break; 4843 } 4844 case AMDGPU::G_LOAD: 4845 case AMDGPU::G_ZEXTLOAD: 4846 case AMDGPU::G_SEXTLOAD: 4847 return getInstrMappingForLoad(MI); 4848 4849 case AMDGPU::G_ATOMICRMW_XCHG: 4850 case AMDGPU::G_ATOMICRMW_ADD: 4851 case AMDGPU::G_ATOMICRMW_SUB: 4852 case AMDGPU::G_ATOMICRMW_AND: 4853 case AMDGPU::G_ATOMICRMW_OR: 4854 case AMDGPU::G_ATOMICRMW_XOR: 4855 case AMDGPU::G_ATOMICRMW_MAX: 4856 case AMDGPU::G_ATOMICRMW_MIN: 4857 case AMDGPU::G_ATOMICRMW_UMAX: 4858 case AMDGPU::G_ATOMICRMW_UMIN: 4859 case AMDGPU::G_ATOMICRMW_FADD: 4860 case AMDGPU::G_ATOMICRMW_UINC_WRAP: 4861 case AMDGPU::G_ATOMICRMW_UDEC_WRAP: 4862 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4863 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4864 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4865 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4866 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4867 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4868 break; 4869 } 4870 case AMDGPU::G_ATOMIC_CMPXCHG: { 4871 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4872 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4873 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4874 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4875 break; 4876 } 4877 case AMDGPU::G_BRCOND: { 4878 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4879 AMDGPU::SGPRRegBankID); 4880 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4881 if (Bank != AMDGPU::SGPRRegBankID) 4882 Bank = AMDGPU::VCCRegBankID; 4883 4884 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4885 break; 4886 } 4887 case AMDGPU::G_FPTRUNC_ROUND_UPWARD: 4888 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: 4889 return getDefaultMappingVOP(MI); 4890 } 4891 4892 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4893 getOperandsMapping(OpdsMapping), 4894 MI.getNumOperands()); 4895 } 4896