1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trival legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 83 #include "llvm/IR/IntrinsicsAMDGPU.h" 84 85 #define GET_TARGET_REGBANK_IMPL 86 #include "AMDGPUGenRegisterBank.inc" 87 88 // This file will be TableGen'ed at some point. 89 #include "AMDGPUGenRegisterBankInfo.def" 90 91 using namespace llvm; 92 using namespace MIPatternMatch; 93 94 namespace { 95 96 // Observer to apply a register bank to new registers created by LegalizerHelper. 97 class ApplyRegBankMapping final : public GISelChangeObserver { 98 private: 99 const AMDGPURegisterBankInfo &RBI; 100 MachineRegisterInfo &MRI; 101 const RegisterBank *NewBank; 102 SmallVector<MachineInstr *, 4> NewInsts; 103 104 public: 105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 106 MachineRegisterInfo &MRI_, const RegisterBank *RB) 107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 108 109 ~ApplyRegBankMapping() { 110 for (MachineInstr *MI : NewInsts) 111 applyBank(*MI); 112 } 113 114 /// Set any registers that don't have a set register class or bank to SALU. 115 void applyBank(MachineInstr &MI) { 116 const unsigned Opc = MI.getOpcode(); 117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 118 Opc == AMDGPU::G_SEXT) { 119 // LegalizerHelper wants to use the basic legalization artifacts when 120 // widening etc. We don't handle selection with vcc in artifact sources, 121 // so we need to use a sslect instead to handle these properly. 122 Register DstReg = MI.getOperand(0).getReg(); 123 Register SrcReg = MI.getOperand(1).getReg(); 124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 125 if (SrcBank == &AMDGPU::VCCRegBank) { 126 const LLT S32 = LLT::scalar(32); 127 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 128 assert(MRI.getType(DstReg) == S32); 129 assert(NewBank == &AMDGPU::VGPRRegBank); 130 131 // Replace the extension with a select, which really uses the boolean 132 // source. 133 MachineIRBuilder B(MI); 134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 135 auto False = B.buildConstant(S32, 0); 136 B.buildSelect(DstReg, SrcReg, True, False); 137 MRI.setRegBank(True.getReg(0), *NewBank); 138 MRI.setRegBank(False.getReg(0), *NewBank); 139 MI.eraseFromParent(); 140 } 141 142 assert(!MRI.getRegClassOrRegBank(DstReg)); 143 MRI.setRegBank(DstReg, *NewBank); 144 return; 145 } 146 147 #ifndef NDEBUG 148 if (Opc == AMDGPU::G_TRUNC) { 149 Register DstReg = MI.getOperand(0).getReg(); 150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 151 assert(DstBank != &AMDGPU::VCCRegBank); 152 } 153 #endif 154 155 for (MachineOperand &Op : MI.operands()) { 156 if (!Op.isReg()) 157 continue; 158 159 // We may see physical registers if building a real MI 160 Register Reg = Op.getReg(); 161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 162 continue; 163 164 const RegisterBank *RB = NewBank; 165 if (MRI.getType(Reg) == LLT::scalar(1)) { 166 assert(NewBank == &AMDGPU::VGPRRegBank && 167 "s1 operands should only be used for vector bools"); 168 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 169 MI.getOpcode() != AMDGPU::G_ANYEXT) && 170 "not expecting legalization artifacts here"); 171 RB = &AMDGPU::VCCRegBank; 172 } 173 174 MRI.setRegBank(Reg, *RB); 175 } 176 } 177 178 void erasingInstr(MachineInstr &MI) override {} 179 180 void createdInstr(MachineInstr &MI) override { 181 // At this point, the instruction was just inserted and has no operands. 182 NewInsts.push_back(&MI); 183 } 184 185 void changingInstr(MachineInstr &MI) override {} 186 void changedInstr(MachineInstr &MI) override { 187 // FIXME: In principle we should probably add the instruction to NewInsts, 188 // but the way the LegalizerHelper uses the observer, we will always see the 189 // registers we need to set the regbank on also referenced in a new 190 // instruction. 191 } 192 }; 193 194 } 195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 196 : AMDGPUGenRegisterBankInfo(), 197 Subtarget(ST), 198 TRI(Subtarget.getRegisterInfo()), 199 TII(Subtarget.getInstrInfo()) { 200 201 // HACK: Until this is fully tablegen'd. 202 static llvm::once_flag InitializeRegisterBankFlag; 203 204 static auto InitializeRegisterBankOnce = [this]() { 205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 208 (void)this; 209 }; 210 211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 212 } 213 214 static bool isVectorRegisterBank(const RegisterBank &Bank) { 215 unsigned BankID = Bank.getID(); 216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 217 } 218 219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 220 const RegisterBank &Src, 221 unsigned Size) const { 222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 223 if (Dst.getID() == AMDGPU::SGPRRegBankID && 224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 225 return std::numeric_limits<unsigned>::max(); 226 } 227 228 // Bool values are tricky, because the meaning is based on context. The SCC 229 // and VCC banks are for the natural scalar and vector conditions produced by 230 // a compare. 231 // 232 // Legalization doesn't know about the necessary context, so an s1 use may 233 // have been a truncate from an arbitrary value, in which case a copy (lowered 234 // as a compare with 0) needs to be inserted. 235 if (Size == 1 && 236 (Dst.getID() == AMDGPU::SGPRRegBankID) && 237 (isVectorRegisterBank(Src) || 238 Src.getID() == AMDGPU::SGPRRegBankID || 239 Src.getID() == AMDGPU::VCCRegBankID)) 240 return std::numeric_limits<unsigned>::max(); 241 242 // There is no direct copy between AGPRs. 243 if (Dst.getID() == AMDGPU::AGPRRegBankID && 244 Src.getID() == AMDGPU::AGPRRegBankID) 245 return 4; 246 247 return RegisterBankInfo::copyCost(Dst, Src, Size); 248 } 249 250 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 251 const ValueMapping &ValMapping, 252 const RegisterBank *CurBank) const { 253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 254 // VGPR. 255 // FIXME: Is there a better way to do this? 256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 257 return 10; // This is expensive. 258 259 assert(ValMapping.NumBreakDowns == 2 && 260 ValMapping.BreakDown[0].Length == 32 && 261 ValMapping.BreakDown[0].StartIdx == 0 && 262 ValMapping.BreakDown[1].Length == 32 && 263 ValMapping.BreakDown[1].StartIdx == 32 && 264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 265 266 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 268 // want. 269 270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 271 // alignment restrictions, but this probably isn't important. 272 return 1; 273 } 274 275 const RegisterBank & 276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 277 LLT Ty) const { 278 if (&RC == &AMDGPU::SReg_1RegClass) 279 return AMDGPU::VCCRegBank; 280 281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 282 // VCC-like use. 283 if (TRI->isSGPRClass(&RC)) { 284 // FIXME: This probably came from a copy from a physical register, which 285 // should be inferrrable from the copied to-type. We don't have many boolean 286 // physical register constraints so just assume a normal SGPR for now. 287 if (!Ty.isValid()) 288 return AMDGPU::SGPRRegBank; 289 290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 291 } 292 293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 294 } 295 296 template <unsigned NumOps> 297 RegisterBankInfo::InstructionMappings 298 AMDGPURegisterBankInfo::addMappingFromTable( 299 const MachineInstr &MI, const MachineRegisterInfo &MRI, 300 const std::array<unsigned, NumOps> RegSrcOpIdx, 301 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 302 303 InstructionMappings AltMappings; 304 305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 306 307 unsigned Sizes[NumOps]; 308 for (unsigned I = 0; I < NumOps; ++I) { 309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 311 } 312 313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 316 } 317 318 // getInstrMapping's default mapping uses ID 1, so start at 2. 319 unsigned MappingID = 2; 320 for (const auto &Entry : Table) { 321 for (unsigned I = 0; I < NumOps; ++I) { 322 int OpIdx = RegSrcOpIdx[I]; 323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 324 } 325 326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 327 getOperandsMapping(Operands), 328 Operands.size())); 329 } 330 331 return AltMappings; 332 } 333 334 RegisterBankInfo::InstructionMappings 335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 337 switch (MI.getIntrinsicID()) { 338 case Intrinsic::amdgcn_readlane: { 339 static const OpRegBankEntry<3> Table[2] = { 340 // Perfectly legal. 341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 342 343 // Need a readfirstlane for the index. 344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 345 }; 346 347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 349 } 350 case Intrinsic::amdgcn_writelane: { 351 static const OpRegBankEntry<4> Table[4] = { 352 // Perfectly legal. 353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 354 355 // Need readfirstlane of first op 356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 357 358 // Need readfirstlane of second op 359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 360 361 // Need readfirstlane of both ops 362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 363 }; 364 365 // rsrc, voffset, offset 366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 368 } 369 default: 370 return RegisterBankInfo::getInstrAlternativeMappings(MI); 371 } 372 } 373 374 RegisterBankInfo::InstructionMappings 375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 377 378 switch (MI.getIntrinsicID()) { 379 case Intrinsic::amdgcn_s_buffer_load: { 380 static const OpRegBankEntry<2> Table[4] = { 381 // Perfectly legal. 382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 383 384 // Only need 1 register in loop 385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 386 387 // Have to waterfall the resource. 388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 389 390 // Have to waterfall the resource, and the offset. 391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 392 }; 393 394 // rsrc, offset 395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 397 } 398 case Intrinsic::amdgcn_ds_ordered_add: 399 case Intrinsic::amdgcn_ds_ordered_swap: { 400 // VGPR = M0, VGPR 401 static const OpRegBankEntry<3> Table[2] = { 402 // Perfectly legal. 403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 404 405 // Need a readfirstlane for m0 406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 407 }; 408 409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 411 } 412 case Intrinsic::amdgcn_s_sendmsg: 413 case Intrinsic::amdgcn_s_sendmsghalt: { 414 // FIXME: Should have no register for immediate 415 static const OpRegBankEntry<1> Table[2] = { 416 // Perfectly legal. 417 { { AMDGPU::SGPRRegBankID }, 1 }, 418 419 // Need readlane 420 { { AMDGPU::VGPRRegBankID }, 3 } 421 }; 422 423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 425 } 426 default: 427 return RegisterBankInfo::getInstrAlternativeMappings(MI); 428 } 429 } 430 431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { 432 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); 433 return I && I->getMetadata("amdgpu.noclobber"); 434 } 435 436 // FIXME: Returns uniform if there's no source value information. This is 437 // probably wrong. 438 static bool isScalarLoadLegal(const MachineInstr &MI) { 439 if (!MI.hasOneMemOperand()) 440 return false; 441 442 const MachineMemOperand *MMO = *MI.memoperands_begin(); 443 const unsigned AS = MMO->getAddrSpace(); 444 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 445 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 446 // Require 4-byte alignment. 447 return MMO->getAlign() >= Align(4) && 448 // Can't do a scalar atomic load. 449 !MMO->isAtomic() && 450 // Don't use scalar loads for volatile accesses to non-constant address 451 // spaces. 452 (IsConst || !MMO->isVolatile()) && 453 // Memory must be known constant, or not written before this load. 454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && 455 AMDGPUInstrInfo::isUniformMMO(MMO); 456 } 457 458 RegisterBankInfo::InstructionMappings 459 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 460 const MachineInstr &MI) const { 461 462 const MachineFunction &MF = *MI.getParent()->getParent(); 463 const MachineRegisterInfo &MRI = MF.getRegInfo(); 464 465 466 InstructionMappings AltMappings; 467 switch (MI.getOpcode()) { 468 case TargetOpcode::G_CONSTANT: { 469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 470 if (Size == 1) { 471 static const OpRegBankEntry<1> Table[3] = { 472 { { AMDGPU::VGPRRegBankID }, 1 }, 473 { { AMDGPU::SGPRRegBankID }, 1 }, 474 { { AMDGPU::VCCRegBankID }, 1 } 475 }; 476 477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 478 } 479 480 LLVM_FALLTHROUGH; 481 } 482 case TargetOpcode::G_FCONSTANT: 483 case TargetOpcode::G_FRAME_INDEX: 484 case TargetOpcode::G_GLOBAL_VALUE: { 485 static const OpRegBankEntry<1> Table[2] = { 486 { { AMDGPU::VGPRRegBankID }, 1 }, 487 { { AMDGPU::SGPRRegBankID }, 1 } 488 }; 489 490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 491 } 492 case TargetOpcode::G_AND: 493 case TargetOpcode::G_OR: 494 case TargetOpcode::G_XOR: { 495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 496 497 if (Size == 1) { 498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 499 const InstructionMapping &SCCMapping = getInstructionMapping( 500 1, 1, getOperandsMapping( 501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 504 3); // Num Operands 505 AltMappings.push_back(&SCCMapping); 506 507 const InstructionMapping &VCCMapping0 = getInstructionMapping( 508 2, 1, getOperandsMapping( 509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 512 3); // Num Operands 513 AltMappings.push_back(&VCCMapping0); 514 return AltMappings; 515 } 516 517 if (Size != 64) 518 break; 519 520 const InstructionMapping &SSMapping = getInstructionMapping( 521 1, 1, getOperandsMapping( 522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 525 3); // Num Operands 526 AltMappings.push_back(&SSMapping); 527 528 const InstructionMapping &VVMapping = getInstructionMapping( 529 2, 2, getOperandsMapping( 530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 533 3); // Num Operands 534 AltMappings.push_back(&VVMapping); 535 break; 536 } 537 case TargetOpcode::G_LOAD: 538 case TargetOpcode::G_ZEXTLOAD: 539 case TargetOpcode::G_SEXTLOAD: { 540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 542 unsigned PtrSize = PtrTy.getSizeInBits(); 543 unsigned AS = PtrTy.getAddressSpace(); 544 545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 546 AS != AMDGPUAS::PRIVATE_ADDRESS) && 547 isScalarLoadLegal(MI)) { 548 const InstructionMapping &SSMapping = getInstructionMapping( 549 1, 1, getOperandsMapping( 550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 552 2); // Num Operands 553 AltMappings.push_back(&SSMapping); 554 } 555 556 const InstructionMapping &VVMapping = getInstructionMapping( 557 2, 1, 558 getOperandsMapping( 559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 561 2); // Num Operands 562 AltMappings.push_back(&VVMapping); 563 564 // It may be possible to have a vgpr = load sgpr mapping here, because 565 // the mubuf instructions support this kind of load, but probably for only 566 // gfx7 and older. However, the addressing mode matching in the instruction 567 // selector should be able to do a better job of detecting and selecting 568 // these kinds of loads from the vgpr = load vgpr mapping. 569 570 return AltMappings; 571 572 } 573 case TargetOpcode::G_SELECT: { 574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 580 4); // Num Operands 581 AltMappings.push_back(&SSMapping); 582 583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 588 4); // Num Operands 589 AltMappings.push_back(&VVMapping); 590 591 return AltMappings; 592 } 593 case TargetOpcode::G_UADDE: 594 case TargetOpcode::G_USUBE: 595 case TargetOpcode::G_SADDE: 596 case TargetOpcode::G_SSUBE: { 597 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 598 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 599 getOperandsMapping( 600 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 604 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 605 5); // Num Operands 606 AltMappings.push_back(&SSMapping); 607 608 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 609 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 610 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 612 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 613 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 614 5); // Num Operands 615 AltMappings.push_back(&VVMapping); 616 return AltMappings; 617 } 618 case AMDGPU::G_BRCOND: { 619 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 620 621 // TODO: Change type to 32 for scalar 622 const InstructionMapping &SMapping = getInstructionMapping( 623 1, 1, getOperandsMapping( 624 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 625 2); // Num Operands 626 AltMappings.push_back(&SMapping); 627 628 const InstructionMapping &VMapping = getInstructionMapping( 629 1, 1, getOperandsMapping( 630 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 631 2); // Num Operands 632 AltMappings.push_back(&VMapping); 633 return AltMappings; 634 } 635 case AMDGPU::G_INTRINSIC: 636 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 637 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 638 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 639 default: 640 break; 641 } 642 return RegisterBankInfo::getInstrAlternativeMappings(MI); 643 } 644 645 void AMDGPURegisterBankInfo::split64BitValueForMapping( 646 MachineIRBuilder &B, 647 SmallVector<Register, 2> &Regs, 648 LLT HalfTy, 649 Register Reg) const { 650 assert(HalfTy.getSizeInBits() == 32); 651 MachineRegisterInfo *MRI = B.getMRI(); 652 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 653 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 654 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 655 MRI->setRegBank(LoLHS, *Bank); 656 MRI->setRegBank(HiLHS, *Bank); 657 658 Regs.push_back(LoLHS); 659 Regs.push_back(HiLHS); 660 661 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 662 .addDef(LoLHS) 663 .addDef(HiLHS) 664 .addUse(Reg); 665 } 666 667 /// Replace the current type each register in \p Regs has with \p NewTy 668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 669 LLT NewTy) { 670 for (Register Reg : Regs) { 671 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 672 MRI.setType(Reg, NewTy); 673 } 674 } 675 676 static LLT getHalfSizedType(LLT Ty) { 677 if (Ty.isVector()) { 678 assert(Ty.getElementCount().isKnownMultipleOf(2)); 679 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 680 Ty.getElementType()); 681 } 682 683 assert(Ty.getScalarSizeInBits() % 2 == 0); 684 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 685 } 686 687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 689 /// execute the instruction for each unique combination of values in all lanes 690 /// in the wave. The block will be split such that rest of the instructions are 691 /// moved to a new block. 692 /// 693 /// Essentially performs this loop: 694 // 695 /// Save Execution Mask 696 /// For (Lane : Wavefront) { 697 /// Enable Lane, Disable all other lanes 698 /// SGPR = read SGPR value for current lane from VGPR 699 /// VGPRResult[Lane] = use_op SGPR 700 /// } 701 /// Restore Execution Mask 702 /// 703 /// There is additional complexity to try for compare values to identify the 704 /// unique values used. 705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 706 MachineIRBuilder &B, 707 iterator_range<MachineBasicBlock::iterator> Range, 708 SmallSet<Register, 4> &SGPROperandRegs, 709 MachineRegisterInfo &MRI) const { 710 SmallVector<Register, 4> ResultRegs; 711 SmallVector<Register, 4> InitResultRegs; 712 SmallVector<Register, 4> PhiRegs; 713 714 // Track use registers which have already been expanded with a readfirstlane 715 // sequence. This may have multiple uses if moving a sequence. 716 DenseMap<Register, Register> WaterfalledRegMap; 717 718 MachineBasicBlock &MBB = B.getMBB(); 719 MachineFunction *MF = &B.getMF(); 720 721 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 722 const unsigned WaveAndOpc = Subtarget.isWave32() ? 723 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 724 const unsigned MovTermOpc = Subtarget.isWave32() ? 725 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 726 const unsigned XorTermOpc = Subtarget.isWave32() ? 727 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 728 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 729 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 730 const unsigned ExecReg = Subtarget.isWave32() ? 731 AMDGPU::EXEC_LO : AMDGPU::EXEC; 732 733 #ifndef NDEBUG 734 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 735 #endif 736 737 for (MachineInstr &MI : Range) { 738 for (MachineOperand &Def : MI.defs()) { 739 if (MRI.use_nodbg_empty(Def.getReg())) 740 continue; 741 742 LLT ResTy = MRI.getType(Def.getReg()); 743 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 744 ResultRegs.push_back(Def.getReg()); 745 Register InitReg = B.buildUndef(ResTy).getReg(0); 746 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 747 InitResultRegs.push_back(InitReg); 748 PhiRegs.push_back(PhiReg); 749 MRI.setRegBank(PhiReg, *DefBank); 750 MRI.setRegBank(InitReg, *DefBank); 751 } 752 } 753 754 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 755 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 756 757 // Don't bother using generic instructions/registers for the exec mask. 758 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 759 .addDef(InitSaveExecReg); 760 761 Register PhiExec = MRI.createVirtualRegister(WaveRC); 762 Register NewExec = MRI.createVirtualRegister(WaveRC); 763 764 // To insert the loop we need to split the block. Move everything before this 765 // point to a new block, and insert a new empty block before this instruction. 766 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 767 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 768 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 769 MachineFunction::iterator MBBI(MBB); 770 ++MBBI; 771 MF->insert(MBBI, LoopBB); 772 MF->insert(MBBI, RestoreExecBB); 773 MF->insert(MBBI, RemainderBB); 774 775 LoopBB->addSuccessor(RestoreExecBB); 776 LoopBB->addSuccessor(LoopBB); 777 778 // Move the rest of the block into a new block. 779 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 780 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 781 782 MBB.addSuccessor(LoopBB); 783 RestoreExecBB->addSuccessor(RemainderBB); 784 785 B.setInsertPt(*LoopBB, LoopBB->end()); 786 787 B.buildInstr(TargetOpcode::PHI) 788 .addDef(PhiExec) 789 .addReg(InitSaveExecReg) 790 .addMBB(&MBB) 791 .addReg(NewExec) 792 .addMBB(LoopBB); 793 794 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 795 B.buildInstr(TargetOpcode::G_PHI) 796 .addDef(std::get<2>(Result)) 797 .addReg(std::get<0>(Result)) // Initial value / implicit_def 798 .addMBB(&MBB) 799 .addReg(std::get<1>(Result)) // Mid-loop value. 800 .addMBB(LoopBB); 801 } 802 803 const DebugLoc &DL = B.getDL(); 804 805 MachineInstr &FirstInst = *Range.begin(); 806 807 // Move the instruction into the loop. Note we moved everything after 808 // Range.end() already into a new block, so Range.end() is no longer valid. 809 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 810 811 // Figure out the iterator range after splicing the instructions. 812 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 813 auto NewEnd = LoopBB->end(); 814 815 MachineBasicBlock::iterator I = Range.begin(); 816 B.setInsertPt(*LoopBB, I); 817 818 Register CondReg; 819 820 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 821 822 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 823 for (MachineOperand &Op : MI.uses()) { 824 if (!Op.isReg() || Op.isDef()) 825 continue; 826 827 Register OldReg = Op.getReg(); 828 if (!SGPROperandRegs.count(OldReg)) 829 continue; 830 831 // See if we already processed this register in another instruction in the 832 // sequence. 833 auto OldVal = WaterfalledRegMap.find(OldReg); 834 if (OldVal != WaterfalledRegMap.end()) { 835 Op.setReg(OldVal->second); 836 continue; 837 } 838 839 Register OpReg = Op.getReg(); 840 LLT OpTy = MRI.getType(OpReg); 841 842 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 843 if (OpBank != &AMDGPU::VGPRRegBank) { 844 // Insert copy from AGPR to VGPR before the loop. 845 B.setMBB(MBB); 846 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 847 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 848 B.setInstr(*I); 849 } 850 851 unsigned OpSize = OpTy.getSizeInBits(); 852 853 // Can only do a readlane of 32-bit pieces. 854 if (OpSize == 32) { 855 // Avoid extra copies in the simple case of one 32-bit register. 856 Register CurrentLaneOpReg 857 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 858 MRI.setType(CurrentLaneOpReg, OpTy); 859 860 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); 861 // Read the next variant <- also loop target. 862 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 863 CurrentLaneOpReg) 864 .addReg(OpReg); 865 866 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 867 bool First = CondReg == AMDGPU::NoRegister; 868 if (First) 869 CondReg = NewCondReg; 870 871 // Compare the just read M0 value to all possible Idx values. 872 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 873 .addDef(NewCondReg) 874 .addReg(CurrentLaneOpReg) 875 .addReg(OpReg); 876 Op.setReg(CurrentLaneOpReg); 877 878 if (!First) { 879 Register AndReg = MRI.createVirtualRegister(WaveRC); 880 881 // If there are multiple operands to consider, and the conditions. 882 B.buildInstr(WaveAndOpc) 883 .addDef(AndReg) 884 .addReg(NewCondReg) 885 .addReg(CondReg); 886 CondReg = AndReg; 887 } 888 } else { 889 LLT S32 = LLT::scalar(32); 890 SmallVector<Register, 8> ReadlanePieces; 891 892 // The compares can be done as 64-bit, but the extract needs to be done 893 // in 32-bit pieces. 894 895 bool Is64 = OpSize % 64 == 0; 896 897 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 898 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 899 : AMDGPU::V_CMP_EQ_U32_e64; 900 901 // The compares can be done as 64-bit, but the extract needs to be done 902 // in 32-bit pieces. 903 904 // Insert the unmerge before the loop. 905 906 B.setMBB(MBB); 907 auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg); 908 B.setInstr(*I); 909 910 unsigned NumPieces = Unmerge->getNumOperands() - 1; 911 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 912 Register UnmergePiece = Unmerge.getReg(PieceIdx); 913 914 Register CurrentLaneOpReg; 915 if (Is64) { 916 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 917 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 918 919 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 920 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 921 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 922 923 // Read the next variant <- also loop target. 924 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 925 CurrentLaneOpRegLo) 926 .addReg(UnmergePiece, 0, AMDGPU::sub0); 927 928 // Read the next variant <- also loop target. 929 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 930 CurrentLaneOpRegHi) 931 .addReg(UnmergePiece, 0, AMDGPU::sub1); 932 933 CurrentLaneOpReg = 934 B.buildMerge(LLT::scalar(64), 935 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 936 .getReg(0); 937 938 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 939 940 if (OpTy.getScalarSizeInBits() == 64) { 941 // If we need to produce a 64-bit element vector, so use the 942 // merged pieces 943 ReadlanePieces.push_back(CurrentLaneOpReg); 944 } else { 945 // 32-bit element type. 946 ReadlanePieces.push_back(CurrentLaneOpRegLo); 947 ReadlanePieces.push_back(CurrentLaneOpRegHi); 948 } 949 } else { 950 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 951 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 952 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 953 954 // Read the next variant <- also loop target. 955 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 956 CurrentLaneOpReg) 957 .addReg(UnmergePiece); 958 ReadlanePieces.push_back(CurrentLaneOpReg); 959 } 960 961 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 962 bool First = CondReg == AMDGPU::NoRegister; 963 if (First) 964 CondReg = NewCondReg; 965 966 B.buildInstr(CmpOp) 967 .addDef(NewCondReg) 968 .addReg(CurrentLaneOpReg) 969 .addReg(UnmergePiece); 970 971 if (!First) { 972 Register AndReg = MRI.createVirtualRegister(WaveRC); 973 974 // If there are multiple operands to consider, and the conditions. 975 B.buildInstr(WaveAndOpc) 976 .addDef(AndReg) 977 .addReg(NewCondReg) 978 .addReg(CondReg); 979 CondReg = AndReg; 980 } 981 } 982 983 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 984 // BUILD_VECTOR 985 if (OpTy.isVector()) { 986 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 987 Op.setReg(Merge.getReg(0)); 988 } else { 989 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 990 Op.setReg(Merge.getReg(0)); 991 } 992 993 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 994 } 995 996 // Make sure we don't re-process this register again. 997 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 998 } 999 } 1000 1001 B.setInsertPt(*LoopBB, LoopBB->end()); 1002 1003 // Update EXEC, save the original EXEC value to VCC. 1004 B.buildInstr(AndSaveExecOpc) 1005 .addDef(NewExec) 1006 .addReg(CondReg, RegState::Kill); 1007 1008 MRI.setSimpleHint(NewExec, CondReg); 1009 1010 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1011 B.buildInstr(XorTermOpc) 1012 .addDef(ExecReg) 1013 .addReg(ExecReg) 1014 .addReg(NewExec); 1015 1016 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1017 // s_cbranch_scc0? 1018 1019 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1020 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 1021 .addMBB(LoopBB); 1022 1023 // Save the EXEC mask before the loop. 1024 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) 1025 .addReg(ExecReg); 1026 1027 // Restore the EXEC mask after the loop. 1028 B.setMBB(*RestoreExecBB); 1029 B.buildInstr(MovTermOpc) 1030 .addDef(ExecReg) 1031 .addReg(SaveExecReg); 1032 1033 // Set the insert point after the original instruction, so any new 1034 // instructions will be in the remainder. 1035 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 1036 1037 return true; 1038 } 1039 1040 // Return any unique registers used by \p MI at \p OpIndices that need to be 1041 // handled in a waterfall loop. Returns these registers in \p 1042 // SGPROperandRegs. Returns true if there are any operands to handle and a 1043 // waterfall loop is necessary. 1044 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 1045 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 1046 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 1047 for (unsigned Op : OpIndices) { 1048 assert(MI.getOperand(Op).isUse()); 1049 Register Reg = MI.getOperand(Op).getReg(); 1050 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 1051 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 1052 SGPROperandRegs.insert(Reg); 1053 } 1054 1055 // No operands need to be replaced, so no need to loop. 1056 return !SGPROperandRegs.empty(); 1057 } 1058 1059 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1060 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 1061 ArrayRef<unsigned> OpIndices) const { 1062 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1063 // are the same register. 1064 SmallSet<Register, 4> SGPROperandRegs; 1065 1066 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 1067 return false; 1068 1069 MachineBasicBlock::iterator I = MI.getIterator(); 1070 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1071 SGPROperandRegs, MRI); 1072 } 1073 1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1075 MachineInstr &MI, MachineRegisterInfo &MRI, 1076 ArrayRef<unsigned> OpIndices) const { 1077 MachineIRBuilder B(MI); 1078 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1079 } 1080 1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1082 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1083 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1084 Register Reg = MI.getOperand(OpIdx).getReg(); 1085 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1086 if (Bank == &AMDGPU::SGPRRegBank) 1087 return; 1088 1089 LLT Ty = MRI.getType(Reg); 1090 MachineIRBuilder B(MI); 1091 1092 if (Bank != &AMDGPU::VGPRRegBank) { 1093 // We need to copy from AGPR to VGPR 1094 Reg = B.buildCopy(Ty, Reg).getReg(0); 1095 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); 1096 } 1097 1098 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1099 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1100 .addDef(SGPR) 1101 .addReg(Reg); 1102 1103 MRI.setType(SGPR, Ty); 1104 1105 const TargetRegisterClass *Constrained = 1106 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1107 (void)Constrained; 1108 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1109 1110 MI.getOperand(OpIdx).setReg(SGPR); 1111 } 1112 1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1114 /// rest will be in the remainder. 1115 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1116 unsigned TotalSize = Ty.getSizeInBits(); 1117 if (!Ty.isVector()) 1118 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1119 1120 LLT EltTy = Ty.getElementType(); 1121 unsigned EltSize = EltTy.getSizeInBits(); 1122 assert(FirstSize % EltSize == 0); 1123 1124 unsigned FirstPartNumElts = FirstSize / EltSize; 1125 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1126 1127 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1128 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1129 } 1130 1131 static LLT widen96To128(LLT Ty) { 1132 if (!Ty.isVector()) 1133 return LLT::scalar(128); 1134 1135 LLT EltTy = Ty.getElementType(); 1136 assert(128 % EltTy.getSizeInBits() == 0); 1137 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1138 } 1139 1140 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1141 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1142 MachineRegisterInfo &MRI) const { 1143 Register DstReg = MI.getOperand(0).getReg(); 1144 const LLT LoadTy = MRI.getType(DstReg); 1145 unsigned LoadSize = LoadTy.getSizeInBits(); 1146 const unsigned MaxNonSmrdLoadSize = 128; 1147 1148 const RegisterBank *DstBank = 1149 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1150 if (DstBank == &AMDGPU::SGPRRegBank) { 1151 // There are some special cases that we need to look at for 32 bit and 96 1152 // bit SGPR loads otherwise we have nothing to do. 1153 if (LoadSize != 32 && LoadSize != 96) 1154 return false; 1155 1156 MachineMemOperand *MMO = *MI.memoperands_begin(); 1157 const unsigned MemSize = 8 * MMO->getSize(); 1158 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1159 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1160 // scalar loads should have a load size of 32 but memory access size of less 1161 // than 32. 1162 if (LoadSize == 32 && 1163 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1164 return false; 1165 1166 Register PtrReg = MI.getOperand(1).getReg(); 1167 1168 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1169 MachineIRBuilder B(MI, O); 1170 1171 if (LoadSize == 32) { 1172 // This is an extending load from a sub-dword size. Widen the memory 1173 // access size to 4 bytes and clear the extra high bits appropriately 1174 const LLT S32 = LLT::scalar(32); 1175 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1176 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1177 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1178 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1179 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1180 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1181 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1182 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1183 } else 1184 // We do not need to touch the higher bits for regular loads. 1185 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1186 } else { 1187 // 96-bit loads are only available for vector loads. We need to split this 1188 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1189 if (MMO->getAlign() < Align(16)) { 1190 LLT Part64, Part32; 1191 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1192 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); 1193 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); 1194 1195 auto Undef = B.buildUndef(LoadTy); 1196 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); 1197 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); 1198 } else { 1199 LLT WiderTy = widen96To128(LoadTy); 1200 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1201 B.buildExtract(MI.getOperand(0), WideLoad, 0); 1202 } 1203 } 1204 1205 MI.eraseFromParent(); 1206 return true; 1207 } 1208 1209 // 128-bit loads are supported for all instruction types. 1210 if (LoadSize <= MaxNonSmrdLoadSize) 1211 return false; 1212 1213 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1214 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1215 1216 if (SrcRegs.empty()) 1217 SrcRegs.push_back(MI.getOperand(1).getReg()); 1218 1219 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1220 1221 // RegBankSelect only emits scalar types, so we need to reset the pointer 1222 // operand to a pointer type. 1223 Register BasePtrReg = SrcRegs[0]; 1224 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1225 MRI.setType(BasePtrReg, PtrTy); 1226 1227 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1228 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1229 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1230 MachineIRBuilder B(MI, Observer); 1231 LegalizerHelper Helper(B.getMF(), Observer, B); 1232 1233 if (LoadTy.isVector()) { 1234 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1235 return false; 1236 } else { 1237 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1238 return false; 1239 } 1240 1241 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1242 return true; 1243 } 1244 1245 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1246 MachineInstr &MI, 1247 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1248 MachineRegisterInfo &MRI) const { 1249 const MachineFunction &MF = *MI.getMF(); 1250 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1251 const auto &TFI = *ST.getFrameLowering(); 1252 1253 // Guard in case the stack growth direction ever changes with scratch 1254 // instructions. 1255 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1256 return false; 1257 1258 Register Dst = MI.getOperand(0).getReg(); 1259 Register AllocSize = MI.getOperand(1).getReg(); 1260 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1261 1262 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1263 1264 // TODO: Need to emit a wave reduction to get the maximum size. 1265 if (SizeBank != &AMDGPU::SGPRRegBank) 1266 return false; 1267 1268 LLT PtrTy = MRI.getType(Dst); 1269 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1270 1271 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1272 Register SPReg = Info->getStackPtrOffsetReg(); 1273 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1274 MachineIRBuilder B(MI, ApplyBank); 1275 1276 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1277 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1278 1279 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1280 if (Alignment > TFI.getStackAlign()) { 1281 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1282 B.buildMaskLowPtrBits(Dst, PtrAdd, 1283 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1284 } else { 1285 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1286 } 1287 1288 MI.eraseFromParent(); 1289 return true; 1290 } 1291 1292 bool AMDGPURegisterBankInfo::applyMappingImage( 1293 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1294 MachineRegisterInfo &MRI, int RsrcIdx) const { 1295 const int NumDefs = MI.getNumExplicitDefs(); 1296 1297 // The reported argument index is relative to the IR intrinsic call arguments, 1298 // so we need to shift by the number of defs and the intrinsic ID. 1299 RsrcIdx += NumDefs + 1; 1300 1301 // Insert copies to VGPR arguments. 1302 applyDefaultMapping(OpdMapper); 1303 1304 // Fixup any SGPR arguments. 1305 SmallVector<unsigned, 4> SGPRIndexes; 1306 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1307 if (!MI.getOperand(I).isReg()) 1308 continue; 1309 1310 // If this intrinsic has a sampler, it immediately follows rsrc. 1311 if (I == RsrcIdx || I == RsrcIdx + 1) 1312 SGPRIndexes.push_back(I); 1313 } 1314 1315 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1316 return true; 1317 } 1318 1319 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1320 Register Reg) { 1321 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1322 if (!Def) 1323 return Reg; 1324 1325 // TODO: Guard against this being an implicit def 1326 return Def->getOperand(0).getReg(); 1327 } 1328 1329 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1330 // the three offsets (voffset, soffset and instoffset) 1331 static unsigned setBufferOffsets(MachineIRBuilder &B, 1332 const AMDGPURegisterBankInfo &RBI, 1333 Register CombinedOffset, Register &VOffsetReg, 1334 Register &SOffsetReg, int64_t &InstOffsetVal, 1335 Align Alignment) { 1336 const LLT S32 = LLT::scalar(32); 1337 MachineRegisterInfo *MRI = B.getMRI(); 1338 1339 if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) { 1340 uint32_t SOffset, ImmOffset; 1341 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1342 Alignment)) { 1343 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1344 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1345 InstOffsetVal = ImmOffset; 1346 1347 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1348 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1349 return SOffset + ImmOffset; 1350 } 1351 } 1352 1353 Register Base; 1354 unsigned Offset; 1355 1356 std::tie(Base, Offset) = 1357 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1358 1359 uint32_t SOffset, ImmOffset; 1360 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1361 &RBI.Subtarget, Alignment)) { 1362 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1363 VOffsetReg = Base; 1364 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1365 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1366 InstOffsetVal = ImmOffset; 1367 return 0; // XXX - Why is this 0? 1368 } 1369 1370 // If we have SGPR base, we can use it for soffset. 1371 if (SOffset == 0) { 1372 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1373 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1374 SOffsetReg = Base; 1375 InstOffsetVal = ImmOffset; 1376 return 0; // XXX - Why is this 0? 1377 } 1378 } 1379 1380 // Handle the variable sgpr + vgpr case. 1381 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1382 if (Add && (int)Offset >= 0) { 1383 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1384 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1385 1386 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1387 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1388 1389 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1390 VOffsetReg = Src0; 1391 SOffsetReg = Src1; 1392 return 0; 1393 } 1394 1395 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1396 VOffsetReg = Src1; 1397 SOffsetReg = Src0; 1398 return 0; 1399 } 1400 } 1401 1402 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1403 // have an SGPR offset and a VGPR resource. 1404 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1405 VOffsetReg = CombinedOffset; 1406 } else { 1407 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1408 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1409 } 1410 1411 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1412 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1413 return 0; 1414 } 1415 1416 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1417 const OperandsMapper &OpdMapper) const { 1418 MachineInstr &MI = OpdMapper.getMI(); 1419 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1420 1421 const LLT S32 = LLT::scalar(32); 1422 Register Dst = MI.getOperand(0).getReg(); 1423 LLT Ty = MRI.getType(Dst); 1424 1425 const RegisterBank *RSrcBank = 1426 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1427 const RegisterBank *OffsetBank = 1428 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1429 if (RSrcBank == &AMDGPU::SGPRRegBank && 1430 OffsetBank == &AMDGPU::SGPRRegBank) 1431 return true; // Legal mapping 1432 1433 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back 1434 // here but don't have an MMO. 1435 1436 unsigned LoadSize = Ty.getSizeInBits(); 1437 int NumLoads = 1; 1438 if (LoadSize == 256 || LoadSize == 512) { 1439 NumLoads = LoadSize / 128; 1440 Ty = Ty.divide(NumLoads); 1441 } 1442 1443 // Use the alignment to ensure that the required offsets will fit into the 1444 // immediate offsets. 1445 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1446 1447 MachineIRBuilder B(MI); 1448 MachineFunction &MF = B.getMF(); 1449 1450 Register SOffset; 1451 Register VOffset; 1452 int64_t ImmOffset = 0; 1453 1454 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1455 VOffset, SOffset, ImmOffset, Alignment); 1456 1457 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1458 // can, but we neeed to track an MMO for that. 1459 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1460 const Align MemAlign(4); // FIXME: ABI type alignment? 1461 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1462 MachinePointerInfo(), 1463 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1464 MachineMemOperand::MOInvariant, 1465 MemSize, MemAlign); 1466 if (MMOOffset != 0) 1467 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1468 1469 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1470 // assume that the buffer is unswizzled. 1471 1472 Register RSrc = MI.getOperand(1).getReg(); 1473 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1474 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1475 1476 SmallVector<Register, 4> LoadParts(NumLoads); 1477 1478 MachineBasicBlock::iterator MII = MI.getIterator(); 1479 MachineInstrSpan Span(MII, &B.getMBB()); 1480 1481 for (int i = 0; i < NumLoads; ++i) { 1482 if (NumLoads == 1) { 1483 LoadParts[i] = Dst; 1484 } else { 1485 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1486 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1487 } 1488 1489 MachineMemOperand *MMO = BaseMMO; 1490 if (i != 0) 1491 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1492 1493 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1494 .addDef(LoadParts[i]) // vdata 1495 .addUse(RSrc) // rsrc 1496 .addUse(VIndex) // vindex 1497 .addUse(VOffset) // voffset 1498 .addUse(SOffset) // soffset 1499 .addImm(ImmOffset + 16 * i) // offset(imm) 1500 .addImm(0) // cachepolicy, swizzled buffer(imm) 1501 .addImm(0) // idxen(imm) 1502 .addMemOperand(MMO); 1503 } 1504 1505 // TODO: If only the resource is a VGPR, it may be better to execute the 1506 // scalar load in the waterfall loop if the resource is expected to frequently 1507 // be dynamically uniform. 1508 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1509 // Remove the original instruction to avoid potentially confusing the 1510 // waterfall loop logic. 1511 B.setInstr(*Span.begin()); 1512 MI.eraseFromParent(); 1513 1514 SmallSet<Register, 4> OpsToWaterfall; 1515 1516 OpsToWaterfall.insert(RSrc); 1517 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1518 OpsToWaterfall, MRI); 1519 } 1520 1521 if (NumLoads != 1) { 1522 if (Ty.isVector()) 1523 B.buildConcatVectors(Dst, LoadParts); 1524 else 1525 B.buildMerge(Dst, LoadParts); 1526 } 1527 1528 // We removed the instruction earlier with a waterfall loop. 1529 if (RSrcBank == &AMDGPU::SGPRRegBank) 1530 MI.eraseFromParent(); 1531 1532 return true; 1533 } 1534 1535 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, 1536 bool Signed) const { 1537 MachineInstr &MI = OpdMapper.getMI(); 1538 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1539 1540 // Insert basic copies 1541 applyDefaultMapping(OpdMapper); 1542 1543 Register DstReg = MI.getOperand(0).getReg(); 1544 LLT Ty = MRI.getType(DstReg); 1545 1546 const LLT S32 = LLT::scalar(32); 1547 1548 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; 1549 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1550 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1551 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1552 1553 const RegisterBank *DstBank = 1554 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1555 if (DstBank == &AMDGPU::VGPRRegBank) { 1556 if (Ty == S32) 1557 return true; 1558 1559 // There is no 64-bit vgpr bitfield extract instructions so the operation 1560 // is expanded to a sequence of instructions that implement the operation. 1561 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); 1562 MachineIRBuilder B(MI, ApplyBank); 1563 1564 const LLT S64 = LLT::scalar(64); 1565 // Shift the source operand so that extracted bits start at bit 0. 1566 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1567 : B.buildLShr(S64, SrcReg, OffsetReg); 1568 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1569 1570 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1571 // if the width is a constant. 1572 if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) { 1573 // Use the 32-bit bitfield extract instruction if the width is a constant. 1574 // Depending on the width size, use either the low or high 32-bits. 1575 auto Zero = B.buildConstant(S32, 0); 1576 auto WidthImm = ConstWidth->Value.getZExtValue(); 1577 if (WidthImm <= 32) { 1578 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1579 // or clear the upper 32-bits. 1580 auto Extract = 1581 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1582 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1583 auto Extend = 1584 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1585 B.buildMerge(DstReg, {Extract, Extend}); 1586 } else { 1587 // Use bitfield extract on upper 32-bit source, and combine with lower 1588 // 32-bit source. 1589 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1590 auto Extract = 1591 Signed 1592 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1593 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1594 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1595 } 1596 MI.eraseFromParent(); 1597 return true; 1598 } 1599 1600 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1601 // operations. 1602 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1603 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1604 if (Signed) 1605 B.buildAShr(S64, SignBit, ExtShift); 1606 else 1607 B.buildLShr(S64, SignBit, ExtShift); 1608 MI.eraseFromParent(); 1609 return true; 1610 } 1611 1612 // The scalar form packs the offset and width in a single operand. 1613 1614 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1615 MachineIRBuilder B(MI, ApplyBank); 1616 1617 // Ensure the high bits are clear to insert the offset. 1618 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1619 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1620 1621 // Zeros out the low bits, so don't bother clamping the input value. 1622 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1623 1624 // Transformation function, pack the offset and width of a BFE into 1625 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1626 // source, bits [5:0] contain the offset and bits [22:16] the width. 1627 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1628 1629 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1630 // register class constraints. 1631 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1632 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1633 1634 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1635 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1636 llvm_unreachable("failed to constrain BFE"); 1637 1638 MI.eraseFromParent(); 1639 return true; 1640 } 1641 1642 // Return a suitable opcode for extending the operands of Opc when widening. 1643 static unsigned getExtendOp(unsigned Opc) { 1644 switch (Opc) { 1645 case TargetOpcode::G_ASHR: 1646 case TargetOpcode::G_SMIN: 1647 case TargetOpcode::G_SMAX: 1648 return TargetOpcode::G_SEXT; 1649 case TargetOpcode::G_LSHR: 1650 case TargetOpcode::G_UMIN: 1651 case TargetOpcode::G_UMAX: 1652 return TargetOpcode::G_ZEXT; 1653 default: 1654 return TargetOpcode::G_ANYEXT; 1655 } 1656 } 1657 1658 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1659 // any illegal vector extend or unmerge operations. 1660 static std::pair<Register, Register> 1661 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1662 const LLT S32 = LLT::scalar(32); 1663 auto Bitcast = B.buildBitcast(S32, Src); 1664 1665 if (ExtOpcode == TargetOpcode::G_SEXT) { 1666 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1667 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1668 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1669 } 1670 1671 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1672 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1673 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1674 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1675 } 1676 1677 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1678 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1679 } 1680 1681 // For cases where only a single copy is inserted for matching register banks. 1682 // Replace the register in the instruction operand 1683 static bool substituteSimpleCopyRegs( 1684 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1685 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1686 if (!SrcReg.empty()) { 1687 assert(SrcReg.size() == 1); 1688 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1689 return true; 1690 } 1691 1692 return false; 1693 } 1694 1695 /// Handle register layout difference for f16 images for some subtargets. 1696 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1697 MachineRegisterInfo &MRI, 1698 Register Reg) const { 1699 if (!Subtarget.hasUnpackedD16VMem()) 1700 return Reg; 1701 1702 const LLT S16 = LLT::scalar(16); 1703 LLT StoreVT = MRI.getType(Reg); 1704 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1705 return Reg; 1706 1707 auto Unmerge = B.buildUnmerge(S16, Reg); 1708 1709 1710 SmallVector<Register, 4> WideRegs; 1711 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1712 WideRegs.push_back(Unmerge.getReg(I)); 1713 1714 const LLT S32 = LLT::scalar(32); 1715 int NumElts = StoreVT.getNumElements(); 1716 1717 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0); 1718 } 1719 1720 static std::pair<Register, unsigned> 1721 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1722 int64_t Const; 1723 if (mi_match(Reg, MRI, m_ICst(Const))) 1724 return std::make_pair(Register(), Const); 1725 1726 Register Base; 1727 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1728 return std::make_pair(Base, Const); 1729 1730 // TODO: Handle G_OR used for add case 1731 return std::make_pair(Reg, 0); 1732 } 1733 1734 std::pair<Register, unsigned> 1735 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1736 Register OrigOffset) const { 1737 const unsigned MaxImm = 4095; 1738 Register BaseReg; 1739 unsigned ImmOffset; 1740 const LLT S32 = LLT::scalar(32); 1741 1742 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1743 OrigOffset); 1744 1745 unsigned C1 = 0; 1746 if (ImmOffset != 0) { 1747 // If the immediate value is too big for the immoffset field, put the value 1748 // and -4096 into the immoffset field so that the value that is copied/added 1749 // for the voffset field is a multiple of 4096, and it stands more chance 1750 // of being CSEd with the copy/add for another similar load/store. 1751 // However, do not do that rounding down to a multiple of 4096 if that is a 1752 // negative number, as it appears to be illegal to have a negative offset 1753 // in the vgpr, even if adding the immediate offset makes it positive. 1754 unsigned Overflow = ImmOffset & ~MaxImm; 1755 ImmOffset -= Overflow; 1756 if ((int32_t)Overflow < 0) { 1757 Overflow += ImmOffset; 1758 ImmOffset = 0; 1759 } 1760 1761 C1 = ImmOffset; 1762 if (Overflow != 0) { 1763 if (!BaseReg) 1764 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1765 else { 1766 auto OverflowVal = B.buildConstant(S32, Overflow); 1767 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1768 } 1769 } 1770 } 1771 1772 if (!BaseReg) 1773 BaseReg = B.buildConstant(S32, 0).getReg(0); 1774 1775 return {BaseReg, C1}; 1776 } 1777 1778 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 1779 int64_t C; 1780 return mi_match(Reg, MRI, m_ICst(C)) && C == 0; 1781 } 1782 1783 static unsigned extractCPol(unsigned CachePolicy) { 1784 return CachePolicy & AMDGPU::CPol::ALL; 1785 } 1786 1787 static unsigned extractSWZ(unsigned CachePolicy) { 1788 return (CachePolicy >> 3) & 1; 1789 } 1790 1791 1792 MachineInstr * 1793 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, 1794 MachineInstr &MI) const { 1795 MachineRegisterInfo &MRI = *B.getMRI(); 1796 executeInWaterfallLoop(B, MI, MRI, {2, 4}); 1797 1798 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. 1799 1800 Register VData = MI.getOperand(1).getReg(); 1801 LLT Ty = MRI.getType(VData); 1802 1803 int EltSize = Ty.getScalarSizeInBits(); 1804 int Size = Ty.getSizeInBits(); 1805 1806 // FIXME: Broken integer truncstore. 1807 if (EltSize != 32) 1808 report_fatal_error("unhandled intrinsic store"); 1809 1810 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 1811 const int MemSize = (*MI.memoperands_begin())->getSize(); 1812 1813 1814 Register RSrc = MI.getOperand(2).getReg(); 1815 Register VOffset = MI.getOperand(3).getReg(); 1816 Register SOffset = MI.getOperand(4).getReg(); 1817 unsigned CachePolicy = MI.getOperand(5).getImm(); 1818 1819 unsigned ImmOffset; 1820 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 1821 1822 const bool Offen = !isZero(VOffset, MRI); 1823 1824 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; 1825 switch (8 * MemSize) { 1826 case 8: 1827 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 1828 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 1829 break; 1830 case 16: 1831 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 1832 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 1833 break; 1834 default: 1835 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 1836 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 1837 if (Size > 32) 1838 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 1839 break; 1840 } 1841 1842 1843 // Set the insertion point back to the instruction in case it was moved into a 1844 // loop. 1845 B.setInstr(MI); 1846 1847 MachineInstrBuilder MIB = B.buildInstr(Opc) 1848 .addUse(VData); 1849 1850 if (Offen) 1851 MIB.addUse(VOffset); 1852 1853 MIB.addUse(RSrc) 1854 .addUse(SOffset) 1855 .addImm(ImmOffset) 1856 .addImm(extractCPol(CachePolicy)) 1857 .addImm(0) // tfe: FIXME: Remove from inst 1858 .addImm(extractSWZ(CachePolicy)) 1859 .cloneMemRefs(MI); 1860 1861 // FIXME: We need a way to report failure from applyMappingImpl. 1862 // Insert constrain copies before inserting the loop. 1863 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1864 report_fatal_error("failed to constrain selected store intrinsic"); 1865 1866 return MIB; 1867 } 1868 1869 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1870 Register SrcReg) const { 1871 MachineRegisterInfo &MRI = *B.getMRI(); 1872 LLT SrcTy = MRI.getType(SrcReg); 1873 if (SrcTy.getSizeInBits() == 32) { 1874 // Use a v_mov_b32 here to make the exec dependency explicit. 1875 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1876 .addDef(DstReg) 1877 .addUse(SrcReg); 1878 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1879 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1880 } 1881 1882 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1883 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1884 1885 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1886 .addDef(TmpReg0) 1887 .addUse(SrcReg, 0, AMDGPU::sub0); 1888 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1889 .addDef(TmpReg1) 1890 .addUse(SrcReg, 0, AMDGPU::sub1); 1891 B.buildInstr(AMDGPU::REG_SEQUENCE) 1892 .addDef(DstReg) 1893 .addUse(TmpReg0) 1894 .addImm(AMDGPU::sub0) 1895 .addUse(TmpReg1) 1896 .addImm(AMDGPU::sub1); 1897 1898 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1899 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1900 } 1901 1902 /// Utility function for pushing dynamic vector indexes with a constant offset 1903 /// into waterwall loops. 1904 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1905 MachineInstr &IdxUseInstr, 1906 unsigned OpIdx, 1907 unsigned ConstOffset) { 1908 MachineRegisterInfo &MRI = *B.getMRI(); 1909 const LLT S32 = LLT::scalar(32); 1910 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1911 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1912 1913 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1914 1915 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1916 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1917 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1918 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1919 } 1920 1921 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1922 /// original 32-bit source value (to be inserted in the low part of the combined 1923 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1924 /// value. 1925 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1926 Register Hi32Reg, Register Lo32Reg, 1927 unsigned ExtOpc, 1928 const RegisterBank &RegBank, 1929 bool IsBooleanSrc = false) { 1930 if (ExtOpc == AMDGPU::G_ZEXT) { 1931 B.buildConstant(Hi32Reg, 0); 1932 } else if (ExtOpc == AMDGPU::G_SEXT) { 1933 if (IsBooleanSrc) { 1934 // If we know the original source was an s1, the high half is the same as 1935 // the low. 1936 B.buildCopy(Hi32Reg, Lo32Reg); 1937 } else { 1938 // Replicate sign bit from 32-bit extended part. 1939 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1940 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1941 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1942 } 1943 } else { 1944 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1945 B.buildUndef(Hi32Reg); 1946 } 1947 } 1948 1949 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1950 MachineInstr &MI, MachineRegisterInfo &MRI, 1951 const OperandsMapper &OpdMapper) const { 1952 1953 Register VecReg = MI.getOperand(1).getReg(); 1954 Register Idx = MI.getOperand(2).getReg(); 1955 1956 const RegisterBank &IdxBank = 1957 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1958 1959 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1960 1961 LLT VecTy = MRI.getType(VecReg); 1962 unsigned EltSize = VecTy.getScalarSizeInBits(); 1963 unsigned NumElem = VecTy.getNumElements(); 1964 1965 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1966 IsDivergentIdx)) 1967 return false; 1968 1969 MachineIRBuilder B(MI); 1970 LLT S32 = LLT::scalar(32); 1971 1972 const RegisterBank &DstBank = 1973 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1974 const RegisterBank &SrcBank = 1975 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1976 1977 const RegisterBank &CCBank = 1978 (DstBank == AMDGPU::SGPRRegBank && 1979 SrcBank == AMDGPU::SGPRRegBank && 1980 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1981 : AMDGPU::VCCRegBank; 1982 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1983 1984 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1985 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1986 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1987 } 1988 1989 LLT EltTy = VecTy.getScalarType(); 1990 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1991 unsigned NumLanes = DstRegs.size(); 1992 if (!NumLanes) 1993 NumLanes = 1; 1994 else 1995 EltTy = MRI.getType(DstRegs[0]); 1996 1997 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1998 SmallVector<Register, 2> Res(NumLanes); 1999 for (unsigned L = 0; L < NumLanes; ++L) 2000 Res[L] = UnmergeToEltTy.getReg(L); 2001 2002 for (unsigned I = 1; I < NumElem; ++I) { 2003 auto IC = B.buildConstant(S32, I); 2004 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2005 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2006 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2007 2008 for (unsigned L = 0; L < NumLanes; ++L) { 2009 auto S = B.buildSelect(EltTy, Cmp, 2010 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 2011 2012 for (unsigned N : { 0, 2, 3 }) 2013 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 2014 2015 Res[L] = S->getOperand(0).getReg(); 2016 } 2017 } 2018 2019 for (unsigned L = 0; L < NumLanes; ++L) { 2020 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 2021 B.buildCopy(DstReg, Res[L]); 2022 MRI.setRegBank(DstReg, DstBank); 2023 } 2024 2025 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2026 MI.eraseFromParent(); 2027 2028 return true; 2029 } 2030 2031 // Insert a cross regbank copy for a register if it already has a bank that 2032 // differs from the one we want to set. 2033 static Register constrainRegToBank(MachineRegisterInfo &MRI, 2034 MachineIRBuilder &B, Register &Reg, 2035 const RegisterBank &Bank) { 2036 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 2037 if (CurrBank && *CurrBank != Bank) { 2038 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2039 MRI.setRegBank(Copy, Bank); 2040 return Copy; 2041 } 2042 2043 MRI.setRegBank(Reg, Bank); 2044 return Reg; 2045 } 2046 2047 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, 2049 const OperandsMapper &OpdMapper) const { 2050 2051 Register VecReg = MI.getOperand(1).getReg(); 2052 Register Idx = MI.getOperand(3).getReg(); 2053 2054 const RegisterBank &IdxBank = 2055 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2056 2057 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2058 2059 LLT VecTy = MRI.getType(VecReg); 2060 unsigned EltSize = VecTy.getScalarSizeInBits(); 2061 unsigned NumElem = VecTy.getNumElements(); 2062 2063 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2064 IsDivergentIdx)) 2065 return false; 2066 2067 MachineIRBuilder B(MI); 2068 LLT S32 = LLT::scalar(32); 2069 2070 const RegisterBank &DstBank = 2071 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2072 const RegisterBank &SrcBank = 2073 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2074 const RegisterBank &InsBank = 2075 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2076 2077 const RegisterBank &CCBank = 2078 (DstBank == AMDGPU::SGPRRegBank && 2079 SrcBank == AMDGPU::SGPRRegBank && 2080 InsBank == AMDGPU::SGPRRegBank && 2081 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2082 : AMDGPU::VCCRegBank; 2083 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2084 2085 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2086 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2087 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2088 } 2089 2090 LLT EltTy = VecTy.getScalarType(); 2091 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2092 unsigned NumLanes = InsRegs.size(); 2093 if (!NumLanes) { 2094 NumLanes = 1; 2095 InsRegs.push_back(MI.getOperand(2).getReg()); 2096 } else { 2097 EltTy = MRI.getType(InsRegs[0]); 2098 } 2099 2100 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2101 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2102 2103 for (unsigned I = 0; I < NumElem; ++I) { 2104 auto IC = B.buildConstant(S32, I); 2105 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2106 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2107 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2108 2109 for (unsigned L = 0; L < NumLanes; ++L) { 2110 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2111 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2112 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2113 2114 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2115 MRI.setRegBank(Select, DstBank); 2116 2117 Ops[I * NumLanes + L] = Select; 2118 } 2119 } 2120 2121 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2122 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2123 B.buildBuildVector(MI.getOperand(0), Ops); 2124 } else { 2125 auto Vec = B.buildBuildVector(MergeTy, Ops); 2126 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2127 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2128 } 2129 2130 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2131 MI.eraseFromParent(); 2132 2133 return true; 2134 } 2135 2136 void AMDGPURegisterBankInfo::applyMappingImpl( 2137 const OperandsMapper &OpdMapper) const { 2138 MachineInstr &MI = OpdMapper.getMI(); 2139 unsigned Opc = MI.getOpcode(); 2140 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2141 switch (Opc) { 2142 case AMDGPU::G_PHI: { 2143 Register DstReg = MI.getOperand(0).getReg(); 2144 LLT DstTy = MRI.getType(DstReg); 2145 if (DstTy != LLT::scalar(1)) 2146 break; 2147 2148 const LLT S32 = LLT::scalar(32); 2149 const RegisterBank *DstBank = 2150 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2151 if (DstBank == &AMDGPU::VCCRegBank) { 2152 applyDefaultMapping(OpdMapper); 2153 // The standard handling only considers the result register bank for 2154 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2155 // produce an invalid copy. We can only copy with some kind of compare to 2156 // get a vector boolean result. Insert a regitser bank copy that will be 2157 // correctly lowered to a compare. 2158 MachineIRBuilder B(*MI.getParent()->getParent()); 2159 2160 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2161 Register SrcReg = MI.getOperand(I).getReg(); 2162 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2163 2164 if (SrcBank != &AMDGPU::VCCRegBank) { 2165 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2166 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2167 2168 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2169 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2170 MI.getOperand(I).setReg(Copy.getReg(0)); 2171 } 2172 } 2173 2174 return; 2175 } 2176 2177 // Phi handling is strange and only considers the bank of the destination. 2178 substituteSimpleCopyRegs(OpdMapper, 0); 2179 2180 // Promote SGPR/VGPR booleans to s32 2181 MachineFunction *MF = MI.getParent()->getParent(); 2182 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2183 MachineIRBuilder B(MI, ApplyBank); 2184 LegalizerHelper Helper(*MF, ApplyBank, B); 2185 2186 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2187 llvm_unreachable("widen scalar should have succeeded"); 2188 2189 return; 2190 } 2191 case AMDGPU::G_ICMP: 2192 case AMDGPU::G_UADDO: 2193 case AMDGPU::G_USUBO: 2194 case AMDGPU::G_UADDE: 2195 case AMDGPU::G_SADDE: 2196 case AMDGPU::G_USUBE: 2197 case AMDGPU::G_SSUBE: { 2198 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2199 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2200 2201 const RegisterBank *DstBank = 2202 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2203 if (DstBank != &AMDGPU::SGPRRegBank) 2204 break; 2205 2206 const bool HasCarryIn = MI.getNumOperands() == 5; 2207 2208 // If this is a scalar compare, promote the result to s32, as the selection 2209 // will end up using a copy to a 32-bit vreg. 2210 const LLT S32 = LLT::scalar(32); 2211 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2212 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2213 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2214 MachineIRBuilder B(MI); 2215 2216 if (HasCarryIn) { 2217 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2218 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2219 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2220 MI.getOperand(4).setReg(NewSrcReg); 2221 } 2222 2223 MachineBasicBlock *MBB = MI.getParent(); 2224 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2225 2226 // If we had a constrained VCC result register, a copy was inserted to VCC 2227 // from SGPR. 2228 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2229 if (DefRegs.empty()) 2230 DefRegs.push_back(DstReg); 2231 B.buildTrunc(DefRegs[0], NewDstReg); 2232 return; 2233 } 2234 case AMDGPU::G_SELECT: { 2235 Register DstReg = MI.getOperand(0).getReg(); 2236 LLT DstTy = MRI.getType(DstReg); 2237 2238 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2239 if (CondRegs.empty()) 2240 CondRegs.push_back(MI.getOperand(1).getReg()); 2241 else { 2242 assert(CondRegs.size() == 1); 2243 } 2244 2245 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2246 if (CondBank == &AMDGPU::SGPRRegBank) { 2247 MachineIRBuilder B(MI); 2248 const LLT S32 = LLT::scalar(32); 2249 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2250 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2251 2252 MI.getOperand(1).setReg(NewCondReg); 2253 B.buildZExt(NewCondReg, CondRegs[0]); 2254 } 2255 2256 if (DstTy.getSizeInBits() != 64) 2257 break; 2258 2259 MachineIRBuilder B(MI); 2260 LLT HalfTy = getHalfSizedType(DstTy); 2261 2262 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2263 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2264 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2265 2266 // All inputs are SGPRs, nothing special to do. 2267 if (DefRegs.empty()) { 2268 assert(Src1Regs.empty() && Src2Regs.empty()); 2269 break; 2270 } 2271 2272 if (Src1Regs.empty()) 2273 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2274 else { 2275 setRegsToType(MRI, Src1Regs, HalfTy); 2276 } 2277 2278 if (Src2Regs.empty()) 2279 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2280 else 2281 setRegsToType(MRI, Src2Regs, HalfTy); 2282 2283 setRegsToType(MRI, DefRegs, HalfTy); 2284 2285 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2286 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2287 2288 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2289 MI.eraseFromParent(); 2290 return; 2291 } 2292 case AMDGPU::G_BRCOND: { 2293 Register CondReg = MI.getOperand(0).getReg(); 2294 // FIXME: Should use legalizer helper, but should change bool ext type. 2295 const RegisterBank *CondBank = 2296 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2297 2298 if (CondBank == &AMDGPU::SGPRRegBank) { 2299 MachineIRBuilder B(MI); 2300 const LLT S32 = LLT::scalar(32); 2301 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2302 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2303 2304 MI.getOperand(0).setReg(NewCondReg); 2305 B.buildZExt(NewCondReg, CondReg); 2306 return; 2307 } 2308 2309 break; 2310 } 2311 case AMDGPU::G_AND: 2312 case AMDGPU::G_OR: 2313 case AMDGPU::G_XOR: { 2314 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2315 // there is a VGPR input. 2316 Register DstReg = MI.getOperand(0).getReg(); 2317 LLT DstTy = MRI.getType(DstReg); 2318 2319 if (DstTy.getSizeInBits() == 1) { 2320 const RegisterBank *DstBank = 2321 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2322 if (DstBank == &AMDGPU::VCCRegBank) 2323 break; 2324 2325 MachineFunction *MF = MI.getParent()->getParent(); 2326 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2327 MachineIRBuilder B(MI, ApplyBank); 2328 LegalizerHelper Helper(*MF, ApplyBank, B); 2329 2330 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2331 LegalizerHelper::Legalized) 2332 llvm_unreachable("widen scalar should have succeeded"); 2333 return; 2334 } 2335 2336 if (DstTy.getSizeInBits() != 64) 2337 break; 2338 2339 LLT HalfTy = getHalfSizedType(DstTy); 2340 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2341 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2342 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2343 2344 // All inputs are SGPRs, nothing special to do. 2345 if (DefRegs.empty()) { 2346 assert(Src0Regs.empty() && Src1Regs.empty()); 2347 break; 2348 } 2349 2350 assert(DefRegs.size() == 2); 2351 assert(Src0Regs.size() == Src1Regs.size() && 2352 (Src0Regs.empty() || Src0Regs.size() == 2)); 2353 2354 // Depending on where the source registers came from, the generic code may 2355 // have decided to split the inputs already or not. If not, we still need to 2356 // extract the values. 2357 MachineIRBuilder B(MI); 2358 2359 if (Src0Regs.empty()) 2360 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2361 else 2362 setRegsToType(MRI, Src0Regs, HalfTy); 2363 2364 if (Src1Regs.empty()) 2365 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2366 else 2367 setRegsToType(MRI, Src1Regs, HalfTy); 2368 2369 setRegsToType(MRI, DefRegs, HalfTy); 2370 2371 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2372 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2373 2374 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2375 MI.eraseFromParent(); 2376 return; 2377 } 2378 case AMDGPU::G_ABS: { 2379 Register SrcReg = MI.getOperand(1).getReg(); 2380 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2381 2382 // There is no VALU abs instruction so we need to replace it with a sub and 2383 // max combination. 2384 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2385 MachineFunction *MF = MI.getParent()->getParent(); 2386 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); 2387 MachineIRBuilder B(MI, Apply); 2388 LegalizerHelper Helper(*MF, Apply, B); 2389 2390 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2391 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2392 return; 2393 } 2394 LLVM_FALLTHROUGH; 2395 } 2396 case AMDGPU::G_ADD: 2397 case AMDGPU::G_SUB: 2398 case AMDGPU::G_MUL: 2399 case AMDGPU::G_SHL: 2400 case AMDGPU::G_LSHR: 2401 case AMDGPU::G_ASHR: 2402 case AMDGPU::G_SMIN: 2403 case AMDGPU::G_SMAX: 2404 case AMDGPU::G_UMIN: 2405 case AMDGPU::G_UMAX: { 2406 Register DstReg = MI.getOperand(0).getReg(); 2407 LLT DstTy = MRI.getType(DstReg); 2408 2409 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2410 // Packed 16-bit operations need to be scalarized and promoted. 2411 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2412 break; 2413 2414 const RegisterBank *DstBank = 2415 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2416 if (DstBank == &AMDGPU::VGPRRegBank) 2417 break; 2418 2419 const LLT S32 = LLT::scalar(32); 2420 MachineBasicBlock *MBB = MI.getParent(); 2421 MachineFunction *MF = MBB->getParent(); 2422 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2423 MachineIRBuilder B(MI, ApplySALU); 2424 2425 if (DstTy.isVector()) { 2426 Register WideSrc0Lo, WideSrc0Hi; 2427 Register WideSrc1Lo, WideSrc1Hi; 2428 2429 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2430 std::tie(WideSrc0Lo, WideSrc0Hi) 2431 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2432 std::tie(WideSrc1Lo, WideSrc1Hi) 2433 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2434 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2435 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2436 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2437 MI.eraseFromParent(); 2438 } else { 2439 LegalizerHelper Helper(*MF, ApplySALU, B); 2440 2441 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2442 llvm_unreachable("widen scalar should have succeeded"); 2443 2444 // FIXME: s16 shift amounts should be legal. 2445 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2446 Opc == AMDGPU::G_ASHR) { 2447 B.setInsertPt(*MBB, MI.getIterator()); 2448 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2449 llvm_unreachable("widen scalar should have succeeded"); 2450 } 2451 } 2452 2453 return; 2454 } 2455 case AMDGPU::G_SEXT_INREG: { 2456 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2457 if (SrcRegs.empty()) 2458 break; // Nothing to repair 2459 2460 const LLT S32 = LLT::scalar(32); 2461 MachineIRBuilder B(MI); 2462 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2463 GISelObserverWrapper Observer(&O); 2464 B.setChangeObserver(Observer); 2465 2466 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2467 // we would need to further expand, and doesn't let us directly set the 2468 // result registers. 2469 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2470 2471 int Amt = MI.getOperand(2).getImm(); 2472 if (Amt <= 32) { 2473 if (Amt == 32) { 2474 // The low bits are unchanged. 2475 B.buildCopy(DstRegs[0], SrcRegs[0]); 2476 } else { 2477 // Extend in the low bits and propagate the sign bit to the high half. 2478 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2479 } 2480 2481 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2482 } else { 2483 // The low bits are unchanged, and extend in the high bits. 2484 B.buildCopy(DstRegs[0], SrcRegs[0]); 2485 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2486 } 2487 2488 Register DstReg = MI.getOperand(0).getReg(); 2489 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2490 MI.eraseFromParent(); 2491 return; 2492 } 2493 case AMDGPU::G_CTPOP: 2494 case AMDGPU::G_BITREVERSE: 2495 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2496 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2497 const RegisterBank *DstBank = 2498 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2499 if (DstBank == &AMDGPU::SGPRRegBank) 2500 break; 2501 2502 Register SrcReg = MI.getOperand(1).getReg(); 2503 const LLT S32 = LLT::scalar(32); 2504 LLT Ty = MRI.getType(SrcReg); 2505 if (Ty == S32) 2506 break; 2507 2508 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2509 MachineIRBuilder B(MI, ApplyVALU); 2510 2511 MachineFunction &MF = B.getMF(); 2512 LegalizerHelper Helper(MF, ApplyVALU, B); 2513 2514 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2515 llvm_unreachable("narrowScalar should have succeeded"); 2516 return; 2517 } 2518 case AMDGPU::G_SEXT: 2519 case AMDGPU::G_ZEXT: 2520 case AMDGPU::G_ANYEXT: { 2521 Register SrcReg = MI.getOperand(1).getReg(); 2522 LLT SrcTy = MRI.getType(SrcReg); 2523 const bool Signed = Opc == AMDGPU::G_SEXT; 2524 2525 assert(empty(OpdMapper.getVRegs(1))); 2526 2527 MachineIRBuilder B(MI); 2528 const RegisterBank *SrcBank = 2529 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2530 2531 Register DstReg = MI.getOperand(0).getReg(); 2532 LLT DstTy = MRI.getType(DstReg); 2533 if (DstTy.isScalar() && 2534 SrcBank != &AMDGPU::SGPRRegBank && 2535 SrcBank != &AMDGPU::VCCRegBank && 2536 // FIXME: Should handle any type that round to s64 when irregular 2537 // breakdowns supported. 2538 DstTy.getSizeInBits() == 64 && 2539 SrcTy.getSizeInBits() <= 32) { 2540 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2541 2542 // Extend to 32-bit, and then extend the low half. 2543 if (Signed) { 2544 // TODO: Should really be buildSExtOrCopy 2545 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2546 } else if (Opc == AMDGPU::G_ZEXT) { 2547 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2548 } else { 2549 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2550 } 2551 2552 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2553 MRI.setRegBank(DstReg, *SrcBank); 2554 MI.eraseFromParent(); 2555 return; 2556 } 2557 2558 if (SrcTy != LLT::scalar(1)) 2559 return; 2560 2561 // It is not legal to have a legalization artifact with a VCC source. Rather 2562 // than introducing a copy, insert the select we would have to select the 2563 // copy to. 2564 if (SrcBank == &AMDGPU::VCCRegBank) { 2565 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2566 2567 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2568 2569 unsigned DstSize = DstTy.getSizeInBits(); 2570 // 64-bit select is SGPR only 2571 const bool UseSel64 = DstSize > 32 && 2572 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2573 2574 // TODO: Should s16 select be legal? 2575 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2576 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2577 auto False = B.buildConstant(SelType, 0); 2578 2579 MRI.setRegBank(True.getReg(0), *DstBank); 2580 MRI.setRegBank(False.getReg(0), *DstBank); 2581 MRI.setRegBank(DstReg, *DstBank); 2582 2583 if (DstSize > 32) { 2584 B.buildSelect(DefRegs[0], SrcReg, True, False); 2585 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2586 } else if (DstSize < 32) { 2587 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2588 MRI.setRegBank(Sel.getReg(0), *DstBank); 2589 B.buildTrunc(DstReg, Sel); 2590 } else { 2591 B.buildSelect(DstReg, SrcReg, True, False); 2592 } 2593 2594 MI.eraseFromParent(); 2595 return; 2596 } 2597 2598 break; 2599 } 2600 case AMDGPU::G_BUILD_VECTOR: 2601 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2602 Register DstReg = MI.getOperand(0).getReg(); 2603 LLT DstTy = MRI.getType(DstReg); 2604 if (DstTy != LLT::fixed_vector(2, 16)) 2605 break; 2606 2607 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2608 substituteSimpleCopyRegs(OpdMapper, 1); 2609 substituteSimpleCopyRegs(OpdMapper, 2); 2610 2611 const RegisterBank *DstBank = 2612 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2613 if (DstBank == &AMDGPU::SGPRRegBank) 2614 break; // Can use S_PACK_* instructions. 2615 2616 MachineIRBuilder B(MI); 2617 2618 Register Lo = MI.getOperand(1).getReg(); 2619 Register Hi = MI.getOperand(2).getReg(); 2620 const LLT S32 = LLT::scalar(32); 2621 2622 const RegisterBank *BankLo = 2623 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2624 const RegisterBank *BankHi = 2625 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2626 2627 Register ZextLo; 2628 Register ShiftHi; 2629 2630 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2631 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2632 MRI.setRegBank(ZextLo, *BankLo); 2633 2634 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2635 MRI.setRegBank(ZextHi, *BankHi); 2636 2637 auto ShiftAmt = B.buildConstant(S32, 16); 2638 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2639 2640 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2641 MRI.setRegBank(ShiftHi, *BankHi); 2642 } else { 2643 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2644 MRI.setRegBank(MaskLo, *BankLo); 2645 2646 auto ShiftAmt = B.buildConstant(S32, 16); 2647 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2648 2649 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2650 MRI.setRegBank(ShiftHi, *BankHi); 2651 2652 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2653 MRI.setRegBank(ZextLo, *BankLo); 2654 } 2655 2656 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2657 MRI.setRegBank(Or.getReg(0), *DstBank); 2658 2659 B.buildBitcast(DstReg, Or); 2660 MI.eraseFromParent(); 2661 return; 2662 } 2663 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2664 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2665 2666 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2667 2668 Register DstReg = MI.getOperand(0).getReg(); 2669 Register SrcReg = MI.getOperand(1).getReg(); 2670 2671 const LLT S32 = LLT::scalar(32); 2672 LLT DstTy = MRI.getType(DstReg); 2673 LLT SrcTy = MRI.getType(SrcReg); 2674 2675 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2676 return; 2677 2678 MachineIRBuilder B(MI); 2679 2680 const ValueMapping &DstMapping 2681 = OpdMapper.getInstrMapping().getOperandMapping(0); 2682 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2683 const RegisterBank *SrcBank = 2684 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2685 const RegisterBank *IdxBank = 2686 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2687 2688 Register BaseIdxReg; 2689 unsigned ConstOffset; 2690 std::tie(BaseIdxReg, ConstOffset) = 2691 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2692 2693 // See if the index is an add of a constant which will be foldable by moving 2694 // the base register of the index later if this is going to be executed in a 2695 // waterfall loop. This is essentially to reassociate the add of a constant 2696 // with the readfirstlane. 2697 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2698 ConstOffset > 0 && 2699 ConstOffset < SrcTy.getNumElements(); 2700 2701 // Move the base register. We'll re-insert the add later. 2702 if (ShouldMoveIndexIntoLoop) 2703 MI.getOperand(2).setReg(BaseIdxReg); 2704 2705 // If this is a VGPR result only because the index was a VGPR result, the 2706 // actual indexing will be done on the SGPR source vector, which will 2707 // produce a scalar result. We need to copy to the VGPR result inside the 2708 // waterfall loop. 2709 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2710 SrcBank == &AMDGPU::SGPRRegBank; 2711 if (DstRegs.empty()) { 2712 applyDefaultMapping(OpdMapper); 2713 2714 executeInWaterfallLoop(MI, MRI, { 2 }); 2715 2716 if (NeedCopyToVGPR) { 2717 // We don't want a phi for this temporary reg. 2718 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2719 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2720 MI.getOperand(0).setReg(TmpReg); 2721 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2722 2723 // Use a v_mov_b32 here to make the exec dependency explicit. 2724 buildVCopy(B, DstReg, TmpReg); 2725 } 2726 2727 // Re-insert the constant offset add inside the waterfall loop. 2728 if (ShouldMoveIndexIntoLoop) 2729 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2730 2731 return; 2732 } 2733 2734 assert(DstTy.getSizeInBits() == 64); 2735 2736 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2737 2738 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2739 auto One = B.buildConstant(S32, 1); 2740 2741 MachineBasicBlock::iterator MII = MI.getIterator(); 2742 2743 // Split the vector index into 32-bit pieces. Prepare to move all of the 2744 // new instructions into a waterfall loop if necessary. 2745 // 2746 // Don't put the bitcast or constant in the loop. 2747 MachineInstrSpan Span(MII, &B.getMBB()); 2748 2749 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2750 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2751 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2752 2753 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2754 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2755 2756 MRI.setRegBank(DstReg, *DstBank); 2757 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2758 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2759 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2760 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2761 2762 SmallSet<Register, 4> OpsToWaterfall; 2763 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2764 MI.eraseFromParent(); 2765 return; 2766 } 2767 2768 // Remove the original instruction to avoid potentially confusing the 2769 // waterfall loop logic. 2770 B.setInstr(*Span.begin()); 2771 MI.eraseFromParent(); 2772 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2773 OpsToWaterfall, MRI); 2774 2775 if (NeedCopyToVGPR) { 2776 MachineBasicBlock *LoopBB = Extract1->getParent(); 2777 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2778 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2779 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2780 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2781 2782 Extract0->getOperand(0).setReg(TmpReg0); 2783 Extract1->getOperand(0).setReg(TmpReg1); 2784 2785 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2786 2787 buildVCopy(B, DstRegs[0], TmpReg0); 2788 buildVCopy(B, DstRegs[1], TmpReg1); 2789 } 2790 2791 if (ShouldMoveIndexIntoLoop) 2792 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2793 2794 return; 2795 } 2796 case AMDGPU::G_INSERT_VECTOR_ELT: { 2797 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2798 2799 Register DstReg = MI.getOperand(0).getReg(); 2800 LLT VecTy = MRI.getType(DstReg); 2801 2802 assert(OpdMapper.getVRegs(0).empty()); 2803 assert(OpdMapper.getVRegs(3).empty()); 2804 2805 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2806 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2807 2808 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2809 return; 2810 2811 const RegisterBank *IdxBank = 2812 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2813 2814 Register SrcReg = MI.getOperand(1).getReg(); 2815 Register InsReg = MI.getOperand(2).getReg(); 2816 LLT InsTy = MRI.getType(InsReg); 2817 (void)InsTy; 2818 2819 Register BaseIdxReg; 2820 unsigned ConstOffset; 2821 std::tie(BaseIdxReg, ConstOffset) = 2822 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2823 2824 // See if the index is an add of a constant which will be foldable by moving 2825 // the base register of the index later if this is going to be executed in a 2826 // waterfall loop. This is essentially to reassociate the add of a constant 2827 // with the readfirstlane. 2828 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2829 ConstOffset > 0 && 2830 ConstOffset < VecTy.getNumElements(); 2831 2832 // Move the base register. We'll re-insert the add later. 2833 if (ShouldMoveIndexIntoLoop) 2834 MI.getOperand(3).setReg(BaseIdxReg); 2835 2836 2837 if (InsRegs.empty()) { 2838 executeInWaterfallLoop(MI, MRI, { 3 }); 2839 2840 // Re-insert the constant offset add inside the waterfall loop. 2841 if (ShouldMoveIndexIntoLoop) { 2842 MachineIRBuilder B(MI); 2843 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2844 } 2845 2846 return; 2847 } 2848 2849 2850 assert(InsTy.getSizeInBits() == 64); 2851 2852 const LLT S32 = LLT::scalar(32); 2853 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2854 2855 MachineIRBuilder B(MI); 2856 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2857 auto One = B.buildConstant(S32, 1); 2858 2859 // Split the vector index into 32-bit pieces. Prepare to move all of the 2860 // new instructions into a waterfall loop if necessary. 2861 // 2862 // Don't put the bitcast or constant in the loop. 2863 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2864 2865 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2866 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2867 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2868 2869 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2870 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2871 2872 const RegisterBank *DstBank = 2873 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2874 const RegisterBank *SrcBank = 2875 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2876 const RegisterBank *InsSrcBank = 2877 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2878 2879 MRI.setRegBank(InsReg, *InsSrcBank); 2880 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2881 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2882 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2883 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2884 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2885 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2886 2887 2888 SmallSet<Register, 4> OpsToWaterfall; 2889 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2890 B.setInsertPt(B.getMBB(), MI); 2891 B.buildBitcast(DstReg, InsHi); 2892 MI.eraseFromParent(); 2893 return; 2894 } 2895 2896 B.setInstr(*Span.begin()); 2897 MI.eraseFromParent(); 2898 2899 // Figure out the point after the waterfall loop before mangling the control 2900 // flow. 2901 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2902 OpsToWaterfall, MRI); 2903 2904 // The insertion point is now right after the original instruction. 2905 // 2906 // Keep the bitcast to the original vector type out of the loop. Doing this 2907 // saved an extra phi we don't need inside the loop. 2908 B.buildBitcast(DstReg, InsHi); 2909 2910 // Re-insert the constant offset add inside the waterfall loop. 2911 if (ShouldMoveIndexIntoLoop) 2912 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2913 2914 return; 2915 } 2916 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2917 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2918 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2919 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2920 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2921 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2922 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2923 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2924 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2925 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2926 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2927 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2928 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2929 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2930 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2931 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2932 applyDefaultMapping(OpdMapper); 2933 executeInWaterfallLoop(MI, MRI, {1, 4}); 2934 return; 2935 } 2936 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2937 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2938 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2939 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2940 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2941 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2942 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2943 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2944 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2945 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2946 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2947 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2948 applyDefaultMapping(OpdMapper); 2949 executeInWaterfallLoop(MI, MRI, {2, 5}); 2950 return; 2951 } 2952 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2954 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2955 applyDefaultMapping(OpdMapper); 2956 executeInWaterfallLoop(MI, MRI, {2, 5}); 2957 return; 2958 } 2959 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2960 applyDefaultMapping(OpdMapper); 2961 executeInWaterfallLoop(MI, MRI, {3, 6}); 2962 return; 2963 } 2964 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2965 applyMappingSBufferLoad(OpdMapper); 2966 return; 2967 } 2968 case AMDGPU::G_INTRINSIC: { 2969 switch (MI.getIntrinsicID()) { 2970 case Intrinsic::amdgcn_readlane: { 2971 substituteSimpleCopyRegs(OpdMapper, 2); 2972 2973 assert(OpdMapper.getVRegs(0).empty()); 2974 assert(OpdMapper.getVRegs(3).empty()); 2975 2976 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2977 // waterfall loop, so assume it's a uniform value. 2978 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2979 return; 2980 } 2981 case Intrinsic::amdgcn_writelane: { 2982 assert(OpdMapper.getVRegs(0).empty()); 2983 assert(OpdMapper.getVRegs(2).empty()); 2984 assert(OpdMapper.getVRegs(3).empty()); 2985 2986 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2987 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2988 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2989 return; 2990 } 2991 case Intrinsic::amdgcn_interp_p1: 2992 case Intrinsic::amdgcn_interp_p2: 2993 case Intrinsic::amdgcn_interp_mov: 2994 case Intrinsic::amdgcn_interp_p1_f16: 2995 case Intrinsic::amdgcn_interp_p2_f16: { 2996 applyDefaultMapping(OpdMapper); 2997 2998 // Readlane for m0 value, which is always the last operand. 2999 // FIXME: Should this be a waterfall loop instead? 3000 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3001 return; 3002 } 3003 case Intrinsic::amdgcn_permlane16: 3004 case Intrinsic::amdgcn_permlanex16: { 3005 // Doing a waterfall loop over these wouldn't make any sense. 3006 substituteSimpleCopyRegs(OpdMapper, 2); 3007 substituteSimpleCopyRegs(OpdMapper, 3); 3008 constrainOpWithReadfirstlane(MI, MRI, 4); 3009 constrainOpWithReadfirstlane(MI, MRI, 5); 3010 return; 3011 } 3012 case Intrinsic::amdgcn_sbfe: 3013 applyMappingBFE(OpdMapper, true); 3014 return; 3015 case Intrinsic::amdgcn_ubfe: 3016 applyMappingBFE(OpdMapper, false); 3017 return; 3018 case Intrinsic::amdgcn_ballot: 3019 // Use default handling and insert copy to vcc source. 3020 break; 3021 } 3022 break; 3023 } 3024 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3025 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 3026 const AMDGPU::RsrcIntrinsic *RSrcIntrin 3027 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 3028 assert(RSrcIntrin && RSrcIntrin->IsImage); 3029 // Non-images can have complications from operands that allow both SGPR 3030 // and VGPR. For now it's too complicated to figure out the final opcode 3031 // to derive the register bank from the MCInstrDesc. 3032 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3033 return; 3034 } 3035 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3036 unsigned N = MI.getNumExplicitOperands() - 2; 3037 executeInWaterfallLoop(MI, MRI, { N }); 3038 return; 3039 } 3040 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3041 auto IntrID = MI.getIntrinsicID(); 3042 switch (IntrID) { 3043 case Intrinsic::amdgcn_ds_ordered_add: 3044 case Intrinsic::amdgcn_ds_ordered_swap: { 3045 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3046 assert(OpdMapper.getVRegs(0).empty()); 3047 substituteSimpleCopyRegs(OpdMapper, 3); 3048 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3049 return; 3050 } 3051 case Intrinsic::amdgcn_ds_gws_init: 3052 case Intrinsic::amdgcn_ds_gws_barrier: 3053 case Intrinsic::amdgcn_ds_gws_sema_br: { 3054 // Only the first lane is executes, so readfirstlane is safe. 3055 substituteSimpleCopyRegs(OpdMapper, 1); 3056 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3057 return; 3058 } 3059 case Intrinsic::amdgcn_ds_gws_sema_v: 3060 case Intrinsic::amdgcn_ds_gws_sema_p: 3061 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3062 // Only the first lane is executes, so readfirstlane is safe. 3063 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3064 return; 3065 } 3066 case Intrinsic::amdgcn_ds_append: 3067 case Intrinsic::amdgcn_ds_consume: { 3068 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3069 return; 3070 } 3071 case Intrinsic::amdgcn_s_sendmsg: 3072 case Intrinsic::amdgcn_s_sendmsghalt: { 3073 // FIXME: Should this use a waterfall loop? 3074 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3075 return; 3076 } 3077 case Intrinsic::amdgcn_s_setreg: { 3078 constrainOpWithReadfirstlane(MI, MRI, 2); 3079 return; 3080 } 3081 default: { 3082 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3083 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3084 // Non-images can have complications from operands that allow both SGPR 3085 // and VGPR. For now it's too complicated to figure out the final opcode 3086 // to derive the register bank from the MCInstrDesc. 3087 if (RSrcIntrin->IsImage) { 3088 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3089 return; 3090 } 3091 } 3092 3093 break; 3094 } 3095 } 3096 break; 3097 } 3098 case AMDGPU::G_LOAD: 3099 case AMDGPU::G_ZEXTLOAD: 3100 case AMDGPU::G_SEXTLOAD: { 3101 if (applyMappingLoad(MI, OpdMapper, MRI)) 3102 return; 3103 break; 3104 } 3105 case AMDGPU::G_DYN_STACKALLOC: 3106 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3107 return; 3108 case AMDGPU::G_SBFX: 3109 applyMappingBFE(OpdMapper, /*Signed*/ true); 3110 return; 3111 case AMDGPU::G_UBFX: 3112 applyMappingBFE(OpdMapper, /*Signed*/ false); 3113 return; 3114 default: 3115 break; 3116 } 3117 3118 return applyDefaultMapping(OpdMapper); 3119 } 3120 3121 // vgpr, sgpr -> vgpr 3122 // vgpr, agpr -> vgpr 3123 // agpr, agpr -> agpr 3124 // agpr, sgpr -> vgpr 3125 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3126 if (RB0 == AMDGPU::InvalidRegBankID) 3127 return RB1; 3128 if (RB1 == AMDGPU::InvalidRegBankID) 3129 return RB0; 3130 3131 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3132 return AMDGPU::SGPRRegBankID; 3133 3134 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3135 return AMDGPU::AGPRRegBankID; 3136 3137 return AMDGPU::VGPRRegBankID; 3138 } 3139 3140 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3141 if (RB0 == AMDGPU::InvalidRegBankID) 3142 return RB1; 3143 if (RB1 == AMDGPU::InvalidRegBankID) 3144 return RB0; 3145 3146 // vcc, vcc -> vcc 3147 // vcc, sgpr -> vcc 3148 // vcc, vgpr -> vcc 3149 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3150 return AMDGPU::VCCRegBankID; 3151 3152 // vcc, vgpr -> vgpr 3153 return regBankUnion(RB0, RB1); 3154 } 3155 3156 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3157 const MachineInstr &MI) const { 3158 unsigned RegBank = AMDGPU::InvalidRegBankID; 3159 3160 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3161 if (!MI.getOperand(i).isReg()) 3162 continue; 3163 Register Reg = MI.getOperand(i).getReg(); 3164 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3165 RegBank = regBankUnion(RegBank, Bank->getID()); 3166 if (RegBank == AMDGPU::VGPRRegBankID) 3167 break; 3168 } 3169 } 3170 3171 return RegBank; 3172 } 3173 3174 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3175 const MachineFunction &MF = *MI.getParent()->getParent(); 3176 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3177 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 3178 if (!MI.getOperand(i).isReg()) 3179 continue; 3180 Register Reg = MI.getOperand(i).getReg(); 3181 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3182 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3183 return false; 3184 } 3185 } 3186 return true; 3187 } 3188 3189 const RegisterBankInfo::InstructionMapping & 3190 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3191 const MachineFunction &MF = *MI.getParent()->getParent(); 3192 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3193 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3194 3195 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3196 const MachineOperand &SrcOp = MI.getOperand(i); 3197 if (!SrcOp.isReg()) 3198 continue; 3199 3200 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3201 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3202 } 3203 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3204 MI.getNumOperands()); 3205 } 3206 3207 const RegisterBankInfo::InstructionMapping & 3208 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3209 const MachineFunction &MF = *MI.getParent()->getParent(); 3210 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3211 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3212 3213 // Even though we technically could use SGPRs, this would require knowledge of 3214 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3215 // 3216 // TODO: Unary ops are trivially OK, so accept SGPRs? 3217 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3218 const MachineOperand &Src = MI.getOperand(i); 3219 if (!Src.isReg()) 3220 continue; 3221 3222 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3223 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3224 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3225 } 3226 3227 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3228 MI.getNumOperands()); 3229 } 3230 3231 const RegisterBankInfo::InstructionMapping & 3232 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3233 const MachineFunction &MF = *MI.getParent()->getParent(); 3234 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3235 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3236 3237 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3238 const MachineOperand &Op = MI.getOperand(I); 3239 if (!Op.isReg()) 3240 continue; 3241 3242 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3243 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3244 } 3245 3246 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3247 MI.getNumOperands()); 3248 } 3249 3250 const RegisterBankInfo::InstructionMapping & 3251 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3252 const MachineInstr &MI, 3253 int RsrcIdx) const { 3254 // The reported argument index is relative to the IR intrinsic call arguments, 3255 // so we need to shift by the number of defs and the intrinsic ID. 3256 RsrcIdx += MI.getNumExplicitDefs() + 1; 3257 3258 const int NumOps = MI.getNumOperands(); 3259 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3260 3261 // TODO: Should packed/unpacked D16 difference be reported here as part of 3262 // the value mapping? 3263 for (int I = 0; I != NumOps; ++I) { 3264 if (!MI.getOperand(I).isReg()) 3265 continue; 3266 3267 Register OpReg = MI.getOperand(I).getReg(); 3268 // We replace some dead address operands with $noreg 3269 if (!OpReg) 3270 continue; 3271 3272 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3273 3274 // FIXME: Probably need a new intrinsic register bank searchable table to 3275 // handle arbitrary intrinsics easily. 3276 // 3277 // If this has a sampler, it immediately follows rsrc. 3278 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3279 3280 if (MustBeSGPR) { 3281 // If this must be an SGPR, so we must report whatever it is as legal. 3282 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3283 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3284 } else { 3285 // Some operands must be VGPR, and these are easy to copy to. 3286 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3287 } 3288 } 3289 3290 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3291 } 3292 3293 /// Return the mapping for a pointer arugment. 3294 const RegisterBankInfo::ValueMapping * 3295 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3296 Register PtrReg) const { 3297 LLT PtrTy = MRI.getType(PtrReg); 3298 unsigned Size = PtrTy.getSizeInBits(); 3299 if (Subtarget.useFlatForGlobal() || 3300 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3301 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3302 3303 // If we're using MUBUF instructions for global memory, an SGPR base register 3304 // is possible. Otherwise this needs to be a VGPR. 3305 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3306 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3307 } 3308 3309 const RegisterBankInfo::InstructionMapping & 3310 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3311 3312 const MachineFunction &MF = *MI.getParent()->getParent(); 3313 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3314 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3315 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3316 Register PtrReg = MI.getOperand(1).getReg(); 3317 LLT PtrTy = MRI.getType(PtrReg); 3318 unsigned AS = PtrTy.getAddressSpace(); 3319 unsigned PtrSize = PtrTy.getSizeInBits(); 3320 3321 const ValueMapping *ValMapping; 3322 const ValueMapping *PtrMapping; 3323 3324 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3325 3326 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3327 if (isScalarLoadLegal(MI)) { 3328 // We have a uniform instruction so we want to use an SMRD load 3329 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3330 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3331 } else { 3332 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3333 3334 // If we're using MUBUF instructions for global memory, an SGPR base 3335 // register is possible. Otherwise this needs to be a VGPR. 3336 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3337 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3338 3339 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3340 } 3341 } else { 3342 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3343 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3344 } 3345 3346 OpdsMapping[0] = ValMapping; 3347 OpdsMapping[1] = PtrMapping; 3348 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3349 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3350 return Mapping; 3351 3352 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3353 // handle that during instruction selection? 3354 } 3355 3356 unsigned 3357 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3358 const MachineRegisterInfo &MRI, 3359 unsigned Default) const { 3360 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3361 return Bank ? Bank->getID() : Default; 3362 } 3363 3364 const RegisterBankInfo::ValueMapping * 3365 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3366 const MachineRegisterInfo &MRI, 3367 const TargetRegisterInfo &TRI) const { 3368 // Lie and claim anything is legal, even though this needs to be an SGPR 3369 // applyMapping will have to deal with it as a waterfall loop. 3370 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3371 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3372 return AMDGPU::getValueMapping(Bank, Size); 3373 } 3374 3375 const RegisterBankInfo::ValueMapping * 3376 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3377 const MachineRegisterInfo &MRI, 3378 const TargetRegisterInfo &TRI) const { 3379 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3380 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3381 } 3382 3383 const RegisterBankInfo::ValueMapping * 3384 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3385 const MachineRegisterInfo &MRI, 3386 const TargetRegisterInfo &TRI) const { 3387 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3388 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3389 } 3390 3391 /// 3392 /// This function must return a legal mapping, because 3393 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3394 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3395 /// VGPR to SGPR generated is illegal. 3396 /// 3397 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3398 // legal. These will be dealt with in applyMappingImpl. 3399 // 3400 const RegisterBankInfo::InstructionMapping & 3401 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3402 const MachineFunction &MF = *MI.getParent()->getParent(); 3403 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3404 3405 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3406 // The default logic bothers to analyze impossible alternative mappings. We 3407 // want the most straightforward mapping, so just directly handle this. 3408 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3409 *TRI); 3410 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3411 *TRI); 3412 assert(SrcBank && "src bank should have been assigned already"); 3413 if (!DstBank) 3414 DstBank = SrcBank; 3415 3416 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3417 if (cannotCopy(*DstBank, *SrcBank, Size)) 3418 return getInvalidInstructionMapping(); 3419 3420 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3421 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3422 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3423 OpdsMapping[0] = &ValMap; 3424 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3425 OpdsMapping[1] = &ValMap; 3426 3427 return getInstructionMapping( 3428 1, /*Cost*/ 1, 3429 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3430 } 3431 3432 if (MI.isRegSequence()) { 3433 // If any input is a VGPR, the result must be a VGPR. The default handling 3434 // assumes any copy between banks is legal. 3435 unsigned BankID = AMDGPU::SGPRRegBankID; 3436 3437 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3438 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3439 // It doesn't make sense to use vcc or scc banks here, so just ignore 3440 // them. 3441 if (OpBank != AMDGPU::SGPRRegBankID) { 3442 BankID = AMDGPU::VGPRRegBankID; 3443 break; 3444 } 3445 } 3446 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3447 3448 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3449 return getInstructionMapping( 3450 1, /*Cost*/ 1, 3451 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3452 } 3453 3454 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3455 // properly. 3456 // 3457 // TODO: There are additional exec masking dependencies to analyze. 3458 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3459 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3460 Register DstReg = MI.getOperand(0).getReg(); 3461 3462 // Sometimes the result may have already been assigned a bank. 3463 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3464 ResultBank = DstBank->getID(); 3465 3466 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3467 Register Reg = MI.getOperand(I).getReg(); 3468 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3469 3470 // FIXME: Assuming VGPR for any undetermined inputs. 3471 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3472 ResultBank = AMDGPU::VGPRRegBankID; 3473 break; 3474 } 3475 3476 // FIXME: Need to promote SGPR case to s32 3477 unsigned OpBank = Bank->getID(); 3478 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3479 } 3480 3481 assert(ResultBank != AMDGPU::InvalidRegBankID); 3482 3483 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3484 3485 const ValueMapping &ValMap = 3486 getValueMapping(0, Size, getRegBank(ResultBank)); 3487 return getInstructionMapping( 3488 1, /*Cost*/ 1, 3489 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3490 } 3491 3492 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3493 if (Mapping.isValid()) 3494 return Mapping; 3495 3496 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3497 3498 switch (MI.getOpcode()) { 3499 default: 3500 return getInvalidInstructionMapping(); 3501 3502 case AMDGPU::G_AND: 3503 case AMDGPU::G_OR: 3504 case AMDGPU::G_XOR: { 3505 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3506 if (Size == 1) { 3507 const RegisterBank *DstBank 3508 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3509 3510 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3511 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3512 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3513 if (DstBank) { 3514 TargetBankID = DstBank->getID(); 3515 if (DstBank == &AMDGPU::VCCRegBank) { 3516 TargetBankID = AMDGPU::VCCRegBankID; 3517 BankLHS = AMDGPU::VCCRegBankID; 3518 BankRHS = AMDGPU::VCCRegBankID; 3519 } else { 3520 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3521 AMDGPU::SGPRRegBankID); 3522 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3523 AMDGPU::SGPRRegBankID); 3524 } 3525 } else { 3526 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3527 AMDGPU::VCCRegBankID); 3528 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3529 AMDGPU::VCCRegBankID); 3530 3531 // Both inputs should be true booleans to produce a boolean result. 3532 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3533 TargetBankID = AMDGPU::VGPRRegBankID; 3534 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3535 TargetBankID = AMDGPU::VCCRegBankID; 3536 BankLHS = AMDGPU::VCCRegBankID; 3537 BankRHS = AMDGPU::VCCRegBankID; 3538 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3539 TargetBankID = AMDGPU::SGPRRegBankID; 3540 } 3541 } 3542 3543 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3544 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3545 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3546 break; 3547 } 3548 3549 if (Size == 64) { 3550 3551 if (isSALUMapping(MI)) { 3552 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3553 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3554 } else { 3555 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3556 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3557 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3558 3559 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3560 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3561 } 3562 3563 break; 3564 } 3565 3566 LLVM_FALLTHROUGH; 3567 } 3568 case AMDGPU::G_PTR_ADD: 3569 case AMDGPU::G_PTRMASK: 3570 case AMDGPU::G_ADD: 3571 case AMDGPU::G_SUB: 3572 case AMDGPU::G_MUL: 3573 case AMDGPU::G_SHL: 3574 case AMDGPU::G_LSHR: 3575 case AMDGPU::G_ASHR: 3576 case AMDGPU::G_UADDO: 3577 case AMDGPU::G_USUBO: 3578 case AMDGPU::G_UADDE: 3579 case AMDGPU::G_SADDE: 3580 case AMDGPU::G_USUBE: 3581 case AMDGPU::G_SSUBE: 3582 case AMDGPU::G_SMIN: 3583 case AMDGPU::G_SMAX: 3584 case AMDGPU::G_UMIN: 3585 case AMDGPU::G_UMAX: 3586 case AMDGPU::G_ABS: 3587 case AMDGPU::G_SHUFFLE_VECTOR: 3588 case AMDGPU::G_SBFX: 3589 case AMDGPU::G_UBFX: 3590 if (isSALUMapping(MI)) 3591 return getDefaultMappingSOP(MI); 3592 LLVM_FALLTHROUGH; 3593 3594 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3595 case AMDGPU::G_SSUBSAT: 3596 case AMDGPU::G_UADDSAT: 3597 case AMDGPU::G_USUBSAT: 3598 case AMDGPU::G_FADD: 3599 case AMDGPU::G_FSUB: 3600 case AMDGPU::G_FPTOSI: 3601 case AMDGPU::G_FPTOUI: 3602 case AMDGPU::G_FMUL: 3603 case AMDGPU::G_FMA: 3604 case AMDGPU::G_FMAD: 3605 case AMDGPU::G_FSQRT: 3606 case AMDGPU::G_FFLOOR: 3607 case AMDGPU::G_FCEIL: 3608 case AMDGPU::G_FRINT: 3609 case AMDGPU::G_SITOFP: 3610 case AMDGPU::G_UITOFP: 3611 case AMDGPU::G_FPTRUNC: 3612 case AMDGPU::G_FPEXT: 3613 case AMDGPU::G_FEXP2: 3614 case AMDGPU::G_FLOG2: 3615 case AMDGPU::G_FMINNUM: 3616 case AMDGPU::G_FMAXNUM: 3617 case AMDGPU::G_FMINNUM_IEEE: 3618 case AMDGPU::G_FMAXNUM_IEEE: 3619 case AMDGPU::G_FCANONICALIZE: 3620 case AMDGPU::G_INTRINSIC_TRUNC: 3621 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3622 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3623 case AMDGPU::G_AMDGPU_FFBH_U32: 3624 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3625 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3626 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3627 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3628 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3629 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3630 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3631 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3632 case AMDGPU::G_AMDGPU_SMED3: 3633 return getDefaultMappingVOP(MI); 3634 case AMDGPU::G_UMULH: 3635 case AMDGPU::G_SMULH: { 3636 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3637 return getDefaultMappingSOP(MI); 3638 return getDefaultMappingVOP(MI); 3639 } 3640 case AMDGPU::G_IMPLICIT_DEF: { 3641 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3642 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3643 break; 3644 } 3645 case AMDGPU::G_FCONSTANT: 3646 case AMDGPU::G_CONSTANT: 3647 case AMDGPU::G_GLOBAL_VALUE: 3648 case AMDGPU::G_BLOCK_ADDR: 3649 case AMDGPU::G_READCYCLECOUNTER: { 3650 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3651 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3652 break; 3653 } 3654 case AMDGPU::G_FRAME_INDEX: { 3655 // TODO: This should be the same as other constants, but eliminateFrameIndex 3656 // currently assumes VALU uses. 3657 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3658 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3659 break; 3660 } 3661 case AMDGPU::G_DYN_STACKALLOC: { 3662 // Result is always uniform, and a wave reduction is needed for the source. 3663 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3664 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3665 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3666 break; 3667 } 3668 case AMDGPU::G_INSERT: { 3669 unsigned BankID = getMappingType(MRI, MI); 3670 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3671 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3672 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3673 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3674 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3675 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3676 OpdsMapping[3] = nullptr; 3677 break; 3678 } 3679 case AMDGPU::G_EXTRACT: { 3680 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3681 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3682 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3683 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3684 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3685 OpdsMapping[2] = nullptr; 3686 break; 3687 } 3688 case AMDGPU::G_BUILD_VECTOR: 3689 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3690 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3691 if (DstTy == LLT::fixed_vector(2, 16)) { 3692 unsigned DstSize = DstTy.getSizeInBits(); 3693 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3694 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3695 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3696 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3697 3698 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3699 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3700 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3701 break; 3702 } 3703 3704 LLVM_FALLTHROUGH; 3705 } 3706 case AMDGPU::G_MERGE_VALUES: 3707 case AMDGPU::G_CONCAT_VECTORS: { 3708 unsigned Bank = getMappingType(MRI, MI); 3709 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3710 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3711 3712 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3713 // Op1 and Dst should use the same register bank. 3714 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3715 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3716 break; 3717 } 3718 case AMDGPU::G_BITREVERSE: 3719 case AMDGPU::G_BITCAST: 3720 case AMDGPU::G_INTTOPTR: 3721 case AMDGPU::G_PTRTOINT: 3722 case AMDGPU::G_FABS: 3723 case AMDGPU::G_FNEG: { 3724 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3725 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3726 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3727 break; 3728 } 3729 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3730 case AMDGPU::G_CTTZ_ZERO_UNDEF: 3731 case AMDGPU::G_CTPOP: { 3732 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3733 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3734 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3735 3736 // This should really be getValueMappingSGPR64Only, but allowing the generic 3737 // code to handle the register split just makes using LegalizerHelper more 3738 // difficult. 3739 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3740 break; 3741 } 3742 case AMDGPU::G_TRUNC: { 3743 Register Dst = MI.getOperand(0).getReg(); 3744 Register Src = MI.getOperand(1).getReg(); 3745 unsigned Bank = getRegBankID(Src, MRI); 3746 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3747 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3748 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3749 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3750 break; 3751 } 3752 case AMDGPU::G_ZEXT: 3753 case AMDGPU::G_SEXT: 3754 case AMDGPU::G_ANYEXT: 3755 case AMDGPU::G_SEXT_INREG: { 3756 Register Dst = MI.getOperand(0).getReg(); 3757 Register Src = MI.getOperand(1).getReg(); 3758 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3759 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3760 3761 unsigned DstBank; 3762 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3763 assert(SrcBank); 3764 switch (SrcBank->getID()) { 3765 case AMDGPU::SGPRRegBankID: 3766 DstBank = AMDGPU::SGPRRegBankID; 3767 break; 3768 default: 3769 DstBank = AMDGPU::VGPRRegBankID; 3770 break; 3771 } 3772 3773 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3774 // 32-bits, and then to 64. 3775 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3776 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3777 SrcSize); 3778 break; 3779 } 3780 case AMDGPU::G_FCMP: { 3781 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3782 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3783 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3784 OpdsMapping[1] = nullptr; // Predicate Operand. 3785 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 3786 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3787 break; 3788 } 3789 case AMDGPU::G_STORE: { 3790 assert(MI.getOperand(0).isReg()); 3791 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3792 3793 // FIXME: We need to specify a different reg bank once scalar stores are 3794 // supported. 3795 const ValueMapping *ValMapping = 3796 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3797 OpdsMapping[0] = ValMapping; 3798 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 3799 break; 3800 } 3801 case AMDGPU::G_ICMP: { 3802 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3803 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3804 3805 // See if the result register has already been constrained to vcc, which may 3806 // happen due to control flow intrinsic lowering. 3807 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 3808 AMDGPU::SGPRRegBankID); 3809 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3810 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 3811 3812 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 3813 Op2Bank == AMDGPU::SGPRRegBankID && 3814 Op3Bank == AMDGPU::SGPRRegBankID && 3815 (Size == 32 || (Size == 64 && 3816 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 3817 Subtarget.hasScalarCompareEq64())); 3818 3819 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3820 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3821 3822 // TODO: Use 32-bit for scalar output size. 3823 // SCC results will need to be copied to a 32-bit SGPR virtual register. 3824 const unsigned ResultSize = 1; 3825 3826 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 3827 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 3828 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 3829 break; 3830 } 3831 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 3832 // VGPR index can be used for waterfall when indexing a SGPR vector. 3833 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3834 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3835 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3836 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3837 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3838 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 3839 3840 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 3841 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 3842 3843 // The index can be either if the source vector is VGPR. 3844 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3845 break; 3846 } 3847 case AMDGPU::G_INSERT_VECTOR_ELT: { 3848 unsigned OutputBankID = isSALUMapping(MI) ? 3849 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3850 3851 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3852 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3853 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 3854 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3855 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 3856 3857 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3858 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3859 3860 // This is a weird case, because we need to break down the mapping based on 3861 // the register bank of a different operand. 3862 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 3863 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 3864 InsertSize); 3865 } else { 3866 assert(InsertSize == 32 || InsertSize == 64); 3867 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 3868 } 3869 3870 // The index can be either if the source vector is VGPR. 3871 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 3872 break; 3873 } 3874 case AMDGPU::G_UNMERGE_VALUES: { 3875 unsigned Bank = getMappingType(MRI, MI); 3876 3877 // Op1 and Dst should use the same register bank. 3878 // FIXME: Shouldn't this be the default? Why do we need to handle this? 3879 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3880 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 3881 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 3882 } 3883 break; 3884 } 3885 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3886 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3887 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3888 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3889 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3890 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3891 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3892 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3893 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3894 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3895 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 3896 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3897 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3898 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3899 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3900 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 3901 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3902 3903 // rsrc 3904 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3905 3906 // vindex 3907 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3908 3909 // voffset 3910 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3911 3912 // soffset 3913 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3914 3915 // Any remaining operands are immediates and were correctly null 3916 // initialized. 3917 break; 3918 } 3919 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3920 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3921 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3922 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3923 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3924 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3925 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3926 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3927 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3928 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3929 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3930 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 3931 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3932 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 3933 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 3934 // vdata_out 3935 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3936 3937 // vdata_in 3938 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3939 3940 // rsrc 3941 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3942 3943 // vindex 3944 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3945 3946 // voffset 3947 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3948 3949 // soffset 3950 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3951 3952 // Any remaining operands are immediates and were correctly null 3953 // initialized. 3954 break; 3955 } 3956 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 3957 // vdata_out 3958 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3959 3960 // vdata_in 3961 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3962 3963 // cmp 3964 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3965 3966 // rsrc 3967 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3968 3969 // vindex 3970 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3971 3972 // voffset 3973 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3974 3975 // soffset 3976 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 3977 3978 // Any remaining operands are immediates and were correctly null 3979 // initialized. 3980 break; 3981 } 3982 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 3983 // Lie and claim everything is legal, even though some need to be 3984 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 3985 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3986 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3987 3988 // We need to convert this to a MUBUF if either the resource of offset is 3989 // VGPR. 3990 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 3991 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 3992 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 3993 3994 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3995 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 3996 break; 3997 } 3998 case AMDGPU::G_INTRINSIC: { 3999 switch (MI.getIntrinsicID()) { 4000 default: 4001 return getInvalidInstructionMapping(); 4002 case Intrinsic::amdgcn_div_fmas: 4003 case Intrinsic::amdgcn_div_fixup: 4004 case Intrinsic::amdgcn_trig_preop: 4005 case Intrinsic::amdgcn_sin: 4006 case Intrinsic::amdgcn_cos: 4007 case Intrinsic::amdgcn_log_clamp: 4008 case Intrinsic::amdgcn_rcp: 4009 case Intrinsic::amdgcn_rcp_legacy: 4010 case Intrinsic::amdgcn_sqrt: 4011 case Intrinsic::amdgcn_rsq: 4012 case Intrinsic::amdgcn_rsq_legacy: 4013 case Intrinsic::amdgcn_rsq_clamp: 4014 case Intrinsic::amdgcn_fmul_legacy: 4015 case Intrinsic::amdgcn_fma_legacy: 4016 case Intrinsic::amdgcn_ldexp: 4017 case Intrinsic::amdgcn_frexp_mant: 4018 case Intrinsic::amdgcn_frexp_exp: 4019 case Intrinsic::amdgcn_fract: 4020 case Intrinsic::amdgcn_cvt_pkrtz: 4021 case Intrinsic::amdgcn_cvt_pknorm_i16: 4022 case Intrinsic::amdgcn_cvt_pknorm_u16: 4023 case Intrinsic::amdgcn_cvt_pk_i16: 4024 case Intrinsic::amdgcn_cvt_pk_u16: 4025 case Intrinsic::amdgcn_fmed3: 4026 case Intrinsic::amdgcn_cubeid: 4027 case Intrinsic::amdgcn_cubema: 4028 case Intrinsic::amdgcn_cubesc: 4029 case Intrinsic::amdgcn_cubetc: 4030 case Intrinsic::amdgcn_sffbh: 4031 case Intrinsic::amdgcn_fmad_ftz: 4032 case Intrinsic::amdgcn_mbcnt_lo: 4033 case Intrinsic::amdgcn_mbcnt_hi: 4034 case Intrinsic::amdgcn_mul_u24: 4035 case Intrinsic::amdgcn_mul_i24: 4036 case Intrinsic::amdgcn_lerp: 4037 case Intrinsic::amdgcn_sad_u8: 4038 case Intrinsic::amdgcn_msad_u8: 4039 case Intrinsic::amdgcn_sad_hi_u8: 4040 case Intrinsic::amdgcn_sad_u16: 4041 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4042 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4043 case Intrinsic::amdgcn_mqsad_u32_u8: 4044 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4045 case Intrinsic::amdgcn_alignbit: 4046 case Intrinsic::amdgcn_alignbyte: 4047 case Intrinsic::amdgcn_perm: 4048 case Intrinsic::amdgcn_fdot2: 4049 case Intrinsic::amdgcn_sdot2: 4050 case Intrinsic::amdgcn_udot2: 4051 case Intrinsic::amdgcn_sdot4: 4052 case Intrinsic::amdgcn_udot4: 4053 case Intrinsic::amdgcn_sdot8: 4054 case Intrinsic::amdgcn_udot8: 4055 return getDefaultMappingVOP(MI); 4056 case Intrinsic::amdgcn_sbfe: 4057 case Intrinsic::amdgcn_ubfe: 4058 if (isSALUMapping(MI)) 4059 return getDefaultMappingSOP(MI); 4060 return getDefaultMappingVOP(MI); 4061 case Intrinsic::amdgcn_ds_swizzle: 4062 case Intrinsic::amdgcn_ds_permute: 4063 case Intrinsic::amdgcn_ds_bpermute: 4064 case Intrinsic::amdgcn_update_dpp: 4065 case Intrinsic::amdgcn_mov_dpp8: 4066 case Intrinsic::amdgcn_mov_dpp: 4067 case Intrinsic::amdgcn_strict_wwm: 4068 case Intrinsic::amdgcn_wwm: 4069 case Intrinsic::amdgcn_strict_wqm: 4070 case Intrinsic::amdgcn_wqm: 4071 case Intrinsic::amdgcn_softwqm: 4072 case Intrinsic::amdgcn_set_inactive: 4073 return getDefaultMappingAllVGPR(MI); 4074 case Intrinsic::amdgcn_kernarg_segment_ptr: 4075 case Intrinsic::amdgcn_s_getpc: 4076 case Intrinsic::amdgcn_groupstaticsize: 4077 case Intrinsic::amdgcn_reloc_constant: 4078 case Intrinsic::returnaddress: { 4079 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4080 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4081 break; 4082 } 4083 case Intrinsic::amdgcn_wqm_vote: { 4084 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4085 OpdsMapping[0] = OpdsMapping[2] 4086 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4087 break; 4088 } 4089 case Intrinsic::amdgcn_ps_live: { 4090 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4091 break; 4092 } 4093 case Intrinsic::amdgcn_div_scale: { 4094 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4095 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4096 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4097 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4098 4099 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4100 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4101 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4102 break; 4103 } 4104 case Intrinsic::amdgcn_class: { 4105 Register Src0Reg = MI.getOperand(2).getReg(); 4106 Register Src1Reg = MI.getOperand(3).getReg(); 4107 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4108 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4109 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4110 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4111 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4112 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4113 break; 4114 } 4115 case Intrinsic::amdgcn_icmp: 4116 case Intrinsic::amdgcn_fcmp: { 4117 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4118 // This is not VCCRegBank because this is not used in boolean contexts. 4119 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4120 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4121 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4122 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4123 break; 4124 } 4125 case Intrinsic::amdgcn_readlane: { 4126 // This must be an SGPR, but accept a VGPR. 4127 Register IdxReg = MI.getOperand(3).getReg(); 4128 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4129 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4130 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4131 LLVM_FALLTHROUGH; 4132 } 4133 case Intrinsic::amdgcn_readfirstlane: { 4134 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4135 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4136 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4137 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4138 break; 4139 } 4140 case Intrinsic::amdgcn_writelane: { 4141 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4142 Register SrcReg = MI.getOperand(2).getReg(); 4143 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4144 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4145 Register IdxReg = MI.getOperand(3).getReg(); 4146 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4147 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4148 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4149 4150 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4151 // to legalize. 4152 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4153 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4154 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4155 break; 4156 } 4157 case Intrinsic::amdgcn_if_break: { 4158 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4159 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4160 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4161 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4162 break; 4163 } 4164 case Intrinsic::amdgcn_permlane16: 4165 case Intrinsic::amdgcn_permlanex16: { 4166 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4167 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4168 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4169 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4170 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4171 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4172 break; 4173 } 4174 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4175 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4176 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4177 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4178 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4179 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4180 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4181 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4182 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4183 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4184 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4185 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4186 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4187 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4188 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4189 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4190 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4191 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4192 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4193 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4194 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4195 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4196 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4197 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4198 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4199 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4200 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { 4201 // Default for MAI intrinsics. 4202 // srcC can also be an immediate which can be folded later. 4203 // FIXME: Should we eventually add an alternative mapping with AGPR src 4204 // for srcA/srcB? 4205 // 4206 // vdst, srcA, srcB, srcC 4207 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4208 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4209 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4210 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4211 break; 4212 } 4213 case Intrinsic::amdgcn_interp_p1: 4214 case Intrinsic::amdgcn_interp_p2: 4215 case Intrinsic::amdgcn_interp_mov: 4216 case Intrinsic::amdgcn_interp_p1_f16: 4217 case Intrinsic::amdgcn_interp_p2_f16: { 4218 const int M0Idx = MI.getNumOperands() - 1; 4219 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4220 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4221 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4222 4223 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4224 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4225 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4226 4227 // Must be SGPR, but we must take whatever the original bank is and fix it 4228 // later. 4229 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4230 break; 4231 } 4232 case Intrinsic::amdgcn_ballot: { 4233 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4234 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4235 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4236 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4237 break; 4238 } 4239 } 4240 break; 4241 } 4242 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4243 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 4244 auto IntrID = MI.getIntrinsicID(); 4245 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4246 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4247 // Non-images can have complications from operands that allow both SGPR 4248 // and VGPR. For now it's too complicated to figure out the final opcode 4249 // to derive the register bank from the MCInstrDesc. 4250 assert(RSrcIntrin->IsImage); 4251 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4252 } 4253 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4254 unsigned N = MI.getNumExplicitOperands() - 2; 4255 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4256 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4257 for (unsigned I = 2; I < N; ++I) 4258 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4259 break; 4260 } 4261 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4262 auto IntrID = MI.getIntrinsicID(); 4263 switch (IntrID) { 4264 case Intrinsic::amdgcn_s_getreg: 4265 case Intrinsic::amdgcn_s_memtime: 4266 case Intrinsic::amdgcn_s_memrealtime: 4267 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 4268 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4269 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4270 break; 4271 } 4272 case Intrinsic::amdgcn_global_atomic_fadd: 4273 case Intrinsic::amdgcn_global_atomic_csub: 4274 case Intrinsic::amdgcn_global_atomic_fmin: 4275 case Intrinsic::amdgcn_global_atomic_fmax: 4276 case Intrinsic::amdgcn_flat_atomic_fadd: 4277 case Intrinsic::amdgcn_flat_atomic_fmin: 4278 case Intrinsic::amdgcn_flat_atomic_fmax: 4279 return getDefaultMappingAllVGPR(MI); 4280 case Intrinsic::amdgcn_ds_ordered_add: 4281 case Intrinsic::amdgcn_ds_ordered_swap: { 4282 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4283 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4284 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4285 AMDGPU::SGPRRegBankID); 4286 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4287 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4288 break; 4289 } 4290 case Intrinsic::amdgcn_ds_append: 4291 case Intrinsic::amdgcn_ds_consume: { 4292 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4293 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4294 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4295 break; 4296 } 4297 case Intrinsic::amdgcn_exp_compr: 4298 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4299 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4300 break; 4301 case Intrinsic::amdgcn_exp: 4302 // FIXME: Could we support packed types here? 4303 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4304 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4305 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4306 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4307 break; 4308 case Intrinsic::amdgcn_s_sendmsg: 4309 case Intrinsic::amdgcn_s_sendmsghalt: { 4310 // This must be an SGPR, but accept a VGPR. 4311 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4312 AMDGPU::SGPRRegBankID); 4313 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4314 break; 4315 } 4316 case Intrinsic::amdgcn_s_setreg: { 4317 // This must be an SGPR, but accept a VGPR. 4318 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4319 AMDGPU::SGPRRegBankID); 4320 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4321 break; 4322 } 4323 case Intrinsic::amdgcn_end_cf: { 4324 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4325 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4326 break; 4327 } 4328 case Intrinsic::amdgcn_else: { 4329 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4330 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4331 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4332 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4333 break; 4334 } 4335 case Intrinsic::amdgcn_live_mask: { 4336 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4337 break; 4338 } 4339 case Intrinsic::amdgcn_wqm_demote: 4340 case Intrinsic::amdgcn_kill: { 4341 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4342 break; 4343 } 4344 case Intrinsic::amdgcn_raw_buffer_load: 4345 case Intrinsic::amdgcn_raw_tbuffer_load: { 4346 // FIXME: Should make intrinsic ID the last operand of the instruction, 4347 // then this would be the same as store 4348 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4349 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4350 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4351 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4352 break; 4353 } 4354 case Intrinsic::amdgcn_raw_buffer_store: 4355 case Intrinsic::amdgcn_raw_buffer_store_format: 4356 case Intrinsic::amdgcn_raw_tbuffer_store: { 4357 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4358 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4359 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4360 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4361 break; 4362 } 4363 case Intrinsic::amdgcn_struct_buffer_load: 4364 case Intrinsic::amdgcn_struct_tbuffer_load: { 4365 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4366 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4367 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4368 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4369 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4370 break; 4371 } 4372 case Intrinsic::amdgcn_struct_buffer_store: 4373 case Intrinsic::amdgcn_struct_tbuffer_store: { 4374 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4375 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4376 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4377 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4378 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4379 break; 4380 } 4381 case Intrinsic::amdgcn_init_exec_from_input: { 4382 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4383 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4384 break; 4385 } 4386 case Intrinsic::amdgcn_ds_gws_init: 4387 case Intrinsic::amdgcn_ds_gws_barrier: 4388 case Intrinsic::amdgcn_ds_gws_sema_br: { 4389 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4390 4391 // This must be an SGPR, but accept a VGPR. 4392 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4393 AMDGPU::SGPRRegBankID); 4394 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4395 break; 4396 } 4397 case Intrinsic::amdgcn_ds_gws_sema_v: 4398 case Intrinsic::amdgcn_ds_gws_sema_p: 4399 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4400 // This must be an SGPR, but accept a VGPR. 4401 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4402 AMDGPU::SGPRRegBankID); 4403 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4404 break; 4405 } 4406 default: 4407 return getInvalidInstructionMapping(); 4408 } 4409 break; 4410 } 4411 case AMDGPU::G_SELECT: { 4412 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4413 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4414 AMDGPU::SGPRRegBankID); 4415 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4416 AMDGPU::SGPRRegBankID); 4417 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4418 Op3Bank == AMDGPU::SGPRRegBankID; 4419 4420 unsigned CondBankDefault = SGPRSrcs ? 4421 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4422 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4423 CondBankDefault); 4424 if (CondBank == AMDGPU::SGPRRegBankID) 4425 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4426 else if (CondBank == AMDGPU::VGPRRegBankID) 4427 CondBank = AMDGPU::VCCRegBankID; 4428 4429 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4430 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4431 4432 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4433 4434 // TODO: Should report 32-bit for scalar condition type. 4435 if (Size == 64) { 4436 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4437 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4438 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4439 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4440 } else { 4441 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4442 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4443 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4444 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4445 } 4446 4447 break; 4448 } 4449 4450 case AMDGPU::G_LOAD: 4451 case AMDGPU::G_ZEXTLOAD: 4452 case AMDGPU::G_SEXTLOAD: 4453 return getInstrMappingForLoad(MI); 4454 4455 case AMDGPU::G_ATOMICRMW_XCHG: 4456 case AMDGPU::G_ATOMICRMW_ADD: 4457 case AMDGPU::G_ATOMICRMW_SUB: 4458 case AMDGPU::G_ATOMICRMW_AND: 4459 case AMDGPU::G_ATOMICRMW_OR: 4460 case AMDGPU::G_ATOMICRMW_XOR: 4461 case AMDGPU::G_ATOMICRMW_MAX: 4462 case AMDGPU::G_ATOMICRMW_MIN: 4463 case AMDGPU::G_ATOMICRMW_UMAX: 4464 case AMDGPU::G_ATOMICRMW_UMIN: 4465 case AMDGPU::G_ATOMICRMW_FADD: 4466 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4467 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4468 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4469 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4470 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4471 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4472 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4473 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4474 break; 4475 } 4476 case AMDGPU::G_ATOMIC_CMPXCHG: { 4477 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4478 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4479 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4480 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4481 break; 4482 } 4483 case AMDGPU::G_BRCOND: { 4484 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4485 AMDGPU::SGPRRegBankID); 4486 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4487 if (Bank != AMDGPU::SGPRRegBankID) 4488 Bank = AMDGPU::VCCRegBankID; 4489 4490 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4491 break; 4492 } 4493 } 4494 4495 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4496 getOperandsMapping(OpdsMapping), 4497 MI.getNumOperands()); 4498 } 4499