1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 83 #include "llvm/IR/IntrinsicsAMDGPU.h" 84 85 #define GET_TARGET_REGBANK_IMPL 86 #include "AMDGPUGenRegisterBank.inc" 87 88 // This file will be TableGen'ed at some point. 89 #include "AMDGPUGenRegisterBankInfo.def" 90 91 using namespace llvm; 92 using namespace MIPatternMatch; 93 94 namespace { 95 96 // Observer to apply a register bank to new registers created by LegalizerHelper. 97 class ApplyRegBankMapping final : public GISelChangeObserver { 98 private: 99 const AMDGPURegisterBankInfo &RBI; 100 MachineRegisterInfo &MRI; 101 const RegisterBank *NewBank; 102 SmallVector<MachineInstr *, 4> NewInsts; 103 104 public: 105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 106 MachineRegisterInfo &MRI_, const RegisterBank *RB) 107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 108 109 ~ApplyRegBankMapping() { 110 for (MachineInstr *MI : NewInsts) 111 applyBank(*MI); 112 } 113 114 /// Set any registers that don't have a set register class or bank to SALU. 115 void applyBank(MachineInstr &MI) { 116 const unsigned Opc = MI.getOpcode(); 117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 118 Opc == AMDGPU::G_SEXT) { 119 // LegalizerHelper wants to use the basic legalization artifacts when 120 // widening etc. We don't handle selection with vcc in artifact sources, 121 // so we need to use a select instead to handle these properly. 122 Register DstReg = MI.getOperand(0).getReg(); 123 Register SrcReg = MI.getOperand(1).getReg(); 124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 125 if (SrcBank == &AMDGPU::VCCRegBank) { 126 const LLT S32 = LLT::scalar(32); 127 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 128 assert(MRI.getType(DstReg) == S32); 129 assert(NewBank == &AMDGPU::VGPRRegBank); 130 131 // Replace the extension with a select, which really uses the boolean 132 // source. 133 MachineIRBuilder B(MI); 134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 135 auto False = B.buildConstant(S32, 0); 136 B.buildSelect(DstReg, SrcReg, True, False); 137 MRI.setRegBank(True.getReg(0), *NewBank); 138 MRI.setRegBank(False.getReg(0), *NewBank); 139 MI.eraseFromParent(); 140 } 141 142 assert(!MRI.getRegClassOrRegBank(DstReg)); 143 MRI.setRegBank(DstReg, *NewBank); 144 return; 145 } 146 147 #ifndef NDEBUG 148 if (Opc == AMDGPU::G_TRUNC) { 149 Register DstReg = MI.getOperand(0).getReg(); 150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 151 assert(DstBank != &AMDGPU::VCCRegBank); 152 } 153 #endif 154 155 for (MachineOperand &Op : MI.operands()) { 156 if (!Op.isReg()) 157 continue; 158 159 // We may see physical registers if building a real MI 160 Register Reg = Op.getReg(); 161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 162 continue; 163 164 const RegisterBank *RB = NewBank; 165 if (MRI.getType(Reg) == LLT::scalar(1)) { 166 assert(NewBank == &AMDGPU::VGPRRegBank && 167 "s1 operands should only be used for vector bools"); 168 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 169 MI.getOpcode() != AMDGPU::G_ANYEXT) && 170 "not expecting legalization artifacts here"); 171 RB = &AMDGPU::VCCRegBank; 172 } 173 174 MRI.setRegBank(Reg, *RB); 175 } 176 } 177 178 void erasingInstr(MachineInstr &MI) override {} 179 180 void createdInstr(MachineInstr &MI) override { 181 // At this point, the instruction was just inserted and has no operands. 182 NewInsts.push_back(&MI); 183 } 184 185 void changingInstr(MachineInstr &MI) override {} 186 void changedInstr(MachineInstr &MI) override { 187 // FIXME: In principle we should probably add the instruction to NewInsts, 188 // but the way the LegalizerHelper uses the observer, we will always see the 189 // registers we need to set the regbank on also referenced in a new 190 // instruction. 191 } 192 }; 193 194 } 195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 196 : AMDGPUGenRegisterBankInfo(), 197 Subtarget(ST), 198 TRI(Subtarget.getRegisterInfo()), 199 TII(Subtarget.getInstrInfo()) { 200 201 // HACK: Until this is fully tablegen'd. 202 static llvm::once_flag InitializeRegisterBankFlag; 203 204 static auto InitializeRegisterBankOnce = [this]() { 205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 208 (void)this; 209 }; 210 211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 212 } 213 214 static bool isVectorRegisterBank(const RegisterBank &Bank) { 215 unsigned BankID = Bank.getID(); 216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 217 } 218 219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 220 const RegisterBank &Src, 221 unsigned Size) const { 222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 223 if (Dst.getID() == AMDGPU::SGPRRegBankID && 224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 225 return std::numeric_limits<unsigned>::max(); 226 } 227 228 // Bool values are tricky, because the meaning is based on context. The SCC 229 // and VCC banks are for the natural scalar and vector conditions produced by 230 // a compare. 231 // 232 // Legalization doesn't know about the necessary context, so an s1 use may 233 // have been a truncate from an arbitrary value, in which case a copy (lowered 234 // as a compare with 0) needs to be inserted. 235 if (Size == 1 && 236 (Dst.getID() == AMDGPU::SGPRRegBankID) && 237 (isVectorRegisterBank(Src) || 238 Src.getID() == AMDGPU::SGPRRegBankID || 239 Src.getID() == AMDGPU::VCCRegBankID)) 240 return std::numeric_limits<unsigned>::max(); 241 242 // There is no direct copy between AGPRs. 243 if (Dst.getID() == AMDGPU::AGPRRegBankID && 244 Src.getID() == AMDGPU::AGPRRegBankID) 245 return 4; 246 247 return RegisterBankInfo::copyCost(Dst, Src, Size); 248 } 249 250 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 251 const ValueMapping &ValMapping, 252 const RegisterBank *CurBank) const { 253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 254 // VGPR. 255 // FIXME: Is there a better way to do this? 256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 257 return 10; // This is expensive. 258 259 assert(ValMapping.NumBreakDowns == 2 && 260 ValMapping.BreakDown[0].Length == 32 && 261 ValMapping.BreakDown[0].StartIdx == 0 && 262 ValMapping.BreakDown[1].Length == 32 && 263 ValMapping.BreakDown[1].StartIdx == 32 && 264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 265 266 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 268 // want. 269 270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 271 // alignment restrictions, but this probably isn't important. 272 return 1; 273 } 274 275 const RegisterBank & 276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 277 LLT Ty) const { 278 if (&RC == &AMDGPU::SReg_1RegClass) 279 return AMDGPU::VCCRegBank; 280 281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 282 // VCC-like use. 283 if (TRI->isSGPRClass(&RC)) { 284 // FIXME: This probably came from a copy from a physical register, which 285 // should be inferable from the copied to-type. We don't have many boolean 286 // physical register constraints so just assume a normal SGPR for now. 287 if (!Ty.isValid()) 288 return AMDGPU::SGPRRegBank; 289 290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 291 } 292 293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 294 } 295 296 template <unsigned NumOps> 297 RegisterBankInfo::InstructionMappings 298 AMDGPURegisterBankInfo::addMappingFromTable( 299 const MachineInstr &MI, const MachineRegisterInfo &MRI, 300 const std::array<unsigned, NumOps> RegSrcOpIdx, 301 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 302 303 InstructionMappings AltMappings; 304 305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 306 307 unsigned Sizes[NumOps]; 308 for (unsigned I = 0; I < NumOps; ++I) { 309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 311 } 312 313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 316 } 317 318 // getInstrMapping's default mapping uses ID 1, so start at 2. 319 unsigned MappingID = 2; 320 for (const auto &Entry : Table) { 321 for (unsigned I = 0; I < NumOps; ++I) { 322 int OpIdx = RegSrcOpIdx[I]; 323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 324 } 325 326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 327 getOperandsMapping(Operands), 328 Operands.size())); 329 } 330 331 return AltMappings; 332 } 333 334 RegisterBankInfo::InstructionMappings 335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 337 switch (MI.getIntrinsicID()) { 338 case Intrinsic::amdgcn_readlane: { 339 static const OpRegBankEntry<3> Table[2] = { 340 // Perfectly legal. 341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 342 343 // Need a readfirstlane for the index. 344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 345 }; 346 347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 349 } 350 case Intrinsic::amdgcn_writelane: { 351 static const OpRegBankEntry<4> Table[4] = { 352 // Perfectly legal. 353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 354 355 // Need readfirstlane of first op 356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 357 358 // Need readfirstlane of second op 359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 360 361 // Need readfirstlane of both ops 362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 363 }; 364 365 // rsrc, voffset, offset 366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 368 } 369 default: 370 return RegisterBankInfo::getInstrAlternativeMappings(MI); 371 } 372 } 373 374 RegisterBankInfo::InstructionMappings 375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 377 378 switch (MI.getIntrinsicID()) { 379 case Intrinsic::amdgcn_s_buffer_load: { 380 static const OpRegBankEntry<2> Table[4] = { 381 // Perfectly legal. 382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 383 384 // Only need 1 register in loop 385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 386 387 // Have to waterfall the resource. 388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 389 390 // Have to waterfall the resource, and the offset. 391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 392 }; 393 394 // rsrc, offset 395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 397 } 398 case Intrinsic::amdgcn_ds_ordered_add: 399 case Intrinsic::amdgcn_ds_ordered_swap: { 400 // VGPR = M0, VGPR 401 static const OpRegBankEntry<3> Table[2] = { 402 // Perfectly legal. 403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 404 405 // Need a readfirstlane for m0 406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 407 }; 408 409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 411 } 412 case Intrinsic::amdgcn_s_sendmsg: 413 case Intrinsic::amdgcn_s_sendmsghalt: { 414 // FIXME: Should have no register for immediate 415 static const OpRegBankEntry<1> Table[2] = { 416 // Perfectly legal. 417 { { AMDGPU::SGPRRegBankID }, 1 }, 418 419 // Need readlane 420 { { AMDGPU::VGPRRegBankID }, 3 } 421 }; 422 423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 425 } 426 default: 427 return RegisterBankInfo::getInstrAlternativeMappings(MI); 428 } 429 } 430 431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { 432 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); 433 return I && I->getMetadata("amdgpu.noclobber"); 434 } 435 436 // FIXME: Returns uniform if there's no source value information. This is 437 // probably wrong. 438 static bool isScalarLoadLegal(const MachineInstr &MI) { 439 if (!MI.hasOneMemOperand()) 440 return false; 441 442 const MachineMemOperand *MMO = *MI.memoperands_begin(); 443 const unsigned AS = MMO->getAddrSpace(); 444 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 445 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 446 // Require 4-byte alignment. 447 return MMO->getAlign() >= Align(4) && 448 // Can't do a scalar atomic load. 449 !MMO->isAtomic() && 450 // Don't use scalar loads for volatile accesses to non-constant address 451 // spaces. 452 (IsConst || !MMO->isVolatile()) && 453 // Memory must be known constant, or not written before this load. 454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && 455 AMDGPUInstrInfo::isUniformMMO(MMO); 456 } 457 458 RegisterBankInfo::InstructionMappings 459 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 460 const MachineInstr &MI) const { 461 462 const MachineFunction &MF = *MI.getParent()->getParent(); 463 const MachineRegisterInfo &MRI = MF.getRegInfo(); 464 465 466 InstructionMappings AltMappings; 467 switch (MI.getOpcode()) { 468 case TargetOpcode::G_CONSTANT: { 469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 470 if (Size == 1) { 471 static const OpRegBankEntry<1> Table[3] = { 472 { { AMDGPU::VGPRRegBankID }, 1 }, 473 { { AMDGPU::SGPRRegBankID }, 1 }, 474 { { AMDGPU::VCCRegBankID }, 1 } 475 }; 476 477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 478 } 479 480 LLVM_FALLTHROUGH; 481 } 482 case TargetOpcode::G_FCONSTANT: 483 case TargetOpcode::G_FRAME_INDEX: 484 case TargetOpcode::G_GLOBAL_VALUE: { 485 static const OpRegBankEntry<1> Table[2] = { 486 { { AMDGPU::VGPRRegBankID }, 1 }, 487 { { AMDGPU::SGPRRegBankID }, 1 } 488 }; 489 490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 491 } 492 case TargetOpcode::G_AND: 493 case TargetOpcode::G_OR: 494 case TargetOpcode::G_XOR: { 495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 496 497 if (Size == 1) { 498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 499 const InstructionMapping &SCCMapping = getInstructionMapping( 500 1, 1, getOperandsMapping( 501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 504 3); // Num Operands 505 AltMappings.push_back(&SCCMapping); 506 507 const InstructionMapping &VCCMapping0 = getInstructionMapping( 508 2, 1, getOperandsMapping( 509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 512 3); // Num Operands 513 AltMappings.push_back(&VCCMapping0); 514 return AltMappings; 515 } 516 517 if (Size != 64) 518 break; 519 520 const InstructionMapping &SSMapping = getInstructionMapping( 521 1, 1, getOperandsMapping( 522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 525 3); // Num Operands 526 AltMappings.push_back(&SSMapping); 527 528 const InstructionMapping &VVMapping = getInstructionMapping( 529 2, 2, getOperandsMapping( 530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 533 3); // Num Operands 534 AltMappings.push_back(&VVMapping); 535 break; 536 } 537 case TargetOpcode::G_LOAD: 538 case TargetOpcode::G_ZEXTLOAD: 539 case TargetOpcode::G_SEXTLOAD: { 540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 542 unsigned PtrSize = PtrTy.getSizeInBits(); 543 unsigned AS = PtrTy.getAddressSpace(); 544 545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 546 AS != AMDGPUAS::PRIVATE_ADDRESS) && 547 isScalarLoadLegal(MI)) { 548 const InstructionMapping &SSMapping = getInstructionMapping( 549 1, 1, getOperandsMapping( 550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 552 2); // Num Operands 553 AltMappings.push_back(&SSMapping); 554 } 555 556 const InstructionMapping &VVMapping = getInstructionMapping( 557 2, 1, 558 getOperandsMapping( 559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 561 2); // Num Operands 562 AltMappings.push_back(&VVMapping); 563 564 // It may be possible to have a vgpr = load sgpr mapping here, because 565 // the mubuf instructions support this kind of load, but probably for only 566 // gfx7 and older. However, the addressing mode matching in the instruction 567 // selector should be able to do a better job of detecting and selecting 568 // these kinds of loads from the vgpr = load vgpr mapping. 569 570 return AltMappings; 571 572 } 573 case TargetOpcode::G_SELECT: { 574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 580 4); // Num Operands 581 AltMappings.push_back(&SSMapping); 582 583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 588 4); // Num Operands 589 AltMappings.push_back(&VVMapping); 590 591 return AltMappings; 592 } 593 case TargetOpcode::G_UADDE: 594 case TargetOpcode::G_USUBE: 595 case TargetOpcode::G_SADDE: 596 case TargetOpcode::G_SSUBE: { 597 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 598 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 599 getOperandsMapping( 600 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 604 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 605 5); // Num Operands 606 AltMappings.push_back(&SSMapping); 607 608 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 609 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 610 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 612 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 613 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 614 5); // Num Operands 615 AltMappings.push_back(&VVMapping); 616 return AltMappings; 617 } 618 case AMDGPU::G_BRCOND: { 619 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 620 621 // TODO: Change type to 32 for scalar 622 const InstructionMapping &SMapping = getInstructionMapping( 623 1, 1, getOperandsMapping( 624 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 625 2); // Num Operands 626 AltMappings.push_back(&SMapping); 627 628 const InstructionMapping &VMapping = getInstructionMapping( 629 1, 1, getOperandsMapping( 630 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 631 2); // Num Operands 632 AltMappings.push_back(&VMapping); 633 return AltMappings; 634 } 635 case AMDGPU::G_INTRINSIC: 636 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 637 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 638 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 639 default: 640 break; 641 } 642 return RegisterBankInfo::getInstrAlternativeMappings(MI); 643 } 644 645 void AMDGPURegisterBankInfo::split64BitValueForMapping( 646 MachineIRBuilder &B, 647 SmallVector<Register, 2> &Regs, 648 LLT HalfTy, 649 Register Reg) const { 650 assert(HalfTy.getSizeInBits() == 32); 651 MachineRegisterInfo *MRI = B.getMRI(); 652 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 653 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 654 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 655 MRI->setRegBank(LoLHS, *Bank); 656 MRI->setRegBank(HiLHS, *Bank); 657 658 Regs.push_back(LoLHS); 659 Regs.push_back(HiLHS); 660 661 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 662 .addDef(LoLHS) 663 .addDef(HiLHS) 664 .addUse(Reg); 665 } 666 667 /// Replace the current type each register in \p Regs has with \p NewTy 668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 669 LLT NewTy) { 670 for (Register Reg : Regs) { 671 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 672 MRI.setType(Reg, NewTy); 673 } 674 } 675 676 static LLT getHalfSizedType(LLT Ty) { 677 if (Ty.isVector()) { 678 assert(Ty.getElementCount().isKnownMultipleOf(2)); 679 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 680 Ty.getElementType()); 681 } 682 683 assert(Ty.getScalarSizeInBits() % 2 == 0); 684 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 685 } 686 687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 689 /// execute the instruction for each unique combination of values in all lanes 690 /// in the wave. The block will be split such that rest of the instructions are 691 /// moved to a new block. 692 /// 693 /// Essentially performs this loop: 694 // 695 /// Save Execution Mask 696 /// For (Lane : Wavefront) { 697 /// Enable Lane, Disable all other lanes 698 /// SGPR = read SGPR value for current lane from VGPR 699 /// VGPRResult[Lane] = use_op SGPR 700 /// } 701 /// Restore Execution Mask 702 /// 703 /// There is additional complexity to try for compare values to identify the 704 /// unique values used. 705 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 706 MachineIRBuilder &B, 707 iterator_range<MachineBasicBlock::iterator> Range, 708 SmallSet<Register, 4> &SGPROperandRegs, 709 MachineRegisterInfo &MRI) const { 710 711 // Track use registers which have already been expanded with a readfirstlane 712 // sequence. This may have multiple uses if moving a sequence. 713 DenseMap<Register, Register> WaterfalledRegMap; 714 715 MachineBasicBlock &MBB = B.getMBB(); 716 MachineFunction *MF = &B.getMF(); 717 718 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 719 const unsigned WaveAndOpc = Subtarget.isWave32() ? 720 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 721 const unsigned MovExecOpc = 722 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 723 const unsigned MovExecTermOpc = 724 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 725 726 const unsigned XorTermOpc = Subtarget.isWave32() ? 727 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 728 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 729 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 730 const unsigned ExecReg = Subtarget.isWave32() ? 731 AMDGPU::EXEC_LO : AMDGPU::EXEC; 732 733 #ifndef NDEBUG 734 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 735 #endif 736 737 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 738 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 739 740 // Don't bother using generic instructions/registers for the exec mask. 741 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 742 .addDef(InitSaveExecReg); 743 744 Register PhiExec = MRI.createVirtualRegister(WaveRC); 745 Register NewExec = MRI.createVirtualRegister(WaveRC); 746 747 // To insert the loop we need to split the block. Move everything before this 748 // point to a new block, and insert a new empty block before this instruction. 749 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 750 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 751 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 752 MachineFunction::iterator MBBI(MBB); 753 ++MBBI; 754 MF->insert(MBBI, LoopBB); 755 MF->insert(MBBI, RestoreExecBB); 756 MF->insert(MBBI, RemainderBB); 757 758 LoopBB->addSuccessor(RestoreExecBB); 759 LoopBB->addSuccessor(LoopBB); 760 761 // Move the rest of the block into a new block. 762 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 763 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 764 765 MBB.addSuccessor(LoopBB); 766 RestoreExecBB->addSuccessor(RemainderBB); 767 768 B.setInsertPt(*LoopBB, LoopBB->end()); 769 770 B.buildInstr(TargetOpcode::PHI) 771 .addDef(PhiExec) 772 .addReg(InitSaveExecReg) 773 .addMBB(&MBB) 774 .addReg(NewExec) 775 .addMBB(LoopBB); 776 777 const DebugLoc &DL = B.getDL(); 778 779 MachineInstr &FirstInst = *Range.begin(); 780 781 // Move the instruction into the loop. Note we moved everything after 782 // Range.end() already into a new block, so Range.end() is no longer valid. 783 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 784 785 // Figure out the iterator range after splicing the instructions. 786 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 787 auto NewEnd = LoopBB->end(); 788 789 MachineBasicBlock::iterator I = Range.begin(); 790 B.setInsertPt(*LoopBB, I); 791 792 Register CondReg; 793 794 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 795 796 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 797 for (MachineOperand &Op : MI.uses()) { 798 if (!Op.isReg() || Op.isDef()) 799 continue; 800 801 Register OldReg = Op.getReg(); 802 if (!SGPROperandRegs.count(OldReg)) 803 continue; 804 805 // See if we already processed this register in another instruction in the 806 // sequence. 807 auto OldVal = WaterfalledRegMap.find(OldReg); 808 if (OldVal != WaterfalledRegMap.end()) { 809 Op.setReg(OldVal->second); 810 continue; 811 } 812 813 Register OpReg = Op.getReg(); 814 LLT OpTy = MRI.getType(OpReg); 815 816 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 817 if (OpBank != &AMDGPU::VGPRRegBank) { 818 // Insert copy from AGPR to VGPR before the loop. 819 B.setMBB(MBB); 820 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 821 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 822 B.setInstr(*I); 823 } 824 825 unsigned OpSize = OpTy.getSizeInBits(); 826 827 // Can only do a readlane of 32-bit pieces. 828 if (OpSize == 32) { 829 // Avoid extra copies in the simple case of one 32-bit register. 830 Register CurrentLaneOpReg 831 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 832 MRI.setType(CurrentLaneOpReg, OpTy); 833 834 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); 835 // Read the next variant <- also loop target. 836 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 837 CurrentLaneOpReg) 838 .addReg(OpReg); 839 840 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 841 bool First = CondReg == AMDGPU::NoRegister; 842 if (First) 843 CondReg = NewCondReg; 844 845 // Compare the just read M0 value to all possible Idx values. 846 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 847 .addDef(NewCondReg) 848 .addReg(CurrentLaneOpReg) 849 .addReg(OpReg); 850 Op.setReg(CurrentLaneOpReg); 851 852 if (!First) { 853 Register AndReg = MRI.createVirtualRegister(WaveRC); 854 855 // If there are multiple operands to consider, and the conditions. 856 B.buildInstr(WaveAndOpc) 857 .addDef(AndReg) 858 .addReg(NewCondReg) 859 .addReg(CondReg); 860 CondReg = AndReg; 861 } 862 } else { 863 LLT S32 = LLT::scalar(32); 864 SmallVector<Register, 8> ReadlanePieces; 865 866 // The compares can be done as 64-bit, but the extract needs to be done 867 // in 32-bit pieces. 868 869 bool Is64 = OpSize % 64 == 0; 870 871 unsigned UnmergeTySize = Is64 ? 64 : 32; 872 unsigned CmpOp = 873 Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64; 874 875 // Insert the unmerge before the loop. 876 877 B.setMBB(MBB); 878 unsigned NumPieces = OpSize / UnmergeTySize; 879 SmallVector<Register, 8> UnmergePieces; 880 if (NumPieces == 1) { 881 UnmergePieces.push_back(OpReg); 882 } else { 883 LLT UnmergeTy = LLT::scalar(UnmergeTySize); 884 MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg); 885 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) 886 UnmergePieces.push_back(Unmerge.getReg(PieceIdx)); 887 } 888 B.setInstr(*I); 889 890 for (Register UnmergePiece : UnmergePieces) { 891 Register CurrentLaneOpReg; 892 if (Is64) { 893 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 894 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 895 896 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 897 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 898 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 899 900 // Read the next variant <- also loop target. 901 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 902 CurrentLaneOpRegLo) 903 .addReg(UnmergePiece, 0, AMDGPU::sub0); 904 905 // Read the next variant <- also loop target. 906 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 907 CurrentLaneOpRegHi) 908 .addReg(UnmergePiece, 0, AMDGPU::sub1); 909 910 CurrentLaneOpReg = 911 B.buildMerge(LLT::scalar(64), 912 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 913 .getReg(0); 914 915 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 916 917 if (OpTy.getScalarSizeInBits() == 64) { 918 // If we need to produce a 64-bit element vector, so use the 919 // merged pieces 920 ReadlanePieces.push_back(CurrentLaneOpReg); 921 } else { 922 // 32-bit element type. 923 ReadlanePieces.push_back(CurrentLaneOpRegLo); 924 ReadlanePieces.push_back(CurrentLaneOpRegHi); 925 } 926 } else { 927 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 928 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 929 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 930 931 // Read the next variant <- also loop target. 932 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 933 CurrentLaneOpReg) 934 .addReg(UnmergePiece); 935 ReadlanePieces.push_back(CurrentLaneOpReg); 936 } 937 938 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 939 bool First = CondReg == AMDGPU::NoRegister; 940 if (First) 941 CondReg = NewCondReg; 942 943 B.buildInstr(CmpOp) 944 .addDef(NewCondReg) 945 .addReg(CurrentLaneOpReg) 946 .addReg(UnmergePiece); 947 948 if (!First) { 949 Register AndReg = MRI.createVirtualRegister(WaveRC); 950 951 // If there are multiple operands to consider, and the conditions. 952 B.buildInstr(WaveAndOpc) 953 .addDef(AndReg) 954 .addReg(NewCondReg) 955 .addReg(CondReg); 956 CondReg = AndReg; 957 } 958 } 959 960 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 961 // BUILD_VECTOR 962 if (OpTy.isVector()) { 963 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 964 Op.setReg(Merge.getReg(0)); 965 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 966 } else if (ReadlanePieces.size() > 1) { 967 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 968 Op.setReg(Merge.getReg(0)); 969 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 970 } else { 971 Op.setReg(ReadlanePieces[0]); 972 } 973 } 974 975 // Make sure we don't re-process this register again. 976 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 977 } 978 } 979 980 // Update EXEC, save the original EXEC value to VCC. 981 B.buildInstr(AndSaveExecOpc) 982 .addDef(NewExec) 983 .addReg(CondReg, RegState::Kill); 984 985 MRI.setSimpleHint(NewExec, CondReg); 986 987 B.setInsertPt(*LoopBB, LoopBB->end()); 988 989 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 990 B.buildInstr(XorTermOpc) 991 .addDef(ExecReg) 992 .addReg(ExecReg) 993 .addReg(NewExec); 994 995 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 996 // s_cbranch_scc0? 997 998 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 999 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 1000 1001 // Save the EXEC mask before the loop. 1002 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 1003 .addReg(ExecReg); 1004 1005 // Restore the EXEC mask after the loop. 1006 B.setMBB(*RestoreExecBB); 1007 B.buildInstr(MovExecTermOpc) 1008 .addDef(ExecReg) 1009 .addReg(SaveExecReg); 1010 1011 // Set the insert point after the original instruction, so any new 1012 // instructions will be in the remainder. 1013 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 1014 1015 return true; 1016 } 1017 1018 // Return any unique registers used by \p MI at \p OpIndices that need to be 1019 // handled in a waterfall loop. Returns these registers in \p 1020 // SGPROperandRegs. Returns true if there are any operands to handle and a 1021 // waterfall loop is necessary. 1022 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 1023 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 1024 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 1025 for (unsigned Op : OpIndices) { 1026 assert(MI.getOperand(Op).isUse()); 1027 Register Reg = MI.getOperand(Op).getReg(); 1028 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 1029 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 1030 SGPROperandRegs.insert(Reg); 1031 } 1032 1033 // No operands need to be replaced, so no need to loop. 1034 return !SGPROperandRegs.empty(); 1035 } 1036 1037 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1038 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 1039 ArrayRef<unsigned> OpIndices) const { 1040 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1041 // are the same register. 1042 SmallSet<Register, 4> SGPROperandRegs; 1043 1044 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 1045 return false; 1046 1047 MachineBasicBlock::iterator I = MI.getIterator(); 1048 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1049 SGPROperandRegs, MRI); 1050 } 1051 1052 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1053 MachineInstr &MI, MachineRegisterInfo &MRI, 1054 ArrayRef<unsigned> OpIndices) const { 1055 MachineIRBuilder B(MI); 1056 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1057 } 1058 1059 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1060 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1061 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1062 Register Reg = MI.getOperand(OpIdx).getReg(); 1063 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1064 if (Bank == &AMDGPU::SGPRRegBank) 1065 return; 1066 1067 LLT Ty = MRI.getType(Reg); 1068 MachineIRBuilder B(MI); 1069 1070 if (Bank != &AMDGPU::VGPRRegBank) { 1071 // We need to copy from AGPR to VGPR 1072 Reg = B.buildCopy(Ty, Reg).getReg(0); 1073 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); 1074 } 1075 1076 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1077 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1078 .addDef(SGPR) 1079 .addReg(Reg); 1080 1081 MRI.setType(SGPR, Ty); 1082 1083 const TargetRegisterClass *Constrained = 1084 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1085 (void)Constrained; 1086 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1087 1088 MI.getOperand(OpIdx).setReg(SGPR); 1089 } 1090 1091 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1092 /// rest will be in the remainder. 1093 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1094 unsigned TotalSize = Ty.getSizeInBits(); 1095 if (!Ty.isVector()) 1096 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1097 1098 LLT EltTy = Ty.getElementType(); 1099 unsigned EltSize = EltTy.getSizeInBits(); 1100 assert(FirstSize % EltSize == 0); 1101 1102 unsigned FirstPartNumElts = FirstSize / EltSize; 1103 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1104 1105 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1106 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1107 } 1108 1109 static LLT widen96To128(LLT Ty) { 1110 if (!Ty.isVector()) 1111 return LLT::scalar(128); 1112 1113 LLT EltTy = Ty.getElementType(); 1114 assert(128 % EltTy.getSizeInBits() == 0); 1115 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1116 } 1117 1118 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1119 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1120 MachineRegisterInfo &MRI) const { 1121 Register DstReg = MI.getOperand(0).getReg(); 1122 const LLT LoadTy = MRI.getType(DstReg); 1123 unsigned LoadSize = LoadTy.getSizeInBits(); 1124 const unsigned MaxNonSmrdLoadSize = 128; 1125 1126 const RegisterBank *DstBank = 1127 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1128 if (DstBank == &AMDGPU::SGPRRegBank) { 1129 // There are some special cases that we need to look at for 32 bit and 96 1130 // bit SGPR loads otherwise we have nothing to do. 1131 if (LoadSize != 32 && LoadSize != 96) 1132 return false; 1133 1134 MachineMemOperand *MMO = *MI.memoperands_begin(); 1135 const unsigned MemSize = 8 * MMO->getSize(); 1136 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1137 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1138 // scalar loads should have a load size of 32 but memory access size of less 1139 // than 32. 1140 if (LoadSize == 32 && 1141 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1142 return false; 1143 1144 Register PtrReg = MI.getOperand(1).getReg(); 1145 1146 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1147 MachineIRBuilder B(MI, O); 1148 1149 if (LoadSize == 32) { 1150 // This is an extending load from a sub-dword size. Widen the memory 1151 // access size to 4 bytes and clear the extra high bits appropriately 1152 const LLT S32 = LLT::scalar(32); 1153 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1154 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1155 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1156 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1157 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1158 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1159 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1160 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1161 } else 1162 // We do not need to touch the higher bits for regular loads. 1163 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1164 } else { 1165 // 96-bit loads are only available for vector loads. We need to split this 1166 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1167 if (MMO->getAlign() < Align(16)) { 1168 MachineFunction *MF = MI.getParent()->getParent(); 1169 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1170 MachineIRBuilder B(MI, ApplyBank); 1171 LegalizerHelper Helper(*MF, ApplyBank, B); 1172 LLT Part64, Part32; 1173 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1174 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1175 LegalizerHelper::Legalized) 1176 return false; 1177 return true; 1178 } else { 1179 LLT WiderTy = widen96To128(LoadTy); 1180 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1181 if (WiderTy.isScalar()) 1182 B.buildTrunc(MI.getOperand(0), WideLoad); 1183 else { 1184 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1185 WideLoad); 1186 } 1187 } 1188 } 1189 1190 MI.eraseFromParent(); 1191 return true; 1192 } 1193 1194 // 128-bit loads are supported for all instruction types. 1195 if (LoadSize <= MaxNonSmrdLoadSize) 1196 return false; 1197 1198 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1199 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1200 1201 if (SrcRegs.empty()) 1202 SrcRegs.push_back(MI.getOperand(1).getReg()); 1203 1204 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1205 1206 // RegBankSelect only emits scalar types, so we need to reset the pointer 1207 // operand to a pointer type. 1208 Register BasePtrReg = SrcRegs[0]; 1209 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1210 MRI.setType(BasePtrReg, PtrTy); 1211 1212 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1213 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1214 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1215 MachineIRBuilder B(MI, Observer); 1216 LegalizerHelper Helper(B.getMF(), Observer, B); 1217 1218 if (LoadTy.isVector()) { 1219 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1220 return false; 1221 } else { 1222 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1223 return false; 1224 } 1225 1226 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1227 return true; 1228 } 1229 1230 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1231 MachineInstr &MI, 1232 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1233 MachineRegisterInfo &MRI) const { 1234 const MachineFunction &MF = *MI.getMF(); 1235 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1236 const auto &TFI = *ST.getFrameLowering(); 1237 1238 // Guard in case the stack growth direction ever changes with scratch 1239 // instructions. 1240 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1241 return false; 1242 1243 Register Dst = MI.getOperand(0).getReg(); 1244 Register AllocSize = MI.getOperand(1).getReg(); 1245 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1246 1247 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1248 1249 // TODO: Need to emit a wave reduction to get the maximum size. 1250 if (SizeBank != &AMDGPU::SGPRRegBank) 1251 return false; 1252 1253 LLT PtrTy = MRI.getType(Dst); 1254 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1255 1256 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1257 Register SPReg = Info->getStackPtrOffsetReg(); 1258 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1259 MachineIRBuilder B(MI, ApplyBank); 1260 1261 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1262 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1263 1264 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1265 if (Alignment > TFI.getStackAlign()) { 1266 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1267 B.buildMaskLowPtrBits(Dst, PtrAdd, 1268 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1269 } else { 1270 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1271 } 1272 1273 MI.eraseFromParent(); 1274 return true; 1275 } 1276 1277 bool AMDGPURegisterBankInfo::applyMappingImage( 1278 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1279 MachineRegisterInfo &MRI, int RsrcIdx) const { 1280 const int NumDefs = MI.getNumExplicitDefs(); 1281 1282 // The reported argument index is relative to the IR intrinsic call arguments, 1283 // so we need to shift by the number of defs and the intrinsic ID. 1284 RsrcIdx += NumDefs + 1; 1285 1286 // Insert copies to VGPR arguments. 1287 applyDefaultMapping(OpdMapper); 1288 1289 // Fixup any SGPR arguments. 1290 SmallVector<unsigned, 4> SGPRIndexes; 1291 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1292 if (!MI.getOperand(I).isReg()) 1293 continue; 1294 1295 // If this intrinsic has a sampler, it immediately follows rsrc. 1296 if (I == RsrcIdx || I == RsrcIdx + 1) 1297 SGPRIndexes.push_back(I); 1298 } 1299 1300 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1301 return true; 1302 } 1303 1304 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1305 Register Reg) { 1306 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1307 if (!Def) 1308 return Reg; 1309 1310 // TODO: Guard against this being an implicit def 1311 return Def->getOperand(0).getReg(); 1312 } 1313 1314 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1315 // the three offsets (voffset, soffset and instoffset) 1316 static unsigned setBufferOffsets(MachineIRBuilder &B, 1317 const AMDGPURegisterBankInfo &RBI, 1318 Register CombinedOffset, Register &VOffsetReg, 1319 Register &SOffsetReg, int64_t &InstOffsetVal, 1320 Align Alignment) { 1321 const LLT S32 = LLT::scalar(32); 1322 MachineRegisterInfo *MRI = B.getMRI(); 1323 1324 if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1325 uint32_t SOffset, ImmOffset; 1326 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1327 Alignment)) { 1328 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1329 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1330 InstOffsetVal = ImmOffset; 1331 1332 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1333 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1334 return SOffset + ImmOffset; 1335 } 1336 } 1337 1338 Register Base; 1339 unsigned Offset; 1340 1341 std::tie(Base, Offset) = 1342 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1343 1344 uint32_t SOffset, ImmOffset; 1345 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1346 &RBI.Subtarget, Alignment)) { 1347 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1348 VOffsetReg = Base; 1349 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1350 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1351 InstOffsetVal = ImmOffset; 1352 return 0; // XXX - Why is this 0? 1353 } 1354 1355 // If we have SGPR base, we can use it for soffset. 1356 if (SOffset == 0) { 1357 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1358 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1359 SOffsetReg = Base; 1360 InstOffsetVal = ImmOffset; 1361 return 0; // XXX - Why is this 0? 1362 } 1363 } 1364 1365 // Handle the variable sgpr + vgpr case. 1366 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1367 if (Add && (int)Offset >= 0) { 1368 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1369 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1370 1371 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1372 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1373 1374 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1375 VOffsetReg = Src0; 1376 SOffsetReg = Src1; 1377 return 0; 1378 } 1379 1380 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1381 VOffsetReg = Src1; 1382 SOffsetReg = Src0; 1383 return 0; 1384 } 1385 } 1386 1387 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1388 // have an SGPR offset and a VGPR resource. 1389 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1390 VOffsetReg = CombinedOffset; 1391 } else { 1392 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1393 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1394 } 1395 1396 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1397 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1398 return 0; 1399 } 1400 1401 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1402 const OperandsMapper &OpdMapper) const { 1403 MachineInstr &MI = OpdMapper.getMI(); 1404 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1405 1406 const LLT S32 = LLT::scalar(32); 1407 Register Dst = MI.getOperand(0).getReg(); 1408 LLT Ty = MRI.getType(Dst); 1409 1410 const RegisterBank *RSrcBank = 1411 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1412 const RegisterBank *OffsetBank = 1413 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1414 if (RSrcBank == &AMDGPU::SGPRRegBank && 1415 OffsetBank == &AMDGPU::SGPRRegBank) 1416 return true; // Legal mapping 1417 1418 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1419 // here but don't have an MMO. 1420 1421 unsigned LoadSize = Ty.getSizeInBits(); 1422 int NumLoads = 1; 1423 if (LoadSize == 256 || LoadSize == 512) { 1424 NumLoads = LoadSize / 128; 1425 Ty = Ty.divide(NumLoads); 1426 } 1427 1428 // Use the alignment to ensure that the required offsets will fit into the 1429 // immediate offsets. 1430 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1431 1432 MachineIRBuilder B(MI); 1433 MachineFunction &MF = B.getMF(); 1434 1435 Register SOffset; 1436 Register VOffset; 1437 int64_t ImmOffset = 0; 1438 1439 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1440 VOffset, SOffset, ImmOffset, Alignment); 1441 1442 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1443 // can, but we need to track an MMO for that. 1444 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1445 const Align MemAlign(4); // FIXME: ABI type alignment? 1446 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1447 MachinePointerInfo(), 1448 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1449 MachineMemOperand::MOInvariant, 1450 MemSize, MemAlign); 1451 if (MMOOffset != 0) 1452 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1453 1454 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1455 // assume that the buffer is unswizzled. 1456 1457 Register RSrc = MI.getOperand(1).getReg(); 1458 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1459 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1460 1461 SmallVector<Register, 4> LoadParts(NumLoads); 1462 1463 MachineBasicBlock::iterator MII = MI.getIterator(); 1464 MachineInstrSpan Span(MII, &B.getMBB()); 1465 1466 for (int i = 0; i < NumLoads; ++i) { 1467 if (NumLoads == 1) { 1468 LoadParts[i] = Dst; 1469 } else { 1470 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1471 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1472 } 1473 1474 MachineMemOperand *MMO = BaseMMO; 1475 if (i != 0) 1476 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1477 1478 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1479 .addDef(LoadParts[i]) // vdata 1480 .addUse(RSrc) // rsrc 1481 .addUse(VIndex) // vindex 1482 .addUse(VOffset) // voffset 1483 .addUse(SOffset) // soffset 1484 .addImm(ImmOffset + 16 * i) // offset(imm) 1485 .addImm(0) // cachepolicy, swizzled buffer(imm) 1486 .addImm(0) // idxen(imm) 1487 .addMemOperand(MMO); 1488 } 1489 1490 // TODO: If only the resource is a VGPR, it may be better to execute the 1491 // scalar load in the waterfall loop if the resource is expected to frequently 1492 // be dynamically uniform. 1493 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1494 // Remove the original instruction to avoid potentially confusing the 1495 // waterfall loop logic. 1496 B.setInstr(*Span.begin()); 1497 MI.eraseFromParent(); 1498 1499 SmallSet<Register, 4> OpsToWaterfall; 1500 1501 OpsToWaterfall.insert(RSrc); 1502 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1503 OpsToWaterfall, MRI); 1504 } 1505 1506 if (NumLoads != 1) { 1507 if (Ty.isVector()) 1508 B.buildConcatVectors(Dst, LoadParts); 1509 else 1510 B.buildMerge(Dst, LoadParts); 1511 } 1512 1513 // We removed the instruction earlier with a waterfall loop. 1514 if (RSrcBank == &AMDGPU::SGPRRegBank) 1515 MI.eraseFromParent(); 1516 1517 return true; 1518 } 1519 1520 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, 1521 bool Signed) const { 1522 MachineInstr &MI = OpdMapper.getMI(); 1523 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1524 1525 // Insert basic copies 1526 applyDefaultMapping(OpdMapper); 1527 1528 Register DstReg = MI.getOperand(0).getReg(); 1529 LLT Ty = MRI.getType(DstReg); 1530 1531 const LLT S32 = LLT::scalar(32); 1532 1533 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; 1534 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1535 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1536 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1537 1538 const RegisterBank *DstBank = 1539 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1540 if (DstBank == &AMDGPU::VGPRRegBank) { 1541 if (Ty == S32) 1542 return true; 1543 1544 // There is no 64-bit vgpr bitfield extract instructions so the operation 1545 // is expanded to a sequence of instructions that implement the operation. 1546 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); 1547 MachineIRBuilder B(MI, ApplyBank); 1548 1549 const LLT S64 = LLT::scalar(64); 1550 // Shift the source operand so that extracted bits start at bit 0. 1551 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1552 : B.buildLShr(S64, SrcReg, OffsetReg); 1553 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1554 1555 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1556 // if the width is a constant. 1557 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1558 // Use the 32-bit bitfield extract instruction if the width is a constant. 1559 // Depending on the width size, use either the low or high 32-bits. 1560 auto Zero = B.buildConstant(S32, 0); 1561 auto WidthImm = ConstWidth->Value.getZExtValue(); 1562 if (WidthImm <= 32) { 1563 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1564 // or clear the upper 32-bits. 1565 auto Extract = 1566 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1567 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1568 auto Extend = 1569 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1570 B.buildMerge(DstReg, {Extract, Extend}); 1571 } else { 1572 // Use bitfield extract on upper 32-bit source, and combine with lower 1573 // 32-bit source. 1574 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1575 auto Extract = 1576 Signed 1577 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1578 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1579 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1580 } 1581 MI.eraseFromParent(); 1582 return true; 1583 } 1584 1585 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1586 // operations. 1587 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1588 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1589 if (Signed) 1590 B.buildAShr(S64, SignBit, ExtShift); 1591 else 1592 B.buildLShr(S64, SignBit, ExtShift); 1593 MI.eraseFromParent(); 1594 return true; 1595 } 1596 1597 // The scalar form packs the offset and width in a single operand. 1598 1599 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1600 MachineIRBuilder B(MI, ApplyBank); 1601 1602 // Ensure the high bits are clear to insert the offset. 1603 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1604 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1605 1606 // Zeros out the low bits, so don't bother clamping the input value. 1607 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1608 1609 // Transformation function, pack the offset and width of a BFE into 1610 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1611 // source, bits [5:0] contain the offset and bits [22:16] the width. 1612 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1613 1614 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1615 // register class constraints. 1616 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1617 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1618 1619 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1620 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1621 llvm_unreachable("failed to constrain BFE"); 1622 1623 MI.eraseFromParent(); 1624 return true; 1625 } 1626 1627 // Return a suitable opcode for extending the operands of Opc when widening. 1628 static unsigned getExtendOp(unsigned Opc) { 1629 switch (Opc) { 1630 case TargetOpcode::G_ASHR: 1631 case TargetOpcode::G_SMIN: 1632 case TargetOpcode::G_SMAX: 1633 return TargetOpcode::G_SEXT; 1634 case TargetOpcode::G_LSHR: 1635 case TargetOpcode::G_UMIN: 1636 case TargetOpcode::G_UMAX: 1637 return TargetOpcode::G_ZEXT; 1638 default: 1639 return TargetOpcode::G_ANYEXT; 1640 } 1641 } 1642 1643 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1644 // any illegal vector extend or unmerge operations. 1645 static std::pair<Register, Register> 1646 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1647 const LLT S32 = LLT::scalar(32); 1648 auto Bitcast = B.buildBitcast(S32, Src); 1649 1650 if (ExtOpcode == TargetOpcode::G_SEXT) { 1651 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1652 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1653 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1654 } 1655 1656 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1657 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1658 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1659 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1660 } 1661 1662 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1663 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1664 } 1665 1666 // For cases where only a single copy is inserted for matching register banks. 1667 // Replace the register in the instruction operand 1668 static bool substituteSimpleCopyRegs( 1669 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1670 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1671 if (!SrcReg.empty()) { 1672 assert(SrcReg.size() == 1); 1673 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1674 return true; 1675 } 1676 1677 return false; 1678 } 1679 1680 /// Handle register layout difference for f16 images for some subtargets. 1681 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1682 MachineRegisterInfo &MRI, 1683 Register Reg) const { 1684 if (!Subtarget.hasUnpackedD16VMem()) 1685 return Reg; 1686 1687 const LLT S16 = LLT::scalar(16); 1688 LLT StoreVT = MRI.getType(Reg); 1689 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1690 return Reg; 1691 1692 auto Unmerge = B.buildUnmerge(S16, Reg); 1693 1694 1695 SmallVector<Register, 4> WideRegs; 1696 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1697 WideRegs.push_back(Unmerge.getReg(I)); 1698 1699 const LLT S32 = LLT::scalar(32); 1700 int NumElts = StoreVT.getNumElements(); 1701 1702 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0); 1703 } 1704 1705 static std::pair<Register, unsigned> 1706 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1707 int64_t Const; 1708 if (mi_match(Reg, MRI, m_ICst(Const))) 1709 return std::make_pair(Register(), Const); 1710 1711 Register Base; 1712 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1713 return std::make_pair(Base, Const); 1714 1715 // TODO: Handle G_OR used for add case 1716 return std::make_pair(Reg, 0); 1717 } 1718 1719 std::pair<Register, unsigned> 1720 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1721 Register OrigOffset) const { 1722 const unsigned MaxImm = 4095; 1723 Register BaseReg; 1724 unsigned ImmOffset; 1725 const LLT S32 = LLT::scalar(32); 1726 1727 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1728 OrigOffset); 1729 1730 unsigned C1 = 0; 1731 if (ImmOffset != 0) { 1732 // If the immediate value is too big for the immoffset field, put the value 1733 // and -4096 into the immoffset field so that the value that is copied/added 1734 // for the voffset field is a multiple of 4096, and it stands more chance 1735 // of being CSEd with the copy/add for another similar load/store. 1736 // However, do not do that rounding down to a multiple of 4096 if that is a 1737 // negative number, as it appears to be illegal to have a negative offset 1738 // in the vgpr, even if adding the immediate offset makes it positive. 1739 unsigned Overflow = ImmOffset & ~MaxImm; 1740 ImmOffset -= Overflow; 1741 if ((int32_t)Overflow < 0) { 1742 Overflow += ImmOffset; 1743 ImmOffset = 0; 1744 } 1745 1746 C1 = ImmOffset; 1747 if (Overflow != 0) { 1748 if (!BaseReg) 1749 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1750 else { 1751 auto OverflowVal = B.buildConstant(S32, Overflow); 1752 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1753 } 1754 } 1755 } 1756 1757 if (!BaseReg) 1758 BaseReg = B.buildConstant(S32, 0).getReg(0); 1759 1760 return {BaseReg, C1}; 1761 } 1762 1763 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1764 Register SrcReg) const { 1765 MachineRegisterInfo &MRI = *B.getMRI(); 1766 LLT SrcTy = MRI.getType(SrcReg); 1767 if (SrcTy.getSizeInBits() == 32) { 1768 // Use a v_mov_b32 here to make the exec dependency explicit. 1769 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1770 .addDef(DstReg) 1771 .addUse(SrcReg); 1772 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1773 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1774 } 1775 1776 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1777 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1778 1779 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1780 .addDef(TmpReg0) 1781 .addUse(SrcReg, 0, AMDGPU::sub0); 1782 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1783 .addDef(TmpReg1) 1784 .addUse(SrcReg, 0, AMDGPU::sub1); 1785 B.buildInstr(AMDGPU::REG_SEQUENCE) 1786 .addDef(DstReg) 1787 .addUse(TmpReg0) 1788 .addImm(AMDGPU::sub0) 1789 .addUse(TmpReg1) 1790 .addImm(AMDGPU::sub1); 1791 1792 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1793 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1794 } 1795 1796 /// Utility function for pushing dynamic vector indexes with a constant offset 1797 /// into waterwall loops. 1798 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1799 MachineInstr &IdxUseInstr, 1800 unsigned OpIdx, 1801 unsigned ConstOffset) { 1802 MachineRegisterInfo &MRI = *B.getMRI(); 1803 const LLT S32 = LLT::scalar(32); 1804 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1805 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1806 1807 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1808 1809 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1810 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1811 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1812 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1813 } 1814 1815 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1816 /// original 32-bit source value (to be inserted in the low part of the combined 1817 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1818 /// value. 1819 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1820 Register Hi32Reg, Register Lo32Reg, 1821 unsigned ExtOpc, 1822 const RegisterBank &RegBank, 1823 bool IsBooleanSrc = false) { 1824 if (ExtOpc == AMDGPU::G_ZEXT) { 1825 B.buildConstant(Hi32Reg, 0); 1826 } else if (ExtOpc == AMDGPU::G_SEXT) { 1827 if (IsBooleanSrc) { 1828 // If we know the original source was an s1, the high half is the same as 1829 // the low. 1830 B.buildCopy(Hi32Reg, Lo32Reg); 1831 } else { 1832 // Replicate sign bit from 32-bit extended part. 1833 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1834 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1835 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1836 } 1837 } else { 1838 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1839 B.buildUndef(Hi32Reg); 1840 } 1841 } 1842 1843 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1844 MachineInstr &MI, MachineRegisterInfo &MRI, 1845 const OperandsMapper &OpdMapper) const { 1846 1847 Register VecReg = MI.getOperand(1).getReg(); 1848 Register Idx = MI.getOperand(2).getReg(); 1849 1850 const RegisterBank &IdxBank = 1851 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1852 1853 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1854 1855 LLT VecTy = MRI.getType(VecReg); 1856 unsigned EltSize = VecTy.getScalarSizeInBits(); 1857 unsigned NumElem = VecTy.getNumElements(); 1858 1859 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1860 IsDivergentIdx)) 1861 return false; 1862 1863 MachineIRBuilder B(MI); 1864 LLT S32 = LLT::scalar(32); 1865 1866 const RegisterBank &DstBank = 1867 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1868 const RegisterBank &SrcBank = 1869 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1870 1871 const RegisterBank &CCBank = 1872 (DstBank == AMDGPU::SGPRRegBank && 1873 SrcBank == AMDGPU::SGPRRegBank && 1874 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1875 : AMDGPU::VCCRegBank; 1876 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1877 1878 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1879 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1880 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1881 } 1882 1883 LLT EltTy = VecTy.getScalarType(); 1884 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1885 unsigned NumLanes = DstRegs.size(); 1886 if (!NumLanes) 1887 NumLanes = 1; 1888 else 1889 EltTy = MRI.getType(DstRegs[0]); 1890 1891 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1892 SmallVector<Register, 2> Res(NumLanes); 1893 for (unsigned L = 0; L < NumLanes; ++L) 1894 Res[L] = UnmergeToEltTy.getReg(L); 1895 1896 for (unsigned I = 1; I < NumElem; ++I) { 1897 auto IC = B.buildConstant(S32, I); 1898 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1899 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1900 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1901 1902 for (unsigned L = 0; L < NumLanes; ++L) { 1903 auto S = B.buildSelect(EltTy, Cmp, 1904 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1905 1906 for (unsigned N : { 0, 2, 3 }) 1907 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1908 1909 Res[L] = S->getOperand(0).getReg(); 1910 } 1911 } 1912 1913 for (unsigned L = 0; L < NumLanes; ++L) { 1914 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1915 B.buildCopy(DstReg, Res[L]); 1916 MRI.setRegBank(DstReg, DstBank); 1917 } 1918 1919 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 1920 MI.eraseFromParent(); 1921 1922 return true; 1923 } 1924 1925 // Insert a cross regbank copy for a register if it already has a bank that 1926 // differs from the one we want to set. 1927 static Register constrainRegToBank(MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B, Register &Reg, 1929 const RegisterBank &Bank) { 1930 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 1931 if (CurrBank && *CurrBank != Bank) { 1932 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 1933 MRI.setRegBank(Copy, Bank); 1934 return Copy; 1935 } 1936 1937 MRI.setRegBank(Reg, Bank); 1938 return Reg; 1939 } 1940 1941 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 1942 MachineInstr &MI, MachineRegisterInfo &MRI, 1943 const OperandsMapper &OpdMapper) const { 1944 1945 Register VecReg = MI.getOperand(1).getReg(); 1946 Register Idx = MI.getOperand(3).getReg(); 1947 1948 const RegisterBank &IdxBank = 1949 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 1950 1951 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1952 1953 LLT VecTy = MRI.getType(VecReg); 1954 unsigned EltSize = VecTy.getScalarSizeInBits(); 1955 unsigned NumElem = VecTy.getNumElements(); 1956 1957 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1958 IsDivergentIdx)) 1959 return false; 1960 1961 MachineIRBuilder B(MI); 1962 LLT S32 = LLT::scalar(32); 1963 1964 const RegisterBank &DstBank = 1965 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1966 const RegisterBank &SrcBank = 1967 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1968 const RegisterBank &InsBank = 1969 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1970 1971 const RegisterBank &CCBank = 1972 (DstBank == AMDGPU::SGPRRegBank && 1973 SrcBank == AMDGPU::SGPRRegBank && 1974 InsBank == AMDGPU::SGPRRegBank && 1975 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1976 : AMDGPU::VCCRegBank; 1977 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1978 1979 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1980 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1981 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1982 } 1983 1984 LLT EltTy = VecTy.getScalarType(); 1985 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 1986 unsigned NumLanes = InsRegs.size(); 1987 if (!NumLanes) { 1988 NumLanes = 1; 1989 InsRegs.push_back(MI.getOperand(2).getReg()); 1990 } else { 1991 EltTy = MRI.getType(InsRegs[0]); 1992 } 1993 1994 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1995 SmallVector<Register, 16> Ops(NumElem * NumLanes); 1996 1997 for (unsigned I = 0; I < NumElem; ++I) { 1998 auto IC = B.buildConstant(S32, I); 1999 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2000 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2001 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2002 2003 for (unsigned L = 0; L < NumLanes; ++L) { 2004 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2005 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2006 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2007 2008 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2009 MRI.setRegBank(Select, DstBank); 2010 2011 Ops[I * NumLanes + L] = Select; 2012 } 2013 } 2014 2015 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2016 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2017 B.buildBuildVector(MI.getOperand(0), Ops); 2018 } else { 2019 auto Vec = B.buildBuildVector(MergeTy, Ops); 2020 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2021 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2022 } 2023 2024 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2025 MI.eraseFromParent(); 2026 2027 return true; 2028 } 2029 2030 void AMDGPURegisterBankInfo::applyMappingImpl( 2031 const OperandsMapper &OpdMapper) const { 2032 MachineInstr &MI = OpdMapper.getMI(); 2033 unsigned Opc = MI.getOpcode(); 2034 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2035 switch (Opc) { 2036 case AMDGPU::G_PHI: { 2037 Register DstReg = MI.getOperand(0).getReg(); 2038 LLT DstTy = MRI.getType(DstReg); 2039 if (DstTy != LLT::scalar(1)) 2040 break; 2041 2042 const LLT S32 = LLT::scalar(32); 2043 const RegisterBank *DstBank = 2044 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2045 if (DstBank == &AMDGPU::VCCRegBank) { 2046 applyDefaultMapping(OpdMapper); 2047 // The standard handling only considers the result register bank for 2048 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2049 // produce an invalid copy. We can only copy with some kind of compare to 2050 // get a vector boolean result. Insert a register bank copy that will be 2051 // correctly lowered to a compare. 2052 MachineIRBuilder B(*MI.getParent()->getParent()); 2053 2054 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2055 Register SrcReg = MI.getOperand(I).getReg(); 2056 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2057 2058 if (SrcBank != &AMDGPU::VCCRegBank) { 2059 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2060 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2061 2062 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2063 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2064 MI.getOperand(I).setReg(Copy.getReg(0)); 2065 } 2066 } 2067 2068 return; 2069 } 2070 2071 // Phi handling is strange and only considers the bank of the destination. 2072 substituteSimpleCopyRegs(OpdMapper, 0); 2073 2074 // Promote SGPR/VGPR booleans to s32 2075 MachineFunction *MF = MI.getParent()->getParent(); 2076 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2077 MachineIRBuilder B(MI, ApplyBank); 2078 LegalizerHelper Helper(*MF, ApplyBank, B); 2079 2080 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2081 llvm_unreachable("widen scalar should have succeeded"); 2082 2083 return; 2084 } 2085 case AMDGPU::G_ICMP: 2086 case AMDGPU::G_UADDO: 2087 case AMDGPU::G_USUBO: 2088 case AMDGPU::G_UADDE: 2089 case AMDGPU::G_SADDE: 2090 case AMDGPU::G_USUBE: 2091 case AMDGPU::G_SSUBE: { 2092 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2093 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2094 2095 const RegisterBank *DstBank = 2096 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2097 if (DstBank != &AMDGPU::SGPRRegBank) 2098 break; 2099 2100 const bool HasCarryIn = MI.getNumOperands() == 5; 2101 2102 // If this is a scalar compare, promote the result to s32, as the selection 2103 // will end up using a copy to a 32-bit vreg. 2104 const LLT S32 = LLT::scalar(32); 2105 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2106 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2107 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2108 MachineIRBuilder B(MI); 2109 2110 if (HasCarryIn) { 2111 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2112 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2113 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2114 MI.getOperand(4).setReg(NewSrcReg); 2115 } 2116 2117 MachineBasicBlock *MBB = MI.getParent(); 2118 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2119 2120 // If we had a constrained VCC result register, a copy was inserted to VCC 2121 // from SGPR. 2122 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2123 if (DefRegs.empty()) 2124 DefRegs.push_back(DstReg); 2125 B.buildTrunc(DefRegs[0], NewDstReg); 2126 return; 2127 } 2128 case AMDGPU::G_SELECT: { 2129 Register DstReg = MI.getOperand(0).getReg(); 2130 LLT DstTy = MRI.getType(DstReg); 2131 2132 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2133 if (CondRegs.empty()) 2134 CondRegs.push_back(MI.getOperand(1).getReg()); 2135 else { 2136 assert(CondRegs.size() == 1); 2137 } 2138 2139 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2140 if (CondBank == &AMDGPU::SGPRRegBank) { 2141 MachineIRBuilder B(MI); 2142 const LLT S32 = LLT::scalar(32); 2143 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2144 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2145 2146 MI.getOperand(1).setReg(NewCondReg); 2147 B.buildZExt(NewCondReg, CondRegs[0]); 2148 } 2149 2150 if (DstTy.getSizeInBits() != 64) 2151 break; 2152 2153 MachineIRBuilder B(MI); 2154 LLT HalfTy = getHalfSizedType(DstTy); 2155 2156 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2157 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2158 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2159 2160 // All inputs are SGPRs, nothing special to do. 2161 if (DefRegs.empty()) { 2162 assert(Src1Regs.empty() && Src2Regs.empty()); 2163 break; 2164 } 2165 2166 if (Src1Regs.empty()) 2167 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2168 else { 2169 setRegsToType(MRI, Src1Regs, HalfTy); 2170 } 2171 2172 if (Src2Regs.empty()) 2173 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2174 else 2175 setRegsToType(MRI, Src2Regs, HalfTy); 2176 2177 setRegsToType(MRI, DefRegs, HalfTy); 2178 2179 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2180 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2181 2182 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2183 MI.eraseFromParent(); 2184 return; 2185 } 2186 case AMDGPU::G_BRCOND: { 2187 Register CondReg = MI.getOperand(0).getReg(); 2188 // FIXME: Should use legalizer helper, but should change bool ext type. 2189 const RegisterBank *CondBank = 2190 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2191 2192 if (CondBank == &AMDGPU::SGPRRegBank) { 2193 MachineIRBuilder B(MI); 2194 const LLT S32 = LLT::scalar(32); 2195 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2196 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2197 2198 MI.getOperand(0).setReg(NewCondReg); 2199 B.buildZExt(NewCondReg, CondReg); 2200 return; 2201 } 2202 2203 break; 2204 } 2205 case AMDGPU::G_AND: 2206 case AMDGPU::G_OR: 2207 case AMDGPU::G_XOR: { 2208 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2209 // there is a VGPR input. 2210 Register DstReg = MI.getOperand(0).getReg(); 2211 LLT DstTy = MRI.getType(DstReg); 2212 2213 if (DstTy.getSizeInBits() == 1) { 2214 const RegisterBank *DstBank = 2215 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2216 if (DstBank == &AMDGPU::VCCRegBank) 2217 break; 2218 2219 MachineFunction *MF = MI.getParent()->getParent(); 2220 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2221 MachineIRBuilder B(MI, ApplyBank); 2222 LegalizerHelper Helper(*MF, ApplyBank, B); 2223 2224 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2225 LegalizerHelper::Legalized) 2226 llvm_unreachable("widen scalar should have succeeded"); 2227 return; 2228 } 2229 2230 if (DstTy.getSizeInBits() != 64) 2231 break; 2232 2233 LLT HalfTy = getHalfSizedType(DstTy); 2234 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2235 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2236 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2237 2238 // All inputs are SGPRs, nothing special to do. 2239 if (DefRegs.empty()) { 2240 assert(Src0Regs.empty() && Src1Regs.empty()); 2241 break; 2242 } 2243 2244 assert(DefRegs.size() == 2); 2245 assert(Src0Regs.size() == Src1Regs.size() && 2246 (Src0Regs.empty() || Src0Regs.size() == 2)); 2247 2248 // Depending on where the source registers came from, the generic code may 2249 // have decided to split the inputs already or not. If not, we still need to 2250 // extract the values. 2251 MachineIRBuilder B(MI); 2252 2253 if (Src0Regs.empty()) 2254 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2255 else 2256 setRegsToType(MRI, Src0Regs, HalfTy); 2257 2258 if (Src1Regs.empty()) 2259 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2260 else 2261 setRegsToType(MRI, Src1Regs, HalfTy); 2262 2263 setRegsToType(MRI, DefRegs, HalfTy); 2264 2265 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2266 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2267 2268 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2269 MI.eraseFromParent(); 2270 return; 2271 } 2272 case AMDGPU::G_ABS: { 2273 Register SrcReg = MI.getOperand(1).getReg(); 2274 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2275 2276 // There is no VALU abs instruction so we need to replace it with a sub and 2277 // max combination. 2278 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2279 MachineFunction *MF = MI.getParent()->getParent(); 2280 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); 2281 MachineIRBuilder B(MI, Apply); 2282 LegalizerHelper Helper(*MF, Apply, B); 2283 2284 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2285 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2286 return; 2287 } 2288 LLVM_FALLTHROUGH; 2289 } 2290 case AMDGPU::G_ADD: 2291 case AMDGPU::G_SUB: 2292 case AMDGPU::G_MUL: 2293 case AMDGPU::G_SHL: 2294 case AMDGPU::G_LSHR: 2295 case AMDGPU::G_ASHR: 2296 case AMDGPU::G_SMIN: 2297 case AMDGPU::G_SMAX: 2298 case AMDGPU::G_UMIN: 2299 case AMDGPU::G_UMAX: { 2300 Register DstReg = MI.getOperand(0).getReg(); 2301 LLT DstTy = MRI.getType(DstReg); 2302 2303 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2304 // Packed 16-bit operations need to be scalarized and promoted. 2305 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2306 break; 2307 2308 const RegisterBank *DstBank = 2309 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2310 if (DstBank == &AMDGPU::VGPRRegBank) 2311 break; 2312 2313 const LLT S32 = LLT::scalar(32); 2314 MachineBasicBlock *MBB = MI.getParent(); 2315 MachineFunction *MF = MBB->getParent(); 2316 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2317 MachineIRBuilder B(MI, ApplySALU); 2318 2319 if (DstTy.isVector()) { 2320 Register WideSrc0Lo, WideSrc0Hi; 2321 Register WideSrc1Lo, WideSrc1Hi; 2322 2323 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2324 std::tie(WideSrc0Lo, WideSrc0Hi) 2325 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2326 std::tie(WideSrc1Lo, WideSrc1Hi) 2327 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2328 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2329 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2330 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2331 MI.eraseFromParent(); 2332 } else { 2333 LegalizerHelper Helper(*MF, ApplySALU, B); 2334 2335 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2336 llvm_unreachable("widen scalar should have succeeded"); 2337 2338 // FIXME: s16 shift amounts should be legal. 2339 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2340 Opc == AMDGPU::G_ASHR) { 2341 B.setInsertPt(*MBB, MI.getIterator()); 2342 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2343 llvm_unreachable("widen scalar should have succeeded"); 2344 } 2345 } 2346 2347 return; 2348 } 2349 case AMDGPU::G_SEXT_INREG: { 2350 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2351 if (SrcRegs.empty()) 2352 break; // Nothing to repair 2353 2354 const LLT S32 = LLT::scalar(32); 2355 MachineIRBuilder B(MI); 2356 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2357 GISelObserverWrapper Observer(&O); 2358 B.setChangeObserver(Observer); 2359 2360 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2361 // we would need to further expand, and doesn't let us directly set the 2362 // result registers. 2363 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2364 2365 int Amt = MI.getOperand(2).getImm(); 2366 if (Amt <= 32) { 2367 if (Amt == 32) { 2368 // The low bits are unchanged. 2369 B.buildCopy(DstRegs[0], SrcRegs[0]); 2370 } else { 2371 // Extend in the low bits and propagate the sign bit to the high half. 2372 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2373 } 2374 2375 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2376 } else { 2377 // The low bits are unchanged, and extend in the high bits. 2378 B.buildCopy(DstRegs[0], SrcRegs[0]); 2379 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2380 } 2381 2382 Register DstReg = MI.getOperand(0).getReg(); 2383 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2384 MI.eraseFromParent(); 2385 return; 2386 } 2387 case AMDGPU::G_CTPOP: 2388 case AMDGPU::G_BITREVERSE: { 2389 const RegisterBank *DstBank = 2390 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2391 if (DstBank == &AMDGPU::SGPRRegBank) 2392 break; 2393 2394 Register SrcReg = MI.getOperand(1).getReg(); 2395 const LLT S32 = LLT::scalar(32); 2396 LLT Ty = MRI.getType(SrcReg); 2397 if (Ty == S32) 2398 break; 2399 2400 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2401 MachineIRBuilder B(MI, ApplyVALU); 2402 2403 MachineFunction &MF = B.getMF(); 2404 LegalizerHelper Helper(MF, ApplyVALU, B); 2405 2406 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2407 llvm_unreachable("narrowScalar should have succeeded"); 2408 return; 2409 } 2410 case AMDGPU::G_AMDGPU_FFBH_U32: 2411 case AMDGPU::G_AMDGPU_FFBL_B32: 2412 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2413 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2414 const RegisterBank *DstBank = 2415 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2416 if (DstBank == &AMDGPU::SGPRRegBank) 2417 break; 2418 2419 Register SrcReg = MI.getOperand(1).getReg(); 2420 const LLT S32 = LLT::scalar(32); 2421 LLT Ty = MRI.getType(SrcReg); 2422 if (Ty == S32) 2423 break; 2424 2425 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2426 // which return -1 when the input is zero: 2427 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2428 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2429 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2430 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2431 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2432 MachineIRBuilder B(MI, ApplyVALU); 2433 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2434 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2435 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2436 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2437 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2438 : Opc; 2439 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2440 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2441 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2442 unsigned AddOpc = 2443 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2444 ? AMDGPU::G_ADD 2445 : AMDGPU::G_UADDSAT; 2446 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2447 Register DstReg = MI.getOperand(0).getReg(); 2448 B.buildUMin(DstReg, X, Y); 2449 MI.eraseFromParent(); 2450 return; 2451 } 2452 case AMDGPU::G_SEXT: 2453 case AMDGPU::G_ZEXT: 2454 case AMDGPU::G_ANYEXT: { 2455 Register SrcReg = MI.getOperand(1).getReg(); 2456 LLT SrcTy = MRI.getType(SrcReg); 2457 const bool Signed = Opc == AMDGPU::G_SEXT; 2458 2459 assert(empty(OpdMapper.getVRegs(1))); 2460 2461 MachineIRBuilder B(MI); 2462 const RegisterBank *SrcBank = 2463 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2464 2465 Register DstReg = MI.getOperand(0).getReg(); 2466 LLT DstTy = MRI.getType(DstReg); 2467 if (DstTy.isScalar() && 2468 SrcBank != &AMDGPU::SGPRRegBank && 2469 SrcBank != &AMDGPU::VCCRegBank && 2470 // FIXME: Should handle any type that round to s64 when irregular 2471 // breakdowns supported. 2472 DstTy.getSizeInBits() == 64 && 2473 SrcTy.getSizeInBits() <= 32) { 2474 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2475 2476 // Extend to 32-bit, and then extend the low half. 2477 if (Signed) { 2478 // TODO: Should really be buildSExtOrCopy 2479 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2480 } else if (Opc == AMDGPU::G_ZEXT) { 2481 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2482 } else { 2483 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2484 } 2485 2486 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2487 MRI.setRegBank(DstReg, *SrcBank); 2488 MI.eraseFromParent(); 2489 return; 2490 } 2491 2492 if (SrcTy != LLT::scalar(1)) 2493 return; 2494 2495 // It is not legal to have a legalization artifact with a VCC source. Rather 2496 // than introducing a copy, insert the select we would have to select the 2497 // copy to. 2498 if (SrcBank == &AMDGPU::VCCRegBank) { 2499 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2500 2501 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2502 2503 unsigned DstSize = DstTy.getSizeInBits(); 2504 // 64-bit select is SGPR only 2505 const bool UseSel64 = DstSize > 32 && 2506 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2507 2508 // TODO: Should s16 select be legal? 2509 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2510 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2511 auto False = B.buildConstant(SelType, 0); 2512 2513 MRI.setRegBank(True.getReg(0), *DstBank); 2514 MRI.setRegBank(False.getReg(0), *DstBank); 2515 MRI.setRegBank(DstReg, *DstBank); 2516 2517 if (DstSize > 32) { 2518 B.buildSelect(DefRegs[0], SrcReg, True, False); 2519 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2520 } else if (DstSize < 32) { 2521 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2522 MRI.setRegBank(Sel.getReg(0), *DstBank); 2523 B.buildTrunc(DstReg, Sel); 2524 } else { 2525 B.buildSelect(DstReg, SrcReg, True, False); 2526 } 2527 2528 MI.eraseFromParent(); 2529 return; 2530 } 2531 2532 break; 2533 } 2534 case AMDGPU::G_BUILD_VECTOR: 2535 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2536 Register DstReg = MI.getOperand(0).getReg(); 2537 LLT DstTy = MRI.getType(DstReg); 2538 if (DstTy != LLT::fixed_vector(2, 16)) 2539 break; 2540 2541 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2542 substituteSimpleCopyRegs(OpdMapper, 1); 2543 substituteSimpleCopyRegs(OpdMapper, 2); 2544 2545 const RegisterBank *DstBank = 2546 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2547 if (DstBank == &AMDGPU::SGPRRegBank) 2548 break; // Can use S_PACK_* instructions. 2549 2550 MachineIRBuilder B(MI); 2551 2552 Register Lo = MI.getOperand(1).getReg(); 2553 Register Hi = MI.getOperand(2).getReg(); 2554 const LLT S32 = LLT::scalar(32); 2555 2556 const RegisterBank *BankLo = 2557 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2558 const RegisterBank *BankHi = 2559 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2560 2561 Register ZextLo; 2562 Register ShiftHi; 2563 2564 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2565 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2566 MRI.setRegBank(ZextLo, *BankLo); 2567 2568 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2569 MRI.setRegBank(ZextHi, *BankHi); 2570 2571 auto ShiftAmt = B.buildConstant(S32, 16); 2572 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2573 2574 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2575 MRI.setRegBank(ShiftHi, *BankHi); 2576 } else { 2577 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2578 MRI.setRegBank(MaskLo, *BankLo); 2579 2580 auto ShiftAmt = B.buildConstant(S32, 16); 2581 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2582 2583 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2584 MRI.setRegBank(ShiftHi, *BankHi); 2585 2586 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2587 MRI.setRegBank(ZextLo, *BankLo); 2588 } 2589 2590 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2591 MRI.setRegBank(Or.getReg(0), *DstBank); 2592 2593 B.buildBitcast(DstReg, Or); 2594 MI.eraseFromParent(); 2595 return; 2596 } 2597 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2598 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2599 2600 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2601 2602 Register DstReg = MI.getOperand(0).getReg(); 2603 Register SrcReg = MI.getOperand(1).getReg(); 2604 2605 const LLT S32 = LLT::scalar(32); 2606 LLT DstTy = MRI.getType(DstReg); 2607 LLT SrcTy = MRI.getType(SrcReg); 2608 2609 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2610 return; 2611 2612 MachineIRBuilder B(MI); 2613 2614 const ValueMapping &DstMapping 2615 = OpdMapper.getInstrMapping().getOperandMapping(0); 2616 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2617 const RegisterBank *SrcBank = 2618 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2619 const RegisterBank *IdxBank = 2620 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2621 2622 Register BaseIdxReg; 2623 unsigned ConstOffset; 2624 std::tie(BaseIdxReg, ConstOffset) = 2625 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2626 2627 // See if the index is an add of a constant which will be foldable by moving 2628 // the base register of the index later if this is going to be executed in a 2629 // waterfall loop. This is essentially to reassociate the add of a constant 2630 // with the readfirstlane. 2631 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2632 ConstOffset > 0 && 2633 ConstOffset < SrcTy.getNumElements(); 2634 2635 // Move the base register. We'll re-insert the add later. 2636 if (ShouldMoveIndexIntoLoop) 2637 MI.getOperand(2).setReg(BaseIdxReg); 2638 2639 // If this is a VGPR result only because the index was a VGPR result, the 2640 // actual indexing will be done on the SGPR source vector, which will 2641 // produce a scalar result. We need to copy to the VGPR result inside the 2642 // waterfall loop. 2643 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2644 SrcBank == &AMDGPU::SGPRRegBank; 2645 if (DstRegs.empty()) { 2646 applyDefaultMapping(OpdMapper); 2647 2648 executeInWaterfallLoop(MI, MRI, { 2 }); 2649 2650 if (NeedCopyToVGPR) { 2651 // We don't want a phi for this temporary reg. 2652 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2653 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2654 MI.getOperand(0).setReg(TmpReg); 2655 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2656 2657 // Use a v_mov_b32 here to make the exec dependency explicit. 2658 buildVCopy(B, DstReg, TmpReg); 2659 } 2660 2661 // Re-insert the constant offset add inside the waterfall loop. 2662 if (ShouldMoveIndexIntoLoop) 2663 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2664 2665 return; 2666 } 2667 2668 assert(DstTy.getSizeInBits() == 64); 2669 2670 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2671 2672 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2673 auto One = B.buildConstant(S32, 1); 2674 2675 MachineBasicBlock::iterator MII = MI.getIterator(); 2676 2677 // Split the vector index into 32-bit pieces. Prepare to move all of the 2678 // new instructions into a waterfall loop if necessary. 2679 // 2680 // Don't put the bitcast or constant in the loop. 2681 MachineInstrSpan Span(MII, &B.getMBB()); 2682 2683 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2684 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2685 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2686 2687 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2688 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2689 2690 MRI.setRegBank(DstReg, *DstBank); 2691 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2692 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2693 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2694 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2695 2696 SmallSet<Register, 4> OpsToWaterfall; 2697 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2698 MI.eraseFromParent(); 2699 return; 2700 } 2701 2702 // Remove the original instruction to avoid potentially confusing the 2703 // waterfall loop logic. 2704 B.setInstr(*Span.begin()); 2705 MI.eraseFromParent(); 2706 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2707 OpsToWaterfall, MRI); 2708 2709 if (NeedCopyToVGPR) { 2710 MachineBasicBlock *LoopBB = Extract1->getParent(); 2711 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2712 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2713 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2714 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2715 2716 Extract0->getOperand(0).setReg(TmpReg0); 2717 Extract1->getOperand(0).setReg(TmpReg1); 2718 2719 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2720 2721 buildVCopy(B, DstRegs[0], TmpReg0); 2722 buildVCopy(B, DstRegs[1], TmpReg1); 2723 } 2724 2725 if (ShouldMoveIndexIntoLoop) 2726 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2727 2728 return; 2729 } 2730 case AMDGPU::G_INSERT_VECTOR_ELT: { 2731 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2732 2733 Register DstReg = MI.getOperand(0).getReg(); 2734 LLT VecTy = MRI.getType(DstReg); 2735 2736 assert(OpdMapper.getVRegs(0).empty()); 2737 assert(OpdMapper.getVRegs(3).empty()); 2738 2739 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2740 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2741 2742 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2743 return; 2744 2745 const RegisterBank *IdxBank = 2746 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2747 2748 Register SrcReg = MI.getOperand(1).getReg(); 2749 Register InsReg = MI.getOperand(2).getReg(); 2750 LLT InsTy = MRI.getType(InsReg); 2751 (void)InsTy; 2752 2753 Register BaseIdxReg; 2754 unsigned ConstOffset; 2755 std::tie(BaseIdxReg, ConstOffset) = 2756 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2757 2758 // See if the index is an add of a constant which will be foldable by moving 2759 // the base register of the index later if this is going to be executed in a 2760 // waterfall loop. This is essentially to reassociate the add of a constant 2761 // with the readfirstlane. 2762 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2763 ConstOffset > 0 && 2764 ConstOffset < VecTy.getNumElements(); 2765 2766 // Move the base register. We'll re-insert the add later. 2767 if (ShouldMoveIndexIntoLoop) 2768 MI.getOperand(3).setReg(BaseIdxReg); 2769 2770 2771 if (InsRegs.empty()) { 2772 executeInWaterfallLoop(MI, MRI, { 3 }); 2773 2774 // Re-insert the constant offset add inside the waterfall loop. 2775 if (ShouldMoveIndexIntoLoop) { 2776 MachineIRBuilder B(MI); 2777 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2778 } 2779 2780 return; 2781 } 2782 2783 2784 assert(InsTy.getSizeInBits() == 64); 2785 2786 const LLT S32 = LLT::scalar(32); 2787 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2788 2789 MachineIRBuilder B(MI); 2790 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2791 auto One = B.buildConstant(S32, 1); 2792 2793 // Split the vector index into 32-bit pieces. Prepare to move all of the 2794 // new instructions into a waterfall loop if necessary. 2795 // 2796 // Don't put the bitcast or constant in the loop. 2797 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2798 2799 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2800 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2801 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2802 2803 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2804 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2805 2806 const RegisterBank *DstBank = 2807 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2808 const RegisterBank *SrcBank = 2809 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2810 const RegisterBank *InsSrcBank = 2811 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2812 2813 MRI.setRegBank(InsReg, *InsSrcBank); 2814 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2815 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2816 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2817 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2818 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2819 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2820 2821 2822 SmallSet<Register, 4> OpsToWaterfall; 2823 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2824 B.setInsertPt(B.getMBB(), MI); 2825 B.buildBitcast(DstReg, InsHi); 2826 MI.eraseFromParent(); 2827 return; 2828 } 2829 2830 B.setInstr(*Span.begin()); 2831 MI.eraseFromParent(); 2832 2833 // Figure out the point after the waterfall loop before mangling the control 2834 // flow. 2835 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2836 OpsToWaterfall, MRI); 2837 2838 // The insertion point is now right after the original instruction. 2839 // 2840 // Keep the bitcast to the original vector type out of the loop. Doing this 2841 // saved an extra phi we don't need inside the loop. 2842 B.buildBitcast(DstReg, InsHi); 2843 2844 // Re-insert the constant offset add inside the waterfall loop. 2845 if (ShouldMoveIndexIntoLoop) 2846 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2847 2848 return; 2849 } 2850 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2851 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2852 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2853 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2854 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2855 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2856 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2857 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2858 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2859 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2860 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2861 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2862 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2863 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2864 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2865 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2866 applyDefaultMapping(OpdMapper); 2867 executeInWaterfallLoop(MI, MRI, {1, 4}); 2868 return; 2869 } 2870 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2871 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2872 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2874 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2875 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2876 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2877 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2878 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2879 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2880 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2881 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2882 applyDefaultMapping(OpdMapper); 2883 executeInWaterfallLoop(MI, MRI, {2, 5}); 2884 return; 2885 } 2886 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2887 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2888 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2889 applyDefaultMapping(OpdMapper); 2890 executeInWaterfallLoop(MI, MRI, {2, 5}); 2891 return; 2892 } 2893 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2894 applyDefaultMapping(OpdMapper); 2895 executeInWaterfallLoop(MI, MRI, {3, 6}); 2896 return; 2897 } 2898 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2899 applyMappingSBufferLoad(OpdMapper); 2900 return; 2901 } 2902 case AMDGPU::G_INTRINSIC: { 2903 switch (MI.getIntrinsicID()) { 2904 case Intrinsic::amdgcn_readlane: { 2905 substituteSimpleCopyRegs(OpdMapper, 2); 2906 2907 assert(OpdMapper.getVRegs(0).empty()); 2908 assert(OpdMapper.getVRegs(3).empty()); 2909 2910 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2911 // waterfall loop, so assume it's a uniform value. 2912 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2913 return; 2914 } 2915 case Intrinsic::amdgcn_writelane: { 2916 assert(OpdMapper.getVRegs(0).empty()); 2917 assert(OpdMapper.getVRegs(2).empty()); 2918 assert(OpdMapper.getVRegs(3).empty()); 2919 2920 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2921 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2922 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2923 return; 2924 } 2925 case Intrinsic::amdgcn_interp_p1: 2926 case Intrinsic::amdgcn_interp_p2: 2927 case Intrinsic::amdgcn_interp_mov: 2928 case Intrinsic::amdgcn_interp_p1_f16: 2929 case Intrinsic::amdgcn_interp_p2_f16: { 2930 applyDefaultMapping(OpdMapper); 2931 2932 // Readlane for m0 value, which is always the last operand. 2933 // FIXME: Should this be a waterfall loop instead? 2934 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 2935 return; 2936 } 2937 case Intrinsic::amdgcn_permlane16: 2938 case Intrinsic::amdgcn_permlanex16: { 2939 // Doing a waterfall loop over these wouldn't make any sense. 2940 substituteSimpleCopyRegs(OpdMapper, 2); 2941 substituteSimpleCopyRegs(OpdMapper, 3); 2942 constrainOpWithReadfirstlane(MI, MRI, 4); 2943 constrainOpWithReadfirstlane(MI, MRI, 5); 2944 return; 2945 } 2946 case Intrinsic::amdgcn_sbfe: 2947 applyMappingBFE(OpdMapper, true); 2948 return; 2949 case Intrinsic::amdgcn_ubfe: 2950 applyMappingBFE(OpdMapper, false); 2951 return; 2952 case Intrinsic::amdgcn_ballot: 2953 // Use default handling and insert copy to vcc source. 2954 break; 2955 } 2956 break; 2957 } 2958 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2959 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 2960 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 2961 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 2962 const AMDGPU::RsrcIntrinsic *RSrcIntrin 2963 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 2964 assert(RSrcIntrin && RSrcIntrin->IsImage); 2965 // Non-images can have complications from operands that allow both SGPR 2966 // and VGPR. For now it's too complicated to figure out the final opcode 2967 // to derive the register bank from the MCInstrDesc. 2968 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 2969 return; 2970 } 2971 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 2972 unsigned N = MI.getNumExplicitOperands() - 2; 2973 applyDefaultMapping(OpdMapper); 2974 executeInWaterfallLoop(MI, MRI, { N }); 2975 return; 2976 } 2977 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 2978 auto IntrID = MI.getIntrinsicID(); 2979 switch (IntrID) { 2980 case Intrinsic::amdgcn_ds_ordered_add: 2981 case Intrinsic::amdgcn_ds_ordered_swap: { 2982 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 2983 assert(OpdMapper.getVRegs(0).empty()); 2984 substituteSimpleCopyRegs(OpdMapper, 3); 2985 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2986 return; 2987 } 2988 case Intrinsic::amdgcn_ds_gws_init: 2989 case Intrinsic::amdgcn_ds_gws_barrier: 2990 case Intrinsic::amdgcn_ds_gws_sema_br: { 2991 // Only the first lane is executes, so readfirstlane is safe. 2992 substituteSimpleCopyRegs(OpdMapper, 1); 2993 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2994 return; 2995 } 2996 case Intrinsic::amdgcn_ds_gws_sema_v: 2997 case Intrinsic::amdgcn_ds_gws_sema_p: 2998 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 2999 // Only the first lane is executes, so readfirstlane is safe. 3000 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3001 return; 3002 } 3003 case Intrinsic::amdgcn_ds_append: 3004 case Intrinsic::amdgcn_ds_consume: { 3005 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3006 return; 3007 } 3008 case Intrinsic::amdgcn_s_sendmsg: 3009 case Intrinsic::amdgcn_s_sendmsghalt: { 3010 // FIXME: Should this use a waterfall loop? 3011 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3012 return; 3013 } 3014 case Intrinsic::amdgcn_s_setreg: { 3015 constrainOpWithReadfirstlane(MI, MRI, 2); 3016 return; 3017 } 3018 default: { 3019 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3020 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3021 // Non-images can have complications from operands that allow both SGPR 3022 // and VGPR. For now it's too complicated to figure out the final opcode 3023 // to derive the register bank from the MCInstrDesc. 3024 if (RSrcIntrin->IsImage) { 3025 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3026 return; 3027 } 3028 } 3029 3030 break; 3031 } 3032 } 3033 break; 3034 } 3035 case AMDGPU::G_SI_CALL: { 3036 // Use a set to avoid extra readfirstlanes in the case where multiple 3037 // operands are the same register. 3038 SmallSet<Register, 4> SGPROperandRegs; 3039 3040 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3041 break; 3042 3043 // Move all copies to physical SGPRs that are used by the call instruction 3044 // into the loop block. Start searching for these copies until the 3045 // ADJCALLSTACKUP. 3046 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3047 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3048 3049 // Move all non-copies before the copies, so that a complete range can be 3050 // moved into the waterfall loop. 3051 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3052 // Count of NonCopyInstrs found until the current LastCopy. 3053 unsigned NonCopyInstrsLen = 0; 3054 MachineBasicBlock::iterator Start(&MI); 3055 MachineBasicBlock::iterator LastCopy = Start; 3056 MachineBasicBlock *MBB = MI.getParent(); 3057 const SIMachineFunctionInfo *Info = 3058 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3059 while (Start->getOpcode() != FrameSetupOpcode) { 3060 --Start; 3061 bool IsCopy = false; 3062 if (Start->getOpcode() == AMDGPU::COPY) { 3063 auto &Dst = Start->getOperand(0); 3064 if (Dst.isReg()) { 3065 Register Reg = Dst.getReg(); 3066 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3067 IsCopy = true; 3068 } else { 3069 // Also move the copy from the scratch rsrc descriptor into the loop 3070 // to allow it to be optimized away. 3071 auto &Src = Start->getOperand(1); 3072 if (Src.isReg()) { 3073 Reg = Src.getReg(); 3074 IsCopy = Info->getScratchRSrcReg() == Reg; 3075 } 3076 } 3077 } 3078 } 3079 3080 if (IsCopy) { 3081 LastCopy = Start; 3082 NonCopyInstrsLen = NonCopyInstrs.size(); 3083 } else { 3084 NonCopyInstrs.push_back(&*Start); 3085 } 3086 } 3087 NonCopyInstrs.resize(NonCopyInstrsLen); 3088 3089 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3090 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3091 } 3092 Start = LastCopy; 3093 3094 // Do the same for copies after the loop 3095 NonCopyInstrs.clear(); 3096 NonCopyInstrsLen = 0; 3097 MachineBasicBlock::iterator End(&MI); 3098 LastCopy = End; 3099 while (End->getOpcode() != FrameDestroyOpcode) { 3100 ++End; 3101 bool IsCopy = false; 3102 if (End->getOpcode() == AMDGPU::COPY) { 3103 auto &Src = End->getOperand(1); 3104 if (Src.isReg()) { 3105 Register Reg = Src.getReg(); 3106 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3107 } 3108 } 3109 3110 if (IsCopy) { 3111 LastCopy = End; 3112 NonCopyInstrsLen = NonCopyInstrs.size(); 3113 } else { 3114 NonCopyInstrs.push_back(&*End); 3115 } 3116 } 3117 NonCopyInstrs.resize(NonCopyInstrsLen); 3118 3119 End = LastCopy; 3120 ++LastCopy; 3121 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3122 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3123 } 3124 3125 ++End; 3126 MachineIRBuilder B(*Start); 3127 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI); 3128 break; 3129 } 3130 case AMDGPU::G_LOAD: 3131 case AMDGPU::G_ZEXTLOAD: 3132 case AMDGPU::G_SEXTLOAD: { 3133 if (applyMappingLoad(MI, OpdMapper, MRI)) 3134 return; 3135 break; 3136 } 3137 case AMDGPU::G_DYN_STACKALLOC: 3138 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3139 return; 3140 case AMDGPU::G_SBFX: 3141 applyMappingBFE(OpdMapper, /*Signed*/ true); 3142 return; 3143 case AMDGPU::G_UBFX: 3144 applyMappingBFE(OpdMapper, /*Signed*/ false); 3145 return; 3146 default: 3147 break; 3148 } 3149 3150 return applyDefaultMapping(OpdMapper); 3151 } 3152 3153 // vgpr, sgpr -> vgpr 3154 // vgpr, agpr -> vgpr 3155 // agpr, agpr -> agpr 3156 // agpr, sgpr -> vgpr 3157 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3158 if (RB0 == AMDGPU::InvalidRegBankID) 3159 return RB1; 3160 if (RB1 == AMDGPU::InvalidRegBankID) 3161 return RB0; 3162 3163 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3164 return AMDGPU::SGPRRegBankID; 3165 3166 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3167 return AMDGPU::AGPRRegBankID; 3168 3169 return AMDGPU::VGPRRegBankID; 3170 } 3171 3172 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3173 if (RB0 == AMDGPU::InvalidRegBankID) 3174 return RB1; 3175 if (RB1 == AMDGPU::InvalidRegBankID) 3176 return RB0; 3177 3178 // vcc, vcc -> vcc 3179 // vcc, sgpr -> vcc 3180 // vcc, vgpr -> vcc 3181 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3182 return AMDGPU::VCCRegBankID; 3183 3184 // vcc, vgpr -> vgpr 3185 return regBankUnion(RB0, RB1); 3186 } 3187 3188 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3189 const MachineInstr &MI) const { 3190 unsigned RegBank = AMDGPU::InvalidRegBankID; 3191 3192 for (const MachineOperand &MO : MI.operands()) { 3193 if (!MO.isReg()) 3194 continue; 3195 Register Reg = MO.getReg(); 3196 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3197 RegBank = regBankUnion(RegBank, Bank->getID()); 3198 if (RegBank == AMDGPU::VGPRRegBankID) 3199 break; 3200 } 3201 } 3202 3203 return RegBank; 3204 } 3205 3206 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3207 const MachineFunction &MF = *MI.getParent()->getParent(); 3208 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3209 for (const MachineOperand &MO : MI.operands()) { 3210 if (!MO.isReg()) 3211 continue; 3212 Register Reg = MO.getReg(); 3213 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3214 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3215 return false; 3216 } 3217 } 3218 return true; 3219 } 3220 3221 const RegisterBankInfo::InstructionMapping & 3222 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3223 const MachineFunction &MF = *MI.getParent()->getParent(); 3224 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3225 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3226 3227 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3228 const MachineOperand &SrcOp = MI.getOperand(i); 3229 if (!SrcOp.isReg()) 3230 continue; 3231 3232 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3233 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3234 } 3235 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3236 MI.getNumOperands()); 3237 } 3238 3239 const RegisterBankInfo::InstructionMapping & 3240 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3241 const MachineFunction &MF = *MI.getParent()->getParent(); 3242 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3243 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3244 3245 // Even though we technically could use SGPRs, this would require knowledge of 3246 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3247 // 3248 // TODO: Unary ops are trivially OK, so accept SGPRs? 3249 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3250 const MachineOperand &Src = MI.getOperand(i); 3251 if (!Src.isReg()) 3252 continue; 3253 3254 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3255 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3256 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3257 } 3258 3259 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3260 MI.getNumOperands()); 3261 } 3262 3263 const RegisterBankInfo::InstructionMapping & 3264 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3265 const MachineFunction &MF = *MI.getParent()->getParent(); 3266 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3267 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3268 3269 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3270 const MachineOperand &Op = MI.getOperand(I); 3271 if (!Op.isReg()) 3272 continue; 3273 3274 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3275 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3276 } 3277 3278 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3279 MI.getNumOperands()); 3280 } 3281 3282 const RegisterBankInfo::InstructionMapping & 3283 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3284 const MachineInstr &MI, 3285 int RsrcIdx) const { 3286 // The reported argument index is relative to the IR intrinsic call arguments, 3287 // so we need to shift by the number of defs and the intrinsic ID. 3288 RsrcIdx += MI.getNumExplicitDefs() + 1; 3289 3290 const int NumOps = MI.getNumOperands(); 3291 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3292 3293 // TODO: Should packed/unpacked D16 difference be reported here as part of 3294 // the value mapping? 3295 for (int I = 0; I != NumOps; ++I) { 3296 if (!MI.getOperand(I).isReg()) 3297 continue; 3298 3299 Register OpReg = MI.getOperand(I).getReg(); 3300 // We replace some dead address operands with $noreg 3301 if (!OpReg) 3302 continue; 3303 3304 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3305 3306 // FIXME: Probably need a new intrinsic register bank searchable table to 3307 // handle arbitrary intrinsics easily. 3308 // 3309 // If this has a sampler, it immediately follows rsrc. 3310 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3311 3312 if (MustBeSGPR) { 3313 // If this must be an SGPR, so we must report whatever it is as legal. 3314 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3315 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3316 } else { 3317 // Some operands must be VGPR, and these are easy to copy to. 3318 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3319 } 3320 } 3321 3322 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3323 } 3324 3325 /// Return the mapping for a pointer argument. 3326 const RegisterBankInfo::ValueMapping * 3327 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3328 Register PtrReg) const { 3329 LLT PtrTy = MRI.getType(PtrReg); 3330 unsigned Size = PtrTy.getSizeInBits(); 3331 if (Subtarget.useFlatForGlobal() || 3332 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3333 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3334 3335 // If we're using MUBUF instructions for global memory, an SGPR base register 3336 // is possible. Otherwise this needs to be a VGPR. 3337 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3338 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3339 } 3340 3341 const RegisterBankInfo::InstructionMapping & 3342 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3343 3344 const MachineFunction &MF = *MI.getParent()->getParent(); 3345 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3346 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3347 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3348 Register PtrReg = MI.getOperand(1).getReg(); 3349 LLT PtrTy = MRI.getType(PtrReg); 3350 unsigned AS = PtrTy.getAddressSpace(); 3351 unsigned PtrSize = PtrTy.getSizeInBits(); 3352 3353 const ValueMapping *ValMapping; 3354 const ValueMapping *PtrMapping; 3355 3356 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3357 3358 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3359 if (isScalarLoadLegal(MI)) { 3360 // We have a uniform instruction so we want to use an SMRD load 3361 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3362 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3363 } else { 3364 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3365 3366 // If we're using MUBUF instructions for global memory, an SGPR base 3367 // register is possible. Otherwise this needs to be a VGPR. 3368 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3369 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3370 3371 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3372 } 3373 } else { 3374 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3375 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3376 } 3377 3378 OpdsMapping[0] = ValMapping; 3379 OpdsMapping[1] = PtrMapping; 3380 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3381 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3382 return Mapping; 3383 3384 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3385 // handle that during instruction selection? 3386 } 3387 3388 unsigned 3389 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3390 const MachineRegisterInfo &MRI, 3391 unsigned Default) const { 3392 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3393 return Bank ? Bank->getID() : Default; 3394 } 3395 3396 const RegisterBankInfo::ValueMapping * 3397 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3398 const MachineRegisterInfo &MRI, 3399 const TargetRegisterInfo &TRI) const { 3400 // Lie and claim anything is legal, even though this needs to be an SGPR 3401 // applyMapping will have to deal with it as a waterfall loop. 3402 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3403 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3404 return AMDGPU::getValueMapping(Bank, Size); 3405 } 3406 3407 const RegisterBankInfo::ValueMapping * 3408 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3409 const MachineRegisterInfo &MRI, 3410 const TargetRegisterInfo &TRI) const { 3411 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3412 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3413 } 3414 3415 const RegisterBankInfo::ValueMapping * 3416 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3417 const MachineRegisterInfo &MRI, 3418 const TargetRegisterInfo &TRI) const { 3419 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3420 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3421 } 3422 3423 /// 3424 /// This function must return a legal mapping, because 3425 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3426 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3427 /// VGPR to SGPR generated is illegal. 3428 /// 3429 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3430 // legal. These will be dealt with in applyMappingImpl. 3431 // 3432 const RegisterBankInfo::InstructionMapping & 3433 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3434 const MachineFunction &MF = *MI.getParent()->getParent(); 3435 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3436 3437 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3438 // The default logic bothers to analyze impossible alternative mappings. We 3439 // want the most straightforward mapping, so just directly handle this. 3440 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3441 *TRI); 3442 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3443 *TRI); 3444 assert(SrcBank && "src bank should have been assigned already"); 3445 if (!DstBank) 3446 DstBank = SrcBank; 3447 3448 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3449 if (cannotCopy(*DstBank, *SrcBank, Size)) 3450 return getInvalidInstructionMapping(); 3451 3452 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3453 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3454 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3455 OpdsMapping[0] = &ValMap; 3456 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3457 OpdsMapping[1] = &ValMap; 3458 3459 return getInstructionMapping( 3460 1, /*Cost*/ 1, 3461 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3462 } 3463 3464 if (MI.isRegSequence()) { 3465 // If any input is a VGPR, the result must be a VGPR. The default handling 3466 // assumes any copy between banks is legal. 3467 unsigned BankID = AMDGPU::SGPRRegBankID; 3468 3469 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3470 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3471 // It doesn't make sense to use vcc or scc banks here, so just ignore 3472 // them. 3473 if (OpBank != AMDGPU::SGPRRegBankID) { 3474 BankID = AMDGPU::VGPRRegBankID; 3475 break; 3476 } 3477 } 3478 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3479 3480 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3481 return getInstructionMapping( 3482 1, /*Cost*/ 1, 3483 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3484 } 3485 3486 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3487 // properly. 3488 // 3489 // TODO: There are additional exec masking dependencies to analyze. 3490 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3491 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3492 Register DstReg = MI.getOperand(0).getReg(); 3493 3494 // Sometimes the result may have already been assigned a bank. 3495 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3496 ResultBank = DstBank->getID(); 3497 3498 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3499 Register Reg = MI.getOperand(I).getReg(); 3500 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3501 3502 // FIXME: Assuming VGPR for any undetermined inputs. 3503 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3504 ResultBank = AMDGPU::VGPRRegBankID; 3505 break; 3506 } 3507 3508 // FIXME: Need to promote SGPR case to s32 3509 unsigned OpBank = Bank->getID(); 3510 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3511 } 3512 3513 assert(ResultBank != AMDGPU::InvalidRegBankID); 3514 3515 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3516 3517 const ValueMapping &ValMap = 3518 getValueMapping(0, Size, getRegBank(ResultBank)); 3519 return getInstructionMapping( 3520 1, /*Cost*/ 1, 3521 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3522 } 3523 3524 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3525 if (Mapping.isValid()) 3526 return Mapping; 3527 3528 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3529 3530 switch (MI.getOpcode()) { 3531 default: 3532 return getInvalidInstructionMapping(); 3533 3534 case AMDGPU::G_AND: 3535 case AMDGPU::G_OR: 3536 case AMDGPU::G_XOR: { 3537 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3538 if (Size == 1) { 3539 const RegisterBank *DstBank 3540 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3541 3542 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3543 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3544 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3545 if (DstBank) { 3546 TargetBankID = DstBank->getID(); 3547 if (DstBank == &AMDGPU::VCCRegBank) { 3548 TargetBankID = AMDGPU::VCCRegBankID; 3549 BankLHS = AMDGPU::VCCRegBankID; 3550 BankRHS = AMDGPU::VCCRegBankID; 3551 } else { 3552 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3553 AMDGPU::SGPRRegBankID); 3554 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3555 AMDGPU::SGPRRegBankID); 3556 } 3557 } else { 3558 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3559 AMDGPU::VCCRegBankID); 3560 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3561 AMDGPU::VCCRegBankID); 3562 3563 // Both inputs should be true booleans to produce a boolean result. 3564 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3565 TargetBankID = AMDGPU::VGPRRegBankID; 3566 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3567 TargetBankID = AMDGPU::VCCRegBankID; 3568 BankLHS = AMDGPU::VCCRegBankID; 3569 BankRHS = AMDGPU::VCCRegBankID; 3570 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3571 TargetBankID = AMDGPU::SGPRRegBankID; 3572 } 3573 } 3574 3575 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3576 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3577 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3578 break; 3579 } 3580 3581 if (Size == 64) { 3582 3583 if (isSALUMapping(MI)) { 3584 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3585 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3586 } else { 3587 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3588 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3589 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3590 3591 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3592 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3593 } 3594 3595 break; 3596 } 3597 3598 LLVM_FALLTHROUGH; 3599 } 3600 case AMDGPU::G_PTR_ADD: 3601 case AMDGPU::G_PTRMASK: 3602 case AMDGPU::G_ADD: 3603 case AMDGPU::G_SUB: 3604 case AMDGPU::G_MUL: 3605 case AMDGPU::G_SHL: 3606 case AMDGPU::G_LSHR: 3607 case AMDGPU::G_ASHR: 3608 case AMDGPU::G_UADDO: 3609 case AMDGPU::G_USUBO: 3610 case AMDGPU::G_UADDE: 3611 case AMDGPU::G_SADDE: 3612 case AMDGPU::G_USUBE: 3613 case AMDGPU::G_SSUBE: 3614 case AMDGPU::G_SMIN: 3615 case AMDGPU::G_SMAX: 3616 case AMDGPU::G_UMIN: 3617 case AMDGPU::G_UMAX: 3618 case AMDGPU::G_ABS: 3619 case AMDGPU::G_SHUFFLE_VECTOR: 3620 case AMDGPU::G_SBFX: 3621 case AMDGPU::G_UBFX: 3622 if (isSALUMapping(MI)) 3623 return getDefaultMappingSOP(MI); 3624 LLVM_FALLTHROUGH; 3625 3626 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3627 case AMDGPU::G_SSUBSAT: 3628 case AMDGPU::G_UADDSAT: 3629 case AMDGPU::G_USUBSAT: 3630 case AMDGPU::G_FADD: 3631 case AMDGPU::G_FSUB: 3632 case AMDGPU::G_FPTOSI: 3633 case AMDGPU::G_FPTOUI: 3634 case AMDGPU::G_FMUL: 3635 case AMDGPU::G_FMA: 3636 case AMDGPU::G_FMAD: 3637 case AMDGPU::G_FSQRT: 3638 case AMDGPU::G_FFLOOR: 3639 case AMDGPU::G_FCEIL: 3640 case AMDGPU::G_FRINT: 3641 case AMDGPU::G_SITOFP: 3642 case AMDGPU::G_UITOFP: 3643 case AMDGPU::G_FPTRUNC: 3644 case AMDGPU::G_FPEXT: 3645 case AMDGPU::G_FEXP2: 3646 case AMDGPU::G_FLOG2: 3647 case AMDGPU::G_FMINNUM: 3648 case AMDGPU::G_FMAXNUM: 3649 case AMDGPU::G_FMINNUM_IEEE: 3650 case AMDGPU::G_FMAXNUM_IEEE: 3651 case AMDGPU::G_FCANONICALIZE: 3652 case AMDGPU::G_INTRINSIC_TRUNC: 3653 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3654 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3655 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3656 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3657 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3658 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3659 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3660 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3661 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3662 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3663 case AMDGPU::G_AMDGPU_SMED3: 3664 return getDefaultMappingVOP(MI); 3665 case AMDGPU::G_UMULH: 3666 case AMDGPU::G_SMULH: { 3667 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3668 return getDefaultMappingSOP(MI); 3669 return getDefaultMappingVOP(MI); 3670 } 3671 case AMDGPU::G_IMPLICIT_DEF: { 3672 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3673 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3674 break; 3675 } 3676 case AMDGPU::G_FCONSTANT: 3677 case AMDGPU::G_CONSTANT: 3678 case AMDGPU::G_GLOBAL_VALUE: 3679 case AMDGPU::G_BLOCK_ADDR: 3680 case AMDGPU::G_READCYCLECOUNTER: { 3681 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3682 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3683 break; 3684 } 3685 case AMDGPU::G_FRAME_INDEX: { 3686 // TODO: This should be the same as other constants, but eliminateFrameIndex 3687 // currently assumes VALU uses. 3688 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3689 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3690 break; 3691 } 3692 case AMDGPU::G_DYN_STACKALLOC: { 3693 // Result is always uniform, and a wave reduction is needed for the source. 3694 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3695 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3696 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3697 break; 3698 } 3699 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 3700 // This case is weird because we expect a physical register in the source, 3701 // but need to set a bank anyway. 3702 // 3703 // We could select the result to SGPR or VGPR, but for the one current use 3704 // it's more practical to always use VGPR. 3705 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3706 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3707 break; 3708 } 3709 case AMDGPU::G_INSERT: { 3710 unsigned BankID = getMappingType(MRI, MI); 3711 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3712 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3713 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3714 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3715 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3716 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3717 OpdsMapping[3] = nullptr; 3718 break; 3719 } 3720 case AMDGPU::G_EXTRACT: { 3721 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3722 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3723 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3724 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3725 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3726 OpdsMapping[2] = nullptr; 3727 break; 3728 } 3729 case AMDGPU::G_BUILD_VECTOR: 3730 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3731 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3732 if (DstTy == LLT::fixed_vector(2, 16)) { 3733 unsigned DstSize = DstTy.getSizeInBits(); 3734 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3735 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3736 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3737 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3738 3739 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3740 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3741 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3742 break; 3743 } 3744 3745 LLVM_FALLTHROUGH; 3746 } 3747 case AMDGPU::G_MERGE_VALUES: 3748 case AMDGPU::G_CONCAT_VECTORS: { 3749 unsigned Bank = getMappingType(MRI, MI); 3750 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3751 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3752 3753 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3754 // Op1 and Dst should use the same register bank. 3755 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3756 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3757 break; 3758 } 3759 case AMDGPU::G_BITREVERSE: 3760 case AMDGPU::G_BITCAST: 3761 case AMDGPU::G_INTTOPTR: 3762 case AMDGPU::G_PTRTOINT: 3763 case AMDGPU::G_FABS: 3764 case AMDGPU::G_FNEG: { 3765 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3766 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3767 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3768 break; 3769 } 3770 case AMDGPU::G_AMDGPU_FFBH_U32: 3771 case AMDGPU::G_AMDGPU_FFBL_B32: 3772 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3773 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 3774 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3775 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3776 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3777 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 3778 break; 3779 } 3780 case AMDGPU::G_CTPOP: { 3781 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3782 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3783 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3784 3785 // This should really be getValueMappingSGPR64Only, but allowing the generic 3786 // code to handle the register split just makes using LegalizerHelper more 3787 // difficult. 3788 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3789 break; 3790 } 3791 case AMDGPU::G_TRUNC: { 3792 Register Dst = MI.getOperand(0).getReg(); 3793 Register Src = MI.getOperand(1).getReg(); 3794 unsigned Bank = getRegBankID(Src, MRI); 3795 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3796 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3797 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3798 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3799 break; 3800 } 3801 case AMDGPU::G_ZEXT: 3802 case AMDGPU::G_SEXT: 3803 case AMDGPU::G_ANYEXT: 3804 case AMDGPU::G_SEXT_INREG: { 3805 Register Dst = MI.getOperand(0).getReg(); 3806 Register Src = MI.getOperand(1).getReg(); 3807 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3808 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3809 3810 unsigned DstBank; 3811 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3812 assert(SrcBank); 3813 switch (SrcBank->getID()) { 3814 case AMDGPU::SGPRRegBankID: 3815 DstBank = AMDGPU::SGPRRegBankID; 3816 break; 3817 default: 3818 DstBank = AMDGPU::VGPRRegBankID; 3819 break; 3820 } 3821 3822 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3823 // 32-bits, and then to 64. 3824 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3825 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3826 SrcSize); 3827 break; 3828 } 3829 case AMDGPU::G_FCMP: { 3830 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3831 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3832 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3833 OpdsMapping[1] = nullptr; // Predicate Operand. 3834 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 3835 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3836 break; 3837 } 3838 case AMDGPU::G_STORE: { 3839 assert(MI.getOperand(0).isReg()); 3840 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3841 3842 // FIXME: We need to specify a different reg bank once scalar stores are 3843 // supported. 3844 const ValueMapping *ValMapping = 3845 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3846 OpdsMapping[0] = ValMapping; 3847 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 3848 break; 3849 } 3850 case AMDGPU::G_ICMP: { 3851 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3852 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3853 3854 // See if the result register has already been constrained to vcc, which may 3855 // happen due to control flow intrinsic lowering. 3856 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 3857 AMDGPU::SGPRRegBankID); 3858 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3859 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 3860 3861 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 3862 Op2Bank == AMDGPU::SGPRRegBankID && 3863 Op3Bank == AMDGPU::SGPRRegBankID && 3864 (Size == 32 || (Size == 64 && 3865 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 3866 Subtarget.hasScalarCompareEq64())); 3867 3868 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3869 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3870 3871 // TODO: Use 32-bit for scalar output size. 3872 // SCC results will need to be copied to a 32-bit SGPR virtual register. 3873 const unsigned ResultSize = 1; 3874 3875 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 3876 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 3877 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 3878 break; 3879 } 3880 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 3881 // VGPR index can be used for waterfall when indexing a SGPR vector. 3882 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3883 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3884 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3885 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3886 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3887 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 3888 3889 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 3890 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 3891 3892 // The index can be either if the source vector is VGPR. 3893 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3894 break; 3895 } 3896 case AMDGPU::G_INSERT_VECTOR_ELT: { 3897 unsigned OutputBankID = isSALUMapping(MI) ? 3898 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3899 3900 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3901 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3902 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 3903 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3904 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 3905 3906 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3907 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3908 3909 // This is a weird case, because we need to break down the mapping based on 3910 // the register bank of a different operand. 3911 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 3912 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 3913 InsertSize); 3914 } else { 3915 assert(InsertSize == 32 || InsertSize == 64); 3916 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 3917 } 3918 3919 // The index can be either if the source vector is VGPR. 3920 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 3921 break; 3922 } 3923 case AMDGPU::G_UNMERGE_VALUES: { 3924 unsigned Bank = getMappingType(MRI, MI); 3925 3926 // Op1 and Dst should use the same register bank. 3927 // FIXME: Shouldn't this be the default? Why do we need to handle this? 3928 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3929 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 3930 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 3931 } 3932 break; 3933 } 3934 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3936 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3937 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3938 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3939 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3940 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3941 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3942 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3943 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3944 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 3945 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3946 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3947 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3948 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3949 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 3950 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3951 3952 // rsrc 3953 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3954 3955 // vindex 3956 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3957 3958 // voffset 3959 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3960 3961 // soffset 3962 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3963 3964 // Any remaining operands are immediates and were correctly null 3965 // initialized. 3966 break; 3967 } 3968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3971 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3972 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3973 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3974 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3976 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3978 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3979 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 3980 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3981 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 3982 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 3983 // vdata_out 3984 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3985 3986 // vdata_in 3987 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3988 3989 // rsrc 3990 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3991 3992 // vindex 3993 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3994 3995 // voffset 3996 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3997 3998 // soffset 3999 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4000 4001 // Any remaining operands are immediates and were correctly null 4002 // initialized. 4003 break; 4004 } 4005 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4006 // vdata_out 4007 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4008 4009 // vdata_in 4010 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4011 4012 // cmp 4013 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4014 4015 // rsrc 4016 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4017 4018 // vindex 4019 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4020 4021 // voffset 4022 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4023 4024 // soffset 4025 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4026 4027 // Any remaining operands are immediates and were correctly null 4028 // initialized. 4029 break; 4030 } 4031 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 4032 // Lie and claim everything is legal, even though some need to be 4033 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4034 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4035 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4036 4037 // We need to convert this to a MUBUF if either the resource of offset is 4038 // VGPR. 4039 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4040 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4041 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4042 4043 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4044 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4045 break; 4046 } 4047 case AMDGPU::G_INTRINSIC: { 4048 switch (MI.getIntrinsicID()) { 4049 default: 4050 return getInvalidInstructionMapping(); 4051 case Intrinsic::amdgcn_div_fmas: 4052 case Intrinsic::amdgcn_div_fixup: 4053 case Intrinsic::amdgcn_trig_preop: 4054 case Intrinsic::amdgcn_sin: 4055 case Intrinsic::amdgcn_cos: 4056 case Intrinsic::amdgcn_log_clamp: 4057 case Intrinsic::amdgcn_rcp: 4058 case Intrinsic::amdgcn_rcp_legacy: 4059 case Intrinsic::amdgcn_sqrt: 4060 case Intrinsic::amdgcn_rsq: 4061 case Intrinsic::amdgcn_rsq_legacy: 4062 case Intrinsic::amdgcn_rsq_clamp: 4063 case Intrinsic::amdgcn_fmul_legacy: 4064 case Intrinsic::amdgcn_fma_legacy: 4065 case Intrinsic::amdgcn_ldexp: 4066 case Intrinsic::amdgcn_frexp_mant: 4067 case Intrinsic::amdgcn_frexp_exp: 4068 case Intrinsic::amdgcn_fract: 4069 case Intrinsic::amdgcn_cvt_pkrtz: 4070 case Intrinsic::amdgcn_cvt_pknorm_i16: 4071 case Intrinsic::amdgcn_cvt_pknorm_u16: 4072 case Intrinsic::amdgcn_cvt_pk_i16: 4073 case Intrinsic::amdgcn_cvt_pk_u16: 4074 case Intrinsic::amdgcn_fmed3: 4075 case Intrinsic::amdgcn_cubeid: 4076 case Intrinsic::amdgcn_cubema: 4077 case Intrinsic::amdgcn_cubesc: 4078 case Intrinsic::amdgcn_cubetc: 4079 case Intrinsic::amdgcn_sffbh: 4080 case Intrinsic::amdgcn_fmad_ftz: 4081 case Intrinsic::amdgcn_mbcnt_lo: 4082 case Intrinsic::amdgcn_mbcnt_hi: 4083 case Intrinsic::amdgcn_mul_u24: 4084 case Intrinsic::amdgcn_mul_i24: 4085 case Intrinsic::amdgcn_mulhi_u24: 4086 case Intrinsic::amdgcn_mulhi_i24: 4087 case Intrinsic::amdgcn_lerp: 4088 case Intrinsic::amdgcn_sad_u8: 4089 case Intrinsic::amdgcn_msad_u8: 4090 case Intrinsic::amdgcn_sad_hi_u8: 4091 case Intrinsic::amdgcn_sad_u16: 4092 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4093 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4094 case Intrinsic::amdgcn_mqsad_u32_u8: 4095 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4096 case Intrinsic::amdgcn_alignbyte: 4097 case Intrinsic::amdgcn_perm: 4098 case Intrinsic::amdgcn_fdot2: 4099 case Intrinsic::amdgcn_sdot2: 4100 case Intrinsic::amdgcn_udot2: 4101 case Intrinsic::amdgcn_sdot4: 4102 case Intrinsic::amdgcn_udot4: 4103 case Intrinsic::amdgcn_sdot8: 4104 case Intrinsic::amdgcn_udot8: 4105 return getDefaultMappingVOP(MI); 4106 case Intrinsic::amdgcn_sbfe: 4107 case Intrinsic::amdgcn_ubfe: 4108 if (isSALUMapping(MI)) 4109 return getDefaultMappingSOP(MI); 4110 return getDefaultMappingVOP(MI); 4111 case Intrinsic::amdgcn_ds_swizzle: 4112 case Intrinsic::amdgcn_ds_permute: 4113 case Intrinsic::amdgcn_ds_bpermute: 4114 case Intrinsic::amdgcn_update_dpp: 4115 case Intrinsic::amdgcn_mov_dpp8: 4116 case Intrinsic::amdgcn_mov_dpp: 4117 case Intrinsic::amdgcn_strict_wwm: 4118 case Intrinsic::amdgcn_wwm: 4119 case Intrinsic::amdgcn_strict_wqm: 4120 case Intrinsic::amdgcn_wqm: 4121 case Intrinsic::amdgcn_softwqm: 4122 case Intrinsic::amdgcn_set_inactive: 4123 return getDefaultMappingAllVGPR(MI); 4124 case Intrinsic::amdgcn_kernarg_segment_ptr: 4125 case Intrinsic::amdgcn_s_getpc: 4126 case Intrinsic::amdgcn_groupstaticsize: 4127 case Intrinsic::amdgcn_reloc_constant: 4128 case Intrinsic::returnaddress: { 4129 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4130 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4131 break; 4132 } 4133 case Intrinsic::amdgcn_wqm_vote: { 4134 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4135 OpdsMapping[0] = OpdsMapping[2] 4136 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4137 break; 4138 } 4139 case Intrinsic::amdgcn_ps_live: { 4140 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4141 break; 4142 } 4143 case Intrinsic::amdgcn_div_scale: { 4144 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4145 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4146 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4147 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4148 4149 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4150 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4151 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4152 break; 4153 } 4154 case Intrinsic::amdgcn_class: { 4155 Register Src0Reg = MI.getOperand(2).getReg(); 4156 Register Src1Reg = MI.getOperand(3).getReg(); 4157 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4158 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4159 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4160 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4161 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4162 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4163 break; 4164 } 4165 case Intrinsic::amdgcn_icmp: 4166 case Intrinsic::amdgcn_fcmp: { 4167 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4168 // This is not VCCRegBank because this is not used in boolean contexts. 4169 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4170 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4171 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4172 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4173 break; 4174 } 4175 case Intrinsic::amdgcn_readlane: { 4176 // This must be an SGPR, but accept a VGPR. 4177 Register IdxReg = MI.getOperand(3).getReg(); 4178 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4179 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4180 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4181 LLVM_FALLTHROUGH; 4182 } 4183 case Intrinsic::amdgcn_readfirstlane: { 4184 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4185 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4186 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4187 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4188 break; 4189 } 4190 case Intrinsic::amdgcn_writelane: { 4191 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4192 Register SrcReg = MI.getOperand(2).getReg(); 4193 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4194 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4195 Register IdxReg = MI.getOperand(3).getReg(); 4196 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4197 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4198 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4199 4200 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4201 // to legalize. 4202 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4203 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4204 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4205 break; 4206 } 4207 case Intrinsic::amdgcn_if_break: { 4208 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4209 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4210 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4211 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4212 break; 4213 } 4214 case Intrinsic::amdgcn_permlane16: 4215 case Intrinsic::amdgcn_permlanex16: { 4216 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4217 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4218 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4219 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4220 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4221 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4222 break; 4223 } 4224 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4225 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4226 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4227 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4228 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4229 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4230 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4231 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4232 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4233 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4234 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4235 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4236 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4237 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4238 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4239 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4240 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4241 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4242 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4243 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4244 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4245 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4246 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4247 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4248 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4249 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4250 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { 4251 // Default for MAI intrinsics. 4252 // srcC can also be an immediate which can be folded later. 4253 // FIXME: Should we eventually add an alternative mapping with AGPR src 4254 // for srcA/srcB? 4255 // 4256 // vdst, srcA, srcB, srcC 4257 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4258 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4259 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4260 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4261 break; 4262 } 4263 case Intrinsic::amdgcn_interp_p1: 4264 case Intrinsic::amdgcn_interp_p2: 4265 case Intrinsic::amdgcn_interp_mov: 4266 case Intrinsic::amdgcn_interp_p1_f16: 4267 case Intrinsic::amdgcn_interp_p2_f16: { 4268 const int M0Idx = MI.getNumOperands() - 1; 4269 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4270 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4271 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4272 4273 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4274 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4275 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4276 4277 // Must be SGPR, but we must take whatever the original bank is and fix it 4278 // later. 4279 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4280 break; 4281 } 4282 case Intrinsic::amdgcn_ballot: { 4283 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4284 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4285 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4286 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4287 break; 4288 } 4289 } 4290 break; 4291 } 4292 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4293 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4294 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4295 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4296 auto IntrID = MI.getIntrinsicID(); 4297 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4298 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4299 // Non-images can have complications from operands that allow both SGPR 4300 // and VGPR. For now it's too complicated to figure out the final opcode 4301 // to derive the register bank from the MCInstrDesc. 4302 assert(RSrcIntrin->IsImage); 4303 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4304 } 4305 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4306 unsigned N = MI.getNumExplicitOperands() - 2; 4307 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4308 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4309 if (N == 3) { 4310 // Sequential form: all operands combined into VGPR256/VGPR512 4311 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4312 if (Size > 256) 4313 Size = 512; 4314 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4315 } else { 4316 // NSA form 4317 for (unsigned I = 2; I < N; ++I) 4318 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4319 } 4320 break; 4321 } 4322 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4323 auto IntrID = MI.getIntrinsicID(); 4324 switch (IntrID) { 4325 case Intrinsic::amdgcn_s_getreg: 4326 case Intrinsic::amdgcn_s_memtime: 4327 case Intrinsic::amdgcn_s_memrealtime: 4328 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 4329 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4330 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4331 break; 4332 } 4333 case Intrinsic::amdgcn_global_atomic_fadd: 4334 case Intrinsic::amdgcn_global_atomic_csub: 4335 case Intrinsic::amdgcn_global_atomic_fmin: 4336 case Intrinsic::amdgcn_global_atomic_fmax: 4337 case Intrinsic::amdgcn_flat_atomic_fadd: 4338 case Intrinsic::amdgcn_flat_atomic_fmin: 4339 case Intrinsic::amdgcn_flat_atomic_fmax: 4340 return getDefaultMappingAllVGPR(MI); 4341 case Intrinsic::amdgcn_ds_ordered_add: 4342 case Intrinsic::amdgcn_ds_ordered_swap: { 4343 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4344 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4345 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4346 AMDGPU::SGPRRegBankID); 4347 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4348 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4349 break; 4350 } 4351 case Intrinsic::amdgcn_ds_append: 4352 case Intrinsic::amdgcn_ds_consume: { 4353 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4354 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4355 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4356 break; 4357 } 4358 case Intrinsic::amdgcn_exp_compr: 4359 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4360 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4361 break; 4362 case Intrinsic::amdgcn_exp: 4363 // FIXME: Could we support packed types here? 4364 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4365 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4366 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4367 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4368 break; 4369 case Intrinsic::amdgcn_s_sendmsg: 4370 case Intrinsic::amdgcn_s_sendmsghalt: { 4371 // This must be an SGPR, but accept a VGPR. 4372 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4373 AMDGPU::SGPRRegBankID); 4374 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4375 break; 4376 } 4377 case Intrinsic::amdgcn_s_setreg: { 4378 // This must be an SGPR, but accept a VGPR. 4379 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4380 AMDGPU::SGPRRegBankID); 4381 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4382 break; 4383 } 4384 case Intrinsic::amdgcn_end_cf: { 4385 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4386 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4387 break; 4388 } 4389 case Intrinsic::amdgcn_else: { 4390 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4391 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4392 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4393 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4394 break; 4395 } 4396 case Intrinsic::amdgcn_live_mask: { 4397 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4398 break; 4399 } 4400 case Intrinsic::amdgcn_wqm_demote: 4401 case Intrinsic::amdgcn_kill: { 4402 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4403 break; 4404 } 4405 case Intrinsic::amdgcn_raw_buffer_load: 4406 case Intrinsic::amdgcn_raw_tbuffer_load: { 4407 // FIXME: Should make intrinsic ID the last operand of the instruction, 4408 // then this would be the same as store 4409 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4410 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4411 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4412 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4413 break; 4414 } 4415 case Intrinsic::amdgcn_raw_buffer_store: 4416 case Intrinsic::amdgcn_raw_buffer_store_format: 4417 case Intrinsic::amdgcn_raw_tbuffer_store: { 4418 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4419 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4420 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4421 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4422 break; 4423 } 4424 case Intrinsic::amdgcn_struct_buffer_load: 4425 case Intrinsic::amdgcn_struct_tbuffer_load: { 4426 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4427 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4428 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4429 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4430 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4431 break; 4432 } 4433 case Intrinsic::amdgcn_struct_buffer_store: 4434 case Intrinsic::amdgcn_struct_tbuffer_store: { 4435 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4436 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4437 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4438 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4439 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4440 break; 4441 } 4442 case Intrinsic::amdgcn_init_exec_from_input: { 4443 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4444 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4445 break; 4446 } 4447 case Intrinsic::amdgcn_ds_gws_init: 4448 case Intrinsic::amdgcn_ds_gws_barrier: 4449 case Intrinsic::amdgcn_ds_gws_sema_br: { 4450 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4451 4452 // This must be an SGPR, but accept a VGPR. 4453 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4454 AMDGPU::SGPRRegBankID); 4455 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4456 break; 4457 } 4458 case Intrinsic::amdgcn_ds_gws_sema_v: 4459 case Intrinsic::amdgcn_ds_gws_sema_p: 4460 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4461 // This must be an SGPR, but accept a VGPR. 4462 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4463 AMDGPU::SGPRRegBankID); 4464 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4465 break; 4466 } 4467 default: 4468 return getInvalidInstructionMapping(); 4469 } 4470 break; 4471 } 4472 case AMDGPU::G_SELECT: { 4473 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4474 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4475 AMDGPU::SGPRRegBankID); 4476 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4477 AMDGPU::SGPRRegBankID); 4478 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4479 Op3Bank == AMDGPU::SGPRRegBankID; 4480 4481 unsigned CondBankDefault = SGPRSrcs ? 4482 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4483 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4484 CondBankDefault); 4485 if (CondBank == AMDGPU::SGPRRegBankID) 4486 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4487 else if (CondBank == AMDGPU::VGPRRegBankID) 4488 CondBank = AMDGPU::VCCRegBankID; 4489 4490 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4491 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4492 4493 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4494 4495 // TODO: Should report 32-bit for scalar condition type. 4496 if (Size == 64) { 4497 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4498 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4499 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4500 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4501 } else { 4502 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4503 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4504 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4505 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4506 } 4507 4508 break; 4509 } 4510 4511 case AMDGPU::G_SI_CALL: { 4512 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4513 // Lie and claim everything is legal, even though some need to be 4514 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4515 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4516 4517 // Allow anything for implicit arguments 4518 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 4519 if (MI.getOperand(I).isReg()) { 4520 Register Reg = MI.getOperand(I).getReg(); 4521 auto OpBank = getRegBankID(Reg, MRI); 4522 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 4523 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 4524 } 4525 } 4526 break; 4527 } 4528 case AMDGPU::G_LOAD: 4529 case AMDGPU::G_ZEXTLOAD: 4530 case AMDGPU::G_SEXTLOAD: 4531 return getInstrMappingForLoad(MI); 4532 4533 case AMDGPU::G_ATOMICRMW_XCHG: 4534 case AMDGPU::G_ATOMICRMW_ADD: 4535 case AMDGPU::G_ATOMICRMW_SUB: 4536 case AMDGPU::G_ATOMICRMW_AND: 4537 case AMDGPU::G_ATOMICRMW_OR: 4538 case AMDGPU::G_ATOMICRMW_XOR: 4539 case AMDGPU::G_ATOMICRMW_MAX: 4540 case AMDGPU::G_ATOMICRMW_MIN: 4541 case AMDGPU::G_ATOMICRMW_UMAX: 4542 case AMDGPU::G_ATOMICRMW_UMIN: 4543 case AMDGPU::G_ATOMICRMW_FADD: 4544 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4545 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4546 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4547 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4548 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4549 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4550 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4551 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4552 break; 4553 } 4554 case AMDGPU::G_ATOMIC_CMPXCHG: { 4555 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4556 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4557 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4558 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4559 break; 4560 } 4561 case AMDGPU::G_BRCOND: { 4562 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4563 AMDGPU::SGPRRegBankID); 4564 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4565 if (Bank != AMDGPU::SGPRRegBankID) 4566 Bank = AMDGPU::VCCRegBankID; 4567 4568 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4569 break; 4570 } 4571 } 4572 4573 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4574 getOperandsMapping(OpdsMapping), 4575 MI.getNumOperands()); 4576 } 4577