1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trival legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 83 #include "llvm/IR/IntrinsicsAMDGPU.h" 84 85 #define GET_TARGET_REGBANK_IMPL 86 #include "AMDGPUGenRegisterBank.inc" 87 88 // This file will be TableGen'ed at some point. 89 #include "AMDGPUGenRegisterBankInfo.def" 90 91 using namespace llvm; 92 using namespace MIPatternMatch; 93 94 namespace { 95 96 // Observer to apply a register bank to new registers created by LegalizerHelper. 97 class ApplyRegBankMapping final : public GISelChangeObserver { 98 private: 99 const AMDGPURegisterBankInfo &RBI; 100 MachineRegisterInfo &MRI; 101 const RegisterBank *NewBank; 102 SmallVector<MachineInstr *, 4> NewInsts; 103 104 public: 105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 106 MachineRegisterInfo &MRI_, const RegisterBank *RB) 107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 108 109 ~ApplyRegBankMapping() { 110 for (MachineInstr *MI : NewInsts) 111 applyBank(*MI); 112 } 113 114 /// Set any registers that don't have a set register class or bank to SALU. 115 void applyBank(MachineInstr &MI) { 116 const unsigned Opc = MI.getOpcode(); 117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 118 Opc == AMDGPU::G_SEXT) { 119 // LegalizerHelper wants to use the basic legalization artifacts when 120 // widening etc. We don't handle selection with vcc in artifact sources, 121 // so we need to use a sslect instead to handle these properly. 122 Register DstReg = MI.getOperand(0).getReg(); 123 Register SrcReg = MI.getOperand(1).getReg(); 124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 125 if (SrcBank == &AMDGPU::VCCRegBank) { 126 const LLT S32 = LLT::scalar(32); 127 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 128 assert(MRI.getType(DstReg) == S32); 129 assert(NewBank == &AMDGPU::VGPRRegBank); 130 131 // Replace the extension with a select, which really uses the boolean 132 // source. 133 MachineIRBuilder B(MI); 134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 135 auto False = B.buildConstant(S32, 0); 136 B.buildSelect(DstReg, SrcReg, True, False); 137 MRI.setRegBank(True.getReg(0), *NewBank); 138 MRI.setRegBank(False.getReg(0), *NewBank); 139 MI.eraseFromParent(); 140 } 141 142 assert(!MRI.getRegClassOrRegBank(DstReg)); 143 MRI.setRegBank(DstReg, *NewBank); 144 return; 145 } 146 147 #ifndef NDEBUG 148 if (Opc == AMDGPU::G_TRUNC) { 149 Register DstReg = MI.getOperand(0).getReg(); 150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 151 assert(DstBank != &AMDGPU::VCCRegBank); 152 } 153 #endif 154 155 for (MachineOperand &Op : MI.operands()) { 156 if (!Op.isReg()) 157 continue; 158 159 // We may see physical registers if building a real MI 160 Register Reg = Op.getReg(); 161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 162 continue; 163 164 const RegisterBank *RB = NewBank; 165 if (MRI.getType(Reg) == LLT::scalar(1)) { 166 assert(NewBank == &AMDGPU::VGPRRegBank && 167 "s1 operands should only be used for vector bools"); 168 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 169 MI.getOpcode() != AMDGPU::G_ANYEXT) && 170 "not expecting legalization artifacts here"); 171 RB = &AMDGPU::VCCRegBank; 172 } 173 174 MRI.setRegBank(Reg, *RB); 175 } 176 } 177 178 void erasingInstr(MachineInstr &MI) override {} 179 180 void createdInstr(MachineInstr &MI) override { 181 // At this point, the instruction was just inserted and has no operands. 182 NewInsts.push_back(&MI); 183 } 184 185 void changingInstr(MachineInstr &MI) override {} 186 void changedInstr(MachineInstr &MI) override { 187 // FIXME: In principle we should probably add the instruction to NewInsts, 188 // but the way the LegalizerHelper uses the observer, we will always see the 189 // registers we need to set the regbank on also referenced in a new 190 // instruction. 191 } 192 }; 193 194 } 195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 196 : AMDGPUGenRegisterBankInfo(), 197 Subtarget(ST), 198 TRI(Subtarget.getRegisterInfo()), 199 TII(Subtarget.getInstrInfo()) { 200 201 // HACK: Until this is fully tablegen'd. 202 static llvm::once_flag InitializeRegisterBankFlag; 203 204 static auto InitializeRegisterBankOnce = [this]() { 205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 208 (void)this; 209 }; 210 211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 212 } 213 214 static bool isVectorRegisterBank(const RegisterBank &Bank) { 215 unsigned BankID = Bank.getID(); 216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 217 } 218 219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 220 const RegisterBank &Src, 221 unsigned Size) const { 222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 223 if (Dst.getID() == AMDGPU::SGPRRegBankID && 224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 225 return std::numeric_limits<unsigned>::max(); 226 } 227 228 // Bool values are tricky, because the meaning is based on context. The SCC 229 // and VCC banks are for the natural scalar and vector conditions produced by 230 // a compare. 231 // 232 // Legalization doesn't know about the necessary context, so an s1 use may 233 // have been a truncate from an arbitrary value, in which case a copy (lowered 234 // as a compare with 0) needs to be inserted. 235 if (Size == 1 && 236 (Dst.getID() == AMDGPU::SGPRRegBankID) && 237 (isVectorRegisterBank(Src) || 238 Src.getID() == AMDGPU::SGPRRegBankID || 239 Src.getID() == AMDGPU::VCCRegBankID)) 240 return std::numeric_limits<unsigned>::max(); 241 242 // There is no direct copy between AGPRs. 243 if (Dst.getID() == AMDGPU::AGPRRegBankID && 244 Src.getID() == AMDGPU::AGPRRegBankID) 245 return 4; 246 247 return RegisterBankInfo::copyCost(Dst, Src, Size); 248 } 249 250 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 251 const ValueMapping &ValMapping, 252 const RegisterBank *CurBank) const { 253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 254 // VGPR. 255 // FIXME: Is there a better way to do this? 256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 257 return 10; // This is expensive. 258 259 assert(ValMapping.NumBreakDowns == 2 && 260 ValMapping.BreakDown[0].Length == 32 && 261 ValMapping.BreakDown[0].StartIdx == 0 && 262 ValMapping.BreakDown[1].Length == 32 && 263 ValMapping.BreakDown[1].StartIdx == 32 && 264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 265 266 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 268 // want. 269 270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 271 // alignment restrictions, but this probably isn't important. 272 return 1; 273 } 274 275 const RegisterBank & 276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 277 LLT Ty) const { 278 if (&RC == &AMDGPU::SReg_1RegClass) 279 return AMDGPU::VCCRegBank; 280 281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 282 // VCC-like use. 283 if (TRI->isSGPRClass(&RC)) { 284 // FIXME: This probably came from a copy from a physical register, which 285 // should be inferrrable from the copied to-type. We don't have many boolean 286 // physical register constraints so just assume a normal SGPR for now. 287 if (!Ty.isValid()) 288 return AMDGPU::SGPRRegBank; 289 290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 291 } 292 293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 294 } 295 296 template <unsigned NumOps> 297 RegisterBankInfo::InstructionMappings 298 AMDGPURegisterBankInfo::addMappingFromTable( 299 const MachineInstr &MI, const MachineRegisterInfo &MRI, 300 const std::array<unsigned, NumOps> RegSrcOpIdx, 301 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 302 303 InstructionMappings AltMappings; 304 305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 306 307 unsigned Sizes[NumOps]; 308 for (unsigned I = 0; I < NumOps; ++I) { 309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 311 } 312 313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 316 } 317 318 // getInstrMapping's default mapping uses ID 1, so start at 2. 319 unsigned MappingID = 2; 320 for (const auto &Entry : Table) { 321 for (unsigned I = 0; I < NumOps; ++I) { 322 int OpIdx = RegSrcOpIdx[I]; 323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 324 } 325 326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 327 getOperandsMapping(Operands), 328 Operands.size())); 329 } 330 331 return AltMappings; 332 } 333 334 RegisterBankInfo::InstructionMappings 335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 337 switch (MI.getIntrinsicID()) { 338 case Intrinsic::amdgcn_readlane: { 339 static const OpRegBankEntry<3> Table[2] = { 340 // Perfectly legal. 341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 342 343 // Need a readfirstlane for the index. 344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 345 }; 346 347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 349 } 350 case Intrinsic::amdgcn_writelane: { 351 static const OpRegBankEntry<4> Table[4] = { 352 // Perfectly legal. 353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 354 355 // Need readfirstlane of first op 356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 357 358 // Need readfirstlane of second op 359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 360 361 // Need readfirstlane of both ops 362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 363 }; 364 365 // rsrc, voffset, offset 366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 368 } 369 default: 370 return RegisterBankInfo::getInstrAlternativeMappings(MI); 371 } 372 } 373 374 RegisterBankInfo::InstructionMappings 375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 377 378 switch (MI.getIntrinsicID()) { 379 case Intrinsic::amdgcn_s_buffer_load: { 380 static const OpRegBankEntry<2> Table[4] = { 381 // Perfectly legal. 382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 383 384 // Only need 1 register in loop 385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 386 387 // Have to waterfall the resource. 388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 389 390 // Have to waterfall the resource, and the offset. 391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 392 }; 393 394 // rsrc, offset 395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 397 } 398 case Intrinsic::amdgcn_ds_ordered_add: 399 case Intrinsic::amdgcn_ds_ordered_swap: { 400 // VGPR = M0, VGPR 401 static const OpRegBankEntry<3> Table[2] = { 402 // Perfectly legal. 403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 404 405 // Need a readfirstlane for m0 406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 407 }; 408 409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 411 } 412 case Intrinsic::amdgcn_s_sendmsg: 413 case Intrinsic::amdgcn_s_sendmsghalt: { 414 // FIXME: Should have no register for immediate 415 static const OpRegBankEntry<1> Table[2] = { 416 // Perfectly legal. 417 { { AMDGPU::SGPRRegBankID }, 1 }, 418 419 // Need readlane 420 { { AMDGPU::VGPRRegBankID }, 3 } 421 }; 422 423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 425 } 426 default: 427 return RegisterBankInfo::getInstrAlternativeMappings(MI); 428 } 429 } 430 431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { 432 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); 433 return I && I->getMetadata("amdgpu.noclobber"); 434 } 435 436 // FIXME: Returns uniform if there's no source value information. This is 437 // probably wrong. 438 static bool isScalarLoadLegal(const MachineInstr &MI) { 439 if (!MI.hasOneMemOperand()) 440 return false; 441 442 const MachineMemOperand *MMO = *MI.memoperands_begin(); 443 const unsigned AS = MMO->getAddrSpace(); 444 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 445 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 446 447 // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. 448 return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) && 449 // Can't do a scalar atomic load. 450 !MMO->isAtomic() && 451 // Don't use scalar loads for volatile accesses to non-constant address 452 // spaces. 453 (IsConst || !MMO->isVolatile()) && 454 // Memory must be known constant, or not written before this load. 455 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && 456 AMDGPUInstrInfo::isUniformMMO(MMO); 457 } 458 459 RegisterBankInfo::InstructionMappings 460 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 461 const MachineInstr &MI) const { 462 463 const MachineFunction &MF = *MI.getParent()->getParent(); 464 const MachineRegisterInfo &MRI = MF.getRegInfo(); 465 466 467 InstructionMappings AltMappings; 468 switch (MI.getOpcode()) { 469 case TargetOpcode::G_CONSTANT: { 470 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 471 if (Size == 1) { 472 static const OpRegBankEntry<1> Table[3] = { 473 { { AMDGPU::VGPRRegBankID }, 1 }, 474 { { AMDGPU::SGPRRegBankID }, 1 }, 475 { { AMDGPU::VCCRegBankID }, 1 } 476 }; 477 478 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 479 } 480 481 LLVM_FALLTHROUGH; 482 } 483 case TargetOpcode::G_FCONSTANT: 484 case TargetOpcode::G_FRAME_INDEX: 485 case TargetOpcode::G_GLOBAL_VALUE: { 486 static const OpRegBankEntry<1> Table[2] = { 487 { { AMDGPU::VGPRRegBankID }, 1 }, 488 { { AMDGPU::SGPRRegBankID }, 1 } 489 }; 490 491 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 492 } 493 case TargetOpcode::G_AND: 494 case TargetOpcode::G_OR: 495 case TargetOpcode::G_XOR: { 496 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 497 498 if (Size == 1) { 499 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 500 const InstructionMapping &SCCMapping = getInstructionMapping( 501 1, 1, getOperandsMapping( 502 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 504 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 505 3); // Num Operands 506 AltMappings.push_back(&SCCMapping); 507 508 const InstructionMapping &VCCMapping0 = getInstructionMapping( 509 2, 1, getOperandsMapping( 510 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 512 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 513 3); // Num Operands 514 AltMappings.push_back(&VCCMapping0); 515 return AltMappings; 516 } 517 518 if (Size != 64) 519 break; 520 521 const InstructionMapping &SSMapping = getInstructionMapping( 522 1, 1, getOperandsMapping( 523 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 525 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 526 3); // Num Operands 527 AltMappings.push_back(&SSMapping); 528 529 const InstructionMapping &VVMapping = getInstructionMapping( 530 2, 2, getOperandsMapping( 531 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 533 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 534 3); // Num Operands 535 AltMappings.push_back(&VVMapping); 536 break; 537 } 538 case TargetOpcode::G_LOAD: 539 case TargetOpcode::G_ZEXTLOAD: 540 case TargetOpcode::G_SEXTLOAD: { 541 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 542 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 543 unsigned PtrSize = PtrTy.getSizeInBits(); 544 unsigned AS = PtrTy.getAddressSpace(); 545 546 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 547 AS != AMDGPUAS::PRIVATE_ADDRESS) && 548 isScalarLoadLegal(MI)) { 549 const InstructionMapping &SSMapping = getInstructionMapping( 550 1, 1, getOperandsMapping( 551 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 552 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 553 2); // Num Operands 554 AltMappings.push_back(&SSMapping); 555 } 556 557 const InstructionMapping &VVMapping = getInstructionMapping( 558 2, 1, 559 getOperandsMapping( 560 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 561 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 562 2); // Num Operands 563 AltMappings.push_back(&VVMapping); 564 565 // It may be possible to have a vgpr = load sgpr mapping here, because 566 // the mubuf instructions support this kind of load, but probably for only 567 // gfx7 and older. However, the addressing mode matching in the instruction 568 // selector should be able to do a better job of detecting and selecting 569 // these kinds of loads from the vgpr = load vgpr mapping. 570 571 return AltMappings; 572 573 } 574 case TargetOpcode::G_SELECT: { 575 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 576 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 577 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 580 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 581 4); // Num Operands 582 AltMappings.push_back(&SSMapping); 583 584 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 585 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 586 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 588 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 589 4); // Num Operands 590 AltMappings.push_back(&VVMapping); 591 592 return AltMappings; 593 } 594 case TargetOpcode::G_SMIN: 595 case TargetOpcode::G_SMAX: 596 case TargetOpcode::G_UMIN: 597 case TargetOpcode::G_UMAX: { 598 static const OpRegBankEntry<3> Table[2] = { 599 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 600 601 // Scalar requires cmp+select, and extends if 16-bit. 602 // FIXME: Should there be separate costs for 32 and 16-bit 603 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } 604 }; 605 606 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; 607 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 608 } 609 case TargetOpcode::G_UADDE: 610 case TargetOpcode::G_USUBE: 611 case TargetOpcode::G_SADDE: 612 case TargetOpcode::G_SSUBE: { 613 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 614 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 615 getOperandsMapping( 616 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 617 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 618 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 619 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 620 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 621 5); // Num Operands 622 AltMappings.push_back(&SSMapping); 623 624 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 625 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 626 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 627 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 628 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 629 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 630 5); // Num Operands 631 AltMappings.push_back(&VVMapping); 632 return AltMappings; 633 } 634 case AMDGPU::G_BRCOND: { 635 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 636 637 // TODO: Change type to 32 for scalar 638 const InstructionMapping &SMapping = getInstructionMapping( 639 1, 1, getOperandsMapping( 640 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 641 2); // Num Operands 642 AltMappings.push_back(&SMapping); 643 644 const InstructionMapping &VMapping = getInstructionMapping( 645 1, 1, getOperandsMapping( 646 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 647 2); // Num Operands 648 AltMappings.push_back(&VMapping); 649 return AltMappings; 650 } 651 case AMDGPU::G_INTRINSIC: 652 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 653 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 654 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 655 default: 656 break; 657 } 658 return RegisterBankInfo::getInstrAlternativeMappings(MI); 659 } 660 661 void AMDGPURegisterBankInfo::split64BitValueForMapping( 662 MachineIRBuilder &B, 663 SmallVector<Register, 2> &Regs, 664 LLT HalfTy, 665 Register Reg) const { 666 assert(HalfTy.getSizeInBits() == 32); 667 MachineRegisterInfo *MRI = B.getMRI(); 668 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 669 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 670 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 671 MRI->setRegBank(LoLHS, *Bank); 672 MRI->setRegBank(HiLHS, *Bank); 673 674 Regs.push_back(LoLHS); 675 Regs.push_back(HiLHS); 676 677 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 678 .addDef(LoLHS) 679 .addDef(HiLHS) 680 .addUse(Reg); 681 } 682 683 /// Replace the current type each register in \p Regs has with \p NewTy 684 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 685 LLT NewTy) { 686 for (Register Reg : Regs) { 687 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 688 MRI.setType(Reg, NewTy); 689 } 690 } 691 692 static LLT getHalfSizedType(LLT Ty) { 693 if (Ty.isVector()) { 694 assert(Ty.getNumElements() % 2 == 0); 695 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); 696 } 697 698 assert(Ty.getSizeInBits() % 2 == 0); 699 return LLT::scalar(Ty.getSizeInBits() / 2); 700 } 701 702 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 703 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 704 /// execute the instruction for each unique combination of values in all lanes 705 /// in the wave. The block will be split such that rest of the instructions are 706 /// moved to a new block. 707 /// 708 /// Essentially performs this loop: 709 // 710 /// Save Execution Mask 711 /// For (Lane : Wavefront) { 712 /// Enable Lane, Disable all other lanes 713 /// SGPR = read SGPR value for current lane from VGPR 714 /// VGPRResult[Lane] = use_op SGPR 715 /// } 716 /// Restore Execution Mask 717 /// 718 /// There is additional complexity to try for compare values to identify the 719 /// unique values used. 720 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 721 MachineIRBuilder &B, 722 iterator_range<MachineBasicBlock::iterator> Range, 723 SmallSet<Register, 4> &SGPROperandRegs, 724 MachineRegisterInfo &MRI) const { 725 SmallVector<Register, 4> ResultRegs; 726 SmallVector<Register, 4> InitResultRegs; 727 SmallVector<Register, 4> PhiRegs; 728 729 // Track use registers which have already been expanded with a readfirstlane 730 // sequence. This may have multiple uses if moving a sequence. 731 DenseMap<Register, Register> WaterfalledRegMap; 732 733 MachineBasicBlock &MBB = B.getMBB(); 734 MachineFunction *MF = &B.getMF(); 735 736 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 737 const unsigned WaveAndOpc = Subtarget.isWave32() ? 738 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 739 const unsigned MovTermOpc = Subtarget.isWave32() ? 740 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 741 const unsigned XorTermOpc = Subtarget.isWave32() ? 742 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 743 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 744 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 745 const unsigned ExecReg = Subtarget.isWave32() ? 746 AMDGPU::EXEC_LO : AMDGPU::EXEC; 747 748 #ifndef NDEBUG 749 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 750 #endif 751 752 for (MachineInstr &MI : Range) { 753 for (MachineOperand &Def : MI.defs()) { 754 if (MRI.use_nodbg_empty(Def.getReg())) 755 continue; 756 757 LLT ResTy = MRI.getType(Def.getReg()); 758 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 759 ResultRegs.push_back(Def.getReg()); 760 Register InitReg = B.buildUndef(ResTy).getReg(0); 761 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 762 InitResultRegs.push_back(InitReg); 763 PhiRegs.push_back(PhiReg); 764 MRI.setRegBank(PhiReg, *DefBank); 765 MRI.setRegBank(InitReg, *DefBank); 766 } 767 } 768 769 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 770 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 771 772 // Don't bother using generic instructions/registers for the exec mask. 773 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 774 .addDef(InitSaveExecReg); 775 776 Register PhiExec = MRI.createVirtualRegister(WaveRC); 777 Register NewExec = MRI.createVirtualRegister(WaveRC); 778 779 // To insert the loop we need to split the block. Move everything before this 780 // point to a new block, and insert a new empty block before this instruction. 781 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 782 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 783 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 784 MachineFunction::iterator MBBI(MBB); 785 ++MBBI; 786 MF->insert(MBBI, LoopBB); 787 MF->insert(MBBI, RestoreExecBB); 788 MF->insert(MBBI, RemainderBB); 789 790 LoopBB->addSuccessor(RestoreExecBB); 791 LoopBB->addSuccessor(LoopBB); 792 793 // Move the rest of the block into a new block. 794 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 795 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 796 797 MBB.addSuccessor(LoopBB); 798 RestoreExecBB->addSuccessor(RemainderBB); 799 800 B.setInsertPt(*LoopBB, LoopBB->end()); 801 802 B.buildInstr(TargetOpcode::PHI) 803 .addDef(PhiExec) 804 .addReg(InitSaveExecReg) 805 .addMBB(&MBB) 806 .addReg(NewExec) 807 .addMBB(LoopBB); 808 809 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 810 B.buildInstr(TargetOpcode::G_PHI) 811 .addDef(std::get<2>(Result)) 812 .addReg(std::get<0>(Result)) // Initial value / implicit_def 813 .addMBB(&MBB) 814 .addReg(std::get<1>(Result)) // Mid-loop value. 815 .addMBB(LoopBB); 816 } 817 818 const DebugLoc &DL = B.getDL(); 819 820 MachineInstr &FirstInst = *Range.begin(); 821 822 // Move the instruction into the loop. Note we moved everything after 823 // Range.end() already into a new block, so Range.end() is no longer valid. 824 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 825 826 // Figure out the iterator range after splicing the instructions. 827 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 828 auto NewEnd = LoopBB->end(); 829 830 MachineBasicBlock::iterator I = Range.begin(); 831 B.setInsertPt(*LoopBB, I); 832 833 Register CondReg; 834 835 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 836 837 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 838 for (MachineOperand &Op : MI.uses()) { 839 if (!Op.isReg() || Op.isDef()) 840 continue; 841 842 Register OldReg = Op.getReg(); 843 if (!SGPROperandRegs.count(OldReg)) 844 continue; 845 846 // See if we already processed this register in another instruction in the 847 // sequence. 848 auto OldVal = WaterfalledRegMap.find(OldReg); 849 if (OldVal != WaterfalledRegMap.end()) { 850 Op.setReg(OldVal->second); 851 continue; 852 } 853 854 Register OpReg = Op.getReg(); 855 LLT OpTy = MRI.getType(OpReg); 856 857 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 858 if (OpBank != &AMDGPU::VGPRRegBank) { 859 // Insert copy from AGPR to VGPR before the loop. 860 B.setMBB(MBB); 861 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 862 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 863 B.setInstr(*I); 864 } 865 866 unsigned OpSize = OpTy.getSizeInBits(); 867 868 // Can only do a readlane of 32-bit pieces. 869 if (OpSize == 32) { 870 // Avoid extra copies in the simple case of one 32-bit register. 871 Register CurrentLaneOpReg 872 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 873 MRI.setType(CurrentLaneOpReg, OpTy); 874 875 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); 876 // Read the next variant <- also loop target. 877 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 878 CurrentLaneOpReg) 879 .addReg(OpReg); 880 881 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 882 bool First = CondReg == AMDGPU::NoRegister; 883 if (First) 884 CondReg = NewCondReg; 885 886 // Compare the just read M0 value to all possible Idx values. 887 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 888 .addDef(NewCondReg) 889 .addReg(CurrentLaneOpReg) 890 .addReg(OpReg); 891 Op.setReg(CurrentLaneOpReg); 892 893 if (!First) { 894 Register AndReg = MRI.createVirtualRegister(WaveRC); 895 896 // If there are multiple operands to consider, and the conditions. 897 B.buildInstr(WaveAndOpc) 898 .addDef(AndReg) 899 .addReg(NewCondReg) 900 .addReg(CondReg); 901 CondReg = AndReg; 902 } 903 } else { 904 LLT S32 = LLT::scalar(32); 905 SmallVector<Register, 8> ReadlanePieces; 906 907 // The compares can be done as 64-bit, but the extract needs to be done 908 // in 32-bit pieces. 909 910 bool Is64 = OpSize % 64 == 0; 911 912 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 913 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 914 : AMDGPU::V_CMP_EQ_U32_e64; 915 916 // The compares can be done as 64-bit, but the extract needs to be done 917 // in 32-bit pieces. 918 919 // Insert the unmerge before the loop. 920 921 B.setMBB(MBB); 922 auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg); 923 B.setInstr(*I); 924 925 unsigned NumPieces = Unmerge->getNumOperands() - 1; 926 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 927 Register UnmergePiece = Unmerge.getReg(PieceIdx); 928 929 Register CurrentLaneOpReg; 930 if (Is64) { 931 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 932 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 933 934 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 935 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 936 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 937 938 // Read the next variant <- also loop target. 939 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 940 CurrentLaneOpRegLo) 941 .addReg(UnmergePiece, 0, AMDGPU::sub0); 942 943 // Read the next variant <- also loop target. 944 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 945 CurrentLaneOpRegHi) 946 .addReg(UnmergePiece, 0, AMDGPU::sub1); 947 948 CurrentLaneOpReg = 949 B.buildMerge(LLT::scalar(64), 950 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 951 .getReg(0); 952 953 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 954 955 if (OpTy.getScalarSizeInBits() == 64) { 956 // If we need to produce a 64-bit element vector, so use the 957 // merged pieces 958 ReadlanePieces.push_back(CurrentLaneOpReg); 959 } else { 960 // 32-bit element type. 961 ReadlanePieces.push_back(CurrentLaneOpRegLo); 962 ReadlanePieces.push_back(CurrentLaneOpRegHi); 963 } 964 } else { 965 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 966 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 967 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 968 969 // Read the next variant <- also loop target. 970 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 971 CurrentLaneOpReg) 972 .addReg(UnmergePiece); 973 ReadlanePieces.push_back(CurrentLaneOpReg); 974 } 975 976 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 977 bool First = CondReg == AMDGPU::NoRegister; 978 if (First) 979 CondReg = NewCondReg; 980 981 B.buildInstr(CmpOp) 982 .addDef(NewCondReg) 983 .addReg(CurrentLaneOpReg) 984 .addReg(UnmergePiece); 985 986 if (!First) { 987 Register AndReg = MRI.createVirtualRegister(WaveRC); 988 989 // If there are multiple operands to consider, and the conditions. 990 B.buildInstr(WaveAndOpc) 991 .addDef(AndReg) 992 .addReg(NewCondReg) 993 .addReg(CondReg); 994 CondReg = AndReg; 995 } 996 } 997 998 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 999 // BUILD_VECTOR 1000 if (OpTy.isVector()) { 1001 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 1002 Op.setReg(Merge.getReg(0)); 1003 } else { 1004 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 1005 Op.setReg(Merge.getReg(0)); 1006 } 1007 1008 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 1009 } 1010 1011 // Make sure we don't re-process this register again. 1012 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 1013 } 1014 } 1015 1016 B.setInsertPt(*LoopBB, LoopBB->end()); 1017 1018 // Update EXEC, save the original EXEC value to VCC. 1019 B.buildInstr(AndSaveExecOpc) 1020 .addDef(NewExec) 1021 .addReg(CondReg, RegState::Kill); 1022 1023 MRI.setSimpleHint(NewExec, CondReg); 1024 1025 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1026 B.buildInstr(XorTermOpc) 1027 .addDef(ExecReg) 1028 .addReg(ExecReg) 1029 .addReg(NewExec); 1030 1031 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1032 // s_cbranch_scc0? 1033 1034 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1035 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 1036 .addMBB(LoopBB); 1037 1038 // Save the EXEC mask before the loop. 1039 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) 1040 .addReg(ExecReg); 1041 1042 // Restore the EXEC mask after the loop. 1043 B.setMBB(*RestoreExecBB); 1044 B.buildInstr(MovTermOpc) 1045 .addDef(ExecReg) 1046 .addReg(SaveExecReg); 1047 1048 // Set the insert point after the original instruction, so any new 1049 // instructions will be in the remainder. 1050 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 1051 1052 return true; 1053 } 1054 1055 // Return any unique registers used by \p MI at \p OpIndices that need to be 1056 // handled in a waterfall loop. Returns these registers in \p 1057 // SGPROperandRegs. Returns true if there are any operands to handle and a 1058 // waterfall loop is necessary. 1059 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 1060 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 1061 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 1062 for (unsigned Op : OpIndices) { 1063 assert(MI.getOperand(Op).isUse()); 1064 Register Reg = MI.getOperand(Op).getReg(); 1065 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 1066 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 1067 SGPROperandRegs.insert(Reg); 1068 } 1069 1070 // No operands need to be replaced, so no need to loop. 1071 return !SGPROperandRegs.empty(); 1072 } 1073 1074 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1075 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 1076 ArrayRef<unsigned> OpIndices) const { 1077 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1078 // are the same register. 1079 SmallSet<Register, 4> SGPROperandRegs; 1080 1081 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 1082 return false; 1083 1084 MachineBasicBlock::iterator I = MI.getIterator(); 1085 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1086 SGPROperandRegs, MRI); 1087 } 1088 1089 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1090 MachineInstr &MI, MachineRegisterInfo &MRI, 1091 ArrayRef<unsigned> OpIndices) const { 1092 MachineIRBuilder B(MI); 1093 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1094 } 1095 1096 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1097 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1098 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1099 Register Reg = MI.getOperand(OpIdx).getReg(); 1100 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1101 if (Bank == &AMDGPU::SGPRRegBank) 1102 return; 1103 1104 LLT Ty = MRI.getType(Reg); 1105 MachineIRBuilder B(MI); 1106 1107 if (Bank != &AMDGPU::VGPRRegBank) { 1108 // We need to copy from AGPR to VGPR 1109 Reg = B.buildCopy(Ty, Reg).getReg(0); 1110 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); 1111 } 1112 1113 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1114 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1115 .addDef(SGPR) 1116 .addReg(Reg); 1117 1118 MRI.setType(SGPR, Ty); 1119 1120 const TargetRegisterClass *Constrained = 1121 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1122 (void)Constrained; 1123 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1124 1125 MI.getOperand(OpIdx).setReg(SGPR); 1126 } 1127 1128 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1129 /// rest will be in the remainder. 1130 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1131 unsigned TotalSize = Ty.getSizeInBits(); 1132 if (!Ty.isVector()) 1133 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1134 1135 LLT EltTy = Ty.getElementType(); 1136 unsigned EltSize = EltTy.getSizeInBits(); 1137 assert(FirstSize % EltSize == 0); 1138 1139 unsigned FirstPartNumElts = FirstSize / EltSize; 1140 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1141 1142 return {LLT::scalarOrVector(FirstPartNumElts, EltTy), 1143 LLT::scalarOrVector(RemainderElts, EltTy)}; 1144 } 1145 1146 static LLT widen96To128(LLT Ty) { 1147 if (!Ty.isVector()) 1148 return LLT::scalar(128); 1149 1150 LLT EltTy = Ty.getElementType(); 1151 assert(128 % EltTy.getSizeInBits() == 0); 1152 return LLT::vector(128 / EltTy.getSizeInBits(), EltTy); 1153 } 1154 1155 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1156 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1157 MachineRegisterInfo &MRI) const { 1158 Register DstReg = MI.getOperand(0).getReg(); 1159 const LLT LoadTy = MRI.getType(DstReg); 1160 unsigned LoadSize = LoadTy.getSizeInBits(); 1161 const unsigned MaxNonSmrdLoadSize = 128; 1162 1163 const RegisterBank *PtrBank = 1164 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1165 if (PtrBank == &AMDGPU::SGPRRegBank) { 1166 // If the pointer is an SGPR, we ordinarily have nothing to do. 1167 if (LoadSize != 96) 1168 return false; 1169 1170 MachineMemOperand *MMO = *MI.memoperands_begin(); 1171 Register PtrReg = MI.getOperand(1).getReg(); 1172 // 96-bit loads are only available for vector loads. We need to split this 1173 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1174 1175 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1176 MachineIRBuilder B(MI, O); 1177 1178 if (MMO->getAlign() < Align(16)) { 1179 LLT Part64, Part32; 1180 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1181 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); 1182 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); 1183 1184 auto Undef = B.buildUndef(LoadTy); 1185 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); 1186 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); 1187 } else { 1188 LLT WiderTy = widen96To128(LoadTy); 1189 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1190 B.buildExtract(MI.getOperand(0), WideLoad, 0); 1191 } 1192 1193 MI.eraseFromParent(); 1194 return true; 1195 } 1196 1197 // 128-bit loads are supported for all instruction types. 1198 if (LoadSize <= MaxNonSmrdLoadSize) 1199 return false; 1200 1201 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1202 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1203 1204 if (SrcRegs.empty()) 1205 SrcRegs.push_back(MI.getOperand(1).getReg()); 1206 1207 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1208 1209 // RegBankSelect only emits scalar types, so we need to reset the pointer 1210 // operand to a pointer type. 1211 Register BasePtrReg = SrcRegs[0]; 1212 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1213 MRI.setType(BasePtrReg, PtrTy); 1214 1215 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1216 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1217 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1218 MachineIRBuilder B(MI, Observer); 1219 LegalizerHelper Helper(B.getMF(), Observer, B); 1220 1221 if (LoadTy.isVector()) { 1222 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1223 return false; 1224 } else { 1225 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1226 return false; 1227 } 1228 1229 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1230 return true; 1231 } 1232 1233 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1234 MachineInstr &MI, 1235 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1236 MachineRegisterInfo &MRI) const { 1237 const MachineFunction &MF = *MI.getMF(); 1238 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1239 const auto &TFI = *ST.getFrameLowering(); 1240 1241 // Guard in case the stack growth direction ever changes with scratch 1242 // instructions. 1243 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1244 return false; 1245 1246 Register Dst = MI.getOperand(0).getReg(); 1247 Register AllocSize = MI.getOperand(1).getReg(); 1248 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1249 1250 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1251 1252 // TODO: Need to emit a wave reduction to get the maximum size. 1253 if (SizeBank != &AMDGPU::SGPRRegBank) 1254 return false; 1255 1256 LLT PtrTy = MRI.getType(Dst); 1257 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1258 1259 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1260 Register SPReg = Info->getStackPtrOffsetReg(); 1261 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1262 MachineIRBuilder B(MI, ApplyBank); 1263 1264 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1265 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1266 1267 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1268 if (Alignment > TFI.getStackAlign()) { 1269 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1270 B.buildMaskLowPtrBits(Dst, PtrAdd, 1271 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1272 } else { 1273 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1274 } 1275 1276 MI.eraseFromParent(); 1277 return true; 1278 } 1279 1280 bool AMDGPURegisterBankInfo::applyMappingImage( 1281 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1282 MachineRegisterInfo &MRI, int RsrcIdx) const { 1283 const int NumDefs = MI.getNumExplicitDefs(); 1284 1285 // The reported argument index is relative to the IR intrinsic call arguments, 1286 // so we need to shift by the number of defs and the intrinsic ID. 1287 RsrcIdx += NumDefs + 1; 1288 1289 // Insert copies to VGPR arguments. 1290 applyDefaultMapping(OpdMapper); 1291 1292 // Fixup any SGPR arguments. 1293 SmallVector<unsigned, 4> SGPRIndexes; 1294 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1295 if (!MI.getOperand(I).isReg()) 1296 continue; 1297 1298 // If this intrinsic has a sampler, it immediately follows rsrc. 1299 if (I == RsrcIdx || I == RsrcIdx + 1) 1300 SGPRIndexes.push_back(I); 1301 } 1302 1303 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1304 return true; 1305 } 1306 1307 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1308 Register Reg) { 1309 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1310 if (!Def) 1311 return Reg; 1312 1313 // TODO: Guard against this being an implicit def 1314 return Def->getOperand(0).getReg(); 1315 } 1316 1317 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1318 // the three offsets (voffset, soffset and instoffset) 1319 static unsigned setBufferOffsets(MachineIRBuilder &B, 1320 const AMDGPURegisterBankInfo &RBI, 1321 Register CombinedOffset, Register &VOffsetReg, 1322 Register &SOffsetReg, int64_t &InstOffsetVal, 1323 Align Alignment) { 1324 const LLT S32 = LLT::scalar(32); 1325 MachineRegisterInfo *MRI = B.getMRI(); 1326 1327 if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) { 1328 uint32_t SOffset, ImmOffset; 1329 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1330 Alignment)) { 1331 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1332 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1333 InstOffsetVal = ImmOffset; 1334 1335 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1336 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1337 return SOffset + ImmOffset; 1338 } 1339 } 1340 1341 Register Base; 1342 unsigned Offset; 1343 1344 std::tie(Base, Offset) = 1345 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1346 1347 uint32_t SOffset, ImmOffset; 1348 if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1349 &RBI.Subtarget, Alignment)) { 1350 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1351 VOffsetReg = Base; 1352 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1353 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1354 InstOffsetVal = ImmOffset; 1355 return 0; // XXX - Why is this 0? 1356 } 1357 1358 // If we have SGPR base, we can use it for soffset. 1359 if (SOffset == 0) { 1360 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1361 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1362 SOffsetReg = Base; 1363 InstOffsetVal = ImmOffset; 1364 return 0; // XXX - Why is this 0? 1365 } 1366 } 1367 1368 // Handle the variable sgpr + vgpr case. 1369 if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) { 1370 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1371 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1372 1373 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1374 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1375 1376 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1377 VOffsetReg = Src0; 1378 SOffsetReg = Src1; 1379 return 0; 1380 } 1381 1382 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1383 VOffsetReg = Src1; 1384 SOffsetReg = Src0; 1385 return 0; 1386 } 1387 } 1388 1389 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1390 // have an SGPR offset and a VGPR resource. 1391 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1392 VOffsetReg = CombinedOffset; 1393 } else { 1394 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1395 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1396 } 1397 1398 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1399 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1400 return 0; 1401 } 1402 1403 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1404 const OperandsMapper &OpdMapper) const { 1405 MachineInstr &MI = OpdMapper.getMI(); 1406 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1407 1408 const LLT S32 = LLT::scalar(32); 1409 Register Dst = MI.getOperand(0).getReg(); 1410 LLT Ty = MRI.getType(Dst); 1411 1412 const RegisterBank *RSrcBank = 1413 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1414 const RegisterBank *OffsetBank = 1415 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1416 if (RSrcBank == &AMDGPU::SGPRRegBank && 1417 OffsetBank == &AMDGPU::SGPRRegBank) 1418 return true; // Legal mapping 1419 1420 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back 1421 // here but don't have an MMO. 1422 1423 unsigned LoadSize = Ty.getSizeInBits(); 1424 int NumLoads = 1; 1425 if (LoadSize == 256 || LoadSize == 512) { 1426 NumLoads = LoadSize / 128; 1427 Ty = Ty.divide(NumLoads); 1428 } 1429 1430 // Use the alignment to ensure that the required offsets will fit into the 1431 // immediate offsets. 1432 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1433 1434 MachineIRBuilder B(MI); 1435 MachineFunction &MF = B.getMF(); 1436 1437 Register SOffset; 1438 Register VOffset; 1439 int64_t ImmOffset = 0; 1440 1441 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1442 VOffset, SOffset, ImmOffset, Alignment); 1443 1444 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1445 // can, but we neeed to track an MMO for that. 1446 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1447 const Align MemAlign(4); // FIXME: ABI type alignment? 1448 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1449 MachinePointerInfo(), 1450 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1451 MachineMemOperand::MOInvariant, 1452 MemSize, MemAlign); 1453 if (MMOOffset != 0) 1454 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1455 1456 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1457 // assume that the buffer is unswizzled. 1458 1459 Register RSrc = MI.getOperand(1).getReg(); 1460 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1461 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1462 1463 SmallVector<Register, 4> LoadParts(NumLoads); 1464 1465 MachineBasicBlock::iterator MII = MI.getIterator(); 1466 MachineInstrSpan Span(MII, &B.getMBB()); 1467 1468 for (int i = 0; i < NumLoads; ++i) { 1469 if (NumLoads == 1) { 1470 LoadParts[i] = Dst; 1471 } else { 1472 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1473 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1474 } 1475 1476 MachineMemOperand *MMO = BaseMMO; 1477 if (i != 0) 1478 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1479 1480 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1481 .addDef(LoadParts[i]) // vdata 1482 .addUse(RSrc) // rsrc 1483 .addUse(VIndex) // vindex 1484 .addUse(VOffset) // voffset 1485 .addUse(SOffset) // soffset 1486 .addImm(ImmOffset + 16 * i) // offset(imm) 1487 .addImm(0) // cachepolicy, swizzled buffer(imm) 1488 .addImm(0) // idxen(imm) 1489 .addMemOperand(MMO); 1490 } 1491 1492 // TODO: If only the resource is a VGPR, it may be better to execute the 1493 // scalar load in the waterfall loop if the resource is expected to frequently 1494 // be dynamically uniform. 1495 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1496 // Remove the original instruction to avoid potentially confusing the 1497 // waterfall loop logic. 1498 B.setInstr(*Span.begin()); 1499 MI.eraseFromParent(); 1500 1501 SmallSet<Register, 4> OpsToWaterfall; 1502 1503 OpsToWaterfall.insert(RSrc); 1504 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1505 OpsToWaterfall, MRI); 1506 } 1507 1508 if (NumLoads != 1) { 1509 if (Ty.isVector()) 1510 B.buildConcatVectors(Dst, LoadParts); 1511 else 1512 B.buildMerge(Dst, LoadParts); 1513 } 1514 1515 // We removed the instruction earlier with a waterfall loop. 1516 if (RSrcBank == &AMDGPU::SGPRRegBank) 1517 MI.eraseFromParent(); 1518 1519 return true; 1520 } 1521 1522 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( 1523 const OperandsMapper &OpdMapper, bool Signed) const { 1524 MachineInstr &MI = OpdMapper.getMI(); 1525 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1526 1527 // Insert basic copies 1528 applyDefaultMapping(OpdMapper); 1529 1530 Register DstReg = MI.getOperand(0).getReg(); 1531 LLT Ty = MRI.getType(DstReg); 1532 1533 const LLT S32 = LLT::scalar(32); 1534 1535 const RegisterBank *DstBank = 1536 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1537 if (DstBank == &AMDGPU::VGPRRegBank) { 1538 if (Ty == S32) 1539 return true; 1540 1541 // TODO: 64-bit version is scalar only, so we need to expand this. 1542 return false; 1543 } 1544 1545 Register SrcReg = MI.getOperand(2).getReg(); 1546 Register OffsetReg = MI.getOperand(3).getReg(); 1547 Register WidthReg = MI.getOperand(4).getReg(); 1548 1549 // The scalar form packs the offset and width in a single operand. 1550 1551 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1552 MachineIRBuilder B(MI, ApplyBank); 1553 1554 // Ensure the high bits are clear to insert the offset. 1555 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1556 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1557 1558 // Zeros out the low bits, so don't bother clamping the input value. 1559 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1560 1561 // Transformation function, pack the offset and width of a BFE into 1562 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1563 // source, bits [5:0] contain the offset and bits [22:16] the width. 1564 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1565 1566 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1567 // register class constraints. 1568 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1569 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1570 1571 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1572 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1573 llvm_unreachable("failed to constrain BFE"); 1574 1575 MI.eraseFromParent(); 1576 return true; 1577 } 1578 1579 // FIXME: Duplicated from LegalizerHelper 1580 static CmpInst::Predicate minMaxToCompare(unsigned Opc) { 1581 switch (Opc) { 1582 case TargetOpcode::G_SMIN: 1583 return CmpInst::ICMP_SLT; 1584 case TargetOpcode::G_SMAX: 1585 return CmpInst::ICMP_SGT; 1586 case TargetOpcode::G_UMIN: 1587 return CmpInst::ICMP_ULT; 1588 case TargetOpcode::G_UMAX: 1589 return CmpInst::ICMP_UGT; 1590 default: 1591 llvm_unreachable("not in integer min/max"); 1592 } 1593 } 1594 1595 static unsigned minMaxToExtend(unsigned Opc) { 1596 switch (Opc) { 1597 case TargetOpcode::G_SMIN: 1598 case TargetOpcode::G_SMAX: 1599 return TargetOpcode::G_SEXT; 1600 case TargetOpcode::G_UMIN: 1601 case TargetOpcode::G_UMAX: 1602 return TargetOpcode::G_ZEXT; 1603 default: 1604 llvm_unreachable("not in integer min/max"); 1605 } 1606 } 1607 1608 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1609 // any illegal vector extend or unmerge operations. 1610 static std::pair<Register, Register> 1611 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1612 const LLT S32 = LLT::scalar(32); 1613 auto Bitcast = B.buildBitcast(S32, Src); 1614 1615 if (ExtOpcode == TargetOpcode::G_SEXT) { 1616 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1617 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1618 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1619 } 1620 1621 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1622 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1623 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1624 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1625 } 1626 1627 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1628 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1629 } 1630 1631 static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B, 1632 CmpInst::Predicate Pred, 1633 Register Dst, Register Src0, 1634 Register Src1) { 1635 const LLT CmpType = LLT::scalar(32); 1636 auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1); 1637 return B.buildSelect(Dst, Cmp, Src0, Src1); 1638 } 1639 1640 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type. 1641 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B, 1642 MachineInstr &MI) const { 1643 Register Dst = MI.getOperand(0).getReg(); 1644 Register Src0 = MI.getOperand(1).getReg(); 1645 Register Src1 = MI.getOperand(2).getReg(); 1646 1647 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 1648 MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1); 1649 1650 Register CmpReg = Sel->getOperand(1).getReg(); 1651 B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank); 1652 MI.eraseFromParent(); 1653 } 1654 1655 // For cases where only a single copy is inserted for matching register banks. 1656 // Replace the register in the instruction operand 1657 static bool substituteSimpleCopyRegs( 1658 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1659 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1660 if (!SrcReg.empty()) { 1661 assert(SrcReg.size() == 1); 1662 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1663 return true; 1664 } 1665 1666 return false; 1667 } 1668 1669 /// Handle register layout difference for f16 images for some subtargets. 1670 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1671 MachineRegisterInfo &MRI, 1672 Register Reg) const { 1673 if (!Subtarget.hasUnpackedD16VMem()) 1674 return Reg; 1675 1676 const LLT S16 = LLT::scalar(16); 1677 LLT StoreVT = MRI.getType(Reg); 1678 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1679 return Reg; 1680 1681 auto Unmerge = B.buildUnmerge(S16, Reg); 1682 1683 1684 SmallVector<Register, 4> WideRegs; 1685 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1686 WideRegs.push_back(Unmerge.getReg(I)); 1687 1688 const LLT S32 = LLT::scalar(32); 1689 int NumElts = StoreVT.getNumElements(); 1690 1691 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1692 } 1693 1694 static std::pair<Register, unsigned> 1695 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1696 int64_t Const; 1697 if (mi_match(Reg, MRI, m_ICst(Const))) 1698 return std::make_pair(Register(), Const); 1699 1700 Register Base; 1701 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1702 return std::make_pair(Base, Const); 1703 1704 // TODO: Handle G_OR used for add case 1705 return std::make_pair(Reg, 0); 1706 } 1707 1708 std::pair<Register, unsigned> 1709 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1710 Register OrigOffset) const { 1711 const unsigned MaxImm = 4095; 1712 Register BaseReg; 1713 unsigned ImmOffset; 1714 const LLT S32 = LLT::scalar(32); 1715 1716 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1717 OrigOffset); 1718 1719 unsigned C1 = 0; 1720 if (ImmOffset != 0) { 1721 // If the immediate value is too big for the immoffset field, put the value 1722 // and -4096 into the immoffset field so that the value that is copied/added 1723 // for the voffset field is a multiple of 4096, and it stands more chance 1724 // of being CSEd with the copy/add for another similar load/store. 1725 // However, do not do that rounding down to a multiple of 4096 if that is a 1726 // negative number, as it appears to be illegal to have a negative offset 1727 // in the vgpr, even if adding the immediate offset makes it positive. 1728 unsigned Overflow = ImmOffset & ~MaxImm; 1729 ImmOffset -= Overflow; 1730 if ((int32_t)Overflow < 0) { 1731 Overflow += ImmOffset; 1732 ImmOffset = 0; 1733 } 1734 1735 C1 = ImmOffset; 1736 if (Overflow != 0) { 1737 if (!BaseReg) 1738 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1739 else { 1740 auto OverflowVal = B.buildConstant(S32, Overflow); 1741 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1742 } 1743 } 1744 } 1745 1746 if (!BaseReg) 1747 BaseReg = B.buildConstant(S32, 0).getReg(0); 1748 1749 return {BaseReg, C1}; 1750 } 1751 1752 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 1753 int64_t C; 1754 return mi_match(Reg, MRI, m_ICst(C)) && C == 0; 1755 } 1756 1757 static unsigned extractGLC(unsigned CachePolicy) { 1758 return CachePolicy & 1; 1759 } 1760 1761 static unsigned extractSLC(unsigned CachePolicy) { 1762 return (CachePolicy >> 1) & 1; 1763 } 1764 1765 static unsigned extractDLC(unsigned CachePolicy) { 1766 return (CachePolicy >> 2) & 1; 1767 } 1768 1769 MachineInstr * 1770 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, 1771 MachineInstr &MI) const { 1772 MachineRegisterInfo &MRI = *B.getMRI(); 1773 executeInWaterfallLoop(B, MI, MRI, {2, 4}); 1774 1775 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. 1776 1777 Register VData = MI.getOperand(1).getReg(); 1778 LLT Ty = MRI.getType(VData); 1779 1780 int EltSize = Ty.getScalarSizeInBits(); 1781 int Size = Ty.getSizeInBits(); 1782 1783 // FIXME: Broken integer truncstore. 1784 if (EltSize != 32) 1785 report_fatal_error("unhandled intrinsic store"); 1786 1787 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 1788 const int MemSize = (*MI.memoperands_begin())->getSize(); 1789 1790 1791 Register RSrc = MI.getOperand(2).getReg(); 1792 Register VOffset = MI.getOperand(3).getReg(); 1793 Register SOffset = MI.getOperand(4).getReg(); 1794 unsigned CachePolicy = MI.getOperand(5).getImm(); 1795 1796 unsigned ImmOffset; 1797 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 1798 1799 const bool Offen = !isZero(VOffset, MRI); 1800 1801 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; 1802 switch (8 * MemSize) { 1803 case 8: 1804 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 1805 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 1806 break; 1807 case 16: 1808 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 1809 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 1810 break; 1811 default: 1812 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 1813 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 1814 if (Size > 32) 1815 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 1816 break; 1817 } 1818 1819 1820 // Set the insertion point back to the instruction in case it was moved into a 1821 // loop. 1822 B.setInstr(MI); 1823 1824 MachineInstrBuilder MIB = B.buildInstr(Opc) 1825 .addUse(VData); 1826 1827 if (Offen) 1828 MIB.addUse(VOffset); 1829 1830 MIB.addUse(RSrc) 1831 .addUse(SOffset) 1832 .addImm(ImmOffset) 1833 .addImm(extractGLC(CachePolicy)) 1834 .addImm(extractSLC(CachePolicy)) 1835 .addImm(0) // tfe: FIXME: Remove from inst 1836 .addImm(extractDLC(CachePolicy)) 1837 .cloneMemRefs(MI); 1838 1839 // FIXME: We need a way to report failure from applyMappingImpl. 1840 // Insert constrain copies before inserting the loop. 1841 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1842 report_fatal_error("failed to constrain selected store intrinsic"); 1843 1844 return MIB; 1845 } 1846 1847 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1848 Register SrcReg) const { 1849 MachineRegisterInfo &MRI = *B.getMRI(); 1850 LLT SrcTy = MRI.getType(SrcReg); 1851 if (SrcTy.getSizeInBits() == 32) { 1852 // Use a v_mov_b32 here to make the exec dependency explicit. 1853 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1854 .addDef(DstReg) 1855 .addUse(SrcReg); 1856 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1857 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1858 } 1859 1860 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1861 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1862 1863 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1864 .addDef(TmpReg0) 1865 .addUse(SrcReg, 0, AMDGPU::sub0); 1866 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1867 .addDef(TmpReg1) 1868 .addUse(SrcReg, 0, AMDGPU::sub1); 1869 B.buildInstr(AMDGPU::REG_SEQUENCE) 1870 .addDef(DstReg) 1871 .addUse(TmpReg0) 1872 .addImm(AMDGPU::sub0) 1873 .addUse(TmpReg1) 1874 .addImm(AMDGPU::sub1); 1875 1876 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1877 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1878 } 1879 1880 /// Utility function for pushing dynamic vector indexes with a constant offset 1881 /// into waterwall loops. 1882 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1883 MachineInstr &IdxUseInstr, 1884 unsigned OpIdx, 1885 unsigned ConstOffset) { 1886 MachineRegisterInfo &MRI = *B.getMRI(); 1887 const LLT S32 = LLT::scalar(32); 1888 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1889 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1890 1891 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1892 1893 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1894 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1895 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1896 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1897 } 1898 1899 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1900 /// original 32-bit source value (to be inserted in the low part of the combined 1901 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1902 /// value. 1903 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1904 Register Hi32Reg, Register Lo32Reg, 1905 unsigned ExtOpc, 1906 const RegisterBank &RegBank, 1907 bool IsBooleanSrc = false) { 1908 if (ExtOpc == AMDGPU::G_ZEXT) { 1909 B.buildConstant(Hi32Reg, 0); 1910 } else if (ExtOpc == AMDGPU::G_SEXT) { 1911 if (IsBooleanSrc) { 1912 // If we know the original source was an s1, the high half is the same as 1913 // the low. 1914 B.buildCopy(Hi32Reg, Lo32Reg); 1915 } else { 1916 // Replicate sign bit from 32-bit extended part. 1917 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1918 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1919 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1920 } 1921 } else { 1922 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1923 B.buildUndef(Hi32Reg); 1924 } 1925 } 1926 1927 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1928 MachineInstr &MI, MachineRegisterInfo &MRI, 1929 const OperandsMapper &OpdMapper) const { 1930 1931 Register VecReg = MI.getOperand(1).getReg(); 1932 Register Idx = MI.getOperand(2).getReg(); 1933 1934 const RegisterBank &IdxBank = 1935 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1936 1937 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1938 1939 LLT VecTy = MRI.getType(VecReg); 1940 unsigned EltSize = VecTy.getScalarSizeInBits(); 1941 unsigned NumElem = VecTy.getNumElements(); 1942 1943 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1944 IsDivergentIdx)) 1945 return false; 1946 1947 MachineIRBuilder B(MI); 1948 LLT S32 = LLT::scalar(32); 1949 1950 const RegisterBank &DstBank = 1951 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1952 const RegisterBank &SrcBank = 1953 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1954 1955 const RegisterBank &CCBank = 1956 (DstBank == AMDGPU::SGPRRegBank && 1957 SrcBank == AMDGPU::SGPRRegBank && 1958 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1959 : AMDGPU::VCCRegBank; 1960 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1961 1962 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1963 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1964 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1965 } 1966 1967 LLT EltTy = VecTy.getScalarType(); 1968 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1969 unsigned NumLanes = DstRegs.size(); 1970 if (!NumLanes) 1971 NumLanes = 1; 1972 else 1973 EltTy = MRI.getType(DstRegs[0]); 1974 1975 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1976 SmallVector<Register, 2> Res(NumLanes); 1977 for (unsigned L = 0; L < NumLanes; ++L) 1978 Res[L] = UnmergeToEltTy.getReg(L); 1979 1980 for (unsigned I = 1; I < NumElem; ++I) { 1981 auto IC = B.buildConstant(S32, I); 1982 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1983 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1984 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1985 1986 for (unsigned L = 0; L < NumLanes; ++L) { 1987 auto S = B.buildSelect(EltTy, Cmp, 1988 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1989 1990 for (unsigned N : { 0, 2, 3 }) 1991 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1992 1993 Res[L] = S->getOperand(0).getReg(); 1994 } 1995 } 1996 1997 for (unsigned L = 0; L < NumLanes; ++L) { 1998 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1999 B.buildCopy(DstReg, Res[L]); 2000 MRI.setRegBank(DstReg, DstBank); 2001 } 2002 2003 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2004 MI.eraseFromParent(); 2005 2006 return true; 2007 } 2008 2009 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2010 MachineInstr &MI, MachineRegisterInfo &MRI, 2011 const OperandsMapper &OpdMapper) const { 2012 2013 Register VecReg = MI.getOperand(1).getReg(); 2014 Register Idx = MI.getOperand(3).getReg(); 2015 2016 const RegisterBank &IdxBank = 2017 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2018 2019 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2020 2021 LLT VecTy = MRI.getType(VecReg); 2022 unsigned EltSize = VecTy.getScalarSizeInBits(); 2023 unsigned NumElem = VecTy.getNumElements(); 2024 2025 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2026 IsDivergentIdx)) 2027 return false; 2028 2029 MachineIRBuilder B(MI); 2030 LLT S32 = LLT::scalar(32); 2031 2032 const RegisterBank &DstBank = 2033 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2034 const RegisterBank &SrcBank = 2035 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2036 const RegisterBank &InsBank = 2037 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2038 2039 const RegisterBank &CCBank = 2040 (DstBank == AMDGPU::SGPRRegBank && 2041 SrcBank == AMDGPU::SGPRRegBank && 2042 InsBank == AMDGPU::SGPRRegBank && 2043 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2044 : AMDGPU::VCCRegBank; 2045 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2046 2047 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2048 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2049 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2050 } 2051 2052 LLT EltTy = VecTy.getScalarType(); 2053 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2054 unsigned NumLanes = InsRegs.size(); 2055 if (!NumLanes) { 2056 NumLanes = 1; 2057 InsRegs.push_back(MI.getOperand(2).getReg()); 2058 } else { 2059 EltTy = MRI.getType(InsRegs[0]); 2060 } 2061 2062 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2063 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2064 2065 for (unsigned I = 0; I < NumElem; ++I) { 2066 auto IC = B.buildConstant(S32, I); 2067 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2068 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2069 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2070 2071 for (unsigned L = 0; L < NumLanes; ++L) { 2072 auto S = B.buildSelect(EltTy, Cmp, InsRegs[L], 2073 UnmergeToEltTy.getReg(I * NumLanes + L)); 2074 2075 for (unsigned N : { 0, 2, 3 }) 2076 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 2077 2078 Ops[I * NumLanes + L] = S->getOperand(0).getReg(); 2079 } 2080 } 2081 2082 LLT MergeTy = LLT::vector(Ops.size(), EltTy); 2083 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2084 B.buildBuildVector(MI.getOperand(0), Ops); 2085 } else { 2086 auto Vec = B.buildBuildVector(MergeTy, Ops); 2087 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2088 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2089 } 2090 2091 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2092 MI.eraseFromParent(); 2093 2094 return true; 2095 } 2096 2097 void AMDGPURegisterBankInfo::applyMappingImpl( 2098 const OperandsMapper &OpdMapper) const { 2099 MachineInstr &MI = OpdMapper.getMI(); 2100 unsigned Opc = MI.getOpcode(); 2101 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2102 switch (Opc) { 2103 case AMDGPU::G_PHI: { 2104 Register DstReg = MI.getOperand(0).getReg(); 2105 LLT DstTy = MRI.getType(DstReg); 2106 if (DstTy != LLT::scalar(1)) 2107 break; 2108 2109 const LLT S32 = LLT::scalar(32); 2110 const RegisterBank *DstBank = 2111 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2112 if (DstBank == &AMDGPU::VCCRegBank) { 2113 applyDefaultMapping(OpdMapper); 2114 // The standard handling only considers the result register bank for 2115 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2116 // produce an invalid copy. We can only copy with some kind of compare to 2117 // get a vector boolean result. Insert a regitser bank copy that will be 2118 // correctly lowered to a compare. 2119 MachineIRBuilder B(*MI.getParent()->getParent()); 2120 2121 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2122 Register SrcReg = MI.getOperand(I).getReg(); 2123 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2124 2125 if (SrcBank != &AMDGPU::VCCRegBank) { 2126 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2127 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2128 2129 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2130 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2131 MI.getOperand(I).setReg(Copy.getReg(0)); 2132 } 2133 } 2134 2135 return; 2136 } 2137 2138 // Phi handling is strange and only considers the bank of the destination. 2139 substituteSimpleCopyRegs(OpdMapper, 0); 2140 2141 // Promote SGPR/VGPR booleans to s32 2142 MachineFunction *MF = MI.getParent()->getParent(); 2143 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2144 MachineIRBuilder B(MI, ApplyBank); 2145 LegalizerHelper Helper(*MF, ApplyBank, B); 2146 2147 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2148 llvm_unreachable("widen scalar should have succeeded"); 2149 2150 return; 2151 } 2152 case AMDGPU::G_ICMP: 2153 case AMDGPU::G_UADDO: 2154 case AMDGPU::G_USUBO: 2155 case AMDGPU::G_UADDE: 2156 case AMDGPU::G_SADDE: 2157 case AMDGPU::G_USUBE: 2158 case AMDGPU::G_SSUBE: { 2159 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2160 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2161 2162 const RegisterBank *DstBank = 2163 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2164 if (DstBank != &AMDGPU::SGPRRegBank) 2165 break; 2166 2167 const bool HasCarryIn = MI.getNumOperands() == 5; 2168 2169 // If this is a scalar compare, promote the result to s32, as the selection 2170 // will end up using a copy to a 32-bit vreg. 2171 const LLT S32 = LLT::scalar(32); 2172 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2173 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2174 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2175 MachineIRBuilder B(MI); 2176 2177 if (HasCarryIn) { 2178 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2179 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2180 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2181 MI.getOperand(4).setReg(NewSrcReg); 2182 } 2183 2184 MachineBasicBlock *MBB = MI.getParent(); 2185 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2186 2187 // If we had a constrained VCC result register, a copy was inserted to VCC 2188 // from SGPR. 2189 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2190 if (DefRegs.empty()) 2191 DefRegs.push_back(DstReg); 2192 B.buildTrunc(DefRegs[0], NewDstReg); 2193 return; 2194 } 2195 case AMDGPU::G_SELECT: { 2196 Register DstReg = MI.getOperand(0).getReg(); 2197 LLT DstTy = MRI.getType(DstReg); 2198 2199 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2200 if (CondRegs.empty()) 2201 CondRegs.push_back(MI.getOperand(1).getReg()); 2202 else { 2203 assert(CondRegs.size() == 1); 2204 } 2205 2206 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2207 if (CondBank == &AMDGPU::SGPRRegBank) { 2208 MachineIRBuilder B(MI); 2209 const LLT S32 = LLT::scalar(32); 2210 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2211 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2212 2213 MI.getOperand(1).setReg(NewCondReg); 2214 B.buildZExt(NewCondReg, CondRegs[0]); 2215 } 2216 2217 if (DstTy.getSizeInBits() != 64) 2218 break; 2219 2220 MachineIRBuilder B(MI); 2221 LLT HalfTy = getHalfSizedType(DstTy); 2222 2223 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2224 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2225 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2226 2227 // All inputs are SGPRs, nothing special to do. 2228 if (DefRegs.empty()) { 2229 assert(Src1Regs.empty() && Src2Regs.empty()); 2230 break; 2231 } 2232 2233 if (Src1Regs.empty()) 2234 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2235 else { 2236 setRegsToType(MRI, Src1Regs, HalfTy); 2237 } 2238 2239 if (Src2Regs.empty()) 2240 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2241 else 2242 setRegsToType(MRI, Src2Regs, HalfTy); 2243 2244 setRegsToType(MRI, DefRegs, HalfTy); 2245 2246 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2247 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2248 2249 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2250 MI.eraseFromParent(); 2251 return; 2252 } 2253 case AMDGPU::G_BRCOND: { 2254 Register CondReg = MI.getOperand(0).getReg(); 2255 // FIXME: Should use legalizer helper, but should change bool ext type. 2256 const RegisterBank *CondBank = 2257 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2258 2259 if (CondBank == &AMDGPU::SGPRRegBank) { 2260 MachineIRBuilder B(MI); 2261 const LLT S32 = LLT::scalar(32); 2262 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2263 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2264 2265 MI.getOperand(0).setReg(NewCondReg); 2266 B.buildZExt(NewCondReg, CondReg); 2267 return; 2268 } 2269 2270 break; 2271 } 2272 case AMDGPU::G_AND: 2273 case AMDGPU::G_OR: 2274 case AMDGPU::G_XOR: { 2275 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2276 // there is a VGPR input. 2277 Register DstReg = MI.getOperand(0).getReg(); 2278 LLT DstTy = MRI.getType(DstReg); 2279 2280 if (DstTy.getSizeInBits() == 1) { 2281 const RegisterBank *DstBank = 2282 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2283 if (DstBank == &AMDGPU::VCCRegBank) 2284 break; 2285 2286 MachineFunction *MF = MI.getParent()->getParent(); 2287 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2288 MachineIRBuilder B(MI, ApplyBank); 2289 LegalizerHelper Helper(*MF, ApplyBank, B); 2290 2291 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2292 LegalizerHelper::Legalized) 2293 llvm_unreachable("widen scalar should have succeeded"); 2294 return; 2295 } 2296 2297 if (DstTy.getSizeInBits() != 64) 2298 break; 2299 2300 LLT HalfTy = getHalfSizedType(DstTy); 2301 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2302 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2303 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2304 2305 // All inputs are SGPRs, nothing special to do. 2306 if (DefRegs.empty()) { 2307 assert(Src0Regs.empty() && Src1Regs.empty()); 2308 break; 2309 } 2310 2311 assert(DefRegs.size() == 2); 2312 assert(Src0Regs.size() == Src1Regs.size() && 2313 (Src0Regs.empty() || Src0Regs.size() == 2)); 2314 2315 // Depending on where the source registers came from, the generic code may 2316 // have decided to split the inputs already or not. If not, we still need to 2317 // extract the values. 2318 MachineIRBuilder B(MI); 2319 2320 if (Src0Regs.empty()) 2321 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2322 else 2323 setRegsToType(MRI, Src0Regs, HalfTy); 2324 2325 if (Src1Regs.empty()) 2326 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2327 else 2328 setRegsToType(MRI, Src1Regs, HalfTy); 2329 2330 setRegsToType(MRI, DefRegs, HalfTy); 2331 2332 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2333 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2334 2335 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2336 MI.eraseFromParent(); 2337 return; 2338 } 2339 case AMDGPU::G_ADD: 2340 case AMDGPU::G_SUB: 2341 case AMDGPU::G_MUL: 2342 case AMDGPU::G_SHL: 2343 case AMDGPU::G_LSHR: 2344 case AMDGPU::G_ASHR: { 2345 Register DstReg = MI.getOperand(0).getReg(); 2346 LLT DstTy = MRI.getType(DstReg); 2347 2348 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2349 // Packed 16-bit operations need to be scalarized and promoted. 2350 if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16)) 2351 break; 2352 2353 const RegisterBank *DstBank = 2354 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2355 if (DstBank == &AMDGPU::VGPRRegBank) 2356 break; 2357 2358 const LLT S32 = LLT::scalar(32); 2359 MachineBasicBlock *MBB = MI.getParent(); 2360 MachineFunction *MF = MBB->getParent(); 2361 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2362 MachineIRBuilder B(MI, ApplySALU); 2363 2364 if (DstTy.isVector()) { 2365 Register WideSrc0Lo, WideSrc0Hi; 2366 Register WideSrc1Lo, WideSrc1Hi; 2367 2368 std::tie(WideSrc0Lo, WideSrc0Hi) 2369 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT); 2370 std::tie(WideSrc1Lo, WideSrc1Hi) 2371 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT); 2372 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2373 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2374 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2375 MI.eraseFromParent(); 2376 } else { 2377 LegalizerHelper Helper(*MF, ApplySALU, B); 2378 2379 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2380 llvm_unreachable("widen scalar should have succeeded"); 2381 2382 // FIXME: s16 shift amounts should be legal. 2383 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2384 Opc == AMDGPU::G_ASHR) { 2385 B.setInsertPt(*MBB, MI.getIterator()); 2386 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2387 llvm_unreachable("widen scalar should have succeeded"); 2388 } 2389 } 2390 2391 return; 2392 } 2393 case AMDGPU::G_SMIN: 2394 case AMDGPU::G_SMAX: 2395 case AMDGPU::G_UMIN: 2396 case AMDGPU::G_UMAX: { 2397 Register DstReg = MI.getOperand(0).getReg(); 2398 const RegisterBank *DstBank = 2399 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2400 if (DstBank == &AMDGPU::VGPRRegBank) 2401 break; 2402 2403 MachineFunction *MF = MI.getParent()->getParent(); 2404 MachineIRBuilder B(MI); 2405 2406 // Turn scalar min/max into a compare and select. 2407 LLT Ty = MRI.getType(DstReg); 2408 const LLT S32 = LLT::scalar(32); 2409 const LLT S16 = LLT::scalar(16); 2410 const LLT V2S16 = LLT::vector(2, 16); 2411 2412 if (Ty == V2S16) { 2413 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2414 B.setChangeObserver(ApplySALU); 2415 2416 // Need to widen to s32, and expand as cmp + select, and avoid producing 2417 // illegal vector extends or unmerges that would need further 2418 // legalization. 2419 // 2420 // TODO: Should we just readfirstlane? That should probably be handled 2421 // with a UniformVGPR register bank that wouldn't need special 2422 // consideration here. 2423 2424 Register Dst = MI.getOperand(0).getReg(); 2425 Register Src0 = MI.getOperand(1).getReg(); 2426 Register Src1 = MI.getOperand(2).getReg(); 2427 2428 Register WideSrc0Lo, WideSrc0Hi; 2429 Register WideSrc1Lo, WideSrc1Hi; 2430 2431 unsigned ExtendOp = minMaxToExtend(MI.getOpcode()); 2432 2433 std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp); 2434 std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp); 2435 2436 Register Lo = MRI.createGenericVirtualRegister(S32); 2437 Register Hi = MRI.createGenericVirtualRegister(S32); 2438 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 2439 buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo); 2440 buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi); 2441 2442 B.buildBuildVectorTrunc(Dst, {Lo, Hi}); 2443 MI.eraseFromParent(); 2444 } else if (Ty == S16) { 2445 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2446 B.setChangeObserver(ApplySALU); 2447 LegalizerHelper Helper(*MF, ApplySALU, B); 2448 2449 // Need to widen to s32, and expand as cmp + select. 2450 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2451 llvm_unreachable("widenScalar should have succeeded"); 2452 2453 // FIXME: This is relying on widenScalar leaving MI in place. 2454 lowerScalarMinMax(B, MI); 2455 } else 2456 lowerScalarMinMax(B, MI); 2457 2458 return; 2459 } 2460 case AMDGPU::G_SEXT_INREG: { 2461 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2462 if (SrcRegs.empty()) 2463 break; // Nothing to repair 2464 2465 const LLT S32 = LLT::scalar(32); 2466 MachineIRBuilder B(MI); 2467 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2468 GISelObserverWrapper Observer(&O); 2469 B.setChangeObserver(Observer); 2470 2471 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2472 // we would need to further expand, and doesn't let us directly set the 2473 // result registers. 2474 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2475 2476 int Amt = MI.getOperand(2).getImm(); 2477 if (Amt <= 32) { 2478 if (Amt == 32) { 2479 // The low bits are unchanged. 2480 B.buildCopy(DstRegs[0], SrcRegs[0]); 2481 } else { 2482 // Extend in the low bits and propagate the sign bit to the high half. 2483 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2484 } 2485 2486 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2487 } else { 2488 // The low bits are unchanged, and extend in the high bits. 2489 B.buildCopy(DstRegs[0], SrcRegs[0]); 2490 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2491 } 2492 2493 Register DstReg = MI.getOperand(0).getReg(); 2494 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2495 MI.eraseFromParent(); 2496 return; 2497 } 2498 case AMDGPU::G_CTPOP: 2499 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2500 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2501 const RegisterBank *DstBank = 2502 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2503 if (DstBank == &AMDGPU::SGPRRegBank) 2504 break; 2505 2506 Register SrcReg = MI.getOperand(1).getReg(); 2507 const LLT S32 = LLT::scalar(32); 2508 LLT Ty = MRI.getType(SrcReg); 2509 if (Ty == S32) 2510 break; 2511 2512 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2513 MachineIRBuilder B(MI, ApplyVALU); 2514 2515 MachineFunction &MF = B.getMF(); 2516 LegalizerHelper Helper(MF, ApplyVALU, B); 2517 2518 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2519 llvm_unreachable("narrowScalar should have succeeded"); 2520 return; 2521 } 2522 case AMDGPU::G_SEXT: 2523 case AMDGPU::G_ZEXT: 2524 case AMDGPU::G_ANYEXT: { 2525 Register SrcReg = MI.getOperand(1).getReg(); 2526 LLT SrcTy = MRI.getType(SrcReg); 2527 const bool Signed = Opc == AMDGPU::G_SEXT; 2528 2529 assert(empty(OpdMapper.getVRegs(1))); 2530 2531 MachineIRBuilder B(MI); 2532 const RegisterBank *SrcBank = 2533 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2534 2535 Register DstReg = MI.getOperand(0).getReg(); 2536 LLT DstTy = MRI.getType(DstReg); 2537 if (DstTy.isScalar() && 2538 SrcBank != &AMDGPU::SGPRRegBank && 2539 SrcBank != &AMDGPU::VCCRegBank && 2540 // FIXME: Should handle any type that round to s64 when irregular 2541 // breakdowns supported. 2542 DstTy.getSizeInBits() == 64 && 2543 SrcTy.getSizeInBits() <= 32) { 2544 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2545 2546 // Extend to 32-bit, and then extend the low half. 2547 if (Signed) { 2548 // TODO: Should really be buildSExtOrCopy 2549 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2550 } else if (Opc == AMDGPU::G_ZEXT) { 2551 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2552 } else { 2553 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2554 } 2555 2556 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2557 MRI.setRegBank(DstReg, *SrcBank); 2558 MI.eraseFromParent(); 2559 return; 2560 } 2561 2562 if (SrcTy != LLT::scalar(1)) 2563 return; 2564 2565 // It is not legal to have a legalization artifact with a VCC source. Rather 2566 // than introducing a copy, insert the select we would have to select the 2567 // copy to. 2568 if (SrcBank == &AMDGPU::VCCRegBank) { 2569 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2570 2571 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2572 2573 unsigned DstSize = DstTy.getSizeInBits(); 2574 // 64-bit select is SGPR only 2575 const bool UseSel64 = DstSize > 32 && 2576 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2577 2578 // TODO: Should s16 select be legal? 2579 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2580 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2581 auto False = B.buildConstant(SelType, 0); 2582 2583 MRI.setRegBank(True.getReg(0), *DstBank); 2584 MRI.setRegBank(False.getReg(0), *DstBank); 2585 MRI.setRegBank(DstReg, *DstBank); 2586 2587 if (DstSize > 32) { 2588 B.buildSelect(DefRegs[0], SrcReg, True, False); 2589 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2590 } else if (DstSize < 32) { 2591 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2592 MRI.setRegBank(Sel.getReg(0), *DstBank); 2593 B.buildTrunc(DstReg, Sel); 2594 } else { 2595 B.buildSelect(DstReg, SrcReg, True, False); 2596 } 2597 2598 MI.eraseFromParent(); 2599 return; 2600 } 2601 2602 break; 2603 } 2604 case AMDGPU::G_BUILD_VECTOR: 2605 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2606 Register DstReg = MI.getOperand(0).getReg(); 2607 LLT DstTy = MRI.getType(DstReg); 2608 if (DstTy != LLT::vector(2, 16)) 2609 break; 2610 2611 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2612 substituteSimpleCopyRegs(OpdMapper, 1); 2613 substituteSimpleCopyRegs(OpdMapper, 2); 2614 2615 const RegisterBank *DstBank = 2616 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2617 if (DstBank == &AMDGPU::SGPRRegBank) 2618 break; // Can use S_PACK_* instructions. 2619 2620 MachineIRBuilder B(MI); 2621 2622 Register Lo = MI.getOperand(1).getReg(); 2623 Register Hi = MI.getOperand(2).getReg(); 2624 const LLT S32 = LLT::scalar(32); 2625 2626 const RegisterBank *BankLo = 2627 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2628 const RegisterBank *BankHi = 2629 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2630 2631 Register ZextLo; 2632 Register ShiftHi; 2633 2634 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2635 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2636 MRI.setRegBank(ZextLo, *BankLo); 2637 2638 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2639 MRI.setRegBank(ZextHi, *BankHi); 2640 2641 auto ShiftAmt = B.buildConstant(S32, 16); 2642 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2643 2644 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2645 MRI.setRegBank(ShiftHi, *BankHi); 2646 } else { 2647 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2648 MRI.setRegBank(MaskLo, *BankLo); 2649 2650 auto ShiftAmt = B.buildConstant(S32, 16); 2651 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2652 2653 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2654 MRI.setRegBank(ShiftHi, *BankHi); 2655 2656 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2657 MRI.setRegBank(ZextLo, *BankLo); 2658 } 2659 2660 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2661 MRI.setRegBank(Or.getReg(0), *DstBank); 2662 2663 B.buildBitcast(DstReg, Or); 2664 MI.eraseFromParent(); 2665 return; 2666 } 2667 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2668 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2669 2670 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2671 2672 Register DstReg = MI.getOperand(0).getReg(); 2673 Register SrcReg = MI.getOperand(1).getReg(); 2674 2675 const LLT S32 = LLT::scalar(32); 2676 LLT DstTy = MRI.getType(DstReg); 2677 LLT SrcTy = MRI.getType(SrcReg); 2678 2679 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2680 return; 2681 2682 MachineIRBuilder B(MI); 2683 2684 const ValueMapping &DstMapping 2685 = OpdMapper.getInstrMapping().getOperandMapping(0); 2686 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2687 const RegisterBank *SrcBank = 2688 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2689 const RegisterBank *IdxBank = 2690 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2691 2692 Register BaseIdxReg; 2693 unsigned ConstOffset; 2694 std::tie(BaseIdxReg, ConstOffset) = 2695 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2696 2697 // See if the index is an add of a constant which will be foldable by moving 2698 // the base register of the index later if this is going to be executed in a 2699 // waterfall loop. This is essentially to reassociate the add of a constant 2700 // with the readfirstlane. 2701 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2702 ConstOffset > 0 && 2703 ConstOffset < SrcTy.getNumElements(); 2704 2705 // Move the base register. We'll re-insert the add later. 2706 if (ShouldMoveIndexIntoLoop) 2707 MI.getOperand(2).setReg(BaseIdxReg); 2708 2709 // If this is a VGPR result only because the index was a VGPR result, the 2710 // actual indexing will be done on the SGPR source vector, which will 2711 // produce a scalar result. We need to copy to the VGPR result inside the 2712 // waterfall loop. 2713 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2714 SrcBank == &AMDGPU::SGPRRegBank; 2715 if (DstRegs.empty()) { 2716 applyDefaultMapping(OpdMapper); 2717 2718 executeInWaterfallLoop(MI, MRI, { 2 }); 2719 2720 if (NeedCopyToVGPR) { 2721 // We don't want a phi for this temporary reg. 2722 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2723 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2724 MI.getOperand(0).setReg(TmpReg); 2725 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2726 2727 // Use a v_mov_b32 here to make the exec dependency explicit. 2728 buildVCopy(B, DstReg, TmpReg); 2729 } 2730 2731 // Re-insert the constant offset add inside the waterfall loop. 2732 if (ShouldMoveIndexIntoLoop) 2733 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2734 2735 return; 2736 } 2737 2738 assert(DstTy.getSizeInBits() == 64); 2739 2740 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); 2741 2742 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2743 auto One = B.buildConstant(S32, 1); 2744 2745 MachineBasicBlock::iterator MII = MI.getIterator(); 2746 2747 // Split the vector index into 32-bit pieces. Prepare to move all of the 2748 // new instructions into a waterfall loop if necessary. 2749 // 2750 // Don't put the bitcast or constant in the loop. 2751 MachineInstrSpan Span(MII, &B.getMBB()); 2752 2753 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2754 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2755 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2756 2757 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2758 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2759 2760 MRI.setRegBank(DstReg, *DstBank); 2761 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2762 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2763 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2764 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2765 2766 SmallSet<Register, 4> OpsToWaterfall; 2767 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2768 MI.eraseFromParent(); 2769 return; 2770 } 2771 2772 // Remove the original instruction to avoid potentially confusing the 2773 // waterfall loop logic. 2774 B.setInstr(*Span.begin()); 2775 MI.eraseFromParent(); 2776 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2777 OpsToWaterfall, MRI); 2778 2779 if (NeedCopyToVGPR) { 2780 MachineBasicBlock *LoopBB = Extract1->getParent(); 2781 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2782 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2783 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2784 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2785 2786 Extract0->getOperand(0).setReg(TmpReg0); 2787 Extract1->getOperand(0).setReg(TmpReg1); 2788 2789 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2790 2791 buildVCopy(B, DstRegs[0], TmpReg0); 2792 buildVCopy(B, DstRegs[1], TmpReg1); 2793 } 2794 2795 if (ShouldMoveIndexIntoLoop) 2796 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2797 2798 return; 2799 } 2800 case AMDGPU::G_INSERT_VECTOR_ELT: { 2801 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2802 2803 Register DstReg = MI.getOperand(0).getReg(); 2804 LLT VecTy = MRI.getType(DstReg); 2805 2806 assert(OpdMapper.getVRegs(0).empty()); 2807 assert(OpdMapper.getVRegs(3).empty()); 2808 2809 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2810 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2811 2812 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2813 return; 2814 2815 const RegisterBank *IdxBank = 2816 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2817 2818 Register SrcReg = MI.getOperand(1).getReg(); 2819 Register InsReg = MI.getOperand(2).getReg(); 2820 LLT InsTy = MRI.getType(InsReg); 2821 (void)InsTy; 2822 2823 Register BaseIdxReg; 2824 unsigned ConstOffset; 2825 std::tie(BaseIdxReg, ConstOffset) = 2826 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2827 2828 // See if the index is an add of a constant which will be foldable by moving 2829 // the base register of the index later if this is going to be executed in a 2830 // waterfall loop. This is essentially to reassociate the add of a constant 2831 // with the readfirstlane. 2832 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2833 ConstOffset > 0 && 2834 ConstOffset < VecTy.getNumElements(); 2835 2836 // Move the base register. We'll re-insert the add later. 2837 if (ShouldMoveIndexIntoLoop) 2838 MI.getOperand(3).setReg(BaseIdxReg); 2839 2840 2841 if (InsRegs.empty()) { 2842 executeInWaterfallLoop(MI, MRI, { 3 }); 2843 2844 // Re-insert the constant offset add inside the waterfall loop. 2845 if (ShouldMoveIndexIntoLoop) { 2846 MachineIRBuilder B(MI); 2847 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2848 } 2849 2850 return; 2851 } 2852 2853 2854 assert(InsTy.getSizeInBits() == 64); 2855 2856 const LLT S32 = LLT::scalar(32); 2857 LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); 2858 2859 MachineIRBuilder B(MI); 2860 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2861 auto One = B.buildConstant(S32, 1); 2862 2863 // Split the vector index into 32-bit pieces. Prepare to move all of the 2864 // new instructions into a waterfall loop if necessary. 2865 // 2866 // Don't put the bitcast or constant in the loop. 2867 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2868 2869 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2870 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2871 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2872 2873 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2874 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2875 2876 const RegisterBank *DstBank = 2877 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2878 const RegisterBank *SrcBank = 2879 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2880 const RegisterBank *InsSrcBank = 2881 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2882 2883 MRI.setRegBank(InsReg, *InsSrcBank); 2884 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2885 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2886 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2887 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2888 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2889 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2890 2891 2892 SmallSet<Register, 4> OpsToWaterfall; 2893 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2894 B.setInsertPt(B.getMBB(), MI); 2895 B.buildBitcast(DstReg, InsHi); 2896 MI.eraseFromParent(); 2897 return; 2898 } 2899 2900 B.setInstr(*Span.begin()); 2901 MI.eraseFromParent(); 2902 2903 // Figure out the point after the waterfall loop before mangling the control 2904 // flow. 2905 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2906 OpsToWaterfall, MRI); 2907 2908 // The insertion point is now right after the original instruction. 2909 // 2910 // Keep the bitcast to the original vector type out of the loop. Doing this 2911 // saved an extra phi we don't need inside the loop. 2912 B.buildBitcast(DstReg, InsHi); 2913 2914 // Re-insert the constant offset add inside the waterfall loop. 2915 if (ShouldMoveIndexIntoLoop) 2916 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2917 2918 return; 2919 } 2920 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2921 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2922 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2923 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2924 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2925 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2926 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2927 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2928 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2929 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2930 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2931 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2932 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2933 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2934 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2935 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2936 applyDefaultMapping(OpdMapper); 2937 executeInWaterfallLoop(MI, MRI, {1, 4}); 2938 return; 2939 } 2940 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2941 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2942 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2943 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2944 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2945 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2946 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2947 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2948 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2949 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2950 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2951 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2952 applyDefaultMapping(OpdMapper); 2953 executeInWaterfallLoop(MI, MRI, {2, 5}); 2954 return; 2955 } 2956 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { 2957 applyDefaultMapping(OpdMapper); 2958 executeInWaterfallLoop(MI, MRI, {2, 5}); 2959 return; 2960 } 2961 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2962 applyDefaultMapping(OpdMapper); 2963 executeInWaterfallLoop(MI, MRI, {3, 6}); 2964 return; 2965 } 2966 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2967 applyMappingSBufferLoad(OpdMapper); 2968 return; 2969 } 2970 case AMDGPU::G_INTRINSIC: { 2971 switch (MI.getIntrinsicID()) { 2972 case Intrinsic::amdgcn_readlane: { 2973 substituteSimpleCopyRegs(OpdMapper, 2); 2974 2975 assert(OpdMapper.getVRegs(0).empty()); 2976 assert(OpdMapper.getVRegs(3).empty()); 2977 2978 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2979 // waterfall loop, so assume it's a uniform value. 2980 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2981 return; 2982 } 2983 case Intrinsic::amdgcn_writelane: { 2984 assert(OpdMapper.getVRegs(0).empty()); 2985 assert(OpdMapper.getVRegs(2).empty()); 2986 assert(OpdMapper.getVRegs(3).empty()); 2987 2988 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2989 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2990 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2991 return; 2992 } 2993 case Intrinsic::amdgcn_interp_p1: 2994 case Intrinsic::amdgcn_interp_p2: 2995 case Intrinsic::amdgcn_interp_mov: 2996 case Intrinsic::amdgcn_interp_p1_f16: 2997 case Intrinsic::amdgcn_interp_p2_f16: { 2998 applyDefaultMapping(OpdMapper); 2999 3000 // Readlane for m0 value, which is always the last operand. 3001 // FIXME: Should this be a waterfall loop instead? 3002 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3003 return; 3004 } 3005 case Intrinsic::amdgcn_permlane16: 3006 case Intrinsic::amdgcn_permlanex16: { 3007 // Doing a waterfall loop over these wouldn't make any sense. 3008 substituteSimpleCopyRegs(OpdMapper, 2); 3009 substituteSimpleCopyRegs(OpdMapper, 3); 3010 constrainOpWithReadfirstlane(MI, MRI, 4); 3011 constrainOpWithReadfirstlane(MI, MRI, 5); 3012 return; 3013 } 3014 case Intrinsic::amdgcn_sbfe: 3015 applyMappingBFEIntrinsic(OpdMapper, true); 3016 return; 3017 case Intrinsic::amdgcn_ubfe: 3018 applyMappingBFEIntrinsic(OpdMapper, false); 3019 return; 3020 case Intrinsic::amdgcn_ballot: 3021 // Use default handling and insert copy to vcc source. 3022 break; 3023 } 3024 break; 3025 } 3026 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3027 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 3028 const AMDGPU::RsrcIntrinsic *RSrcIntrin 3029 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 3030 assert(RSrcIntrin && RSrcIntrin->IsImage); 3031 // Non-images can have complications from operands that allow both SGPR 3032 // and VGPR. For now it's too complicated to figure out the final opcode 3033 // to derive the register bank from the MCInstrDesc. 3034 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3035 return; 3036 } 3037 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3038 unsigned N = MI.getNumExplicitOperands() - 2; 3039 executeInWaterfallLoop(MI, MRI, { N }); 3040 return; 3041 } 3042 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3043 auto IntrID = MI.getIntrinsicID(); 3044 switch (IntrID) { 3045 case Intrinsic::amdgcn_ds_ordered_add: 3046 case Intrinsic::amdgcn_ds_ordered_swap: { 3047 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3048 assert(OpdMapper.getVRegs(0).empty()); 3049 substituteSimpleCopyRegs(OpdMapper, 3); 3050 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3051 return; 3052 } 3053 case Intrinsic::amdgcn_ds_gws_init: 3054 case Intrinsic::amdgcn_ds_gws_barrier: 3055 case Intrinsic::amdgcn_ds_gws_sema_br: { 3056 // Only the first lane is executes, so readfirstlane is safe. 3057 substituteSimpleCopyRegs(OpdMapper, 1); 3058 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3059 return; 3060 } 3061 case Intrinsic::amdgcn_ds_gws_sema_v: 3062 case Intrinsic::amdgcn_ds_gws_sema_p: 3063 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3064 // Only the first lane is executes, so readfirstlane is safe. 3065 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3066 return; 3067 } 3068 case Intrinsic::amdgcn_ds_append: 3069 case Intrinsic::amdgcn_ds_consume: { 3070 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3071 return; 3072 } 3073 case Intrinsic::amdgcn_s_sendmsg: 3074 case Intrinsic::amdgcn_s_sendmsghalt: { 3075 // FIXME: Should this use a waterfall loop? 3076 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3077 return; 3078 } 3079 case Intrinsic::amdgcn_s_setreg: { 3080 constrainOpWithReadfirstlane(MI, MRI, 2); 3081 return; 3082 } 3083 default: { 3084 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3085 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3086 // Non-images can have complications from operands that allow both SGPR 3087 // and VGPR. For now it's too complicated to figure out the final opcode 3088 // to derive the register bank from the MCInstrDesc. 3089 if (RSrcIntrin->IsImage) { 3090 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3091 return; 3092 } 3093 } 3094 3095 break; 3096 } 3097 } 3098 break; 3099 } 3100 case AMDGPU::G_LOAD: 3101 case AMDGPU::G_ZEXTLOAD: 3102 case AMDGPU::G_SEXTLOAD: { 3103 if (applyMappingLoad(MI, OpdMapper, MRI)) 3104 return; 3105 break; 3106 } 3107 case AMDGPU::G_DYN_STACKALLOC: 3108 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3109 return; 3110 default: 3111 break; 3112 } 3113 3114 return applyDefaultMapping(OpdMapper); 3115 } 3116 3117 // vgpr, sgpr -> vgpr 3118 // vgpr, agpr -> vgpr 3119 // agpr, agpr -> agpr 3120 // agpr, sgpr -> vgpr 3121 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3122 if (RB0 == AMDGPU::InvalidRegBankID) 3123 return RB1; 3124 if (RB1 == AMDGPU::InvalidRegBankID) 3125 return RB0; 3126 3127 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3128 return AMDGPU::SGPRRegBankID; 3129 3130 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3131 return AMDGPU::AGPRRegBankID; 3132 3133 return AMDGPU::VGPRRegBankID; 3134 } 3135 3136 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3137 if (RB0 == AMDGPU::InvalidRegBankID) 3138 return RB1; 3139 if (RB1 == AMDGPU::InvalidRegBankID) 3140 return RB0; 3141 3142 // vcc, vcc -> vcc 3143 // vcc, sgpr -> vcc 3144 // vcc, vgpr -> vcc 3145 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3146 return AMDGPU::VCCRegBankID; 3147 3148 // vcc, vgpr -> vgpr 3149 return regBankUnion(RB0, RB1); 3150 } 3151 3152 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3153 const MachineInstr &MI) const { 3154 unsigned RegBank = AMDGPU::InvalidRegBankID; 3155 3156 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3157 if (!MI.getOperand(i).isReg()) 3158 continue; 3159 Register Reg = MI.getOperand(i).getReg(); 3160 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3161 RegBank = regBankUnion(RegBank, Bank->getID()); 3162 if (RegBank == AMDGPU::VGPRRegBankID) 3163 break; 3164 } 3165 } 3166 3167 return RegBank; 3168 } 3169 3170 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3171 const MachineFunction &MF = *MI.getParent()->getParent(); 3172 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3173 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 3174 if (!MI.getOperand(i).isReg()) 3175 continue; 3176 Register Reg = MI.getOperand(i).getReg(); 3177 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3178 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3179 return false; 3180 } 3181 } 3182 return true; 3183 } 3184 3185 const RegisterBankInfo::InstructionMapping & 3186 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3187 const MachineFunction &MF = *MI.getParent()->getParent(); 3188 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3189 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3190 3191 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3192 const MachineOperand &SrcOp = MI.getOperand(i); 3193 if (!SrcOp.isReg()) 3194 continue; 3195 3196 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3197 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3198 } 3199 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3200 MI.getNumOperands()); 3201 } 3202 3203 const RegisterBankInfo::InstructionMapping & 3204 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3205 const MachineFunction &MF = *MI.getParent()->getParent(); 3206 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3207 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3208 3209 // Even though we technically could use SGPRs, this would require knowledge of 3210 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3211 // 3212 // TODO: Unary ops are trivially OK, so accept SGPRs? 3213 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3214 const MachineOperand &Src = MI.getOperand(i); 3215 if (!Src.isReg()) 3216 continue; 3217 3218 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3219 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3220 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3221 } 3222 3223 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3224 MI.getNumOperands()); 3225 } 3226 3227 const RegisterBankInfo::InstructionMapping & 3228 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3229 const MachineFunction &MF = *MI.getParent()->getParent(); 3230 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3231 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3232 3233 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3234 const MachineOperand &Op = MI.getOperand(I); 3235 if (!Op.isReg()) 3236 continue; 3237 3238 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3239 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3240 } 3241 3242 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3243 MI.getNumOperands()); 3244 } 3245 3246 const RegisterBankInfo::InstructionMapping & 3247 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3248 const MachineInstr &MI, 3249 int RsrcIdx) const { 3250 // The reported argument index is relative to the IR intrinsic call arguments, 3251 // so we need to shift by the number of defs and the intrinsic ID. 3252 RsrcIdx += MI.getNumExplicitDefs() + 1; 3253 3254 const int NumOps = MI.getNumOperands(); 3255 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3256 3257 // TODO: Should packed/unpacked D16 difference be reported here as part of 3258 // the value mapping? 3259 for (int I = 0; I != NumOps; ++I) { 3260 if (!MI.getOperand(I).isReg()) 3261 continue; 3262 3263 Register OpReg = MI.getOperand(I).getReg(); 3264 // We replace some dead address operands with $noreg 3265 if (!OpReg) 3266 continue; 3267 3268 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3269 3270 // FIXME: Probably need a new intrinsic register bank searchable table to 3271 // handle arbitrary intrinsics easily. 3272 // 3273 // If this has a sampler, it immediately follows rsrc. 3274 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3275 3276 if (MustBeSGPR) { 3277 // If this must be an SGPR, so we must report whatever it is as legal. 3278 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3279 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3280 } else { 3281 // Some operands must be VGPR, and these are easy to copy to. 3282 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3283 } 3284 } 3285 3286 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3287 } 3288 3289 /// Return the mapping for a pointer arugment. 3290 const RegisterBankInfo::ValueMapping * 3291 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3292 Register PtrReg) const { 3293 LLT PtrTy = MRI.getType(PtrReg); 3294 unsigned Size = PtrTy.getSizeInBits(); 3295 if (Subtarget.useFlatForGlobal() || 3296 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3297 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3298 3299 // If we're using MUBUF instructions for global memory, an SGPR base register 3300 // is possible. Otherwise this needs to be a VGPR. 3301 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3302 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3303 } 3304 3305 const RegisterBankInfo::InstructionMapping & 3306 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3307 3308 const MachineFunction &MF = *MI.getParent()->getParent(); 3309 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3310 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3311 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3312 Register PtrReg = MI.getOperand(1).getReg(); 3313 LLT PtrTy = MRI.getType(PtrReg); 3314 unsigned AS = PtrTy.getAddressSpace(); 3315 unsigned PtrSize = PtrTy.getSizeInBits(); 3316 3317 const ValueMapping *ValMapping; 3318 const ValueMapping *PtrMapping; 3319 3320 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3321 3322 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3323 if (isScalarLoadLegal(MI)) { 3324 // We have a uniform instruction so we want to use an SMRD load 3325 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3326 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3327 } else { 3328 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3329 3330 // If we're using MUBUF instructions for global memory, an SGPR base 3331 // register is possible. Otherwise this needs to be a VGPR. 3332 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3333 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3334 3335 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3336 } 3337 } else { 3338 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3339 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3340 } 3341 3342 OpdsMapping[0] = ValMapping; 3343 OpdsMapping[1] = PtrMapping; 3344 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3345 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3346 return Mapping; 3347 3348 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3349 // handle that during instruction selection? 3350 } 3351 3352 unsigned 3353 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3354 const MachineRegisterInfo &MRI, 3355 unsigned Default) const { 3356 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3357 return Bank ? Bank->getID() : Default; 3358 } 3359 3360 const RegisterBankInfo::ValueMapping * 3361 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3362 const MachineRegisterInfo &MRI, 3363 const TargetRegisterInfo &TRI) const { 3364 // Lie and claim anything is legal, even though this needs to be an SGPR 3365 // applyMapping will have to deal with it as a waterfall loop. 3366 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3367 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3368 return AMDGPU::getValueMapping(Bank, Size); 3369 } 3370 3371 const RegisterBankInfo::ValueMapping * 3372 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3373 const MachineRegisterInfo &MRI, 3374 const TargetRegisterInfo &TRI) const { 3375 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3376 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3377 } 3378 3379 const RegisterBankInfo::ValueMapping * 3380 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3381 const MachineRegisterInfo &MRI, 3382 const TargetRegisterInfo &TRI) const { 3383 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3384 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3385 } 3386 3387 /// 3388 /// This function must return a legal mapping, because 3389 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3390 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3391 /// VGPR to SGPR generated is illegal. 3392 /// 3393 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3394 // legal. These will be dealt with in applyMappingImpl. 3395 // 3396 const RegisterBankInfo::InstructionMapping & 3397 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3398 const MachineFunction &MF = *MI.getParent()->getParent(); 3399 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3400 3401 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3402 // The default logic bothers to analyze impossible alternative mappings. We 3403 // want the most straightforward mapping, so just directly handle this. 3404 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3405 *TRI); 3406 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3407 *TRI); 3408 assert(SrcBank && "src bank should have been assigned already"); 3409 if (!DstBank) 3410 DstBank = SrcBank; 3411 3412 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3413 if (cannotCopy(*DstBank, *SrcBank, Size)) 3414 return getInvalidInstructionMapping(); 3415 3416 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3417 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3418 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3419 OpdsMapping[0] = &ValMap; 3420 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3421 OpdsMapping[1] = &ValMap; 3422 3423 return getInstructionMapping( 3424 1, /*Cost*/ 1, 3425 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3426 } 3427 3428 if (MI.isRegSequence()) { 3429 // If any input is a VGPR, the result must be a VGPR. The default handling 3430 // assumes any copy between banks is legal. 3431 unsigned BankID = AMDGPU::SGPRRegBankID; 3432 3433 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3434 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3435 // It doesn't make sense to use vcc or scc banks here, so just ignore 3436 // them. 3437 if (OpBank != AMDGPU::SGPRRegBankID) { 3438 BankID = AMDGPU::VGPRRegBankID; 3439 break; 3440 } 3441 } 3442 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3443 3444 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3445 return getInstructionMapping( 3446 1, /*Cost*/ 1, 3447 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3448 } 3449 3450 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3451 // properly. 3452 // 3453 // TODO: There are additional exec masking dependencies to analyze. 3454 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3455 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3456 Register DstReg = MI.getOperand(0).getReg(); 3457 3458 // Sometimes the result may have already been assigned a bank. 3459 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3460 ResultBank = DstBank->getID(); 3461 3462 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3463 Register Reg = MI.getOperand(I).getReg(); 3464 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3465 3466 // FIXME: Assuming VGPR for any undetermined inputs. 3467 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3468 ResultBank = AMDGPU::VGPRRegBankID; 3469 break; 3470 } 3471 3472 // FIXME: Need to promote SGPR case to s32 3473 unsigned OpBank = Bank->getID(); 3474 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3475 } 3476 3477 assert(ResultBank != AMDGPU::InvalidRegBankID); 3478 3479 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3480 3481 const ValueMapping &ValMap = 3482 getValueMapping(0, Size, getRegBank(ResultBank)); 3483 return getInstructionMapping( 3484 1, /*Cost*/ 1, 3485 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3486 } 3487 3488 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3489 if (Mapping.isValid()) 3490 return Mapping; 3491 3492 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3493 3494 switch (MI.getOpcode()) { 3495 default: 3496 return getInvalidInstructionMapping(); 3497 3498 case AMDGPU::G_AND: 3499 case AMDGPU::G_OR: 3500 case AMDGPU::G_XOR: { 3501 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3502 if (Size == 1) { 3503 const RegisterBank *DstBank 3504 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3505 3506 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3507 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3508 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3509 if (DstBank) { 3510 TargetBankID = DstBank->getID(); 3511 if (DstBank == &AMDGPU::VCCRegBank) { 3512 TargetBankID = AMDGPU::VCCRegBankID; 3513 BankLHS = AMDGPU::VCCRegBankID; 3514 BankRHS = AMDGPU::VCCRegBankID; 3515 } else { 3516 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3517 AMDGPU::SGPRRegBankID); 3518 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3519 AMDGPU::SGPRRegBankID); 3520 } 3521 } else { 3522 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3523 AMDGPU::VCCRegBankID); 3524 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3525 AMDGPU::VCCRegBankID); 3526 3527 // Both inputs should be true booleans to produce a boolean result. 3528 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3529 TargetBankID = AMDGPU::VGPRRegBankID; 3530 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3531 TargetBankID = AMDGPU::VCCRegBankID; 3532 BankLHS = AMDGPU::VCCRegBankID; 3533 BankRHS = AMDGPU::VCCRegBankID; 3534 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3535 TargetBankID = AMDGPU::SGPRRegBankID; 3536 } 3537 } 3538 3539 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3540 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3541 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3542 break; 3543 } 3544 3545 if (Size == 64) { 3546 3547 if (isSALUMapping(MI)) { 3548 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3549 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3550 } else { 3551 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3552 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3553 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3554 3555 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3556 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3557 } 3558 3559 break; 3560 } 3561 3562 LLVM_FALLTHROUGH; 3563 } 3564 case AMDGPU::G_PTR_ADD: 3565 case AMDGPU::G_PTRMASK: 3566 case AMDGPU::G_ADD: 3567 case AMDGPU::G_SUB: 3568 case AMDGPU::G_MUL: 3569 case AMDGPU::G_SHL: 3570 case AMDGPU::G_LSHR: 3571 case AMDGPU::G_ASHR: 3572 case AMDGPU::G_UADDO: 3573 case AMDGPU::G_USUBO: 3574 case AMDGPU::G_UADDE: 3575 case AMDGPU::G_SADDE: 3576 case AMDGPU::G_USUBE: 3577 case AMDGPU::G_SSUBE: 3578 case AMDGPU::G_SMIN: 3579 case AMDGPU::G_SMAX: 3580 case AMDGPU::G_UMIN: 3581 case AMDGPU::G_UMAX: 3582 case AMDGPU::G_SHUFFLE_VECTOR: 3583 if (isSALUMapping(MI)) 3584 return getDefaultMappingSOP(MI); 3585 LLVM_FALLTHROUGH; 3586 3587 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3588 case AMDGPU::G_SSUBSAT: 3589 case AMDGPU::G_UADDSAT: 3590 case AMDGPU::G_USUBSAT: 3591 case AMDGPU::G_FADD: 3592 case AMDGPU::G_FSUB: 3593 case AMDGPU::G_FPTOSI: 3594 case AMDGPU::G_FPTOUI: 3595 case AMDGPU::G_FMUL: 3596 case AMDGPU::G_FMA: 3597 case AMDGPU::G_FMAD: 3598 case AMDGPU::G_FSQRT: 3599 case AMDGPU::G_FFLOOR: 3600 case AMDGPU::G_FCEIL: 3601 case AMDGPU::G_FRINT: 3602 case AMDGPU::G_SITOFP: 3603 case AMDGPU::G_UITOFP: 3604 case AMDGPU::G_FPTRUNC: 3605 case AMDGPU::G_FPEXT: 3606 case AMDGPU::G_FEXP2: 3607 case AMDGPU::G_FLOG2: 3608 case AMDGPU::G_FMINNUM: 3609 case AMDGPU::G_FMAXNUM: 3610 case AMDGPU::G_FMINNUM_IEEE: 3611 case AMDGPU::G_FMAXNUM_IEEE: 3612 case AMDGPU::G_FCANONICALIZE: 3613 case AMDGPU::G_INTRINSIC_TRUNC: 3614 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3615 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3616 case AMDGPU::G_AMDGPU_FFBH_U32: 3617 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3618 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3619 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3620 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3621 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3622 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3623 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3624 return getDefaultMappingVOP(MI); 3625 case AMDGPU::G_UMULH: 3626 case AMDGPU::G_SMULH: { 3627 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3628 return getDefaultMappingSOP(MI); 3629 return getDefaultMappingVOP(MI); 3630 } 3631 case AMDGPU::G_IMPLICIT_DEF: { 3632 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3633 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3634 break; 3635 } 3636 case AMDGPU::G_FCONSTANT: 3637 case AMDGPU::G_CONSTANT: 3638 case AMDGPU::G_GLOBAL_VALUE: 3639 case AMDGPU::G_BLOCK_ADDR: 3640 case AMDGPU::G_READCYCLECOUNTER: { 3641 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3642 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3643 break; 3644 } 3645 case AMDGPU::G_FRAME_INDEX: { 3646 // TODO: This should be the same as other constants, but eliminateFrameIndex 3647 // currently assumes VALU uses. 3648 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3649 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3650 break; 3651 } 3652 case AMDGPU::G_DYN_STACKALLOC: { 3653 // Result is always uniform, and a wave reduction is needed for the source. 3654 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3655 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3656 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3657 break; 3658 } 3659 case AMDGPU::G_INSERT: { 3660 unsigned BankID = getMappingType(MRI, MI); 3661 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3662 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3663 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3664 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3665 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3666 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3667 OpdsMapping[3] = nullptr; 3668 break; 3669 } 3670 case AMDGPU::G_EXTRACT: { 3671 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3672 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3673 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3674 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3675 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3676 OpdsMapping[2] = nullptr; 3677 break; 3678 } 3679 case AMDGPU::G_BUILD_VECTOR: 3680 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3681 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3682 if (DstTy == LLT::vector(2, 16)) { 3683 unsigned DstSize = DstTy.getSizeInBits(); 3684 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3685 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3686 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3687 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3688 3689 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3690 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3691 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3692 break; 3693 } 3694 3695 LLVM_FALLTHROUGH; 3696 } 3697 case AMDGPU::G_MERGE_VALUES: 3698 case AMDGPU::G_CONCAT_VECTORS: { 3699 unsigned Bank = getMappingType(MRI, MI); 3700 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3701 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3702 3703 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3704 // Op1 and Dst should use the same register bank. 3705 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3706 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3707 break; 3708 } 3709 case AMDGPU::G_BITCAST: 3710 case AMDGPU::G_INTTOPTR: 3711 case AMDGPU::G_PTRTOINT: 3712 case AMDGPU::G_BITREVERSE: 3713 case AMDGPU::G_FABS: 3714 case AMDGPU::G_FNEG: { 3715 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3716 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3717 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3718 break; 3719 } 3720 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3721 case AMDGPU::G_CTTZ_ZERO_UNDEF: 3722 case AMDGPU::G_CTPOP: { 3723 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3724 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3725 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3726 3727 // This should really be getValueMappingSGPR64Only, but allowing the generic 3728 // code to handle the register split just makes using LegalizerHelper more 3729 // difficult. 3730 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3731 break; 3732 } 3733 case AMDGPU::G_TRUNC: { 3734 Register Dst = MI.getOperand(0).getReg(); 3735 Register Src = MI.getOperand(1).getReg(); 3736 unsigned Bank = getRegBankID(Src, MRI); 3737 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3738 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3739 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3740 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3741 break; 3742 } 3743 case AMDGPU::G_ZEXT: 3744 case AMDGPU::G_SEXT: 3745 case AMDGPU::G_ANYEXT: 3746 case AMDGPU::G_SEXT_INREG: { 3747 Register Dst = MI.getOperand(0).getReg(); 3748 Register Src = MI.getOperand(1).getReg(); 3749 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3750 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3751 3752 unsigned DstBank; 3753 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3754 assert(SrcBank); 3755 switch (SrcBank->getID()) { 3756 case AMDGPU::SGPRRegBankID: 3757 DstBank = AMDGPU::SGPRRegBankID; 3758 break; 3759 default: 3760 DstBank = AMDGPU::VGPRRegBankID; 3761 break; 3762 } 3763 3764 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3765 // 32-bits, and then to 64. 3766 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3767 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3768 SrcSize); 3769 break; 3770 } 3771 case AMDGPU::G_FCMP: { 3772 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3773 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3774 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3775 OpdsMapping[1] = nullptr; // Predicate Operand. 3776 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 3777 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3778 break; 3779 } 3780 case AMDGPU::G_STORE: { 3781 assert(MI.getOperand(0).isReg()); 3782 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3783 3784 // FIXME: We need to specify a different reg bank once scalar stores are 3785 // supported. 3786 const ValueMapping *ValMapping = 3787 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3788 OpdsMapping[0] = ValMapping; 3789 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 3790 break; 3791 } 3792 case AMDGPU::G_ICMP: { 3793 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3794 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3795 3796 // See if the result register has already been constrained to vcc, which may 3797 // happen due to control flow intrinsic lowering. 3798 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 3799 AMDGPU::SGPRRegBankID); 3800 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3801 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 3802 3803 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 3804 Op2Bank == AMDGPU::SGPRRegBankID && 3805 Op3Bank == AMDGPU::SGPRRegBankID && 3806 (Size == 32 || (Size == 64 && 3807 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 3808 Subtarget.hasScalarCompareEq64())); 3809 3810 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3811 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3812 3813 // TODO: Use 32-bit for scalar output size. 3814 // SCC results will need to be copied to a 32-bit SGPR virtual register. 3815 const unsigned ResultSize = 1; 3816 3817 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 3818 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 3819 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 3820 break; 3821 } 3822 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 3823 // VGPR index can be used for waterfall when indexing a SGPR vector. 3824 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3825 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3826 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3827 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3828 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3829 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 3830 3831 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 3832 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 3833 3834 // The index can be either if the source vector is VGPR. 3835 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3836 break; 3837 } 3838 case AMDGPU::G_INSERT_VECTOR_ELT: { 3839 unsigned OutputBankID = isSALUMapping(MI) ? 3840 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3841 3842 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3843 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3844 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 3845 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3846 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 3847 3848 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3849 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3850 3851 // This is a weird case, because we need to break down the mapping based on 3852 // the register bank of a different operand. 3853 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 3854 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 3855 InsertSize); 3856 } else { 3857 assert(InsertSize == 32 || InsertSize == 64); 3858 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 3859 } 3860 3861 // The index can be either if the source vector is VGPR. 3862 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 3863 break; 3864 } 3865 case AMDGPU::G_UNMERGE_VALUES: { 3866 unsigned Bank = getMappingType(MRI, MI); 3867 3868 // Op1 and Dst should use the same register bank. 3869 // FIXME: Shouldn't this be the default? Why do we need to handle this? 3870 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3871 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 3872 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 3873 } 3874 break; 3875 } 3876 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3877 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3878 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3879 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3880 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3881 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3882 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3883 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3884 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3885 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3886 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 3887 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3888 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3889 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3890 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3891 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 3892 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3893 3894 // rsrc 3895 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3896 3897 // vindex 3898 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3899 3900 // voffset 3901 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3902 3903 // soffset 3904 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3905 3906 // Any remaining operands are immediates and were correctly null 3907 // initialized. 3908 break; 3909 } 3910 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3911 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3912 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3913 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3914 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3915 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3916 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3917 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3918 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3919 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3920 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3921 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 3922 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { 3923 // vdata_out 3924 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3925 3926 // vdata_in 3927 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3928 3929 // rsrc 3930 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3931 3932 // vindex 3933 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3934 3935 // voffset 3936 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3937 3938 // soffset 3939 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3940 3941 // Any remaining operands are immediates and were correctly null 3942 // initialized. 3943 break; 3944 } 3945 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 3946 // vdata_out 3947 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3948 3949 // vdata_in 3950 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3951 3952 // cmp 3953 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3954 3955 // rsrc 3956 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3957 3958 // vindex 3959 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3960 3961 // voffset 3962 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3963 3964 // soffset 3965 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 3966 3967 // Any remaining operands are immediates and were correctly null 3968 // initialized. 3969 break; 3970 } 3971 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 3972 // Lie and claim everything is legal, even though some need to be 3973 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 3974 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3975 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3976 3977 // We need to convert this to a MUBUF if either the resource of offset is 3978 // VGPR. 3979 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 3980 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 3981 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 3982 3983 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3984 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 3985 break; 3986 } 3987 case AMDGPU::G_INTRINSIC: { 3988 switch (MI.getIntrinsicID()) { 3989 default: 3990 return getInvalidInstructionMapping(); 3991 case Intrinsic::amdgcn_div_fmas: 3992 case Intrinsic::amdgcn_div_fixup: 3993 case Intrinsic::amdgcn_trig_preop: 3994 case Intrinsic::amdgcn_sin: 3995 case Intrinsic::amdgcn_cos: 3996 case Intrinsic::amdgcn_log_clamp: 3997 case Intrinsic::amdgcn_rcp: 3998 case Intrinsic::amdgcn_rcp_legacy: 3999 case Intrinsic::amdgcn_sqrt: 4000 case Intrinsic::amdgcn_rsq: 4001 case Intrinsic::amdgcn_rsq_legacy: 4002 case Intrinsic::amdgcn_rsq_clamp: 4003 case Intrinsic::amdgcn_fmul_legacy: 4004 case Intrinsic::amdgcn_fma_legacy: 4005 case Intrinsic::amdgcn_ldexp: 4006 case Intrinsic::amdgcn_frexp_mant: 4007 case Intrinsic::amdgcn_frexp_exp: 4008 case Intrinsic::amdgcn_fract: 4009 case Intrinsic::amdgcn_cvt_pkrtz: 4010 case Intrinsic::amdgcn_cvt_pknorm_i16: 4011 case Intrinsic::amdgcn_cvt_pknorm_u16: 4012 case Intrinsic::amdgcn_cvt_pk_i16: 4013 case Intrinsic::amdgcn_cvt_pk_u16: 4014 case Intrinsic::amdgcn_fmed3: 4015 case Intrinsic::amdgcn_cubeid: 4016 case Intrinsic::amdgcn_cubema: 4017 case Intrinsic::amdgcn_cubesc: 4018 case Intrinsic::amdgcn_cubetc: 4019 case Intrinsic::amdgcn_sffbh: 4020 case Intrinsic::amdgcn_fmad_ftz: 4021 case Intrinsic::amdgcn_mbcnt_lo: 4022 case Intrinsic::amdgcn_mbcnt_hi: 4023 case Intrinsic::amdgcn_mul_u24: 4024 case Intrinsic::amdgcn_mul_i24: 4025 case Intrinsic::amdgcn_lerp: 4026 case Intrinsic::amdgcn_sad_u8: 4027 case Intrinsic::amdgcn_msad_u8: 4028 case Intrinsic::amdgcn_sad_hi_u8: 4029 case Intrinsic::amdgcn_sad_u16: 4030 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4031 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4032 case Intrinsic::amdgcn_mqsad_u32_u8: 4033 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4034 case Intrinsic::amdgcn_alignbit: 4035 case Intrinsic::amdgcn_alignbyte: 4036 case Intrinsic::amdgcn_fdot2: 4037 case Intrinsic::amdgcn_sdot2: 4038 case Intrinsic::amdgcn_udot2: 4039 case Intrinsic::amdgcn_sdot4: 4040 case Intrinsic::amdgcn_udot4: 4041 case Intrinsic::amdgcn_sdot8: 4042 case Intrinsic::amdgcn_udot8: 4043 return getDefaultMappingVOP(MI); 4044 case Intrinsic::amdgcn_sbfe: 4045 case Intrinsic::amdgcn_ubfe: 4046 if (isSALUMapping(MI)) 4047 return getDefaultMappingSOP(MI); 4048 return getDefaultMappingVOP(MI); 4049 case Intrinsic::amdgcn_ds_swizzle: 4050 case Intrinsic::amdgcn_ds_permute: 4051 case Intrinsic::amdgcn_ds_bpermute: 4052 case Intrinsic::amdgcn_update_dpp: 4053 case Intrinsic::amdgcn_mov_dpp8: 4054 case Intrinsic::amdgcn_mov_dpp: 4055 case Intrinsic::amdgcn_wwm: 4056 case Intrinsic::amdgcn_wqm: 4057 case Intrinsic::amdgcn_softwqm: 4058 case Intrinsic::amdgcn_set_inactive: 4059 return getDefaultMappingAllVGPR(MI); 4060 case Intrinsic::amdgcn_kernarg_segment_ptr: 4061 case Intrinsic::amdgcn_s_getpc: 4062 case Intrinsic::amdgcn_groupstaticsize: 4063 case Intrinsic::amdgcn_reloc_constant: 4064 case Intrinsic::returnaddress: { 4065 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4066 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4067 break; 4068 } 4069 case Intrinsic::amdgcn_wqm_vote: { 4070 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4071 OpdsMapping[0] = OpdsMapping[2] 4072 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4073 break; 4074 } 4075 case Intrinsic::amdgcn_ps_live: { 4076 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4077 break; 4078 } 4079 case Intrinsic::amdgcn_div_scale: { 4080 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4081 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4082 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4083 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4084 4085 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4086 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4087 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4088 break; 4089 } 4090 case Intrinsic::amdgcn_class: { 4091 Register Src0Reg = MI.getOperand(2).getReg(); 4092 Register Src1Reg = MI.getOperand(3).getReg(); 4093 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4094 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4095 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4096 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4097 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4098 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4099 break; 4100 } 4101 case Intrinsic::amdgcn_icmp: 4102 case Intrinsic::amdgcn_fcmp: { 4103 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4104 // This is not VCCRegBank because this is not used in boolean contexts. 4105 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4106 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4107 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4108 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4109 break; 4110 } 4111 case Intrinsic::amdgcn_readlane: { 4112 // This must be an SGPR, but accept a VGPR. 4113 Register IdxReg = MI.getOperand(3).getReg(); 4114 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4115 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4116 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4117 LLVM_FALLTHROUGH; 4118 } 4119 case Intrinsic::amdgcn_readfirstlane: { 4120 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4121 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4122 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4123 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4124 break; 4125 } 4126 case Intrinsic::amdgcn_writelane: { 4127 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4128 Register SrcReg = MI.getOperand(2).getReg(); 4129 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4130 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4131 Register IdxReg = MI.getOperand(3).getReg(); 4132 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4133 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4134 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4135 4136 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4137 // to legalize. 4138 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4139 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4140 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4141 break; 4142 } 4143 case Intrinsic::amdgcn_if_break: { 4144 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4145 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4146 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4147 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4148 break; 4149 } 4150 case Intrinsic::amdgcn_permlane16: 4151 case Intrinsic::amdgcn_permlanex16: { 4152 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4153 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4154 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4155 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4156 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4157 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4158 break; 4159 } 4160 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4161 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4162 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4163 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4164 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4165 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4166 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4167 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4168 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4169 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4170 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4171 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4172 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4173 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4174 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4175 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4176 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4177 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4178 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4179 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: { 4180 // Default for MAI intrinsics. 4181 // srcC can also be an immediate which can be folded later. 4182 // FIXME: Should we eventually add an alternative mapping with AGPR src 4183 // for srcA/srcB? 4184 // 4185 // vdst, srcA, srcB, srcC 4186 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4187 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4188 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4189 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4190 break; 4191 } 4192 case Intrinsic::amdgcn_interp_p1: 4193 case Intrinsic::amdgcn_interp_p2: 4194 case Intrinsic::amdgcn_interp_mov: 4195 case Intrinsic::amdgcn_interp_p1_f16: 4196 case Intrinsic::amdgcn_interp_p2_f16: { 4197 const int M0Idx = MI.getNumOperands() - 1; 4198 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4199 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4200 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4201 4202 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4203 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4204 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4205 4206 // Must be SGPR, but we must take whatever the original bank is and fix it 4207 // later. 4208 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4209 break; 4210 } 4211 case Intrinsic::amdgcn_ballot: { 4212 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4213 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4214 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4215 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4216 break; 4217 } 4218 } 4219 break; 4220 } 4221 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4222 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 4223 auto IntrID = MI.getIntrinsicID(); 4224 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4225 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4226 // Non-images can have complications from operands that allow both SGPR 4227 // and VGPR. For now it's too complicated to figure out the final opcode 4228 // to derive the register bank from the MCInstrDesc. 4229 assert(RSrcIntrin->IsImage); 4230 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4231 } 4232 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4233 unsigned N = MI.getNumExplicitOperands() - 2; 4234 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4235 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4236 for (unsigned I = 2; I < N; ++I) 4237 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4238 break; 4239 } 4240 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4241 auto IntrID = MI.getIntrinsicID(); 4242 switch (IntrID) { 4243 case Intrinsic::amdgcn_s_getreg: 4244 case Intrinsic::amdgcn_s_memtime: 4245 case Intrinsic::amdgcn_s_memrealtime: 4246 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 4247 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4248 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4249 break; 4250 } 4251 case Intrinsic::amdgcn_global_atomic_fadd: 4252 case Intrinsic::amdgcn_global_atomic_csub: 4253 return getDefaultMappingAllVGPR(MI); 4254 case Intrinsic::amdgcn_ds_ordered_add: 4255 case Intrinsic::amdgcn_ds_ordered_swap: { 4256 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4257 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4258 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4259 AMDGPU::SGPRRegBankID); 4260 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4261 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4262 break; 4263 } 4264 case Intrinsic::amdgcn_ds_append: 4265 case Intrinsic::amdgcn_ds_consume: { 4266 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4267 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4268 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4269 break; 4270 } 4271 case Intrinsic::amdgcn_exp_compr: 4272 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4273 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4274 break; 4275 case Intrinsic::amdgcn_exp: 4276 // FIXME: Could we support packed types here? 4277 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4278 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4279 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4280 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4281 break; 4282 case Intrinsic::amdgcn_s_sendmsg: 4283 case Intrinsic::amdgcn_s_sendmsghalt: { 4284 // This must be an SGPR, but accept a VGPR. 4285 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4286 AMDGPU::SGPRRegBankID); 4287 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4288 break; 4289 } 4290 case Intrinsic::amdgcn_s_setreg: { 4291 // This must be an SGPR, but accept a VGPR. 4292 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4293 AMDGPU::SGPRRegBankID); 4294 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4295 break; 4296 } 4297 case Intrinsic::amdgcn_end_cf: { 4298 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4299 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4300 break; 4301 } 4302 case Intrinsic::amdgcn_else: { 4303 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4304 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4305 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4306 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4307 break; 4308 } 4309 case Intrinsic::amdgcn_kill: { 4310 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4311 break; 4312 } 4313 case Intrinsic::amdgcn_raw_buffer_load: 4314 case Intrinsic::amdgcn_raw_tbuffer_load: { 4315 // FIXME: Should make intrinsic ID the last operand of the instruction, 4316 // then this would be the same as store 4317 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4318 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4319 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4320 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4321 break; 4322 } 4323 case Intrinsic::amdgcn_raw_buffer_store: 4324 case Intrinsic::amdgcn_raw_buffer_store_format: 4325 case Intrinsic::amdgcn_raw_tbuffer_store: { 4326 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4327 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4328 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4329 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4330 break; 4331 } 4332 case Intrinsic::amdgcn_struct_buffer_load: 4333 case Intrinsic::amdgcn_struct_tbuffer_load: { 4334 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4335 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4336 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4337 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4338 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4339 break; 4340 } 4341 case Intrinsic::amdgcn_struct_buffer_store: 4342 case Intrinsic::amdgcn_struct_tbuffer_store: { 4343 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4344 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4345 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4346 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4347 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4348 break; 4349 } 4350 case Intrinsic::amdgcn_init_exec_from_input: { 4351 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4352 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4353 break; 4354 } 4355 case Intrinsic::amdgcn_ds_gws_init: 4356 case Intrinsic::amdgcn_ds_gws_barrier: 4357 case Intrinsic::amdgcn_ds_gws_sema_br: { 4358 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4359 4360 // This must be an SGPR, but accept a VGPR. 4361 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4362 AMDGPU::SGPRRegBankID); 4363 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4364 break; 4365 } 4366 case Intrinsic::amdgcn_ds_gws_sema_v: 4367 case Intrinsic::amdgcn_ds_gws_sema_p: 4368 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4369 // This must be an SGPR, but accept a VGPR. 4370 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4371 AMDGPU::SGPRRegBankID); 4372 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4373 break; 4374 } 4375 default: 4376 return getInvalidInstructionMapping(); 4377 } 4378 break; 4379 } 4380 case AMDGPU::G_SELECT: { 4381 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4382 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4383 AMDGPU::SGPRRegBankID); 4384 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4385 AMDGPU::SGPRRegBankID); 4386 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4387 Op3Bank == AMDGPU::SGPRRegBankID; 4388 4389 unsigned CondBankDefault = SGPRSrcs ? 4390 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4391 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4392 CondBankDefault); 4393 if (CondBank == AMDGPU::SGPRRegBankID) 4394 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4395 else if (CondBank == AMDGPU::VGPRRegBankID) 4396 CondBank = AMDGPU::VCCRegBankID; 4397 4398 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4399 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4400 4401 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4402 4403 // TODO: Should report 32-bit for scalar condition type. 4404 if (Size == 64) { 4405 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4406 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4407 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4408 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4409 } else { 4410 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4411 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4412 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4413 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4414 } 4415 4416 break; 4417 } 4418 4419 case AMDGPU::G_LOAD: 4420 case AMDGPU::G_ZEXTLOAD: 4421 case AMDGPU::G_SEXTLOAD: 4422 return getInstrMappingForLoad(MI); 4423 4424 case AMDGPU::G_ATOMICRMW_XCHG: 4425 case AMDGPU::G_ATOMICRMW_ADD: 4426 case AMDGPU::G_ATOMICRMW_SUB: 4427 case AMDGPU::G_ATOMICRMW_AND: 4428 case AMDGPU::G_ATOMICRMW_OR: 4429 case AMDGPU::G_ATOMICRMW_XOR: 4430 case AMDGPU::G_ATOMICRMW_MAX: 4431 case AMDGPU::G_ATOMICRMW_MIN: 4432 case AMDGPU::G_ATOMICRMW_UMAX: 4433 case AMDGPU::G_ATOMICRMW_UMIN: 4434 case AMDGPU::G_ATOMICRMW_FADD: 4435 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4436 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4437 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4438 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4439 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4440 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4441 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4442 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4443 break; 4444 } 4445 case AMDGPU::G_ATOMIC_CMPXCHG: { 4446 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4447 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4448 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4449 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4450 break; 4451 } 4452 case AMDGPU::G_BRCOND: { 4453 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4454 AMDGPU::SGPRRegBankID); 4455 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4456 if (Bank != AMDGPU::SGPRRegBankID) 4457 Bank = AMDGPU::VCCRegBankID; 4458 4459 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4460 break; 4461 } 4462 } 4463 4464 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4465 getOperandsMapping(OpdsMapping), 4466 MI.getNumOperands()); 4467 } 4468