1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 const AMDGPURegisterBankInfo &RBI; 101 MachineRegisterInfo &MRI; 102 const RegisterBank *NewBank; 103 SmallVector<MachineInstr *, 4> NewInsts; 104 105 public: 106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 107 MachineRegisterInfo &MRI_, const RegisterBank *RB) 108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 109 110 ~ApplyRegBankMapping() { 111 for (MachineInstr *MI : NewInsts) 112 applyBank(*MI); 113 } 114 115 /// Set any registers that don't have a set register class or bank to SALU. 116 void applyBank(MachineInstr &MI) { 117 const unsigned Opc = MI.getOpcode(); 118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 119 Opc == AMDGPU::G_SEXT) { 120 // LegalizerHelper wants to use the basic legalization artifacts when 121 // widening etc. We don't handle selection with vcc in artifact sources, 122 // so we need to use a select instead to handle these properly. 123 Register DstReg = MI.getOperand(0).getReg(); 124 Register SrcReg = MI.getOperand(1).getReg(); 125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 126 if (SrcBank == &AMDGPU::VCCRegBank) { 127 const LLT S32 = LLT::scalar(32); 128 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 129 assert(MRI.getType(DstReg) == S32); 130 assert(NewBank == &AMDGPU::VGPRRegBank); 131 132 // Replace the extension with a select, which really uses the boolean 133 // source. 134 MachineIRBuilder B(MI); 135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 136 auto False = B.buildConstant(S32, 0); 137 B.buildSelect(DstReg, SrcReg, True, False); 138 MRI.setRegBank(True.getReg(0), *NewBank); 139 MRI.setRegBank(False.getReg(0), *NewBank); 140 MI.eraseFromParent(); 141 } 142 143 assert(!MRI.getRegClassOrRegBank(DstReg)); 144 MRI.setRegBank(DstReg, *NewBank); 145 return; 146 } 147 148 #ifndef NDEBUG 149 if (Opc == AMDGPU::G_TRUNC) { 150 Register DstReg = MI.getOperand(0).getReg(); 151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 152 assert(DstBank != &AMDGPU::VCCRegBank); 153 } 154 #endif 155 156 for (MachineOperand &Op : MI.operands()) { 157 if (!Op.isReg()) 158 continue; 159 160 // We may see physical registers if building a real MI 161 Register Reg = Op.getReg(); 162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 163 continue; 164 165 const RegisterBank *RB = NewBank; 166 if (MRI.getType(Reg) == LLT::scalar(1)) { 167 assert(NewBank == &AMDGPU::VGPRRegBank && 168 "s1 operands should only be used for vector bools"); 169 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 170 MI.getOpcode() != AMDGPU::G_ANYEXT) && 171 "not expecting legalization artifacts here"); 172 RB = &AMDGPU::VCCRegBank; 173 } 174 175 MRI.setRegBank(Reg, *RB); 176 } 177 } 178 179 void erasingInstr(MachineInstr &MI) override {} 180 181 void createdInstr(MachineInstr &MI) override { 182 // At this point, the instruction was just inserted and has no operands. 183 NewInsts.push_back(&MI); 184 } 185 186 void changingInstr(MachineInstr &MI) override {} 187 void changedInstr(MachineInstr &MI) override { 188 // FIXME: In principle we should probably add the instruction to NewInsts, 189 // but the way the LegalizerHelper uses the observer, we will always see the 190 // registers we need to set the regbank on also referenced in a new 191 // instruction. 192 } 193 }; 194 195 } 196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 197 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), 198 TII(Subtarget.getInstrInfo()) { 199 200 // HACK: Until this is fully tablegen'd. 201 static llvm::once_flag InitializeRegisterBankFlag; 202 203 static auto InitializeRegisterBankOnce = [this]() { 204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 207 (void)this; 208 }; 209 210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 211 } 212 213 static bool isVectorRegisterBank(const RegisterBank &Bank) { 214 unsigned BankID = Bank.getID(); 215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 216 } 217 218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 219 const RegisterBank &Src, 220 unsigned Size) const { 221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 222 if (Dst.getID() == AMDGPU::SGPRRegBankID && 223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 224 return std::numeric_limits<unsigned>::max(); 225 } 226 227 // Bool values are tricky, because the meaning is based on context. The SCC 228 // and VCC banks are for the natural scalar and vector conditions produced by 229 // a compare. 230 // 231 // Legalization doesn't know about the necessary context, so an s1 use may 232 // have been a truncate from an arbitrary value, in which case a copy (lowered 233 // as a compare with 0) needs to be inserted. 234 if (Size == 1 && 235 (Dst.getID() == AMDGPU::SGPRRegBankID) && 236 (isVectorRegisterBank(Src) || 237 Src.getID() == AMDGPU::SGPRRegBankID || 238 Src.getID() == AMDGPU::VCCRegBankID)) 239 return std::numeric_limits<unsigned>::max(); 240 241 // There is no direct copy between AGPRs. 242 if (Dst.getID() == AMDGPU::AGPRRegBankID && 243 Src.getID() == AMDGPU::AGPRRegBankID) 244 return 4; 245 246 return RegisterBankInfo::copyCost(Dst, Src, Size); 247 } 248 249 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 250 const ValueMapping &ValMapping, 251 const RegisterBank *CurBank) const { 252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 253 // VGPR. 254 // FIXME: Is there a better way to do this? 255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 256 return 10; // This is expensive. 257 258 assert(ValMapping.NumBreakDowns == 2 && 259 ValMapping.BreakDown[0].Length == 32 && 260 ValMapping.BreakDown[0].StartIdx == 0 && 261 ValMapping.BreakDown[1].Length == 32 && 262 ValMapping.BreakDown[1].StartIdx == 32 && 263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 264 265 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 267 // want. 268 269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 270 // alignment restrictions, but this probably isn't important. 271 return 1; 272 } 273 274 const RegisterBank & 275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 276 LLT Ty) const { 277 if (&RC == &AMDGPU::SReg_1RegClass) 278 return AMDGPU::VCCRegBank; 279 280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 281 // VCC-like use. 282 if (TRI->isSGPRClass(&RC)) { 283 // FIXME: This probably came from a copy from a physical register, which 284 // should be inferable from the copied to-type. We don't have many boolean 285 // physical register constraints so just assume a normal SGPR for now. 286 if (!Ty.isValid()) 287 return AMDGPU::SGPRRegBank; 288 289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 290 } 291 292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 293 } 294 295 template <unsigned NumOps> 296 RegisterBankInfo::InstructionMappings 297 AMDGPURegisterBankInfo::addMappingFromTable( 298 const MachineInstr &MI, const MachineRegisterInfo &MRI, 299 const std::array<unsigned, NumOps> RegSrcOpIdx, 300 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 301 302 InstructionMappings AltMappings; 303 304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 305 306 unsigned Sizes[NumOps]; 307 for (unsigned I = 0; I < NumOps; ++I) { 308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 310 } 311 312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 315 } 316 317 // getInstrMapping's default mapping uses ID 1, so start at 2. 318 unsigned MappingID = 2; 319 for (const auto &Entry : Table) { 320 for (unsigned I = 0; I < NumOps; ++I) { 321 int OpIdx = RegSrcOpIdx[I]; 322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 323 } 324 325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 326 getOperandsMapping(Operands), 327 Operands.size())); 328 } 329 330 return AltMappings; 331 } 332 333 RegisterBankInfo::InstructionMappings 334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 336 switch (MI.getIntrinsicID()) { 337 case Intrinsic::amdgcn_readlane: { 338 static const OpRegBankEntry<3> Table[2] = { 339 // Perfectly legal. 340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 341 342 // Need a readfirstlane for the index. 343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 344 }; 345 346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 348 } 349 case Intrinsic::amdgcn_writelane: { 350 static const OpRegBankEntry<4> Table[4] = { 351 // Perfectly legal. 352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 353 354 // Need readfirstlane of first op 355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 356 357 // Need readfirstlane of second op 358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 359 360 // Need readfirstlane of both ops 361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 362 }; 363 364 // rsrc, voffset, offset 365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 367 } 368 default: 369 return RegisterBankInfo::getInstrAlternativeMappings(MI); 370 } 371 } 372 373 RegisterBankInfo::InstructionMappings 374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 376 377 switch (MI.getIntrinsicID()) { 378 case Intrinsic::amdgcn_s_buffer_load: { 379 static const OpRegBankEntry<2> Table[4] = { 380 // Perfectly legal. 381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 382 383 // Only need 1 register in loop 384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 385 386 // Have to waterfall the resource. 387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 388 389 // Have to waterfall the resource, and the offset. 390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 391 }; 392 393 // rsrc, offset 394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 396 } 397 case Intrinsic::amdgcn_ds_ordered_add: 398 case Intrinsic::amdgcn_ds_ordered_swap: { 399 // VGPR = M0, VGPR 400 static const OpRegBankEntry<3> Table[2] = { 401 // Perfectly legal. 402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 403 404 // Need a readfirstlane for m0 405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 406 }; 407 408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 410 } 411 case Intrinsic::amdgcn_s_sendmsg: 412 case Intrinsic::amdgcn_s_sendmsghalt: { 413 // FIXME: Should have no register for immediate 414 static const OpRegBankEntry<1> Table[2] = { 415 // Perfectly legal. 416 { { AMDGPU::SGPRRegBankID }, 1 }, 417 418 // Need readlane 419 { { AMDGPU::VGPRRegBankID }, 3 } 420 }; 421 422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 424 } 425 default: 426 return RegisterBankInfo::getInstrAlternativeMappings(MI); 427 } 428 } 429 430 // FIXME: Returns uniform if there's no source value information. This is 431 // probably wrong. 432 static bool isScalarLoadLegal(const MachineInstr &MI) { 433 if (!MI.hasOneMemOperand()) 434 return false; 435 436 const MachineMemOperand *MMO = *MI.memoperands_begin(); 437 const unsigned AS = MMO->getAddrSpace(); 438 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 439 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 440 // Require 4-byte alignment. 441 return MMO->getAlign() >= Align(4) && 442 // Can't do a scalar atomic load. 443 !MMO->isAtomic() && 444 // Don't use scalar loads for volatile accesses to non-constant address 445 // spaces. 446 (IsConst || !MMO->isVolatile()) && 447 // Memory must be known constant, or not written before this load. 448 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 449 AMDGPUInstrInfo::isUniformMMO(MMO); 450 } 451 452 RegisterBankInfo::InstructionMappings 453 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 454 const MachineInstr &MI) const { 455 456 const MachineFunction &MF = *MI.getParent()->getParent(); 457 const MachineRegisterInfo &MRI = MF.getRegInfo(); 458 459 460 InstructionMappings AltMappings; 461 switch (MI.getOpcode()) { 462 case TargetOpcode::G_CONSTANT: { 463 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 464 if (Size == 1) { 465 static const OpRegBankEntry<1> Table[3] = { 466 { { AMDGPU::VGPRRegBankID }, 1 }, 467 { { AMDGPU::SGPRRegBankID }, 1 }, 468 { { AMDGPU::VCCRegBankID }, 1 } 469 }; 470 471 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 472 } 473 474 LLVM_FALLTHROUGH; 475 } 476 case TargetOpcode::G_FCONSTANT: 477 case TargetOpcode::G_FRAME_INDEX: 478 case TargetOpcode::G_GLOBAL_VALUE: { 479 static const OpRegBankEntry<1> Table[2] = { 480 { { AMDGPU::VGPRRegBankID }, 1 }, 481 { { AMDGPU::SGPRRegBankID }, 1 } 482 }; 483 484 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 485 } 486 case TargetOpcode::G_AND: 487 case TargetOpcode::G_OR: 488 case TargetOpcode::G_XOR: { 489 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 490 491 if (Size == 1) { 492 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 493 const InstructionMapping &SCCMapping = getInstructionMapping( 494 1, 1, getOperandsMapping( 495 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 496 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 498 3); // Num Operands 499 AltMappings.push_back(&SCCMapping); 500 501 const InstructionMapping &VCCMapping0 = getInstructionMapping( 502 2, 1, getOperandsMapping( 503 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 504 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 506 3); // Num Operands 507 AltMappings.push_back(&VCCMapping0); 508 return AltMappings; 509 } 510 511 if (Size != 64) 512 break; 513 514 const InstructionMapping &SSMapping = getInstructionMapping( 515 1, 1, getOperandsMapping( 516 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 517 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 519 3); // Num Operands 520 AltMappings.push_back(&SSMapping); 521 522 const InstructionMapping &VVMapping = getInstructionMapping( 523 2, 2, getOperandsMapping( 524 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 525 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 527 3); // Num Operands 528 AltMappings.push_back(&VVMapping); 529 break; 530 } 531 case TargetOpcode::G_LOAD: 532 case TargetOpcode::G_ZEXTLOAD: 533 case TargetOpcode::G_SEXTLOAD: { 534 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 535 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 536 unsigned PtrSize = PtrTy.getSizeInBits(); 537 unsigned AS = PtrTy.getAddressSpace(); 538 539 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 540 AS != AMDGPUAS::PRIVATE_ADDRESS) && 541 isScalarLoadLegal(MI)) { 542 const InstructionMapping &SSMapping = getInstructionMapping( 543 1, 1, getOperandsMapping( 544 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 545 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 546 2); // Num Operands 547 AltMappings.push_back(&SSMapping); 548 } 549 550 const InstructionMapping &VVMapping = getInstructionMapping( 551 2, 1, 552 getOperandsMapping( 553 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 554 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 555 2); // Num Operands 556 AltMappings.push_back(&VVMapping); 557 558 // It may be possible to have a vgpr = load sgpr mapping here, because 559 // the mubuf instructions support this kind of load, but probably for only 560 // gfx7 and older. However, the addressing mode matching in the instruction 561 // selector should be able to do a better job of detecting and selecting 562 // these kinds of loads from the vgpr = load vgpr mapping. 563 564 return AltMappings; 565 566 } 567 case TargetOpcode::G_SELECT: { 568 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 569 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 570 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 571 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 574 4); // Num Operands 575 AltMappings.push_back(&SSMapping); 576 577 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 578 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 579 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 580 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 582 4); // Num Operands 583 AltMappings.push_back(&VVMapping); 584 585 return AltMappings; 586 } 587 case TargetOpcode::G_UADDE: 588 case TargetOpcode::G_USUBE: 589 case TargetOpcode::G_SADDE: 590 case TargetOpcode::G_SSUBE: { 591 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 592 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 593 getOperandsMapping( 594 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 595 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 599 5); // Num Operands 600 AltMappings.push_back(&SSMapping); 601 602 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 603 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 604 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 605 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 607 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 608 5); // Num Operands 609 AltMappings.push_back(&VVMapping); 610 return AltMappings; 611 } 612 case AMDGPU::G_BRCOND: { 613 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 614 615 // TODO: Change type to 32 for scalar 616 const InstructionMapping &SMapping = getInstructionMapping( 617 1, 1, getOperandsMapping( 618 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 619 2); // Num Operands 620 AltMappings.push_back(&SMapping); 621 622 const InstructionMapping &VMapping = getInstructionMapping( 623 1, 1, getOperandsMapping( 624 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 625 2); // Num Operands 626 AltMappings.push_back(&VMapping); 627 return AltMappings; 628 } 629 case AMDGPU::G_INTRINSIC: 630 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 631 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 632 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 633 default: 634 break; 635 } 636 return RegisterBankInfo::getInstrAlternativeMappings(MI); 637 } 638 639 void AMDGPURegisterBankInfo::split64BitValueForMapping( 640 MachineIRBuilder &B, 641 SmallVector<Register, 2> &Regs, 642 LLT HalfTy, 643 Register Reg) const { 644 assert(HalfTy.getSizeInBits() == 32); 645 MachineRegisterInfo *MRI = B.getMRI(); 646 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 647 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 648 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 649 MRI->setRegBank(LoLHS, *Bank); 650 MRI->setRegBank(HiLHS, *Bank); 651 652 Regs.push_back(LoLHS); 653 Regs.push_back(HiLHS); 654 655 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 656 .addDef(LoLHS) 657 .addDef(HiLHS) 658 .addUse(Reg); 659 } 660 661 /// Replace the current type each register in \p Regs has with \p NewTy 662 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 663 LLT NewTy) { 664 for (Register Reg : Regs) { 665 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 666 MRI.setType(Reg, NewTy); 667 } 668 } 669 670 static LLT getHalfSizedType(LLT Ty) { 671 if (Ty.isVector()) { 672 assert(Ty.getElementCount().isKnownMultipleOf(2)); 673 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 674 Ty.getElementType()); 675 } 676 677 assert(Ty.getScalarSizeInBits() % 2 == 0); 678 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 679 } 680 681 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector 682 // source value into a scalar register. 683 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, 684 MachineRegisterInfo &MRI, 685 Register Src) const { 686 LLT Ty = MRI.getType(Src); 687 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); 688 689 if (Bank == &AMDGPU::SGPRRegBank) 690 return Src; 691 692 unsigned Bits = Ty.getSizeInBits(); 693 assert(Bits % 32 == 0); 694 695 if (Bank != &AMDGPU::VGPRRegBank) { 696 // We need to copy from AGPR to VGPR 697 Src = B.buildCopy(Ty, Src).getReg(0); 698 MRI.setRegBank(Src, AMDGPU::VGPRRegBank); 699 } 700 701 LLT S32 = LLT::scalar(32); 702 unsigned NumParts = Bits / 32; 703 SmallVector<Register, 8> SrcParts; 704 SmallVector<Register, 8> DstParts; 705 706 if (Bits == 32) { 707 SrcParts.push_back(Src); 708 } else { 709 auto Unmerge = B.buildUnmerge(S32, Src); 710 for (unsigned i = 0; i < NumParts; ++i) 711 SrcParts.push_back(Unmerge.getReg(i)); 712 } 713 714 for (unsigned i = 0; i < NumParts; ++i) { 715 Register SrcPart = SrcParts[i]; 716 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 717 MRI.setType(DstPart, NumParts == 1 ? Ty : S32); 718 719 const TargetRegisterClass *Constrained = 720 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); 721 (void)Constrained; 722 assert(Constrained && "Failed to constrain readfirstlane src reg"); 723 724 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); 725 726 DstParts.push_back(DstPart); 727 } 728 729 if (Bits == 32) 730 return DstParts[0]; 731 732 Register Dst = B.buildMerge(Ty, DstParts).getReg(0); 733 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); 734 return Dst; 735 } 736 737 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 738 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 739 /// execute the instruction for each unique combination of values in all lanes 740 /// in the wave. The block will be split such that rest of the instructions are 741 /// moved to a new block. 742 /// 743 /// Essentially performs this loop: 744 // 745 /// Save Execution Mask 746 /// For (Lane : Wavefront) { 747 /// Enable Lane, Disable all other lanes 748 /// SGPR = read SGPR value for current lane from VGPR 749 /// VGPRResult[Lane] = use_op SGPR 750 /// } 751 /// Restore Execution Mask 752 /// 753 /// There is additional complexity to try for compare values to identify the 754 /// unique values used. 755 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 756 MachineIRBuilder &B, 757 iterator_range<MachineBasicBlock::iterator> Range, 758 SmallSet<Register, 4> &SGPROperandRegs, 759 MachineRegisterInfo &MRI) const { 760 761 // Track use registers which have already been expanded with a readfirstlane 762 // sequence. This may have multiple uses if moving a sequence. 763 DenseMap<Register, Register> WaterfalledRegMap; 764 765 MachineBasicBlock &MBB = B.getMBB(); 766 MachineFunction *MF = &B.getMF(); 767 768 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 769 const unsigned MovExecOpc = 770 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 771 const unsigned MovExecTermOpc = 772 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 773 774 const unsigned XorTermOpc = Subtarget.isWave32() ? 775 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 776 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 777 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 778 const unsigned ExecReg = Subtarget.isWave32() ? 779 AMDGPU::EXEC_LO : AMDGPU::EXEC; 780 781 #ifndef NDEBUG 782 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 783 #endif 784 785 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 786 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 787 788 // Don't bother using generic instructions/registers for the exec mask. 789 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 790 .addDef(InitSaveExecReg); 791 792 Register PhiExec = MRI.createVirtualRegister(WaveRC); 793 Register NewExec = MRI.createVirtualRegister(WaveRC); 794 795 // To insert the loop we need to split the block. Move everything before this 796 // point to a new block, and insert a new empty block before this instruction. 797 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 798 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); 799 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 800 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 801 MachineFunction::iterator MBBI(MBB); 802 ++MBBI; 803 MF->insert(MBBI, LoopBB); 804 MF->insert(MBBI, BodyBB); 805 MF->insert(MBBI, RestoreExecBB); 806 MF->insert(MBBI, RemainderBB); 807 808 LoopBB->addSuccessor(BodyBB); 809 BodyBB->addSuccessor(RestoreExecBB); 810 BodyBB->addSuccessor(LoopBB); 811 812 // Move the rest of the block into a new block. 813 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 814 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 815 816 MBB.addSuccessor(LoopBB); 817 RestoreExecBB->addSuccessor(RemainderBB); 818 819 B.setInsertPt(*LoopBB, LoopBB->end()); 820 821 B.buildInstr(TargetOpcode::PHI) 822 .addDef(PhiExec) 823 .addReg(InitSaveExecReg) 824 .addMBB(&MBB) 825 .addReg(NewExec) 826 .addMBB(BodyBB); 827 828 const DebugLoc &DL = B.getDL(); 829 830 MachineInstr &FirstInst = *Range.begin(); 831 832 // Move the instruction into the loop body. Note we moved everything after 833 // Range.end() already into a new block, so Range.end() is no longer valid. 834 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); 835 836 // Figure out the iterator range after splicing the instructions. 837 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 838 auto NewEnd = BodyBB->end(); 839 840 B.setMBB(*LoopBB); 841 842 LLT S1 = LLT::scalar(1); 843 Register CondReg; 844 845 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 846 847 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 848 for (MachineOperand &Op : MI.uses()) { 849 if (!Op.isReg() || Op.isDef()) 850 continue; 851 852 Register OldReg = Op.getReg(); 853 if (!SGPROperandRegs.count(OldReg)) 854 continue; 855 856 // See if we already processed this register in another instruction in the 857 // sequence. 858 auto OldVal = WaterfalledRegMap.find(OldReg); 859 if (OldVal != WaterfalledRegMap.end()) { 860 Op.setReg(OldVal->second); 861 continue; 862 } 863 864 Register OpReg = Op.getReg(); 865 LLT OpTy = MRI.getType(OpReg); 866 867 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 868 if (OpBank != &AMDGPU::VGPRRegBank) { 869 // Insert copy from AGPR to VGPR before the loop. 870 B.setMBB(MBB); 871 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 872 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 873 B.setMBB(*LoopBB); 874 } 875 876 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); 877 878 // Build the comparison(s). 879 unsigned OpSize = OpTy.getSizeInBits(); 880 bool Is64 = OpSize % 64 == 0; 881 unsigned PartSize = Is64 ? 64 : 32; 882 LLT PartTy = LLT::scalar(PartSize); 883 unsigned NumParts = OpSize / PartSize; 884 SmallVector<Register, 8> OpParts; 885 SmallVector<Register, 8> CurrentLaneParts; 886 887 if (NumParts == 1) { 888 OpParts.push_back(OpReg); 889 CurrentLaneParts.push_back(CurrentLaneReg); 890 } else { 891 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); 892 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); 893 for (unsigned i = 0; i < NumParts; ++i) { 894 OpParts.push_back(UnmergeOp.getReg(i)); 895 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); 896 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); 897 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); 898 } 899 } 900 901 for (unsigned i = 0; i < NumParts; ++i) { 902 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], 903 OpParts[i]).getReg(0); 904 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); 905 906 if (!CondReg) { 907 CondReg = CmpReg; 908 } else { 909 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); 910 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); 911 } 912 } 913 914 Op.setReg(CurrentLaneReg); 915 916 // Make sure we don't re-process this register again. 917 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 918 } 919 } 920 921 // The ballot becomes a no-op during instruction selection. 922 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, 923 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}, 924 false) 925 .addReg(CondReg) 926 .getReg(0); 927 MRI.setRegClass(CondReg, WaveRC); 928 929 // Update EXEC, save the original EXEC value to VCC. 930 B.buildInstr(AndSaveExecOpc) 931 .addDef(NewExec) 932 .addReg(CondReg, RegState::Kill); 933 934 MRI.setSimpleHint(NewExec, CondReg); 935 936 B.setInsertPt(*BodyBB, BodyBB->end()); 937 938 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 939 B.buildInstr(XorTermOpc) 940 .addDef(ExecReg) 941 .addReg(ExecReg) 942 .addReg(NewExec); 943 944 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 945 // s_cbranch_scc0? 946 947 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 948 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 949 950 // Save the EXEC mask before the loop. 951 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 952 .addReg(ExecReg); 953 954 // Restore the EXEC mask after the loop. 955 B.setMBB(*RestoreExecBB); 956 B.buildInstr(MovExecTermOpc) 957 .addDef(ExecReg) 958 .addReg(SaveExecReg); 959 960 // Set the insert point after the original instruction, so any new 961 // instructions will be in the remainder. 962 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 963 964 return true; 965 } 966 967 // Return any unique registers used by \p MI at \p OpIndices that need to be 968 // handled in a waterfall loop. Returns these registers in \p 969 // SGPROperandRegs. Returns true if there are any operands to handle and a 970 // waterfall loop is necessary. 971 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 972 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 973 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 974 for (unsigned Op : OpIndices) { 975 assert(MI.getOperand(Op).isUse()); 976 Register Reg = MI.getOperand(Op).getReg(); 977 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 978 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 979 SGPROperandRegs.insert(Reg); 980 } 981 982 // No operands need to be replaced, so no need to loop. 983 return !SGPROperandRegs.empty(); 984 } 985 986 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 987 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 988 ArrayRef<unsigned> OpIndices) const { 989 // Use a set to avoid extra readfirstlanes in the case where multiple operands 990 // are the same register. 991 SmallSet<Register, 4> SGPROperandRegs; 992 993 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 994 return false; 995 996 MachineBasicBlock::iterator I = MI.getIterator(); 997 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 998 SGPROperandRegs, MRI); 999 } 1000 1001 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1002 MachineInstr &MI, MachineRegisterInfo &MRI, 1003 ArrayRef<unsigned> OpIndices) const { 1004 MachineIRBuilder B(MI); 1005 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1006 } 1007 1008 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1009 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1010 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1011 Register Reg = MI.getOperand(OpIdx).getReg(); 1012 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1013 if (Bank == &AMDGPU::SGPRRegBank) 1014 return; 1015 1016 MachineIRBuilder B(MI); 1017 1018 Reg = buildReadFirstLane(B, MRI, Reg); 1019 MI.getOperand(OpIdx).setReg(Reg); 1020 } 1021 1022 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1023 /// rest will be in the remainder. 1024 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1025 unsigned TotalSize = Ty.getSizeInBits(); 1026 if (!Ty.isVector()) 1027 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1028 1029 LLT EltTy = Ty.getElementType(); 1030 unsigned EltSize = EltTy.getSizeInBits(); 1031 assert(FirstSize % EltSize == 0); 1032 1033 unsigned FirstPartNumElts = FirstSize / EltSize; 1034 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1035 1036 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1037 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1038 } 1039 1040 static LLT widen96To128(LLT Ty) { 1041 if (!Ty.isVector()) 1042 return LLT::scalar(128); 1043 1044 LLT EltTy = Ty.getElementType(); 1045 assert(128 % EltTy.getSizeInBits() == 0); 1046 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1047 } 1048 1049 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1050 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1051 MachineRegisterInfo &MRI) const { 1052 Register DstReg = MI.getOperand(0).getReg(); 1053 const LLT LoadTy = MRI.getType(DstReg); 1054 unsigned LoadSize = LoadTy.getSizeInBits(); 1055 const unsigned MaxNonSmrdLoadSize = 128; 1056 1057 const RegisterBank *DstBank = 1058 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1059 if (DstBank == &AMDGPU::SGPRRegBank) { 1060 // There are some special cases that we need to look at for 32 bit and 96 1061 // bit SGPR loads otherwise we have nothing to do. 1062 if (LoadSize != 32 && LoadSize != 96) 1063 return false; 1064 1065 MachineMemOperand *MMO = *MI.memoperands_begin(); 1066 const unsigned MemSize = 8 * MMO->getSize(); 1067 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1068 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1069 // scalar loads should have a load size of 32 but memory access size of less 1070 // than 32. 1071 if (LoadSize == 32 && 1072 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1073 return false; 1074 1075 Register PtrReg = MI.getOperand(1).getReg(); 1076 1077 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1078 MachineIRBuilder B(MI, O); 1079 1080 if (LoadSize == 32) { 1081 // This is an extending load from a sub-dword size. Widen the memory 1082 // access size to 4 bytes and clear the extra high bits appropriately 1083 const LLT S32 = LLT::scalar(32); 1084 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1085 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1086 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1087 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1088 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1089 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1090 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1091 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1092 } else 1093 // We do not need to touch the higher bits for regular loads. 1094 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1095 } else { 1096 // 96-bit loads are only available for vector loads. We need to split this 1097 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1098 if (MMO->getAlign() < Align(16)) { 1099 MachineFunction *MF = MI.getParent()->getParent(); 1100 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1101 MachineIRBuilder B(MI, ApplyBank); 1102 LegalizerHelper Helper(*MF, ApplyBank, B); 1103 LLT Part64, Part32; 1104 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1105 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1106 LegalizerHelper::Legalized) 1107 return false; 1108 return true; 1109 } else { 1110 LLT WiderTy = widen96To128(LoadTy); 1111 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1112 if (WiderTy.isScalar()) 1113 B.buildTrunc(MI.getOperand(0), WideLoad); 1114 else { 1115 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1116 WideLoad); 1117 } 1118 } 1119 } 1120 1121 MI.eraseFromParent(); 1122 return true; 1123 } 1124 1125 // 128-bit loads are supported for all instruction types. 1126 if (LoadSize <= MaxNonSmrdLoadSize) 1127 return false; 1128 1129 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1130 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1131 1132 if (SrcRegs.empty()) 1133 SrcRegs.push_back(MI.getOperand(1).getReg()); 1134 1135 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1136 1137 // RegBankSelect only emits scalar types, so we need to reset the pointer 1138 // operand to a pointer type. 1139 Register BasePtrReg = SrcRegs[0]; 1140 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1141 MRI.setType(BasePtrReg, PtrTy); 1142 1143 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1144 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1145 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1146 MachineIRBuilder B(MI, Observer); 1147 LegalizerHelper Helper(B.getMF(), Observer, B); 1148 1149 if (LoadTy.isVector()) { 1150 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1151 return false; 1152 } else { 1153 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1154 return false; 1155 } 1156 1157 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1158 return true; 1159 } 1160 1161 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1162 MachineInstr &MI, 1163 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1164 MachineRegisterInfo &MRI) const { 1165 const MachineFunction &MF = *MI.getMF(); 1166 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1167 const auto &TFI = *ST.getFrameLowering(); 1168 1169 // Guard in case the stack growth direction ever changes with scratch 1170 // instructions. 1171 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1172 return false; 1173 1174 Register Dst = MI.getOperand(0).getReg(); 1175 Register AllocSize = MI.getOperand(1).getReg(); 1176 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1177 1178 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1179 1180 // TODO: Need to emit a wave reduction to get the maximum size. 1181 if (SizeBank != &AMDGPU::SGPRRegBank) 1182 return false; 1183 1184 LLT PtrTy = MRI.getType(Dst); 1185 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1186 1187 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1188 Register SPReg = Info->getStackPtrOffsetReg(); 1189 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1190 MachineIRBuilder B(MI, ApplyBank); 1191 1192 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1193 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1194 1195 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1196 if (Alignment > TFI.getStackAlign()) { 1197 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1198 B.buildMaskLowPtrBits(Dst, PtrAdd, 1199 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1200 } else { 1201 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1202 } 1203 1204 MI.eraseFromParent(); 1205 return true; 1206 } 1207 1208 bool AMDGPURegisterBankInfo::applyMappingImage( 1209 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1210 MachineRegisterInfo &MRI, int RsrcIdx) const { 1211 const int NumDefs = MI.getNumExplicitDefs(); 1212 1213 // The reported argument index is relative to the IR intrinsic call arguments, 1214 // so we need to shift by the number of defs and the intrinsic ID. 1215 RsrcIdx += NumDefs + 1; 1216 1217 // Insert copies to VGPR arguments. 1218 applyDefaultMapping(OpdMapper); 1219 1220 // Fixup any SGPR arguments. 1221 SmallVector<unsigned, 4> SGPRIndexes; 1222 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1223 if (!MI.getOperand(I).isReg()) 1224 continue; 1225 1226 // If this intrinsic has a sampler, it immediately follows rsrc. 1227 if (I == RsrcIdx || I == RsrcIdx + 1) 1228 SGPRIndexes.push_back(I); 1229 } 1230 1231 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1232 return true; 1233 } 1234 1235 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1236 Register Reg) { 1237 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1238 if (!Def) 1239 return Reg; 1240 1241 // TODO: Guard against this being an implicit def 1242 return Def->getOperand(0).getReg(); 1243 } 1244 1245 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1246 // the three offsets (voffset, soffset and instoffset) 1247 static unsigned setBufferOffsets(MachineIRBuilder &B, 1248 const AMDGPURegisterBankInfo &RBI, 1249 Register CombinedOffset, Register &VOffsetReg, 1250 Register &SOffsetReg, int64_t &InstOffsetVal, 1251 Align Alignment) { 1252 const LLT S32 = LLT::scalar(32); 1253 MachineRegisterInfo *MRI = B.getMRI(); 1254 1255 if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1256 uint32_t SOffset, ImmOffset; 1257 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1258 Alignment)) { 1259 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1260 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1261 InstOffsetVal = ImmOffset; 1262 1263 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1264 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1265 return SOffset + ImmOffset; 1266 } 1267 } 1268 1269 Register Base; 1270 unsigned Offset; 1271 1272 std::tie(Base, Offset) = 1273 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1274 1275 uint32_t SOffset, ImmOffset; 1276 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1277 &RBI.Subtarget, Alignment)) { 1278 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1279 VOffsetReg = Base; 1280 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1281 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1282 InstOffsetVal = ImmOffset; 1283 return 0; // XXX - Why is this 0? 1284 } 1285 1286 // If we have SGPR base, we can use it for soffset. 1287 if (SOffset == 0) { 1288 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1289 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1290 SOffsetReg = Base; 1291 InstOffsetVal = ImmOffset; 1292 return 0; // XXX - Why is this 0? 1293 } 1294 } 1295 1296 // Handle the variable sgpr + vgpr case. 1297 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1298 if (Add && (int)Offset >= 0) { 1299 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1300 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1301 1302 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1303 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1304 1305 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1306 VOffsetReg = Src0; 1307 SOffsetReg = Src1; 1308 return 0; 1309 } 1310 1311 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1312 VOffsetReg = Src1; 1313 SOffsetReg = Src0; 1314 return 0; 1315 } 1316 } 1317 1318 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1319 // have an SGPR offset and a VGPR resource. 1320 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1321 VOffsetReg = CombinedOffset; 1322 } else { 1323 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1324 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1325 } 1326 1327 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1328 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1329 return 0; 1330 } 1331 1332 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1333 const OperandsMapper &OpdMapper) const { 1334 MachineInstr &MI = OpdMapper.getMI(); 1335 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1336 1337 const LLT S32 = LLT::scalar(32); 1338 Register Dst = MI.getOperand(0).getReg(); 1339 LLT Ty = MRI.getType(Dst); 1340 1341 const RegisterBank *RSrcBank = 1342 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1343 const RegisterBank *OffsetBank = 1344 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1345 if (RSrcBank == &AMDGPU::SGPRRegBank && 1346 OffsetBank == &AMDGPU::SGPRRegBank) 1347 return true; // Legal mapping 1348 1349 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1350 // here but don't have an MMO. 1351 1352 unsigned LoadSize = Ty.getSizeInBits(); 1353 int NumLoads = 1; 1354 if (LoadSize == 256 || LoadSize == 512) { 1355 NumLoads = LoadSize / 128; 1356 Ty = Ty.divide(NumLoads); 1357 } 1358 1359 // Use the alignment to ensure that the required offsets will fit into the 1360 // immediate offsets. 1361 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1362 1363 MachineIRBuilder B(MI); 1364 MachineFunction &MF = B.getMF(); 1365 1366 Register SOffset; 1367 Register VOffset; 1368 int64_t ImmOffset = 0; 1369 1370 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1371 VOffset, SOffset, ImmOffset, Alignment); 1372 1373 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1374 // can, but we need to track an MMO for that. 1375 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1376 const Align MemAlign(4); // FIXME: ABI type alignment? 1377 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1378 MachinePointerInfo(), 1379 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1380 MachineMemOperand::MOInvariant, 1381 MemSize, MemAlign); 1382 if (MMOOffset != 0) 1383 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1384 1385 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1386 // assume that the buffer is unswizzled. 1387 1388 Register RSrc = MI.getOperand(1).getReg(); 1389 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1390 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1391 1392 SmallVector<Register, 4> LoadParts(NumLoads); 1393 1394 MachineBasicBlock::iterator MII = MI.getIterator(); 1395 MachineInstrSpan Span(MII, &B.getMBB()); 1396 1397 for (int i = 0; i < NumLoads; ++i) { 1398 if (NumLoads == 1) { 1399 LoadParts[i] = Dst; 1400 } else { 1401 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1402 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1403 } 1404 1405 MachineMemOperand *MMO = BaseMMO; 1406 if (i != 0) 1407 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1408 1409 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1410 .addDef(LoadParts[i]) // vdata 1411 .addUse(RSrc) // rsrc 1412 .addUse(VIndex) // vindex 1413 .addUse(VOffset) // voffset 1414 .addUse(SOffset) // soffset 1415 .addImm(ImmOffset + 16 * i) // offset(imm) 1416 .addImm(0) // cachepolicy, swizzled buffer(imm) 1417 .addImm(0) // idxen(imm) 1418 .addMemOperand(MMO); 1419 } 1420 1421 // TODO: If only the resource is a VGPR, it may be better to execute the 1422 // scalar load in the waterfall loop if the resource is expected to frequently 1423 // be dynamically uniform. 1424 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1425 // Remove the original instruction to avoid potentially confusing the 1426 // waterfall loop logic. 1427 B.setInstr(*Span.begin()); 1428 MI.eraseFromParent(); 1429 1430 SmallSet<Register, 4> OpsToWaterfall; 1431 1432 OpsToWaterfall.insert(RSrc); 1433 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1434 OpsToWaterfall, MRI); 1435 } 1436 1437 if (NumLoads != 1) { 1438 if (Ty.isVector()) 1439 B.buildConcatVectors(Dst, LoadParts); 1440 else 1441 B.buildMerge(Dst, LoadParts); 1442 } 1443 1444 // We removed the instruction earlier with a waterfall loop. 1445 if (RSrcBank == &AMDGPU::SGPRRegBank) 1446 MI.eraseFromParent(); 1447 1448 return true; 1449 } 1450 1451 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, 1452 bool Signed) const { 1453 MachineInstr &MI = OpdMapper.getMI(); 1454 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1455 1456 // Insert basic copies 1457 applyDefaultMapping(OpdMapper); 1458 1459 Register DstReg = MI.getOperand(0).getReg(); 1460 LLT Ty = MRI.getType(DstReg); 1461 1462 const LLT S32 = LLT::scalar(32); 1463 1464 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; 1465 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1466 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1467 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1468 1469 const RegisterBank *DstBank = 1470 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1471 if (DstBank == &AMDGPU::VGPRRegBank) { 1472 if (Ty == S32) 1473 return true; 1474 1475 // There is no 64-bit vgpr bitfield extract instructions so the operation 1476 // is expanded to a sequence of instructions that implement the operation. 1477 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); 1478 MachineIRBuilder B(MI, ApplyBank); 1479 1480 const LLT S64 = LLT::scalar(64); 1481 // Shift the source operand so that extracted bits start at bit 0. 1482 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1483 : B.buildLShr(S64, SrcReg, OffsetReg); 1484 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1485 1486 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1487 // if the width is a constant. 1488 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1489 // Use the 32-bit bitfield extract instruction if the width is a constant. 1490 // Depending on the width size, use either the low or high 32-bits. 1491 auto Zero = B.buildConstant(S32, 0); 1492 auto WidthImm = ConstWidth->Value.getZExtValue(); 1493 if (WidthImm <= 32) { 1494 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1495 // or clear the upper 32-bits. 1496 auto Extract = 1497 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1498 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1499 auto Extend = 1500 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1501 B.buildMerge(DstReg, {Extract, Extend}); 1502 } else { 1503 // Use bitfield extract on upper 32-bit source, and combine with lower 1504 // 32-bit source. 1505 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1506 auto Extract = 1507 Signed 1508 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1509 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1510 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1511 } 1512 MI.eraseFromParent(); 1513 return true; 1514 } 1515 1516 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1517 // operations. 1518 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1519 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1520 if (Signed) 1521 B.buildAShr(S64, SignBit, ExtShift); 1522 else 1523 B.buildLShr(S64, SignBit, ExtShift); 1524 MI.eraseFromParent(); 1525 return true; 1526 } 1527 1528 // The scalar form packs the offset and width in a single operand. 1529 1530 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1531 MachineIRBuilder B(MI, ApplyBank); 1532 1533 // Ensure the high bits are clear to insert the offset. 1534 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1535 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1536 1537 // Zeros out the low bits, so don't bother clamping the input value. 1538 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1539 1540 // Transformation function, pack the offset and width of a BFE into 1541 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1542 // source, bits [5:0] contain the offset and bits [22:16] the width. 1543 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1544 1545 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1546 // register class constraints. 1547 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1548 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1549 1550 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1551 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1552 llvm_unreachable("failed to constrain BFE"); 1553 1554 MI.eraseFromParent(); 1555 return true; 1556 } 1557 1558 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( 1559 const OperandsMapper &OpdMapper) const { 1560 MachineInstr &MI = OpdMapper.getMI(); 1561 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1562 1563 // Insert basic copies. 1564 applyDefaultMapping(OpdMapper); 1565 1566 Register Dst0 = MI.getOperand(0).getReg(); 1567 Register Dst1 = MI.getOperand(1).getReg(); 1568 Register Src0 = MI.getOperand(2).getReg(); 1569 Register Src1 = MI.getOperand(3).getReg(); 1570 Register Src2 = MI.getOperand(4).getReg(); 1571 1572 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) 1573 return true; 1574 1575 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 1576 LLT S1 = LLT::scalar(1); 1577 LLT S32 = LLT::scalar(32); 1578 1579 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; 1580 bool Accumulate = true; 1581 1582 if (!DstOnValu) { 1583 if (mi_match(Src2, MRI, m_ZeroInt())) 1584 Accumulate = false; 1585 } 1586 1587 // Keep the multiplication on the SALU. 1588 MachineIRBuilder B(MI); 1589 1590 Register DstHi; 1591 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); 1592 bool MulHiInVgpr = false; 1593 1594 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); 1595 1596 if (Subtarget.hasSMulHi()) { 1597 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) 1598 : B.buildSMulH(S32, Src0, Src1).getReg(0); 1599 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); 1600 } else { 1601 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); 1602 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); 1603 1604 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); 1605 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); 1606 1607 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) 1608 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); 1609 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1610 1611 if (!DstOnValu) { 1612 DstHi = buildReadFirstLane(B, MRI, DstHi); 1613 } else { 1614 MulHiInVgpr = true; 1615 } 1616 } 1617 1618 // Accumulate and produce the "carry-out" bit. 1619 // 1620 // The "carry-out" is defined as bit 64 of the result when computed as a 1621 // big integer. For unsigned multiply-add, this matches the usual definition 1622 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the 1623 // result, which is determined as: 1624 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add 1625 LLT CarryType = DstOnValu ? S1 : S32; 1626 const RegisterBank &CarryBank = 1627 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 1628 const RegisterBank &DstBank = 1629 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; 1630 Register Carry; 1631 Register Zero; 1632 1633 if (!IsUnsigned) { 1634 Zero = B.buildConstant(S32, 0).getReg(0); 1635 MRI.setRegBank(Zero, 1636 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); 1637 1638 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) 1639 .getReg(0); 1640 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank 1641 : AMDGPU::SGPRRegBank); 1642 1643 if (DstOnValu && !MulHiInVgpr) { 1644 Carry = B.buildTrunc(S1, Carry).getReg(0); 1645 MRI.setRegBank(Carry, AMDGPU::VCCRegBank); 1646 } 1647 } 1648 1649 if (Accumulate) { 1650 if (DstOnValu) { 1651 DstLo = B.buildCopy(S32, DstLo).getReg(0); 1652 DstHi = B.buildCopy(S32, DstHi).getReg(0); 1653 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); 1654 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1655 } 1656 1657 auto Unmerge = B.buildUnmerge(S32, Src2); 1658 Register Src2Lo = Unmerge.getReg(0); 1659 Register Src2Hi = Unmerge.getReg(1); 1660 MRI.setRegBank(Src2Lo, DstBank); 1661 MRI.setRegBank(Src2Hi, DstBank); 1662 1663 if (!IsUnsigned) { 1664 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); 1665 MRI.setRegBank(Src2Sign.getReg(0), CarryBank); 1666 1667 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); 1668 MRI.setRegBank(Carry, CarryBank); 1669 } 1670 1671 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); 1672 DstLo = AddLo.getReg(0); 1673 Register CarryLo = AddLo.getReg(1); 1674 MRI.setRegBank(DstLo, DstBank); 1675 MRI.setRegBank(CarryLo, CarryBank); 1676 1677 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); 1678 DstHi = AddHi.getReg(0); 1679 MRI.setRegBank(DstHi, DstBank); 1680 1681 Register CarryHi = AddHi.getReg(1); 1682 MRI.setRegBank(CarryHi, CarryBank); 1683 1684 if (IsUnsigned) { 1685 Carry = CarryHi; 1686 } else { 1687 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); 1688 MRI.setRegBank(Carry, CarryBank); 1689 } 1690 } else { 1691 if (IsUnsigned) { 1692 Carry = B.buildConstant(CarryType, 0).getReg(0); 1693 MRI.setRegBank(Carry, CarryBank); 1694 } 1695 } 1696 1697 B.buildMerge(Dst0, {DstLo, DstHi}); 1698 1699 if (DstOnValu) { 1700 B.buildCopy(Dst1, Carry); 1701 } else { 1702 B.buildTrunc(Dst1, Carry); 1703 } 1704 1705 MI.eraseFromParent(); 1706 return true; 1707 } 1708 1709 // Return a suitable opcode for extending the operands of Opc when widening. 1710 static unsigned getExtendOp(unsigned Opc) { 1711 switch (Opc) { 1712 case TargetOpcode::G_ASHR: 1713 case TargetOpcode::G_SMIN: 1714 case TargetOpcode::G_SMAX: 1715 return TargetOpcode::G_SEXT; 1716 case TargetOpcode::G_LSHR: 1717 case TargetOpcode::G_UMIN: 1718 case TargetOpcode::G_UMAX: 1719 return TargetOpcode::G_ZEXT; 1720 default: 1721 return TargetOpcode::G_ANYEXT; 1722 } 1723 } 1724 1725 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1726 // any illegal vector extend or unmerge operations. 1727 static std::pair<Register, Register> 1728 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1729 const LLT S32 = LLT::scalar(32); 1730 auto Bitcast = B.buildBitcast(S32, Src); 1731 1732 if (ExtOpcode == TargetOpcode::G_SEXT) { 1733 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1734 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1735 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1736 } 1737 1738 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1739 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1740 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1741 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1742 } 1743 1744 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1745 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1746 } 1747 1748 // For cases where only a single copy is inserted for matching register banks. 1749 // Replace the register in the instruction operand 1750 static bool substituteSimpleCopyRegs( 1751 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1752 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1753 if (!SrcReg.empty()) { 1754 assert(SrcReg.size() == 1); 1755 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1756 return true; 1757 } 1758 1759 return false; 1760 } 1761 1762 /// Handle register layout difference for f16 images for some subtargets. 1763 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1764 MachineRegisterInfo &MRI, 1765 Register Reg) const { 1766 if (!Subtarget.hasUnpackedD16VMem()) 1767 return Reg; 1768 1769 const LLT S16 = LLT::scalar(16); 1770 LLT StoreVT = MRI.getType(Reg); 1771 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1772 return Reg; 1773 1774 auto Unmerge = B.buildUnmerge(S16, Reg); 1775 1776 1777 SmallVector<Register, 4> WideRegs; 1778 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1779 WideRegs.push_back(Unmerge.getReg(I)); 1780 1781 const LLT S32 = LLT::scalar(32); 1782 int NumElts = StoreVT.getNumElements(); 1783 1784 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0); 1785 } 1786 1787 static std::pair<Register, unsigned> 1788 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1789 int64_t Const; 1790 if (mi_match(Reg, MRI, m_ICst(Const))) 1791 return std::make_pair(Register(), Const); 1792 1793 Register Base; 1794 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1795 return std::make_pair(Base, Const); 1796 1797 // TODO: Handle G_OR used for add case 1798 return std::make_pair(Reg, 0); 1799 } 1800 1801 std::pair<Register, unsigned> 1802 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1803 Register OrigOffset) const { 1804 const unsigned MaxImm = 4095; 1805 Register BaseReg; 1806 unsigned ImmOffset; 1807 const LLT S32 = LLT::scalar(32); 1808 1809 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1810 OrigOffset); 1811 1812 unsigned C1 = 0; 1813 if (ImmOffset != 0) { 1814 // If the immediate value is too big for the immoffset field, put the value 1815 // and -4096 into the immoffset field so that the value that is copied/added 1816 // for the voffset field is a multiple of 4096, and it stands more chance 1817 // of being CSEd with the copy/add for another similar load/store. 1818 // However, do not do that rounding down to a multiple of 4096 if that is a 1819 // negative number, as it appears to be illegal to have a negative offset 1820 // in the vgpr, even if adding the immediate offset makes it positive. 1821 unsigned Overflow = ImmOffset & ~MaxImm; 1822 ImmOffset -= Overflow; 1823 if ((int32_t)Overflow < 0) { 1824 Overflow += ImmOffset; 1825 ImmOffset = 0; 1826 } 1827 1828 C1 = ImmOffset; 1829 if (Overflow != 0) { 1830 if (!BaseReg) 1831 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1832 else { 1833 auto OverflowVal = B.buildConstant(S32, Overflow); 1834 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1835 } 1836 } 1837 } 1838 1839 if (!BaseReg) 1840 BaseReg = B.buildConstant(S32, 0).getReg(0); 1841 1842 return {BaseReg, C1}; 1843 } 1844 1845 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1846 Register SrcReg) const { 1847 MachineRegisterInfo &MRI = *B.getMRI(); 1848 LLT SrcTy = MRI.getType(SrcReg); 1849 if (SrcTy.getSizeInBits() == 32) { 1850 // Use a v_mov_b32 here to make the exec dependency explicit. 1851 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1852 .addDef(DstReg) 1853 .addUse(SrcReg); 1854 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1855 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1856 } 1857 1858 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1859 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1860 1861 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1862 .addDef(TmpReg0) 1863 .addUse(SrcReg, 0, AMDGPU::sub0); 1864 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1865 .addDef(TmpReg1) 1866 .addUse(SrcReg, 0, AMDGPU::sub1); 1867 B.buildInstr(AMDGPU::REG_SEQUENCE) 1868 .addDef(DstReg) 1869 .addUse(TmpReg0) 1870 .addImm(AMDGPU::sub0) 1871 .addUse(TmpReg1) 1872 .addImm(AMDGPU::sub1); 1873 1874 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1875 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1876 } 1877 1878 /// Utility function for pushing dynamic vector indexes with a constant offset 1879 /// into waterfall loops. 1880 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1881 MachineInstr &IdxUseInstr, 1882 unsigned OpIdx, 1883 unsigned ConstOffset) { 1884 MachineRegisterInfo &MRI = *B.getMRI(); 1885 const LLT S32 = LLT::scalar(32); 1886 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1887 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1888 1889 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1890 1891 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1892 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1893 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1894 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1895 } 1896 1897 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1898 /// original 32-bit source value (to be inserted in the low part of the combined 1899 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1900 /// value. 1901 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1902 Register Hi32Reg, Register Lo32Reg, 1903 unsigned ExtOpc, 1904 const RegisterBank &RegBank, 1905 bool IsBooleanSrc = false) { 1906 if (ExtOpc == AMDGPU::G_ZEXT) { 1907 B.buildConstant(Hi32Reg, 0); 1908 } else if (ExtOpc == AMDGPU::G_SEXT) { 1909 if (IsBooleanSrc) { 1910 // If we know the original source was an s1, the high half is the same as 1911 // the low. 1912 B.buildCopy(Hi32Reg, Lo32Reg); 1913 } else { 1914 // Replicate sign bit from 32-bit extended part. 1915 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1916 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1917 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1918 } 1919 } else { 1920 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1921 B.buildUndef(Hi32Reg); 1922 } 1923 } 1924 1925 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 const OperandsMapper &OpdMapper) const { 1928 1929 Register VecReg = MI.getOperand(1).getReg(); 1930 Register Idx = MI.getOperand(2).getReg(); 1931 1932 const RegisterBank &IdxBank = 1933 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1934 1935 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1936 1937 LLT VecTy = MRI.getType(VecReg); 1938 unsigned EltSize = VecTy.getScalarSizeInBits(); 1939 unsigned NumElem = VecTy.getNumElements(); 1940 1941 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1942 IsDivergentIdx, &Subtarget)) 1943 return false; 1944 1945 MachineIRBuilder B(MI); 1946 LLT S32 = LLT::scalar(32); 1947 1948 const RegisterBank &DstBank = 1949 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1950 const RegisterBank &SrcBank = 1951 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1952 1953 const RegisterBank &CCBank = 1954 (DstBank == AMDGPU::SGPRRegBank && 1955 SrcBank == AMDGPU::SGPRRegBank && 1956 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1957 : AMDGPU::VCCRegBank; 1958 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1959 1960 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1961 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1962 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1963 } 1964 1965 LLT EltTy = VecTy.getScalarType(); 1966 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1967 unsigned NumLanes = DstRegs.size(); 1968 if (!NumLanes) 1969 NumLanes = 1; 1970 else 1971 EltTy = MRI.getType(DstRegs[0]); 1972 1973 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1974 SmallVector<Register, 2> Res(NumLanes); 1975 for (unsigned L = 0; L < NumLanes; ++L) 1976 Res[L] = UnmergeToEltTy.getReg(L); 1977 1978 for (unsigned I = 1; I < NumElem; ++I) { 1979 auto IC = B.buildConstant(S32, I); 1980 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1981 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1982 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1983 1984 for (unsigned L = 0; L < NumLanes; ++L) { 1985 auto S = B.buildSelect(EltTy, Cmp, 1986 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1987 1988 for (unsigned N : { 0, 2, 3 }) 1989 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1990 1991 Res[L] = S->getOperand(0).getReg(); 1992 } 1993 } 1994 1995 for (unsigned L = 0; L < NumLanes; ++L) { 1996 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1997 B.buildCopy(DstReg, Res[L]); 1998 MRI.setRegBank(DstReg, DstBank); 1999 } 2000 2001 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2002 MI.eraseFromParent(); 2003 2004 return true; 2005 } 2006 2007 // Insert a cross regbank copy for a register if it already has a bank that 2008 // differs from the one we want to set. 2009 static Register constrainRegToBank(MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B, Register &Reg, 2011 const RegisterBank &Bank) { 2012 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 2013 if (CurrBank && *CurrBank != Bank) { 2014 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2015 MRI.setRegBank(Copy, Bank); 2016 return Copy; 2017 } 2018 2019 MRI.setRegBank(Reg, Bank); 2020 return Reg; 2021 } 2022 2023 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2024 MachineInstr &MI, MachineRegisterInfo &MRI, 2025 const OperandsMapper &OpdMapper) const { 2026 2027 Register VecReg = MI.getOperand(1).getReg(); 2028 Register Idx = MI.getOperand(3).getReg(); 2029 2030 const RegisterBank &IdxBank = 2031 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2032 2033 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2034 2035 LLT VecTy = MRI.getType(VecReg); 2036 unsigned EltSize = VecTy.getScalarSizeInBits(); 2037 unsigned NumElem = VecTy.getNumElements(); 2038 2039 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2040 IsDivergentIdx, &Subtarget)) 2041 return false; 2042 2043 MachineIRBuilder B(MI); 2044 LLT S32 = LLT::scalar(32); 2045 2046 const RegisterBank &DstBank = 2047 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2048 const RegisterBank &SrcBank = 2049 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2050 const RegisterBank &InsBank = 2051 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2052 2053 const RegisterBank &CCBank = 2054 (DstBank == AMDGPU::SGPRRegBank && 2055 SrcBank == AMDGPU::SGPRRegBank && 2056 InsBank == AMDGPU::SGPRRegBank && 2057 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2058 : AMDGPU::VCCRegBank; 2059 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2060 2061 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2062 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2063 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2064 } 2065 2066 LLT EltTy = VecTy.getScalarType(); 2067 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2068 unsigned NumLanes = InsRegs.size(); 2069 if (!NumLanes) { 2070 NumLanes = 1; 2071 InsRegs.push_back(MI.getOperand(2).getReg()); 2072 } else { 2073 EltTy = MRI.getType(InsRegs[0]); 2074 } 2075 2076 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2077 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2078 2079 for (unsigned I = 0; I < NumElem; ++I) { 2080 auto IC = B.buildConstant(S32, I); 2081 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2082 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2083 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2084 2085 for (unsigned L = 0; L < NumLanes; ++L) { 2086 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2087 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2088 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2089 2090 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2091 MRI.setRegBank(Select, DstBank); 2092 2093 Ops[I * NumLanes + L] = Select; 2094 } 2095 } 2096 2097 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2098 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2099 B.buildBuildVector(MI.getOperand(0), Ops); 2100 } else { 2101 auto Vec = B.buildBuildVector(MergeTy, Ops); 2102 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2103 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2104 } 2105 2106 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2107 MI.eraseFromParent(); 2108 2109 return true; 2110 } 2111 2112 void AMDGPURegisterBankInfo::applyMappingImpl( 2113 const OperandsMapper &OpdMapper) const { 2114 MachineInstr &MI = OpdMapper.getMI(); 2115 unsigned Opc = MI.getOpcode(); 2116 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2117 switch (Opc) { 2118 case AMDGPU::G_PHI: { 2119 Register DstReg = MI.getOperand(0).getReg(); 2120 LLT DstTy = MRI.getType(DstReg); 2121 if (DstTy != LLT::scalar(1)) 2122 break; 2123 2124 const LLT S32 = LLT::scalar(32); 2125 const RegisterBank *DstBank = 2126 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2127 if (DstBank == &AMDGPU::VCCRegBank) { 2128 applyDefaultMapping(OpdMapper); 2129 // The standard handling only considers the result register bank for 2130 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2131 // produce an invalid copy. We can only copy with some kind of compare to 2132 // get a vector boolean result. Insert a register bank copy that will be 2133 // correctly lowered to a compare. 2134 MachineIRBuilder B(*MI.getParent()->getParent()); 2135 2136 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2137 Register SrcReg = MI.getOperand(I).getReg(); 2138 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2139 2140 if (SrcBank != &AMDGPU::VCCRegBank) { 2141 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2142 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2143 2144 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2145 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2146 MI.getOperand(I).setReg(Copy.getReg(0)); 2147 } 2148 } 2149 2150 return; 2151 } 2152 2153 // Phi handling is strange and only considers the bank of the destination. 2154 substituteSimpleCopyRegs(OpdMapper, 0); 2155 2156 // Promote SGPR/VGPR booleans to s32 2157 MachineFunction *MF = MI.getParent()->getParent(); 2158 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2159 MachineIRBuilder B(MI, ApplyBank); 2160 LegalizerHelper Helper(*MF, ApplyBank, B); 2161 2162 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2163 llvm_unreachable("widen scalar should have succeeded"); 2164 2165 return; 2166 } 2167 case AMDGPU::G_ICMP: 2168 case AMDGPU::G_UADDO: 2169 case AMDGPU::G_USUBO: 2170 case AMDGPU::G_UADDE: 2171 case AMDGPU::G_SADDE: 2172 case AMDGPU::G_USUBE: 2173 case AMDGPU::G_SSUBE: { 2174 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2175 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2176 2177 const RegisterBank *DstBank = 2178 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2179 if (DstBank != &AMDGPU::SGPRRegBank) 2180 break; 2181 2182 const bool HasCarryIn = MI.getNumOperands() == 5; 2183 2184 // If this is a scalar compare, promote the result to s32, as the selection 2185 // will end up using a copy to a 32-bit vreg. 2186 const LLT S32 = LLT::scalar(32); 2187 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2188 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2189 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2190 MachineIRBuilder B(MI); 2191 2192 if (HasCarryIn) { 2193 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2194 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2195 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2196 MI.getOperand(4).setReg(NewSrcReg); 2197 } 2198 2199 MachineBasicBlock *MBB = MI.getParent(); 2200 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2201 2202 // If we had a constrained VCC result register, a copy was inserted to VCC 2203 // from SGPR. 2204 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2205 if (DefRegs.empty()) 2206 DefRegs.push_back(DstReg); 2207 B.buildTrunc(DefRegs[0], NewDstReg); 2208 return; 2209 } 2210 case AMDGPU::G_SELECT: { 2211 Register DstReg = MI.getOperand(0).getReg(); 2212 LLT DstTy = MRI.getType(DstReg); 2213 2214 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2215 if (CondRegs.empty()) 2216 CondRegs.push_back(MI.getOperand(1).getReg()); 2217 else { 2218 assert(CondRegs.size() == 1); 2219 } 2220 2221 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2222 if (CondBank == &AMDGPU::SGPRRegBank) { 2223 MachineIRBuilder B(MI); 2224 const LLT S32 = LLT::scalar(32); 2225 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2226 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2227 2228 MI.getOperand(1).setReg(NewCondReg); 2229 B.buildZExt(NewCondReg, CondRegs[0]); 2230 } 2231 2232 if (DstTy.getSizeInBits() != 64) 2233 break; 2234 2235 MachineIRBuilder B(MI); 2236 LLT HalfTy = getHalfSizedType(DstTy); 2237 2238 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2239 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2240 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2241 2242 // All inputs are SGPRs, nothing special to do. 2243 if (DefRegs.empty()) { 2244 assert(Src1Regs.empty() && Src2Regs.empty()); 2245 break; 2246 } 2247 2248 if (Src1Regs.empty()) 2249 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2250 else { 2251 setRegsToType(MRI, Src1Regs, HalfTy); 2252 } 2253 2254 if (Src2Regs.empty()) 2255 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2256 else 2257 setRegsToType(MRI, Src2Regs, HalfTy); 2258 2259 setRegsToType(MRI, DefRegs, HalfTy); 2260 2261 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2262 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2263 2264 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2265 MI.eraseFromParent(); 2266 return; 2267 } 2268 case AMDGPU::G_BRCOND: { 2269 Register CondReg = MI.getOperand(0).getReg(); 2270 // FIXME: Should use legalizer helper, but should change bool ext type. 2271 const RegisterBank *CondBank = 2272 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2273 2274 if (CondBank == &AMDGPU::SGPRRegBank) { 2275 MachineIRBuilder B(MI); 2276 const LLT S32 = LLT::scalar(32); 2277 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2278 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2279 2280 MI.getOperand(0).setReg(NewCondReg); 2281 B.buildZExt(NewCondReg, CondReg); 2282 return; 2283 } 2284 2285 break; 2286 } 2287 case AMDGPU::G_AND: 2288 case AMDGPU::G_OR: 2289 case AMDGPU::G_XOR: { 2290 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2291 // there is a VGPR input. 2292 Register DstReg = MI.getOperand(0).getReg(); 2293 LLT DstTy = MRI.getType(DstReg); 2294 2295 if (DstTy.getSizeInBits() == 1) { 2296 const RegisterBank *DstBank = 2297 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2298 if (DstBank == &AMDGPU::VCCRegBank) 2299 break; 2300 2301 MachineFunction *MF = MI.getParent()->getParent(); 2302 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2303 MachineIRBuilder B(MI, ApplyBank); 2304 LegalizerHelper Helper(*MF, ApplyBank, B); 2305 2306 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2307 LegalizerHelper::Legalized) 2308 llvm_unreachable("widen scalar should have succeeded"); 2309 return; 2310 } 2311 2312 if (DstTy.getSizeInBits() != 64) 2313 break; 2314 2315 LLT HalfTy = getHalfSizedType(DstTy); 2316 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2317 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2318 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2319 2320 // All inputs are SGPRs, nothing special to do. 2321 if (DefRegs.empty()) { 2322 assert(Src0Regs.empty() && Src1Regs.empty()); 2323 break; 2324 } 2325 2326 assert(DefRegs.size() == 2); 2327 assert(Src0Regs.size() == Src1Regs.size() && 2328 (Src0Regs.empty() || Src0Regs.size() == 2)); 2329 2330 // Depending on where the source registers came from, the generic code may 2331 // have decided to split the inputs already or not. If not, we still need to 2332 // extract the values. 2333 MachineIRBuilder B(MI); 2334 2335 if (Src0Regs.empty()) 2336 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2337 else 2338 setRegsToType(MRI, Src0Regs, HalfTy); 2339 2340 if (Src1Regs.empty()) 2341 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2342 else 2343 setRegsToType(MRI, Src1Regs, HalfTy); 2344 2345 setRegsToType(MRI, DefRegs, HalfTy); 2346 2347 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2348 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2349 2350 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2351 MI.eraseFromParent(); 2352 return; 2353 } 2354 case AMDGPU::G_ABS: { 2355 Register SrcReg = MI.getOperand(1).getReg(); 2356 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2357 2358 // There is no VALU abs instruction so we need to replace it with a sub and 2359 // max combination. 2360 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2361 MachineFunction *MF = MI.getParent()->getParent(); 2362 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); 2363 MachineIRBuilder B(MI, Apply); 2364 LegalizerHelper Helper(*MF, Apply, B); 2365 2366 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2367 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2368 return; 2369 } 2370 LLVM_FALLTHROUGH; 2371 } 2372 case AMDGPU::G_ADD: 2373 case AMDGPU::G_SUB: 2374 case AMDGPU::G_MUL: 2375 case AMDGPU::G_SHL: 2376 case AMDGPU::G_LSHR: 2377 case AMDGPU::G_ASHR: 2378 case AMDGPU::G_SMIN: 2379 case AMDGPU::G_SMAX: 2380 case AMDGPU::G_UMIN: 2381 case AMDGPU::G_UMAX: { 2382 Register DstReg = MI.getOperand(0).getReg(); 2383 LLT DstTy = MRI.getType(DstReg); 2384 2385 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2386 // Packed 16-bit operations need to be scalarized and promoted. 2387 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2388 break; 2389 2390 const RegisterBank *DstBank = 2391 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2392 if (DstBank == &AMDGPU::VGPRRegBank) 2393 break; 2394 2395 const LLT S32 = LLT::scalar(32); 2396 MachineBasicBlock *MBB = MI.getParent(); 2397 MachineFunction *MF = MBB->getParent(); 2398 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2399 MachineIRBuilder B(MI, ApplySALU); 2400 2401 if (DstTy.isVector()) { 2402 Register WideSrc0Lo, WideSrc0Hi; 2403 Register WideSrc1Lo, WideSrc1Hi; 2404 2405 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2406 std::tie(WideSrc0Lo, WideSrc0Hi) 2407 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2408 std::tie(WideSrc1Lo, WideSrc1Hi) 2409 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2410 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2411 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2412 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2413 MI.eraseFromParent(); 2414 } else { 2415 LegalizerHelper Helper(*MF, ApplySALU, B); 2416 2417 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2418 llvm_unreachable("widen scalar should have succeeded"); 2419 2420 // FIXME: s16 shift amounts should be legal. 2421 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2422 Opc == AMDGPU::G_ASHR) { 2423 B.setInsertPt(*MBB, MI.getIterator()); 2424 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2425 llvm_unreachable("widen scalar should have succeeded"); 2426 } 2427 } 2428 2429 return; 2430 } 2431 case AMDGPU::G_SEXT_INREG: { 2432 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2433 if (SrcRegs.empty()) 2434 break; // Nothing to repair 2435 2436 const LLT S32 = LLT::scalar(32); 2437 MachineIRBuilder B(MI); 2438 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2439 GISelObserverWrapper Observer(&O); 2440 B.setChangeObserver(Observer); 2441 2442 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2443 // we would need to further expand, and doesn't let us directly set the 2444 // result registers. 2445 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2446 2447 int Amt = MI.getOperand(2).getImm(); 2448 if (Amt <= 32) { 2449 if (Amt == 32) { 2450 // The low bits are unchanged. 2451 B.buildCopy(DstRegs[0], SrcRegs[0]); 2452 } else { 2453 // Extend in the low bits and propagate the sign bit to the high half. 2454 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2455 } 2456 2457 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2458 } else { 2459 // The low bits are unchanged, and extend in the high bits. 2460 B.buildCopy(DstRegs[0], SrcRegs[0]); 2461 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2462 } 2463 2464 Register DstReg = MI.getOperand(0).getReg(); 2465 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2466 MI.eraseFromParent(); 2467 return; 2468 } 2469 case AMDGPU::G_CTPOP: 2470 case AMDGPU::G_BITREVERSE: { 2471 const RegisterBank *DstBank = 2472 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2473 if (DstBank == &AMDGPU::SGPRRegBank) 2474 break; 2475 2476 Register SrcReg = MI.getOperand(1).getReg(); 2477 const LLT S32 = LLT::scalar(32); 2478 LLT Ty = MRI.getType(SrcReg); 2479 if (Ty == S32) 2480 break; 2481 2482 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2483 MachineIRBuilder B(MI, ApplyVALU); 2484 2485 MachineFunction &MF = B.getMF(); 2486 LegalizerHelper Helper(MF, ApplyVALU, B); 2487 2488 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2489 llvm_unreachable("narrowScalar should have succeeded"); 2490 return; 2491 } 2492 case AMDGPU::G_AMDGPU_FFBH_U32: 2493 case AMDGPU::G_AMDGPU_FFBL_B32: 2494 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2495 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2496 const RegisterBank *DstBank = 2497 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2498 if (DstBank == &AMDGPU::SGPRRegBank) 2499 break; 2500 2501 Register SrcReg = MI.getOperand(1).getReg(); 2502 const LLT S32 = LLT::scalar(32); 2503 LLT Ty = MRI.getType(SrcReg); 2504 if (Ty == S32) 2505 break; 2506 2507 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2508 // which return -1 when the input is zero: 2509 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2510 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2511 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2512 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2513 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2514 MachineIRBuilder B(MI, ApplyVALU); 2515 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2516 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2517 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2518 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2519 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2520 : Opc; 2521 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2522 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2523 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2524 unsigned AddOpc = 2525 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2526 ? AMDGPU::G_ADD 2527 : AMDGPU::G_UADDSAT; 2528 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2529 Register DstReg = MI.getOperand(0).getReg(); 2530 B.buildUMin(DstReg, X, Y); 2531 MI.eraseFromParent(); 2532 return; 2533 } 2534 case AMDGPU::G_SEXT: 2535 case AMDGPU::G_ZEXT: 2536 case AMDGPU::G_ANYEXT: { 2537 Register SrcReg = MI.getOperand(1).getReg(); 2538 LLT SrcTy = MRI.getType(SrcReg); 2539 const bool Signed = Opc == AMDGPU::G_SEXT; 2540 2541 assert(empty(OpdMapper.getVRegs(1))); 2542 2543 MachineIRBuilder B(MI); 2544 const RegisterBank *SrcBank = 2545 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2546 2547 Register DstReg = MI.getOperand(0).getReg(); 2548 LLT DstTy = MRI.getType(DstReg); 2549 if (DstTy.isScalar() && 2550 SrcBank != &AMDGPU::SGPRRegBank && 2551 SrcBank != &AMDGPU::VCCRegBank && 2552 // FIXME: Should handle any type that round to s64 when irregular 2553 // breakdowns supported. 2554 DstTy.getSizeInBits() == 64 && 2555 SrcTy.getSizeInBits() <= 32) { 2556 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2557 2558 // Extend to 32-bit, and then extend the low half. 2559 if (Signed) { 2560 // TODO: Should really be buildSExtOrCopy 2561 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2562 } else if (Opc == AMDGPU::G_ZEXT) { 2563 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2564 } else { 2565 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2566 } 2567 2568 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2569 MRI.setRegBank(DstReg, *SrcBank); 2570 MI.eraseFromParent(); 2571 return; 2572 } 2573 2574 if (SrcTy != LLT::scalar(1)) 2575 return; 2576 2577 // It is not legal to have a legalization artifact with a VCC source. Rather 2578 // than introducing a copy, insert the select we would have to select the 2579 // copy to. 2580 if (SrcBank == &AMDGPU::VCCRegBank) { 2581 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2582 2583 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2584 2585 unsigned DstSize = DstTy.getSizeInBits(); 2586 // 64-bit select is SGPR only 2587 const bool UseSel64 = DstSize > 32 && 2588 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2589 2590 // TODO: Should s16 select be legal? 2591 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2592 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2593 auto False = B.buildConstant(SelType, 0); 2594 2595 MRI.setRegBank(True.getReg(0), *DstBank); 2596 MRI.setRegBank(False.getReg(0), *DstBank); 2597 MRI.setRegBank(DstReg, *DstBank); 2598 2599 if (DstSize > 32) { 2600 B.buildSelect(DefRegs[0], SrcReg, True, False); 2601 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2602 } else if (DstSize < 32) { 2603 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2604 MRI.setRegBank(Sel.getReg(0), *DstBank); 2605 B.buildTrunc(DstReg, Sel); 2606 } else { 2607 B.buildSelect(DstReg, SrcReg, True, False); 2608 } 2609 2610 MI.eraseFromParent(); 2611 return; 2612 } 2613 2614 break; 2615 } 2616 case AMDGPU::G_BUILD_VECTOR: 2617 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2618 Register DstReg = MI.getOperand(0).getReg(); 2619 LLT DstTy = MRI.getType(DstReg); 2620 if (DstTy != LLT::fixed_vector(2, 16)) 2621 break; 2622 2623 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2624 substituteSimpleCopyRegs(OpdMapper, 1); 2625 substituteSimpleCopyRegs(OpdMapper, 2); 2626 2627 const RegisterBank *DstBank = 2628 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2629 if (DstBank == &AMDGPU::SGPRRegBank) 2630 break; // Can use S_PACK_* instructions. 2631 2632 MachineIRBuilder B(MI); 2633 2634 Register Lo = MI.getOperand(1).getReg(); 2635 Register Hi = MI.getOperand(2).getReg(); 2636 const LLT S32 = LLT::scalar(32); 2637 2638 const RegisterBank *BankLo = 2639 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2640 const RegisterBank *BankHi = 2641 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2642 2643 Register ZextLo; 2644 Register ShiftHi; 2645 2646 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2647 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2648 MRI.setRegBank(ZextLo, *BankLo); 2649 2650 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2651 MRI.setRegBank(ZextHi, *BankHi); 2652 2653 auto ShiftAmt = B.buildConstant(S32, 16); 2654 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2655 2656 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2657 MRI.setRegBank(ShiftHi, *BankHi); 2658 } else { 2659 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2660 MRI.setRegBank(MaskLo, *BankLo); 2661 2662 auto ShiftAmt = B.buildConstant(S32, 16); 2663 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2664 2665 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2666 MRI.setRegBank(ShiftHi, *BankHi); 2667 2668 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2669 MRI.setRegBank(ZextLo, *BankLo); 2670 } 2671 2672 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2673 MRI.setRegBank(Or.getReg(0), *DstBank); 2674 2675 B.buildBitcast(DstReg, Or); 2676 MI.eraseFromParent(); 2677 return; 2678 } 2679 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2680 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2681 2682 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2683 2684 Register DstReg = MI.getOperand(0).getReg(); 2685 Register SrcReg = MI.getOperand(1).getReg(); 2686 2687 const LLT S32 = LLT::scalar(32); 2688 LLT DstTy = MRI.getType(DstReg); 2689 LLT SrcTy = MRI.getType(SrcReg); 2690 2691 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2692 return; 2693 2694 MachineIRBuilder B(MI); 2695 2696 const ValueMapping &DstMapping 2697 = OpdMapper.getInstrMapping().getOperandMapping(0); 2698 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2699 const RegisterBank *SrcBank = 2700 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2701 const RegisterBank *IdxBank = 2702 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2703 2704 Register BaseIdxReg; 2705 unsigned ConstOffset; 2706 std::tie(BaseIdxReg, ConstOffset) = 2707 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2708 2709 // See if the index is an add of a constant which will be foldable by moving 2710 // the base register of the index later if this is going to be executed in a 2711 // waterfall loop. This is essentially to reassociate the add of a constant 2712 // with the readfirstlane. 2713 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2714 ConstOffset > 0 && 2715 ConstOffset < SrcTy.getNumElements(); 2716 2717 // Move the base register. We'll re-insert the add later. 2718 if (ShouldMoveIndexIntoLoop) 2719 MI.getOperand(2).setReg(BaseIdxReg); 2720 2721 // If this is a VGPR result only because the index was a VGPR result, the 2722 // actual indexing will be done on the SGPR source vector, which will 2723 // produce a scalar result. We need to copy to the VGPR result inside the 2724 // waterfall loop. 2725 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2726 SrcBank == &AMDGPU::SGPRRegBank; 2727 if (DstRegs.empty()) { 2728 applyDefaultMapping(OpdMapper); 2729 2730 executeInWaterfallLoop(MI, MRI, { 2 }); 2731 2732 if (NeedCopyToVGPR) { 2733 // We don't want a phi for this temporary reg. 2734 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2735 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2736 MI.getOperand(0).setReg(TmpReg); 2737 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2738 2739 // Use a v_mov_b32 here to make the exec dependency explicit. 2740 buildVCopy(B, DstReg, TmpReg); 2741 } 2742 2743 // Re-insert the constant offset add inside the waterfall loop. 2744 if (ShouldMoveIndexIntoLoop) 2745 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2746 2747 return; 2748 } 2749 2750 assert(DstTy.getSizeInBits() == 64); 2751 2752 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2753 2754 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2755 auto One = B.buildConstant(S32, 1); 2756 2757 MachineBasicBlock::iterator MII = MI.getIterator(); 2758 2759 // Split the vector index into 32-bit pieces. Prepare to move all of the 2760 // new instructions into a waterfall loop if necessary. 2761 // 2762 // Don't put the bitcast or constant in the loop. 2763 MachineInstrSpan Span(MII, &B.getMBB()); 2764 2765 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2766 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2767 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2768 2769 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2770 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2771 2772 MRI.setRegBank(DstReg, *DstBank); 2773 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2774 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2775 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2776 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2777 2778 SmallSet<Register, 4> OpsToWaterfall; 2779 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2780 MI.eraseFromParent(); 2781 return; 2782 } 2783 2784 // Remove the original instruction to avoid potentially confusing the 2785 // waterfall loop logic. 2786 B.setInstr(*Span.begin()); 2787 MI.eraseFromParent(); 2788 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2789 OpsToWaterfall, MRI); 2790 2791 if (NeedCopyToVGPR) { 2792 MachineBasicBlock *LoopBB = Extract1->getParent(); 2793 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2794 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2795 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2796 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2797 2798 Extract0->getOperand(0).setReg(TmpReg0); 2799 Extract1->getOperand(0).setReg(TmpReg1); 2800 2801 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2802 2803 buildVCopy(B, DstRegs[0], TmpReg0); 2804 buildVCopy(B, DstRegs[1], TmpReg1); 2805 } 2806 2807 if (ShouldMoveIndexIntoLoop) 2808 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2809 2810 return; 2811 } 2812 case AMDGPU::G_INSERT_VECTOR_ELT: { 2813 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2814 2815 Register DstReg = MI.getOperand(0).getReg(); 2816 LLT VecTy = MRI.getType(DstReg); 2817 2818 assert(OpdMapper.getVRegs(0).empty()); 2819 assert(OpdMapper.getVRegs(3).empty()); 2820 2821 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2822 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2823 2824 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2825 return; 2826 2827 const RegisterBank *IdxBank = 2828 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2829 2830 Register SrcReg = MI.getOperand(1).getReg(); 2831 Register InsReg = MI.getOperand(2).getReg(); 2832 LLT InsTy = MRI.getType(InsReg); 2833 (void)InsTy; 2834 2835 Register BaseIdxReg; 2836 unsigned ConstOffset; 2837 std::tie(BaseIdxReg, ConstOffset) = 2838 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2839 2840 // See if the index is an add of a constant which will be foldable by moving 2841 // the base register of the index later if this is going to be executed in a 2842 // waterfall loop. This is essentially to reassociate the add of a constant 2843 // with the readfirstlane. 2844 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2845 ConstOffset > 0 && 2846 ConstOffset < VecTy.getNumElements(); 2847 2848 // Move the base register. We'll re-insert the add later. 2849 if (ShouldMoveIndexIntoLoop) 2850 MI.getOperand(3).setReg(BaseIdxReg); 2851 2852 2853 if (InsRegs.empty()) { 2854 executeInWaterfallLoop(MI, MRI, { 3 }); 2855 2856 // Re-insert the constant offset add inside the waterfall loop. 2857 if (ShouldMoveIndexIntoLoop) { 2858 MachineIRBuilder B(MI); 2859 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2860 } 2861 2862 return; 2863 } 2864 2865 2866 assert(InsTy.getSizeInBits() == 64); 2867 2868 const LLT S32 = LLT::scalar(32); 2869 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2870 2871 MachineIRBuilder B(MI); 2872 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2873 auto One = B.buildConstant(S32, 1); 2874 2875 // Split the vector index into 32-bit pieces. Prepare to move all of the 2876 // new instructions into a waterfall loop if necessary. 2877 // 2878 // Don't put the bitcast or constant in the loop. 2879 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2880 2881 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2882 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2883 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2884 2885 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2886 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2887 2888 const RegisterBank *DstBank = 2889 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2890 const RegisterBank *SrcBank = 2891 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2892 const RegisterBank *InsSrcBank = 2893 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2894 2895 MRI.setRegBank(InsReg, *InsSrcBank); 2896 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2897 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2898 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2899 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2900 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2901 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2902 2903 2904 SmallSet<Register, 4> OpsToWaterfall; 2905 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2906 B.setInsertPt(B.getMBB(), MI); 2907 B.buildBitcast(DstReg, InsHi); 2908 MI.eraseFromParent(); 2909 return; 2910 } 2911 2912 B.setInstr(*Span.begin()); 2913 MI.eraseFromParent(); 2914 2915 // Figure out the point after the waterfall loop before mangling the control 2916 // flow. 2917 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2918 OpsToWaterfall, MRI); 2919 2920 // The insertion point is now right after the original instruction. 2921 // 2922 // Keep the bitcast to the original vector type out of the loop. Doing this 2923 // saved an extra phi we don't need inside the loop. 2924 B.buildBitcast(DstReg, InsHi); 2925 2926 // Re-insert the constant offset add inside the waterfall loop. 2927 if (ShouldMoveIndexIntoLoop) 2928 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2929 2930 return; 2931 } 2932 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2933 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2936 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2937 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2938 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2939 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2940 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2941 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2942 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2943 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2944 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2945 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2946 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2947 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2948 applyDefaultMapping(OpdMapper); 2949 executeInWaterfallLoop(MI, MRI, {1, 4}); 2950 return; 2951 } 2952 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2954 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2955 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2956 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2957 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2958 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2959 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2960 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2961 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2962 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2963 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2964 applyDefaultMapping(OpdMapper); 2965 executeInWaterfallLoop(MI, MRI, {2, 5}); 2966 return; 2967 } 2968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2971 applyDefaultMapping(OpdMapper); 2972 executeInWaterfallLoop(MI, MRI, {2, 5}); 2973 return; 2974 } 2975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2976 applyDefaultMapping(OpdMapper); 2977 executeInWaterfallLoop(MI, MRI, {3, 6}); 2978 return; 2979 } 2980 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2981 applyMappingSBufferLoad(OpdMapper); 2982 return; 2983 } 2984 case AMDGPU::G_INTRINSIC: { 2985 switch (MI.getIntrinsicID()) { 2986 case Intrinsic::amdgcn_readlane: { 2987 substituteSimpleCopyRegs(OpdMapper, 2); 2988 2989 assert(OpdMapper.getVRegs(0).empty()); 2990 assert(OpdMapper.getVRegs(3).empty()); 2991 2992 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2993 // waterfall loop, so assume it's a uniform value. 2994 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2995 return; 2996 } 2997 case Intrinsic::amdgcn_writelane: { 2998 assert(OpdMapper.getVRegs(0).empty()); 2999 assert(OpdMapper.getVRegs(2).empty()); 3000 assert(OpdMapper.getVRegs(3).empty()); 3001 3002 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 3003 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 3004 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 3005 return; 3006 } 3007 case Intrinsic::amdgcn_interp_p1: 3008 case Intrinsic::amdgcn_interp_p2: 3009 case Intrinsic::amdgcn_interp_mov: 3010 case Intrinsic::amdgcn_interp_p1_f16: 3011 case Intrinsic::amdgcn_interp_p2_f16: 3012 case Intrinsic::amdgcn_lds_param_load: { 3013 applyDefaultMapping(OpdMapper); 3014 3015 // Readlane for m0 value, which is always the last operand. 3016 // FIXME: Should this be a waterfall loop instead? 3017 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3018 return; 3019 } 3020 case Intrinsic::amdgcn_interp_inreg_p10: 3021 case Intrinsic::amdgcn_interp_inreg_p2: 3022 case Intrinsic::amdgcn_interp_inreg_p10_f16: 3023 case Intrinsic::amdgcn_interp_inreg_p2_f16: 3024 applyDefaultMapping(OpdMapper); 3025 return; 3026 case Intrinsic::amdgcn_permlane16: 3027 case Intrinsic::amdgcn_permlanex16: { 3028 // Doing a waterfall loop over these wouldn't make any sense. 3029 substituteSimpleCopyRegs(OpdMapper, 2); 3030 substituteSimpleCopyRegs(OpdMapper, 3); 3031 constrainOpWithReadfirstlane(MI, MRI, 4); 3032 constrainOpWithReadfirstlane(MI, MRI, 5); 3033 return; 3034 } 3035 case Intrinsic::amdgcn_sbfe: 3036 applyMappingBFE(OpdMapper, true); 3037 return; 3038 case Intrinsic::amdgcn_ubfe: 3039 applyMappingBFE(OpdMapper, false); 3040 return; 3041 case Intrinsic::amdgcn_ballot: 3042 // Use default handling and insert copy to vcc source. 3043 break; 3044 } 3045 break; 3046 } 3047 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3048 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3049 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3050 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3051 const AMDGPU::RsrcIntrinsic *RSrcIntrin 3052 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 3053 assert(RSrcIntrin && RSrcIntrin->IsImage); 3054 // Non-images can have complications from operands that allow both SGPR 3055 // and VGPR. For now it's too complicated to figure out the final opcode 3056 // to derive the register bank from the MCInstrDesc. 3057 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3058 return; 3059 } 3060 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3061 unsigned N = MI.getNumExplicitOperands() - 2; 3062 applyDefaultMapping(OpdMapper); 3063 executeInWaterfallLoop(MI, MRI, { N }); 3064 return; 3065 } 3066 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3067 auto IntrID = MI.getIntrinsicID(); 3068 switch (IntrID) { 3069 case Intrinsic::amdgcn_ds_ordered_add: 3070 case Intrinsic::amdgcn_ds_ordered_swap: { 3071 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3072 assert(OpdMapper.getVRegs(0).empty()); 3073 substituteSimpleCopyRegs(OpdMapper, 3); 3074 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3075 return; 3076 } 3077 case Intrinsic::amdgcn_ds_gws_init: 3078 case Intrinsic::amdgcn_ds_gws_barrier: 3079 case Intrinsic::amdgcn_ds_gws_sema_br: { 3080 // Only the first lane is executes, so readfirstlane is safe. 3081 substituteSimpleCopyRegs(OpdMapper, 1); 3082 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3083 return; 3084 } 3085 case Intrinsic::amdgcn_ds_gws_sema_v: 3086 case Intrinsic::amdgcn_ds_gws_sema_p: 3087 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3088 // Only the first lane is executes, so readfirstlane is safe. 3089 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3090 return; 3091 } 3092 case Intrinsic::amdgcn_ds_append: 3093 case Intrinsic::amdgcn_ds_consume: { 3094 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3095 return; 3096 } 3097 case Intrinsic::amdgcn_s_sendmsg: 3098 case Intrinsic::amdgcn_s_sendmsghalt: { 3099 // FIXME: Should this use a waterfall loop? 3100 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3101 return; 3102 } 3103 case Intrinsic::amdgcn_s_setreg: { 3104 constrainOpWithReadfirstlane(MI, MRI, 2); 3105 return; 3106 } 3107 case Intrinsic::amdgcn_raw_buffer_load_lds: { 3108 applyDefaultMapping(OpdMapper); 3109 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3110 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3111 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset 3112 return; 3113 } 3114 case Intrinsic::amdgcn_struct_buffer_load_lds: { 3115 applyDefaultMapping(OpdMapper); 3116 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3117 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3118 constrainOpWithReadfirstlane(MI, MRI, 6); // soffset 3119 return; 3120 } 3121 case Intrinsic::amdgcn_global_load_lds: { 3122 applyDefaultMapping(OpdMapper); 3123 constrainOpWithReadfirstlane(MI, MRI, 2); 3124 return; 3125 } 3126 case Intrinsic::amdgcn_lds_direct_load: { 3127 applyDefaultMapping(OpdMapper); 3128 // Readlane for m0 value, which is always the last operand. 3129 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3130 return; 3131 } 3132 case Intrinsic::amdgcn_exp_row: 3133 applyDefaultMapping(OpdMapper); 3134 constrainOpWithReadfirstlane(MI, MRI, 8); // M0 3135 return; 3136 default: { 3137 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3138 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3139 // Non-images can have complications from operands that allow both SGPR 3140 // and VGPR. For now it's too complicated to figure out the final opcode 3141 // to derive the register bank from the MCInstrDesc. 3142 if (RSrcIntrin->IsImage) { 3143 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3144 return; 3145 } 3146 } 3147 3148 break; 3149 } 3150 } 3151 break; 3152 } 3153 case AMDGPU::G_SI_CALL: { 3154 // Use a set to avoid extra readfirstlanes in the case where multiple 3155 // operands are the same register. 3156 SmallSet<Register, 4> SGPROperandRegs; 3157 3158 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3159 break; 3160 3161 // Move all copies to physical SGPRs that are used by the call instruction 3162 // into the loop block. Start searching for these copies until the 3163 // ADJCALLSTACKUP. 3164 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3165 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3166 3167 // Move all non-copies before the copies, so that a complete range can be 3168 // moved into the waterfall loop. 3169 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3170 // Count of NonCopyInstrs found until the current LastCopy. 3171 unsigned NonCopyInstrsLen = 0; 3172 MachineBasicBlock::iterator Start(&MI); 3173 MachineBasicBlock::iterator LastCopy = Start; 3174 MachineBasicBlock *MBB = MI.getParent(); 3175 const SIMachineFunctionInfo *Info = 3176 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3177 while (Start->getOpcode() != FrameSetupOpcode) { 3178 --Start; 3179 bool IsCopy = false; 3180 if (Start->getOpcode() == AMDGPU::COPY) { 3181 auto &Dst = Start->getOperand(0); 3182 if (Dst.isReg()) { 3183 Register Reg = Dst.getReg(); 3184 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3185 IsCopy = true; 3186 } else { 3187 // Also move the copy from the scratch rsrc descriptor into the loop 3188 // to allow it to be optimized away. 3189 auto &Src = Start->getOperand(1); 3190 if (Src.isReg()) { 3191 Reg = Src.getReg(); 3192 IsCopy = Info->getScratchRSrcReg() == Reg; 3193 } 3194 } 3195 } 3196 } 3197 3198 if (IsCopy) { 3199 LastCopy = Start; 3200 NonCopyInstrsLen = NonCopyInstrs.size(); 3201 } else { 3202 NonCopyInstrs.push_back(&*Start); 3203 } 3204 } 3205 NonCopyInstrs.resize(NonCopyInstrsLen); 3206 3207 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3208 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3209 } 3210 Start = LastCopy; 3211 3212 // Do the same for copies after the loop 3213 NonCopyInstrs.clear(); 3214 NonCopyInstrsLen = 0; 3215 MachineBasicBlock::iterator End(&MI); 3216 LastCopy = End; 3217 while (End->getOpcode() != FrameDestroyOpcode) { 3218 ++End; 3219 bool IsCopy = false; 3220 if (End->getOpcode() == AMDGPU::COPY) { 3221 auto &Src = End->getOperand(1); 3222 if (Src.isReg()) { 3223 Register Reg = Src.getReg(); 3224 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3225 } 3226 } 3227 3228 if (IsCopy) { 3229 LastCopy = End; 3230 NonCopyInstrsLen = NonCopyInstrs.size(); 3231 } else { 3232 NonCopyInstrs.push_back(&*End); 3233 } 3234 } 3235 NonCopyInstrs.resize(NonCopyInstrsLen); 3236 3237 End = LastCopy; 3238 ++LastCopy; 3239 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3240 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3241 } 3242 3243 ++End; 3244 MachineIRBuilder B(*Start); 3245 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI); 3246 break; 3247 } 3248 case AMDGPU::G_LOAD: 3249 case AMDGPU::G_ZEXTLOAD: 3250 case AMDGPU::G_SEXTLOAD: { 3251 if (applyMappingLoad(MI, OpdMapper, MRI)) 3252 return; 3253 break; 3254 } 3255 case AMDGPU::G_DYN_STACKALLOC: 3256 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3257 return; 3258 case AMDGPU::G_SBFX: 3259 applyMappingBFE(OpdMapper, /*Signed*/ true); 3260 return; 3261 case AMDGPU::G_UBFX: 3262 applyMappingBFE(OpdMapper, /*Signed*/ false); 3263 return; 3264 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3265 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3266 applyMappingMAD_64_32(OpdMapper); 3267 return; 3268 default: 3269 break; 3270 } 3271 3272 return applyDefaultMapping(OpdMapper); 3273 } 3274 3275 // vgpr, sgpr -> vgpr 3276 // vgpr, agpr -> vgpr 3277 // agpr, agpr -> agpr 3278 // agpr, sgpr -> vgpr 3279 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3280 if (RB0 == AMDGPU::InvalidRegBankID) 3281 return RB1; 3282 if (RB1 == AMDGPU::InvalidRegBankID) 3283 return RB0; 3284 3285 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3286 return AMDGPU::SGPRRegBankID; 3287 3288 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3289 return AMDGPU::AGPRRegBankID; 3290 3291 return AMDGPU::VGPRRegBankID; 3292 } 3293 3294 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3295 if (RB0 == AMDGPU::InvalidRegBankID) 3296 return RB1; 3297 if (RB1 == AMDGPU::InvalidRegBankID) 3298 return RB0; 3299 3300 // vcc, vcc -> vcc 3301 // vcc, sgpr -> vcc 3302 // vcc, vgpr -> vcc 3303 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3304 return AMDGPU::VCCRegBankID; 3305 3306 // vcc, vgpr -> vgpr 3307 return regBankUnion(RB0, RB1); 3308 } 3309 3310 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3311 const MachineInstr &MI) const { 3312 unsigned RegBank = AMDGPU::InvalidRegBankID; 3313 3314 for (const MachineOperand &MO : MI.operands()) { 3315 if (!MO.isReg()) 3316 continue; 3317 Register Reg = MO.getReg(); 3318 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3319 RegBank = regBankUnion(RegBank, Bank->getID()); 3320 if (RegBank == AMDGPU::VGPRRegBankID) 3321 break; 3322 } 3323 } 3324 3325 return RegBank; 3326 } 3327 3328 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3329 const MachineFunction &MF = *MI.getParent()->getParent(); 3330 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3331 for (const MachineOperand &MO : MI.operands()) { 3332 if (!MO.isReg()) 3333 continue; 3334 Register Reg = MO.getReg(); 3335 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3336 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3337 return false; 3338 } 3339 } 3340 return true; 3341 } 3342 3343 const RegisterBankInfo::InstructionMapping & 3344 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3345 const MachineFunction &MF = *MI.getParent()->getParent(); 3346 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3347 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3348 3349 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3350 const MachineOperand &SrcOp = MI.getOperand(i); 3351 if (!SrcOp.isReg()) 3352 continue; 3353 3354 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3355 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3356 } 3357 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3358 MI.getNumOperands()); 3359 } 3360 3361 const RegisterBankInfo::InstructionMapping & 3362 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3363 const MachineFunction &MF = *MI.getParent()->getParent(); 3364 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3365 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3366 3367 // Even though we technically could use SGPRs, this would require knowledge of 3368 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3369 // 3370 // TODO: Unary ops are trivially OK, so accept SGPRs? 3371 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3372 const MachineOperand &Src = MI.getOperand(i); 3373 if (!Src.isReg()) 3374 continue; 3375 3376 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3377 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3378 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3379 } 3380 3381 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3382 MI.getNumOperands()); 3383 } 3384 3385 const RegisterBankInfo::InstructionMapping & 3386 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3387 const MachineFunction &MF = *MI.getParent()->getParent(); 3388 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3389 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3390 3391 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3392 const MachineOperand &Op = MI.getOperand(I); 3393 if (!Op.isReg()) 3394 continue; 3395 3396 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3397 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3398 } 3399 3400 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3401 MI.getNumOperands()); 3402 } 3403 3404 const RegisterBankInfo::InstructionMapping & 3405 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3406 const MachineInstr &MI, 3407 int RsrcIdx) const { 3408 // The reported argument index is relative to the IR intrinsic call arguments, 3409 // so we need to shift by the number of defs and the intrinsic ID. 3410 RsrcIdx += MI.getNumExplicitDefs() + 1; 3411 3412 const int NumOps = MI.getNumOperands(); 3413 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3414 3415 // TODO: Should packed/unpacked D16 difference be reported here as part of 3416 // the value mapping? 3417 for (int I = 0; I != NumOps; ++I) { 3418 if (!MI.getOperand(I).isReg()) 3419 continue; 3420 3421 Register OpReg = MI.getOperand(I).getReg(); 3422 // We replace some dead address operands with $noreg 3423 if (!OpReg) 3424 continue; 3425 3426 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3427 3428 // FIXME: Probably need a new intrinsic register bank searchable table to 3429 // handle arbitrary intrinsics easily. 3430 // 3431 // If this has a sampler, it immediately follows rsrc. 3432 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3433 3434 if (MustBeSGPR) { 3435 // If this must be an SGPR, so we must report whatever it is as legal. 3436 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3437 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3438 } else { 3439 // Some operands must be VGPR, and these are easy to copy to. 3440 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3441 } 3442 } 3443 3444 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3445 } 3446 3447 /// Return the mapping for a pointer argument. 3448 const RegisterBankInfo::ValueMapping * 3449 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3450 Register PtrReg) const { 3451 LLT PtrTy = MRI.getType(PtrReg); 3452 unsigned Size = PtrTy.getSizeInBits(); 3453 if (Subtarget.useFlatForGlobal() || 3454 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3455 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3456 3457 // If we're using MUBUF instructions for global memory, an SGPR base register 3458 // is possible. Otherwise this needs to be a VGPR. 3459 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3460 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3461 } 3462 3463 const RegisterBankInfo::InstructionMapping & 3464 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3465 3466 const MachineFunction &MF = *MI.getParent()->getParent(); 3467 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3468 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3470 Register PtrReg = MI.getOperand(1).getReg(); 3471 LLT PtrTy = MRI.getType(PtrReg); 3472 unsigned AS = PtrTy.getAddressSpace(); 3473 unsigned PtrSize = PtrTy.getSizeInBits(); 3474 3475 const ValueMapping *ValMapping; 3476 const ValueMapping *PtrMapping; 3477 3478 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3479 3480 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3481 if (isScalarLoadLegal(MI)) { 3482 // We have a uniform instruction so we want to use an SMRD load 3483 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3484 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3485 } else { 3486 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3487 3488 // If we're using MUBUF instructions for global memory, an SGPR base 3489 // register is possible. Otherwise this needs to be a VGPR. 3490 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3491 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3492 3493 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3494 } 3495 } else { 3496 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3497 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3498 } 3499 3500 OpdsMapping[0] = ValMapping; 3501 OpdsMapping[1] = PtrMapping; 3502 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3503 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3504 return Mapping; 3505 3506 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3507 // handle that during instruction selection? 3508 } 3509 3510 unsigned 3511 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3512 const MachineRegisterInfo &MRI, 3513 unsigned Default) const { 3514 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3515 return Bank ? Bank->getID() : Default; 3516 } 3517 3518 const RegisterBankInfo::ValueMapping * 3519 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3520 const MachineRegisterInfo &MRI, 3521 const TargetRegisterInfo &TRI) const { 3522 // Lie and claim anything is legal, even though this needs to be an SGPR 3523 // applyMapping will have to deal with it as a waterfall loop. 3524 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3525 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3526 return AMDGPU::getValueMapping(Bank, Size); 3527 } 3528 3529 const RegisterBankInfo::ValueMapping * 3530 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3531 const MachineRegisterInfo &MRI, 3532 const TargetRegisterInfo &TRI) const { 3533 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3534 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3535 } 3536 3537 const RegisterBankInfo::ValueMapping * 3538 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3539 const MachineRegisterInfo &MRI, 3540 const TargetRegisterInfo &TRI) const { 3541 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3542 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3543 } 3544 3545 /// 3546 /// This function must return a legal mapping, because 3547 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3548 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3549 /// VGPR to SGPR generated is illegal. 3550 /// 3551 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3552 // legal. These will be dealt with in applyMappingImpl. 3553 // 3554 const RegisterBankInfo::InstructionMapping & 3555 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3556 const MachineFunction &MF = *MI.getParent()->getParent(); 3557 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3558 3559 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3560 // The default logic bothers to analyze impossible alternative mappings. We 3561 // want the most straightforward mapping, so just directly handle this. 3562 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3563 *TRI); 3564 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3565 *TRI); 3566 assert(SrcBank && "src bank should have been assigned already"); 3567 if (!DstBank) 3568 DstBank = SrcBank; 3569 3570 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3571 if (cannotCopy(*DstBank, *SrcBank, Size)) 3572 return getInvalidInstructionMapping(); 3573 3574 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3575 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3576 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3577 OpdsMapping[0] = &ValMap; 3578 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3579 OpdsMapping[1] = &ValMap; 3580 3581 return getInstructionMapping( 3582 1, /*Cost*/ 1, 3583 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3584 } 3585 3586 if (MI.isRegSequence()) { 3587 // If any input is a VGPR, the result must be a VGPR. The default handling 3588 // assumes any copy between banks is legal. 3589 unsigned BankID = AMDGPU::SGPRRegBankID; 3590 3591 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3592 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3593 // It doesn't make sense to use vcc or scc banks here, so just ignore 3594 // them. 3595 if (OpBank != AMDGPU::SGPRRegBankID) { 3596 BankID = AMDGPU::VGPRRegBankID; 3597 break; 3598 } 3599 } 3600 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3601 3602 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3603 return getInstructionMapping( 3604 1, /*Cost*/ 1, 3605 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3606 } 3607 3608 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3609 // properly. 3610 // 3611 // TODO: There are additional exec masking dependencies to analyze. 3612 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3613 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3614 Register DstReg = MI.getOperand(0).getReg(); 3615 3616 // Sometimes the result may have already been assigned a bank. 3617 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3618 ResultBank = DstBank->getID(); 3619 3620 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3621 Register Reg = MI.getOperand(I).getReg(); 3622 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3623 3624 // FIXME: Assuming VGPR for any undetermined inputs. 3625 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3626 ResultBank = AMDGPU::VGPRRegBankID; 3627 break; 3628 } 3629 3630 // FIXME: Need to promote SGPR case to s32 3631 unsigned OpBank = Bank->getID(); 3632 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3633 } 3634 3635 assert(ResultBank != AMDGPU::InvalidRegBankID); 3636 3637 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3638 3639 const ValueMapping &ValMap = 3640 getValueMapping(0, Size, getRegBank(ResultBank)); 3641 return getInstructionMapping( 3642 1, /*Cost*/ 1, 3643 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3644 } 3645 3646 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3647 if (Mapping.isValid()) 3648 return Mapping; 3649 3650 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3651 3652 switch (MI.getOpcode()) { 3653 default: 3654 return getInvalidInstructionMapping(); 3655 3656 case AMDGPU::G_AND: 3657 case AMDGPU::G_OR: 3658 case AMDGPU::G_XOR: { 3659 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3660 if (Size == 1) { 3661 const RegisterBank *DstBank 3662 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3663 3664 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3665 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3666 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3667 if (DstBank) { 3668 TargetBankID = DstBank->getID(); 3669 if (DstBank == &AMDGPU::VCCRegBank) { 3670 TargetBankID = AMDGPU::VCCRegBankID; 3671 BankLHS = AMDGPU::VCCRegBankID; 3672 BankRHS = AMDGPU::VCCRegBankID; 3673 } else { 3674 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3675 AMDGPU::SGPRRegBankID); 3676 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3677 AMDGPU::SGPRRegBankID); 3678 } 3679 } else { 3680 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3681 AMDGPU::VCCRegBankID); 3682 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3683 AMDGPU::VCCRegBankID); 3684 3685 // Both inputs should be true booleans to produce a boolean result. 3686 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3687 TargetBankID = AMDGPU::VGPRRegBankID; 3688 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3689 TargetBankID = AMDGPU::VCCRegBankID; 3690 BankLHS = AMDGPU::VCCRegBankID; 3691 BankRHS = AMDGPU::VCCRegBankID; 3692 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3693 TargetBankID = AMDGPU::SGPRRegBankID; 3694 } 3695 } 3696 3697 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3698 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3699 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3700 break; 3701 } 3702 3703 if (Size == 64) { 3704 3705 if (isSALUMapping(MI)) { 3706 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3707 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3708 } else { 3709 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3710 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3711 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3712 3713 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3714 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3715 } 3716 3717 break; 3718 } 3719 3720 LLVM_FALLTHROUGH; 3721 } 3722 case AMDGPU::G_PTR_ADD: 3723 case AMDGPU::G_PTRMASK: 3724 case AMDGPU::G_ADD: 3725 case AMDGPU::G_SUB: 3726 case AMDGPU::G_MUL: 3727 case AMDGPU::G_SHL: 3728 case AMDGPU::G_LSHR: 3729 case AMDGPU::G_ASHR: 3730 case AMDGPU::G_UADDO: 3731 case AMDGPU::G_USUBO: 3732 case AMDGPU::G_UADDE: 3733 case AMDGPU::G_SADDE: 3734 case AMDGPU::G_USUBE: 3735 case AMDGPU::G_SSUBE: 3736 case AMDGPU::G_SMIN: 3737 case AMDGPU::G_SMAX: 3738 case AMDGPU::G_UMIN: 3739 case AMDGPU::G_UMAX: 3740 case AMDGPU::G_ABS: 3741 case AMDGPU::G_SHUFFLE_VECTOR: 3742 case AMDGPU::G_SBFX: 3743 case AMDGPU::G_UBFX: 3744 if (isSALUMapping(MI)) 3745 return getDefaultMappingSOP(MI); 3746 LLVM_FALLTHROUGH; 3747 3748 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3749 case AMDGPU::G_SSUBSAT: 3750 case AMDGPU::G_UADDSAT: 3751 case AMDGPU::G_USUBSAT: 3752 case AMDGPU::G_FADD: 3753 case AMDGPU::G_FSUB: 3754 case AMDGPU::G_FPTOSI: 3755 case AMDGPU::G_FPTOUI: 3756 case AMDGPU::G_FMUL: 3757 case AMDGPU::G_FMA: 3758 case AMDGPU::G_FMAD: 3759 case AMDGPU::G_FSQRT: 3760 case AMDGPU::G_FFLOOR: 3761 case AMDGPU::G_FCEIL: 3762 case AMDGPU::G_FRINT: 3763 case AMDGPU::G_SITOFP: 3764 case AMDGPU::G_UITOFP: 3765 case AMDGPU::G_FPTRUNC: 3766 case AMDGPU::G_FPEXT: 3767 case AMDGPU::G_FEXP2: 3768 case AMDGPU::G_FLOG2: 3769 case AMDGPU::G_FMINNUM: 3770 case AMDGPU::G_FMAXNUM: 3771 case AMDGPU::G_FMINNUM_IEEE: 3772 case AMDGPU::G_FMAXNUM_IEEE: 3773 case AMDGPU::G_FCANONICALIZE: 3774 case AMDGPU::G_INTRINSIC_TRUNC: 3775 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3776 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3777 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3778 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3779 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3780 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3781 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3782 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3783 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3784 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3785 case AMDGPU::G_AMDGPU_SMED3: 3786 return getDefaultMappingVOP(MI); 3787 case AMDGPU::G_UMULH: 3788 case AMDGPU::G_SMULH: { 3789 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3790 return getDefaultMappingSOP(MI); 3791 return getDefaultMappingVOP(MI); 3792 } 3793 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3794 case AMDGPU::G_AMDGPU_MAD_I64_I32: { 3795 // Three possible mappings: 3796 // 3797 // - Default SOP 3798 // - Default VOP 3799 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. 3800 // 3801 // This allows instruction selection to keep the multiplication part of the 3802 // instruction on the SALU. 3803 bool AllSalu = true; 3804 bool MulSalu = true; 3805 for (unsigned i = 0; i < 5; ++i) { 3806 Register Reg = MI.getOperand(i).getReg(); 3807 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3808 if (Bank->getID() != AMDGPU::SGPRRegBankID) { 3809 AllSalu = false; 3810 if (i == 2 || i == 3) { 3811 MulSalu = false; 3812 break; 3813 } 3814 } 3815 } 3816 } 3817 3818 if (AllSalu) 3819 return getDefaultMappingSOP(MI); 3820 3821 // If the multiply-add is full-rate in VALU, use that even if the 3822 // multiplication part is scalar. Accumulating separately on the VALU would 3823 // take two instructions. 3824 if (!MulSalu || Subtarget.hasFullRate64Ops()) 3825 return getDefaultMappingVOP(MI); 3826 3827 // Keep the multiplication on the SALU, then accumulate on the VALU. 3828 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3829 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3830 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3831 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3832 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3833 break; 3834 } 3835 case AMDGPU::G_IMPLICIT_DEF: { 3836 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3837 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3838 break; 3839 } 3840 case AMDGPU::G_FCONSTANT: 3841 case AMDGPU::G_CONSTANT: 3842 case AMDGPU::G_GLOBAL_VALUE: 3843 case AMDGPU::G_BLOCK_ADDR: 3844 case AMDGPU::G_READCYCLECOUNTER: { 3845 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3846 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3847 break; 3848 } 3849 case AMDGPU::G_FRAME_INDEX: { 3850 // TODO: This should be the same as other constants, but eliminateFrameIndex 3851 // currently assumes VALU uses. 3852 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3853 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3854 break; 3855 } 3856 case AMDGPU::G_DYN_STACKALLOC: { 3857 // Result is always uniform, and a wave reduction is needed for the source. 3858 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3859 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3860 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3861 break; 3862 } 3863 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 3864 // This case is weird because we expect a physical register in the source, 3865 // but need to set a bank anyway. 3866 // 3867 // We could select the result to SGPR or VGPR, but for the one current use 3868 // it's more practical to always use VGPR. 3869 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3870 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3871 break; 3872 } 3873 case AMDGPU::G_INSERT: { 3874 unsigned BankID = getMappingType(MRI, MI); 3875 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3876 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3877 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3878 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3879 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3880 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3881 OpdsMapping[3] = nullptr; 3882 break; 3883 } 3884 case AMDGPU::G_EXTRACT: { 3885 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3886 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3887 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3888 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3889 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3890 OpdsMapping[2] = nullptr; 3891 break; 3892 } 3893 case AMDGPU::G_BUILD_VECTOR: 3894 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3895 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3896 if (DstTy == LLT::fixed_vector(2, 16)) { 3897 unsigned DstSize = DstTy.getSizeInBits(); 3898 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3899 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3900 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3901 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3902 3903 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3904 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3905 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3906 break; 3907 } 3908 3909 LLVM_FALLTHROUGH; 3910 } 3911 case AMDGPU::G_MERGE_VALUES: 3912 case AMDGPU::G_CONCAT_VECTORS: { 3913 unsigned Bank = getMappingType(MRI, MI); 3914 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3915 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3916 3917 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3918 // Op1 and Dst should use the same register bank. 3919 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3920 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3921 break; 3922 } 3923 case AMDGPU::G_BITREVERSE: 3924 case AMDGPU::G_BITCAST: 3925 case AMDGPU::G_INTTOPTR: 3926 case AMDGPU::G_PTRTOINT: 3927 case AMDGPU::G_FABS: 3928 case AMDGPU::G_FNEG: { 3929 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3930 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3931 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3932 break; 3933 } 3934 case AMDGPU::G_AMDGPU_FFBH_U32: 3935 case AMDGPU::G_AMDGPU_FFBL_B32: 3936 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3937 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 3938 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3939 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3940 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3941 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 3942 break; 3943 } 3944 case AMDGPU::G_CTPOP: { 3945 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3946 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3947 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3948 3949 // This should really be getValueMappingSGPR64Only, but allowing the generic 3950 // code to handle the register split just makes using LegalizerHelper more 3951 // difficult. 3952 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3953 break; 3954 } 3955 case AMDGPU::G_TRUNC: { 3956 Register Dst = MI.getOperand(0).getReg(); 3957 Register Src = MI.getOperand(1).getReg(); 3958 unsigned Bank = getRegBankID(Src, MRI); 3959 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3960 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3961 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3962 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3963 break; 3964 } 3965 case AMDGPU::G_ZEXT: 3966 case AMDGPU::G_SEXT: 3967 case AMDGPU::G_ANYEXT: 3968 case AMDGPU::G_SEXT_INREG: { 3969 Register Dst = MI.getOperand(0).getReg(); 3970 Register Src = MI.getOperand(1).getReg(); 3971 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3972 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3973 3974 unsigned DstBank; 3975 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3976 assert(SrcBank); 3977 switch (SrcBank->getID()) { 3978 case AMDGPU::SGPRRegBankID: 3979 DstBank = AMDGPU::SGPRRegBankID; 3980 break; 3981 default: 3982 DstBank = AMDGPU::VGPRRegBankID; 3983 break; 3984 } 3985 3986 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3987 // 32-bits, and then to 64. 3988 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3989 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3990 SrcSize); 3991 break; 3992 } 3993 case AMDGPU::G_FCMP: { 3994 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3995 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3996 OpdsMapping[1] = nullptr; // Predicate Operand. 3997 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3998 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3999 break; 4000 } 4001 case AMDGPU::G_STORE: { 4002 assert(MI.getOperand(0).isReg()); 4003 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4004 4005 // FIXME: We need to specify a different reg bank once scalar stores are 4006 // supported. 4007 const ValueMapping *ValMapping = 4008 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4009 OpdsMapping[0] = ValMapping; 4010 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4011 break; 4012 } 4013 case AMDGPU::G_ICMP: { 4014 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 4015 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4016 4017 // See if the result register has already been constrained to vcc, which may 4018 // happen due to control flow intrinsic lowering. 4019 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4020 AMDGPU::SGPRRegBankID); 4021 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4022 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 4023 4024 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 4025 Op2Bank == AMDGPU::SGPRRegBankID && 4026 Op3Bank == AMDGPU::SGPRRegBankID && 4027 (Size == 32 || (Size == 64 && 4028 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 4029 Subtarget.hasScalarCompareEq64())); 4030 4031 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4032 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4033 4034 // TODO: Use 32-bit for scalar output size. 4035 // SCC results will need to be copied to a 32-bit SGPR virtual register. 4036 const unsigned ResultSize = 1; 4037 4038 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 4039 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 4040 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 4041 break; 4042 } 4043 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 4044 // VGPR index can be used for waterfall when indexing a SGPR vector. 4045 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4046 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4047 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4048 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4049 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4050 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 4051 4052 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 4053 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 4054 4055 // The index can be either if the source vector is VGPR. 4056 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4057 break; 4058 } 4059 case AMDGPU::G_INSERT_VECTOR_ELT: { 4060 unsigned OutputBankID = isSALUMapping(MI) ? 4061 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4062 4063 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4064 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4065 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4066 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4067 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 4068 4069 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4070 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4071 4072 // This is a weird case, because we need to break down the mapping based on 4073 // the register bank of a different operand. 4074 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 4075 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 4076 InsertSize); 4077 } else { 4078 assert(InsertSize == 32 || InsertSize == 64); 4079 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 4080 } 4081 4082 // The index can be either if the source vector is VGPR. 4083 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 4084 break; 4085 } 4086 case AMDGPU::G_UNMERGE_VALUES: { 4087 unsigned Bank = getMappingType(MRI, MI); 4088 4089 // Op1 and Dst should use the same register bank. 4090 // FIXME: Shouldn't this be the default? Why do we need to handle this? 4091 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4092 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 4093 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 4094 } 4095 break; 4096 } 4097 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 4098 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4099 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4100 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4101 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4102 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 4103 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 4104 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 4105 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 4106 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 4107 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 4108 case AMDGPU::G_AMDGPU_BUFFER_STORE: 4109 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 4110 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 4111 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 4112 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 4113 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4114 4115 // rsrc 4116 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4117 4118 // vindex 4119 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4120 4121 // voffset 4122 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4123 4124 // soffset 4125 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4126 4127 // Any remaining operands are immediates and were correctly null 4128 // initialized. 4129 break; 4130 } 4131 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 4132 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 4133 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 4134 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 4135 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 4136 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 4137 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 4138 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 4139 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 4140 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 4141 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 4142 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 4143 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 4144 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 4145 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 4146 // vdata_out 4147 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4148 4149 // vdata_in 4150 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4151 4152 // rsrc 4153 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4154 4155 // vindex 4156 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4157 4158 // voffset 4159 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4160 4161 // soffset 4162 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4163 4164 // Any remaining operands are immediates and were correctly null 4165 // initialized. 4166 break; 4167 } 4168 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4169 // vdata_out 4170 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4171 4172 // vdata_in 4173 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4174 4175 // cmp 4176 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4177 4178 // rsrc 4179 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4180 4181 // vindex 4182 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4183 4184 // voffset 4185 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4186 4187 // soffset 4188 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4189 4190 // Any remaining operands are immediates and were correctly null 4191 // initialized. 4192 break; 4193 } 4194 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 4195 // Lie and claim everything is legal, even though some need to be 4196 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4197 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4198 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4199 4200 // We need to convert this to a MUBUF if either the resource of offset is 4201 // VGPR. 4202 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4203 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4204 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4205 4206 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4207 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4208 break; 4209 } 4210 case AMDGPU::G_INTRINSIC: { 4211 switch (MI.getIntrinsicID()) { 4212 default: 4213 return getInvalidInstructionMapping(); 4214 case Intrinsic::amdgcn_div_fmas: 4215 case Intrinsic::amdgcn_div_fixup: 4216 case Intrinsic::amdgcn_trig_preop: 4217 case Intrinsic::amdgcn_sin: 4218 case Intrinsic::amdgcn_cos: 4219 case Intrinsic::amdgcn_log_clamp: 4220 case Intrinsic::amdgcn_rcp: 4221 case Intrinsic::amdgcn_rcp_legacy: 4222 case Intrinsic::amdgcn_sqrt: 4223 case Intrinsic::amdgcn_rsq: 4224 case Intrinsic::amdgcn_rsq_legacy: 4225 case Intrinsic::amdgcn_rsq_clamp: 4226 case Intrinsic::amdgcn_fmul_legacy: 4227 case Intrinsic::amdgcn_fma_legacy: 4228 case Intrinsic::amdgcn_ldexp: 4229 case Intrinsic::amdgcn_frexp_mant: 4230 case Intrinsic::amdgcn_frexp_exp: 4231 case Intrinsic::amdgcn_fract: 4232 case Intrinsic::amdgcn_cvt_pkrtz: 4233 case Intrinsic::amdgcn_cvt_pknorm_i16: 4234 case Intrinsic::amdgcn_cvt_pknorm_u16: 4235 case Intrinsic::amdgcn_cvt_pk_i16: 4236 case Intrinsic::amdgcn_cvt_pk_u16: 4237 case Intrinsic::amdgcn_fmed3: 4238 case Intrinsic::amdgcn_cubeid: 4239 case Intrinsic::amdgcn_cubema: 4240 case Intrinsic::amdgcn_cubesc: 4241 case Intrinsic::amdgcn_cubetc: 4242 case Intrinsic::amdgcn_sffbh: 4243 case Intrinsic::amdgcn_fmad_ftz: 4244 case Intrinsic::amdgcn_mbcnt_lo: 4245 case Intrinsic::amdgcn_mbcnt_hi: 4246 case Intrinsic::amdgcn_mul_u24: 4247 case Intrinsic::amdgcn_mul_i24: 4248 case Intrinsic::amdgcn_mulhi_u24: 4249 case Intrinsic::amdgcn_mulhi_i24: 4250 case Intrinsic::amdgcn_lerp: 4251 case Intrinsic::amdgcn_sad_u8: 4252 case Intrinsic::amdgcn_msad_u8: 4253 case Intrinsic::amdgcn_sad_hi_u8: 4254 case Intrinsic::amdgcn_sad_u16: 4255 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4256 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4257 case Intrinsic::amdgcn_mqsad_u32_u8: 4258 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4259 case Intrinsic::amdgcn_alignbyte: 4260 case Intrinsic::amdgcn_perm: 4261 case Intrinsic::amdgcn_fdot2: 4262 case Intrinsic::amdgcn_sdot2: 4263 case Intrinsic::amdgcn_udot2: 4264 case Intrinsic::amdgcn_sdot4: 4265 case Intrinsic::amdgcn_udot4: 4266 case Intrinsic::amdgcn_sdot8: 4267 case Intrinsic::amdgcn_udot8: 4268 case Intrinsic::amdgcn_fdot2_bf16_bf16: 4269 case Intrinsic::amdgcn_fdot2_f16_f16: 4270 case Intrinsic::amdgcn_fdot2_f32_bf16: 4271 case Intrinsic::amdgcn_sudot4: 4272 case Intrinsic::amdgcn_sudot8: 4273 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: 4274 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: 4275 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: 4276 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: 4277 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: 4278 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: 4279 return getDefaultMappingVOP(MI); 4280 case Intrinsic::amdgcn_sbfe: 4281 case Intrinsic::amdgcn_ubfe: 4282 if (isSALUMapping(MI)) 4283 return getDefaultMappingSOP(MI); 4284 return getDefaultMappingVOP(MI); 4285 case Intrinsic::amdgcn_ds_swizzle: 4286 case Intrinsic::amdgcn_ds_permute: 4287 case Intrinsic::amdgcn_ds_bpermute: 4288 case Intrinsic::amdgcn_update_dpp: 4289 case Intrinsic::amdgcn_mov_dpp8: 4290 case Intrinsic::amdgcn_mov_dpp: 4291 case Intrinsic::amdgcn_strict_wwm: 4292 case Intrinsic::amdgcn_wwm: 4293 case Intrinsic::amdgcn_strict_wqm: 4294 case Intrinsic::amdgcn_wqm: 4295 case Intrinsic::amdgcn_softwqm: 4296 case Intrinsic::amdgcn_set_inactive: 4297 case Intrinsic::amdgcn_permlane64: 4298 return getDefaultMappingAllVGPR(MI); 4299 case Intrinsic::amdgcn_kernarg_segment_ptr: 4300 case Intrinsic::amdgcn_s_getpc: 4301 case Intrinsic::amdgcn_groupstaticsize: 4302 case Intrinsic::amdgcn_reloc_constant: 4303 case Intrinsic::returnaddress: { 4304 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4305 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4306 break; 4307 } 4308 case Intrinsic::amdgcn_wqm_vote: { 4309 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4310 OpdsMapping[0] = OpdsMapping[2] 4311 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4312 break; 4313 } 4314 case Intrinsic::amdgcn_ps_live: { 4315 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4316 break; 4317 } 4318 case Intrinsic::amdgcn_div_scale: { 4319 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4320 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4321 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4322 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4323 4324 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4325 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4326 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4327 break; 4328 } 4329 case Intrinsic::amdgcn_class: { 4330 Register Src0Reg = MI.getOperand(2).getReg(); 4331 Register Src1Reg = MI.getOperand(3).getReg(); 4332 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4333 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4334 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4335 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4336 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4337 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4338 break; 4339 } 4340 case Intrinsic::amdgcn_icmp: 4341 case Intrinsic::amdgcn_fcmp: { 4342 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4343 // This is not VCCRegBank because this is not used in boolean contexts. 4344 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4345 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4346 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4347 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4348 break; 4349 } 4350 case Intrinsic::amdgcn_readlane: { 4351 // This must be an SGPR, but accept a VGPR. 4352 Register IdxReg = MI.getOperand(3).getReg(); 4353 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4354 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4355 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4356 LLVM_FALLTHROUGH; 4357 } 4358 case Intrinsic::amdgcn_readfirstlane: { 4359 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4360 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4361 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4362 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4363 break; 4364 } 4365 case Intrinsic::amdgcn_writelane: { 4366 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4367 Register SrcReg = MI.getOperand(2).getReg(); 4368 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4369 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4370 Register IdxReg = MI.getOperand(3).getReg(); 4371 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4372 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4373 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4374 4375 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4376 // to legalize. 4377 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4378 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4379 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4380 break; 4381 } 4382 case Intrinsic::amdgcn_if_break: { 4383 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4384 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4385 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4386 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4387 break; 4388 } 4389 case Intrinsic::amdgcn_permlane16: 4390 case Intrinsic::amdgcn_permlanex16: { 4391 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4392 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4393 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4394 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4395 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4396 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4397 break; 4398 } 4399 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4400 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4401 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4402 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4403 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4404 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4405 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4406 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4407 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4408 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4409 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4410 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4411 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4412 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4413 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4414 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4415 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4416 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4417 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4418 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4419 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4420 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4421 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4422 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4423 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4424 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4425 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: 4426 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: 4427 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: 4428 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: 4429 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: 4430 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: 4431 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: 4432 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: 4433 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: 4434 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: 4435 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: 4436 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: 4437 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { 4438 // Default for MAI intrinsics. 4439 // srcC can also be an immediate which can be folded later. 4440 // FIXME: Should we eventually add an alternative mapping with AGPR src 4441 // for srcA/srcB? 4442 // 4443 // vdst, srcA, srcB, srcC 4444 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4445 OpdsMapping[0] = 4446 Info->mayNeedAGPRs() 4447 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4448 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4449 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4450 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4451 OpdsMapping[4] = 4452 Info->mayNeedAGPRs() 4453 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4454 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4455 break; 4456 } 4457 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 4458 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 4459 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 4460 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 4461 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 4462 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 4463 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 4464 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 4465 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 4466 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 4467 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 4468 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 4469 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 4470 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: { 4471 // vdst, srcA, srcB, srcC, idx 4472 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4473 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4474 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4475 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4476 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4477 break; 4478 } 4479 case Intrinsic::amdgcn_interp_p1: 4480 case Intrinsic::amdgcn_interp_p2: 4481 case Intrinsic::amdgcn_interp_mov: 4482 case Intrinsic::amdgcn_interp_p1_f16: 4483 case Intrinsic::amdgcn_interp_p2_f16: 4484 case Intrinsic::amdgcn_lds_param_load: { 4485 const int M0Idx = MI.getNumOperands() - 1; 4486 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4487 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4488 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4489 4490 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4491 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4492 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4493 4494 // Must be SGPR, but we must take whatever the original bank is and fix it 4495 // later. 4496 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4497 break; 4498 } 4499 case Intrinsic::amdgcn_interp_inreg_p10: 4500 case Intrinsic::amdgcn_interp_inreg_p2: 4501 case Intrinsic::amdgcn_interp_inreg_p10_f16: 4502 case Intrinsic::amdgcn_interp_inreg_p2_f16: { 4503 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4504 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4505 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4506 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4507 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4508 break; 4509 } 4510 case Intrinsic::amdgcn_ballot: { 4511 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4512 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4513 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4514 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4515 break; 4516 } 4517 } 4518 break; 4519 } 4520 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4521 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4522 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4523 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4524 auto IntrID = MI.getIntrinsicID(); 4525 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4526 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4527 // Non-images can have complications from operands that allow both SGPR 4528 // and VGPR. For now it's too complicated to figure out the final opcode 4529 // to derive the register bank from the MCInstrDesc. 4530 assert(RSrcIntrin->IsImage); 4531 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4532 } 4533 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4534 unsigned N = MI.getNumExplicitOperands() - 2; 4535 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4536 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4537 if (N == 3) { 4538 // Sequential form: all operands combined into VGPR256/VGPR512 4539 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4540 if (Size > 256) 4541 Size = 512; 4542 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4543 } else { 4544 // NSA form 4545 for (unsigned I = 2; I < N; ++I) { 4546 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); 4547 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4548 } 4549 } 4550 break; 4551 } 4552 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4553 auto IntrID = MI.getIntrinsicID(); 4554 switch (IntrID) { 4555 case Intrinsic::amdgcn_s_getreg: 4556 case Intrinsic::amdgcn_s_memtime: 4557 case Intrinsic::amdgcn_s_memrealtime: 4558 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: 4559 case Intrinsic::amdgcn_s_sendmsg_rtn: { 4560 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4561 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4562 break; 4563 } 4564 case Intrinsic::amdgcn_global_atomic_fadd: 4565 case Intrinsic::amdgcn_global_atomic_csub: 4566 case Intrinsic::amdgcn_global_atomic_fmin: 4567 case Intrinsic::amdgcn_global_atomic_fmax: 4568 case Intrinsic::amdgcn_flat_atomic_fadd: 4569 case Intrinsic::amdgcn_flat_atomic_fmin: 4570 case Intrinsic::amdgcn_flat_atomic_fmax: 4571 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 4572 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 4573 return getDefaultMappingAllVGPR(MI); 4574 case Intrinsic::amdgcn_ds_ordered_add: 4575 case Intrinsic::amdgcn_ds_ordered_swap: { 4576 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4577 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4578 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4579 AMDGPU::SGPRRegBankID); 4580 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4581 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4582 break; 4583 } 4584 case Intrinsic::amdgcn_ds_append: 4585 case Intrinsic::amdgcn_ds_consume: { 4586 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4587 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4588 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4589 break; 4590 } 4591 case Intrinsic::amdgcn_exp_compr: 4592 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4593 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4594 break; 4595 case Intrinsic::amdgcn_exp: 4596 // FIXME: Could we support packed types here? 4597 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4598 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4599 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4600 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4601 break; 4602 case Intrinsic::amdgcn_exp_row: 4603 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4604 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4605 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4606 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4607 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); 4608 break; 4609 case Intrinsic::amdgcn_s_sendmsg: 4610 case Intrinsic::amdgcn_s_sendmsghalt: { 4611 // This must be an SGPR, but accept a VGPR. 4612 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4613 AMDGPU::SGPRRegBankID); 4614 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4615 break; 4616 } 4617 case Intrinsic::amdgcn_s_setreg: { 4618 // This must be an SGPR, but accept a VGPR. 4619 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4620 AMDGPU::SGPRRegBankID); 4621 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4622 break; 4623 } 4624 case Intrinsic::amdgcn_end_cf: { 4625 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4626 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4627 break; 4628 } 4629 case Intrinsic::amdgcn_else: { 4630 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4631 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4632 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4633 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4634 break; 4635 } 4636 case Intrinsic::amdgcn_live_mask: { 4637 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4638 break; 4639 } 4640 case Intrinsic::amdgcn_wqm_demote: 4641 case Intrinsic::amdgcn_kill: { 4642 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4643 break; 4644 } 4645 case Intrinsic::amdgcn_raw_buffer_load: 4646 case Intrinsic::amdgcn_raw_tbuffer_load: { 4647 // FIXME: Should make intrinsic ID the last operand of the instruction, 4648 // then this would be the same as store 4649 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4650 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4651 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4652 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4653 break; 4654 } 4655 case Intrinsic::amdgcn_raw_buffer_load_lds: { 4656 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4657 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4658 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4659 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4660 break; 4661 } 4662 case Intrinsic::amdgcn_raw_buffer_store: 4663 case Intrinsic::amdgcn_raw_buffer_store_format: 4664 case Intrinsic::amdgcn_raw_tbuffer_store: { 4665 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4666 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4667 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4668 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4669 break; 4670 } 4671 case Intrinsic::amdgcn_struct_buffer_load: 4672 case Intrinsic::amdgcn_struct_tbuffer_load: { 4673 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4674 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4675 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4676 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4677 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4678 break; 4679 } 4680 case Intrinsic::amdgcn_struct_buffer_load_lds: { 4681 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4682 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4683 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4684 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4685 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4686 break; 4687 } 4688 case Intrinsic::amdgcn_struct_buffer_store: 4689 case Intrinsic::amdgcn_struct_tbuffer_store: { 4690 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4691 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4692 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4693 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4694 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4695 break; 4696 } 4697 case Intrinsic::amdgcn_init_exec_from_input: { 4698 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4699 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4700 break; 4701 } 4702 case Intrinsic::amdgcn_ds_gws_init: 4703 case Intrinsic::amdgcn_ds_gws_barrier: 4704 case Intrinsic::amdgcn_ds_gws_sema_br: { 4705 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4706 4707 // This must be an SGPR, but accept a VGPR. 4708 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4709 AMDGPU::SGPRRegBankID); 4710 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4711 break; 4712 } 4713 case Intrinsic::amdgcn_ds_gws_sema_v: 4714 case Intrinsic::amdgcn_ds_gws_sema_p: 4715 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4716 // This must be an SGPR, but accept a VGPR. 4717 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4718 AMDGPU::SGPRRegBankID); 4719 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4720 break; 4721 } 4722 case Intrinsic::amdgcn_global_load_lds: { 4723 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4724 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4725 break; 4726 } 4727 case Intrinsic::amdgcn_lds_direct_load: { 4728 const int M0Idx = MI.getNumOperands() - 1; 4729 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4730 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4731 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4732 4733 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4734 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4735 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4736 4737 // Must be SGPR, but we must take whatever the original bank is and fix it 4738 // later. 4739 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4740 break; 4741 } 4742 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 4743 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: 4744 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4745 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4746 break; 4747 default: 4748 return getInvalidInstructionMapping(); 4749 } 4750 break; 4751 } 4752 case AMDGPU::G_SELECT: { 4753 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4754 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4755 AMDGPU::SGPRRegBankID); 4756 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4757 AMDGPU::SGPRRegBankID); 4758 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4759 Op3Bank == AMDGPU::SGPRRegBankID; 4760 4761 unsigned CondBankDefault = SGPRSrcs ? 4762 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4763 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4764 CondBankDefault); 4765 if (CondBank == AMDGPU::SGPRRegBankID) 4766 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4767 else if (CondBank == AMDGPU::VGPRRegBankID) 4768 CondBank = AMDGPU::VCCRegBankID; 4769 4770 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4771 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4772 4773 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4774 4775 // TODO: Should report 32-bit for scalar condition type. 4776 if (Size == 64) { 4777 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4778 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4779 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4780 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4781 } else { 4782 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4783 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4784 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4785 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4786 } 4787 4788 break; 4789 } 4790 4791 case AMDGPU::G_SI_CALL: { 4792 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4793 // Lie and claim everything is legal, even though some need to be 4794 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4795 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4796 4797 // Allow anything for implicit arguments 4798 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 4799 if (MI.getOperand(I).isReg()) { 4800 Register Reg = MI.getOperand(I).getReg(); 4801 auto OpBank = getRegBankID(Reg, MRI); 4802 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 4803 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 4804 } 4805 } 4806 break; 4807 } 4808 case AMDGPU::G_LOAD: 4809 case AMDGPU::G_ZEXTLOAD: 4810 case AMDGPU::G_SEXTLOAD: 4811 return getInstrMappingForLoad(MI); 4812 4813 case AMDGPU::G_ATOMICRMW_XCHG: 4814 case AMDGPU::G_ATOMICRMW_ADD: 4815 case AMDGPU::G_ATOMICRMW_SUB: 4816 case AMDGPU::G_ATOMICRMW_AND: 4817 case AMDGPU::G_ATOMICRMW_OR: 4818 case AMDGPU::G_ATOMICRMW_XOR: 4819 case AMDGPU::G_ATOMICRMW_MAX: 4820 case AMDGPU::G_ATOMICRMW_MIN: 4821 case AMDGPU::G_ATOMICRMW_UMAX: 4822 case AMDGPU::G_ATOMICRMW_UMIN: 4823 case AMDGPU::G_ATOMICRMW_FADD: 4824 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4825 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4826 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4827 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4828 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4829 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4830 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4831 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4832 break; 4833 } 4834 case AMDGPU::G_ATOMIC_CMPXCHG: { 4835 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4836 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4837 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4838 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4839 break; 4840 } 4841 case AMDGPU::G_BRCOND: { 4842 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4843 AMDGPU::SGPRRegBankID); 4844 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4845 if (Bank != AMDGPU::SGPRRegBankID) 4846 Bank = AMDGPU::VCCRegBankID; 4847 4848 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4849 break; 4850 } 4851 case AMDGPU::G_FPTRUNC_ROUND_UPWARD: 4852 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: 4853 return getDefaultMappingVOP(MI); 4854 } 4855 4856 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4857 getOperandsMapping(OpdsMapping), 4858 MI.getNumOperands()); 4859 } 4860