1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 83 #include "llvm/CodeGen/RegisterBank.h" 84 #include "llvm/IR/IntrinsicsAMDGPU.h" 85 86 #define GET_TARGET_REGBANK_IMPL 87 #include "AMDGPUGenRegisterBank.inc" 88 89 // This file will be TableGen'ed at some point. 90 #include "AMDGPUGenRegisterBankInfo.def" 91 92 using namespace llvm; 93 using namespace MIPatternMatch; 94 95 namespace { 96 97 // Observer to apply a register bank to new registers created by LegalizerHelper. 98 class ApplyRegBankMapping final : public GISelChangeObserver { 99 private: 100 const AMDGPURegisterBankInfo &RBI; 101 MachineRegisterInfo &MRI; 102 const RegisterBank *NewBank; 103 SmallVector<MachineInstr *, 4> NewInsts; 104 105 public: 106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 107 MachineRegisterInfo &MRI_, const RegisterBank *RB) 108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 109 110 ~ApplyRegBankMapping() { 111 for (MachineInstr *MI : NewInsts) 112 applyBank(*MI); 113 } 114 115 /// Set any registers that don't have a set register class or bank to SALU. 116 void applyBank(MachineInstr &MI) { 117 const unsigned Opc = MI.getOpcode(); 118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 119 Opc == AMDGPU::G_SEXT) { 120 // LegalizerHelper wants to use the basic legalization artifacts when 121 // widening etc. We don't handle selection with vcc in artifact sources, 122 // so we need to use a select instead to handle these properly. 123 Register DstReg = MI.getOperand(0).getReg(); 124 Register SrcReg = MI.getOperand(1).getReg(); 125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 126 if (SrcBank == &AMDGPU::VCCRegBank) { 127 const LLT S32 = LLT::scalar(32); 128 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 129 assert(MRI.getType(DstReg) == S32); 130 assert(NewBank == &AMDGPU::VGPRRegBank); 131 132 // Replace the extension with a select, which really uses the boolean 133 // source. 134 MachineIRBuilder B(MI); 135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 136 auto False = B.buildConstant(S32, 0); 137 B.buildSelect(DstReg, SrcReg, True, False); 138 MRI.setRegBank(True.getReg(0), *NewBank); 139 MRI.setRegBank(False.getReg(0), *NewBank); 140 MI.eraseFromParent(); 141 } 142 143 assert(!MRI.getRegClassOrRegBank(DstReg)); 144 MRI.setRegBank(DstReg, *NewBank); 145 return; 146 } 147 148 #ifndef NDEBUG 149 if (Opc == AMDGPU::G_TRUNC) { 150 Register DstReg = MI.getOperand(0).getReg(); 151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 152 assert(DstBank != &AMDGPU::VCCRegBank); 153 } 154 #endif 155 156 for (MachineOperand &Op : MI.operands()) { 157 if (!Op.isReg()) 158 continue; 159 160 // We may see physical registers if building a real MI 161 Register Reg = Op.getReg(); 162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 163 continue; 164 165 const RegisterBank *RB = NewBank; 166 if (MRI.getType(Reg) == LLT::scalar(1)) { 167 assert(NewBank == &AMDGPU::VGPRRegBank && 168 "s1 operands should only be used for vector bools"); 169 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 170 MI.getOpcode() != AMDGPU::G_ANYEXT) && 171 "not expecting legalization artifacts here"); 172 RB = &AMDGPU::VCCRegBank; 173 } 174 175 MRI.setRegBank(Reg, *RB); 176 } 177 } 178 179 void erasingInstr(MachineInstr &MI) override {} 180 181 void createdInstr(MachineInstr &MI) override { 182 // At this point, the instruction was just inserted and has no operands. 183 NewInsts.push_back(&MI); 184 } 185 186 void changingInstr(MachineInstr &MI) override {} 187 void changedInstr(MachineInstr &MI) override { 188 // FIXME: In principle we should probably add the instruction to NewInsts, 189 // but the way the LegalizerHelper uses the observer, we will always see the 190 // registers we need to set the regbank on also referenced in a new 191 // instruction. 192 } 193 }; 194 195 } 196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 197 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), 198 TII(Subtarget.getInstrInfo()) { 199 200 // HACK: Until this is fully tablegen'd. 201 static llvm::once_flag InitializeRegisterBankFlag; 202 203 static auto InitializeRegisterBankOnce = [this]() { 204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 207 (void)this; 208 }; 209 210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 211 } 212 213 static bool isVectorRegisterBank(const RegisterBank &Bank) { 214 unsigned BankID = Bank.getID(); 215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 216 } 217 218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 219 const RegisterBank &Src, 220 unsigned Size) const { 221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 222 if (Dst.getID() == AMDGPU::SGPRRegBankID && 223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 224 return std::numeric_limits<unsigned>::max(); 225 } 226 227 // Bool values are tricky, because the meaning is based on context. The SCC 228 // and VCC banks are for the natural scalar and vector conditions produced by 229 // a compare. 230 // 231 // Legalization doesn't know about the necessary context, so an s1 use may 232 // have been a truncate from an arbitrary value, in which case a copy (lowered 233 // as a compare with 0) needs to be inserted. 234 if (Size == 1 && 235 (Dst.getID() == AMDGPU::SGPRRegBankID) && 236 (isVectorRegisterBank(Src) || 237 Src.getID() == AMDGPU::SGPRRegBankID || 238 Src.getID() == AMDGPU::VCCRegBankID)) 239 return std::numeric_limits<unsigned>::max(); 240 241 // There is no direct copy between AGPRs. 242 if (Dst.getID() == AMDGPU::AGPRRegBankID && 243 Src.getID() == AMDGPU::AGPRRegBankID) 244 return 4; 245 246 return RegisterBankInfo::copyCost(Dst, Src, Size); 247 } 248 249 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 250 const ValueMapping &ValMapping, 251 const RegisterBank *CurBank) const { 252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 253 // VGPR. 254 // FIXME: Is there a better way to do this? 255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 256 return 10; // This is expensive. 257 258 assert(ValMapping.NumBreakDowns == 2 && 259 ValMapping.BreakDown[0].Length == 32 && 260 ValMapping.BreakDown[0].StartIdx == 0 && 261 ValMapping.BreakDown[1].Length == 32 && 262 ValMapping.BreakDown[1].StartIdx == 32 && 263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 264 265 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 267 // want. 268 269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 270 // alignment restrictions, but this probably isn't important. 271 return 1; 272 } 273 274 const RegisterBank & 275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 276 LLT Ty) const { 277 if (&RC == &AMDGPU::SReg_1RegClass) 278 return AMDGPU::VCCRegBank; 279 280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 281 // VCC-like use. 282 if (TRI->isSGPRClass(&RC)) { 283 // FIXME: This probably came from a copy from a physical register, which 284 // should be inferable from the copied to-type. We don't have many boolean 285 // physical register constraints so just assume a normal SGPR for now. 286 if (!Ty.isValid()) 287 return AMDGPU::SGPRRegBank; 288 289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 290 } 291 292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 293 } 294 295 template <unsigned NumOps> 296 RegisterBankInfo::InstructionMappings 297 AMDGPURegisterBankInfo::addMappingFromTable( 298 const MachineInstr &MI, const MachineRegisterInfo &MRI, 299 const std::array<unsigned, NumOps> RegSrcOpIdx, 300 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 301 302 InstructionMappings AltMappings; 303 304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 305 306 unsigned Sizes[NumOps]; 307 for (unsigned I = 0; I < NumOps; ++I) { 308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 310 } 311 312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 315 } 316 317 // getInstrMapping's default mapping uses ID 1, so start at 2. 318 unsigned MappingID = 2; 319 for (const auto &Entry : Table) { 320 for (unsigned I = 0; I < NumOps; ++I) { 321 int OpIdx = RegSrcOpIdx[I]; 322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 323 } 324 325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 326 getOperandsMapping(Operands), 327 Operands.size())); 328 } 329 330 return AltMappings; 331 } 332 333 RegisterBankInfo::InstructionMappings 334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 336 switch (MI.getIntrinsicID()) { 337 case Intrinsic::amdgcn_readlane: { 338 static const OpRegBankEntry<3> Table[2] = { 339 // Perfectly legal. 340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 341 342 // Need a readfirstlane for the index. 343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 344 }; 345 346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 348 } 349 case Intrinsic::amdgcn_writelane: { 350 static const OpRegBankEntry<4> Table[4] = { 351 // Perfectly legal. 352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 353 354 // Need readfirstlane of first op 355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 356 357 // Need readfirstlane of second op 358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 359 360 // Need readfirstlane of both ops 361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 362 }; 363 364 // rsrc, voffset, offset 365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table); 367 } 368 default: 369 return RegisterBankInfo::getInstrAlternativeMappings(MI); 370 } 371 } 372 373 RegisterBankInfo::InstructionMappings 374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 376 377 switch (MI.getIntrinsicID()) { 378 case Intrinsic::amdgcn_s_buffer_load: { 379 static const OpRegBankEntry<2> Table[4] = { 380 // Perfectly legal. 381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 382 383 // Only need 1 register in loop 384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 385 386 // Have to waterfall the resource. 387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 388 389 // Have to waterfall the resource, and the offset. 390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 391 }; 392 393 // rsrc, offset 394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table); 396 } 397 case Intrinsic::amdgcn_ds_ordered_add: 398 case Intrinsic::amdgcn_ds_ordered_swap: { 399 // VGPR = M0, VGPR 400 static const OpRegBankEntry<3> Table[2] = { 401 // Perfectly legal. 402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 403 404 // Need a readfirstlane for m0 405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 406 }; 407 408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); 410 } 411 case Intrinsic::amdgcn_s_sendmsg: 412 case Intrinsic::amdgcn_s_sendmsghalt: { 413 // FIXME: Should have no register for immediate 414 static const OpRegBankEntry<1> Table[2] = { 415 // Perfectly legal. 416 { { AMDGPU::SGPRRegBankID }, 1 }, 417 418 // Need readlane 419 { { AMDGPU::VGPRRegBankID }, 3 } 420 }; 421 422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table); 424 } 425 default: 426 return RegisterBankInfo::getInstrAlternativeMappings(MI); 427 } 428 } 429 430 // FIXME: Returns uniform if there's no source value information. This is 431 // probably wrong. 432 static bool isScalarLoadLegal(const MachineInstr &MI) { 433 if (!MI.hasOneMemOperand()) 434 return false; 435 436 const MachineMemOperand *MMO = *MI.memoperands_begin(); 437 const unsigned AS = MMO->getAddrSpace(); 438 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 439 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 440 // Require 4-byte alignment. 441 return MMO->getAlign() >= Align(4) && 442 // Can't do a scalar atomic load. 443 !MMO->isAtomic() && 444 // Don't use scalar loads for volatile accesses to non-constant address 445 // spaces. 446 (IsConst || !MMO->isVolatile()) && 447 // Memory must be known constant, or not written before this load. 448 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && 449 AMDGPUInstrInfo::isUniformMMO(MMO); 450 } 451 452 RegisterBankInfo::InstructionMappings 453 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 454 const MachineInstr &MI) const { 455 456 const MachineFunction &MF = *MI.getParent()->getParent(); 457 const MachineRegisterInfo &MRI = MF.getRegInfo(); 458 459 460 InstructionMappings AltMappings; 461 switch (MI.getOpcode()) { 462 case TargetOpcode::G_CONSTANT: 463 case TargetOpcode::G_IMPLICIT_DEF: { 464 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 465 if (Size == 1) { 466 static const OpRegBankEntry<1> Table[3] = { 467 { { AMDGPU::VGPRRegBankID }, 1 }, 468 { { AMDGPU::SGPRRegBankID }, 1 }, 469 { { AMDGPU::VCCRegBankID }, 1 } 470 }; 471 472 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 473 } 474 475 [[fallthrough]]; 476 } 477 case TargetOpcode::G_FCONSTANT: 478 case TargetOpcode::G_FRAME_INDEX: 479 case TargetOpcode::G_GLOBAL_VALUE: { 480 static const OpRegBankEntry<1> Table[2] = { 481 { { AMDGPU::VGPRRegBankID }, 1 }, 482 { { AMDGPU::SGPRRegBankID }, 1 } 483 }; 484 485 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 486 } 487 case TargetOpcode::G_AND: 488 case TargetOpcode::G_OR: 489 case TargetOpcode::G_XOR: { 490 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 491 492 if (Size == 1) { 493 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 494 const InstructionMapping &SCCMapping = getInstructionMapping( 495 1, 1, getOperandsMapping( 496 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 499 3); // Num Operands 500 AltMappings.push_back(&SCCMapping); 501 502 const InstructionMapping &VCCMapping0 = getInstructionMapping( 503 2, 1, getOperandsMapping( 504 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 506 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 507 3); // Num Operands 508 AltMappings.push_back(&VCCMapping0); 509 return AltMappings; 510 } 511 512 if (Size != 64) 513 break; 514 515 const InstructionMapping &SSMapping = getInstructionMapping( 516 1, 1, getOperandsMapping( 517 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 520 3); // Num Operands 521 AltMappings.push_back(&SSMapping); 522 523 const InstructionMapping &VVMapping = getInstructionMapping( 524 2, 2, getOperandsMapping( 525 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 528 3); // Num Operands 529 AltMappings.push_back(&VVMapping); 530 break; 531 } 532 case TargetOpcode::G_LOAD: 533 case TargetOpcode::G_ZEXTLOAD: 534 case TargetOpcode::G_SEXTLOAD: { 535 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 536 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 537 unsigned PtrSize = PtrTy.getSizeInBits(); 538 unsigned AS = PtrTy.getAddressSpace(); 539 540 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 541 AS != AMDGPUAS::PRIVATE_ADDRESS) && 542 isScalarLoadLegal(MI)) { 543 const InstructionMapping &SSMapping = getInstructionMapping( 544 1, 1, getOperandsMapping( 545 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 546 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 547 2); // Num Operands 548 AltMappings.push_back(&SSMapping); 549 } 550 551 const InstructionMapping &VVMapping = getInstructionMapping( 552 2, 1, 553 getOperandsMapping( 554 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 555 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 556 2); // Num Operands 557 AltMappings.push_back(&VVMapping); 558 559 // It may be possible to have a vgpr = load sgpr mapping here, because 560 // the mubuf instructions support this kind of load, but probably for only 561 // gfx7 and older. However, the addressing mode matching in the instruction 562 // selector should be able to do a better job of detecting and selecting 563 // these kinds of loads from the vgpr = load vgpr mapping. 564 565 return AltMappings; 566 567 } 568 case TargetOpcode::G_SELECT: { 569 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 570 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 571 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 574 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 575 4); // Num Operands 576 AltMappings.push_back(&SSMapping); 577 578 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 579 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 580 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 582 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 583 4); // Num Operands 584 AltMappings.push_back(&VVMapping); 585 586 return AltMappings; 587 } 588 case TargetOpcode::G_UADDE: 589 case TargetOpcode::G_USUBE: 590 case TargetOpcode::G_SADDE: 591 case TargetOpcode::G_SSUBE: { 592 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 593 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 594 getOperandsMapping( 595 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 599 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 600 5); // Num Operands 601 AltMappings.push_back(&SSMapping); 602 603 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 604 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 605 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 607 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 608 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 609 5); // Num Operands 610 AltMappings.push_back(&VVMapping); 611 return AltMappings; 612 } 613 case AMDGPU::G_BRCOND: { 614 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 615 616 // TODO: Change type to 32 for scalar 617 const InstructionMapping &SMapping = getInstructionMapping( 618 1, 1, getOperandsMapping( 619 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 620 2); // Num Operands 621 AltMappings.push_back(&SMapping); 622 623 const InstructionMapping &VMapping = getInstructionMapping( 624 1, 1, getOperandsMapping( 625 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 626 2); // Num Operands 627 AltMappings.push_back(&VMapping); 628 return AltMappings; 629 } 630 case AMDGPU::G_INTRINSIC: 631 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 632 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 633 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 634 default: 635 break; 636 } 637 return RegisterBankInfo::getInstrAlternativeMappings(MI); 638 } 639 640 void AMDGPURegisterBankInfo::split64BitValueForMapping( 641 MachineIRBuilder &B, 642 SmallVector<Register, 2> &Regs, 643 LLT HalfTy, 644 Register Reg) const { 645 assert(HalfTy.getSizeInBits() == 32); 646 MachineRegisterInfo *MRI = B.getMRI(); 647 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 648 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 649 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 650 MRI->setRegBank(LoLHS, *Bank); 651 MRI->setRegBank(HiLHS, *Bank); 652 653 Regs.push_back(LoLHS); 654 Regs.push_back(HiLHS); 655 656 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 657 .addDef(LoLHS) 658 .addDef(HiLHS) 659 .addUse(Reg); 660 } 661 662 /// Replace the current type each register in \p Regs has with \p NewTy 663 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 664 LLT NewTy) { 665 for (Register Reg : Regs) { 666 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 667 MRI.setType(Reg, NewTy); 668 } 669 } 670 671 static LLT getHalfSizedType(LLT Ty) { 672 if (Ty.isVector()) { 673 assert(Ty.getElementCount().isKnownMultipleOf(2)); 674 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), 675 Ty.getElementType()); 676 } 677 678 assert(Ty.getScalarSizeInBits() % 2 == 0); 679 return LLT::scalar(Ty.getScalarSizeInBits() / 2); 680 } 681 682 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector 683 // source value into a scalar register. 684 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, 685 MachineRegisterInfo &MRI, 686 Register Src) const { 687 LLT Ty = MRI.getType(Src); 688 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); 689 690 if (Bank == &AMDGPU::SGPRRegBank) 691 return Src; 692 693 unsigned Bits = Ty.getSizeInBits(); 694 assert(Bits % 32 == 0); 695 696 if (Bank != &AMDGPU::VGPRRegBank) { 697 // We need to copy from AGPR to VGPR 698 Src = B.buildCopy(Ty, Src).getReg(0); 699 MRI.setRegBank(Src, AMDGPU::VGPRRegBank); 700 } 701 702 LLT S32 = LLT::scalar(32); 703 unsigned NumParts = Bits / 32; 704 SmallVector<Register, 8> SrcParts; 705 SmallVector<Register, 8> DstParts; 706 707 if (Bits == 32) { 708 SrcParts.push_back(Src); 709 } else { 710 auto Unmerge = B.buildUnmerge(S32, Src); 711 for (unsigned i = 0; i < NumParts; ++i) 712 SrcParts.push_back(Unmerge.getReg(i)); 713 } 714 715 for (unsigned i = 0; i < NumParts; ++i) { 716 Register SrcPart = SrcParts[i]; 717 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 718 MRI.setType(DstPart, NumParts == 1 ? Ty : S32); 719 720 const TargetRegisterClass *Constrained = 721 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); 722 (void)Constrained; 723 assert(Constrained && "Failed to constrain readfirstlane src reg"); 724 725 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); 726 727 DstParts.push_back(DstPart); 728 } 729 730 if (Bits == 32) 731 return DstParts[0]; 732 733 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0); 734 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); 735 return Dst; 736 } 737 738 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 739 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 740 /// execute the instruction for each unique combination of values in all lanes 741 /// in the wave. The block will be split such that rest of the instructions are 742 /// moved to a new block. 743 /// 744 /// Essentially performs this loop: 745 // 746 /// Save Execution Mask 747 /// For (Lane : Wavefront) { 748 /// Enable Lane, Disable all other lanes 749 /// SGPR = read SGPR value for current lane from VGPR 750 /// VGPRResult[Lane] = use_op SGPR 751 /// } 752 /// Restore Execution Mask 753 /// 754 /// There is additional complexity to try for compare values to identify the 755 /// unique values used. 756 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 757 MachineIRBuilder &B, 758 iterator_range<MachineBasicBlock::iterator> Range, 759 SmallSet<Register, 4> &SGPROperandRegs, 760 MachineRegisterInfo &MRI) const { 761 762 // Track use registers which have already been expanded with a readfirstlane 763 // sequence. This may have multiple uses if moving a sequence. 764 DenseMap<Register, Register> WaterfalledRegMap; 765 766 MachineBasicBlock &MBB = B.getMBB(); 767 MachineFunction *MF = &B.getMF(); 768 769 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 770 const unsigned MovExecOpc = 771 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 772 const unsigned MovExecTermOpc = 773 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 774 775 const unsigned XorTermOpc = Subtarget.isWave32() ? 776 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 777 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 778 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 779 const unsigned ExecReg = Subtarget.isWave32() ? 780 AMDGPU::EXEC_LO : AMDGPU::EXEC; 781 782 #ifndef NDEBUG 783 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 784 #endif 785 786 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 787 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 788 789 // Don't bother using generic instructions/registers for the exec mask. 790 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 791 .addDef(InitSaveExecReg); 792 793 Register PhiExec = MRI.createVirtualRegister(WaveRC); 794 Register NewExec = MRI.createVirtualRegister(WaveRC); 795 796 // To insert the loop we need to split the block. Move everything before this 797 // point to a new block, and insert a new empty block before this instruction. 798 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 799 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); 800 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 801 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 802 MachineFunction::iterator MBBI(MBB); 803 ++MBBI; 804 MF->insert(MBBI, LoopBB); 805 MF->insert(MBBI, BodyBB); 806 MF->insert(MBBI, RestoreExecBB); 807 MF->insert(MBBI, RemainderBB); 808 809 LoopBB->addSuccessor(BodyBB); 810 BodyBB->addSuccessor(RestoreExecBB); 811 BodyBB->addSuccessor(LoopBB); 812 813 // Move the rest of the block into a new block. 814 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 815 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 816 817 MBB.addSuccessor(LoopBB); 818 RestoreExecBB->addSuccessor(RemainderBB); 819 820 B.setInsertPt(*LoopBB, LoopBB->end()); 821 822 B.buildInstr(TargetOpcode::PHI) 823 .addDef(PhiExec) 824 .addReg(InitSaveExecReg) 825 .addMBB(&MBB) 826 .addReg(NewExec) 827 .addMBB(BodyBB); 828 829 const DebugLoc &DL = B.getDL(); 830 831 MachineInstr &FirstInst = *Range.begin(); 832 833 // Move the instruction into the loop body. Note we moved everything after 834 // Range.end() already into a new block, so Range.end() is no longer valid. 835 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); 836 837 // Figure out the iterator range after splicing the instructions. 838 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 839 auto NewEnd = BodyBB->end(); 840 841 B.setMBB(*LoopBB); 842 843 LLT S1 = LLT::scalar(1); 844 Register CondReg; 845 846 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 847 848 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 849 for (MachineOperand &Op : MI.uses()) { 850 if (!Op.isReg() || Op.isDef()) 851 continue; 852 853 Register OldReg = Op.getReg(); 854 if (!SGPROperandRegs.count(OldReg)) 855 continue; 856 857 // See if we already processed this register in another instruction in the 858 // sequence. 859 auto OldVal = WaterfalledRegMap.find(OldReg); 860 if (OldVal != WaterfalledRegMap.end()) { 861 Op.setReg(OldVal->second); 862 continue; 863 } 864 865 Register OpReg = Op.getReg(); 866 LLT OpTy = MRI.getType(OpReg); 867 868 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 869 if (OpBank != &AMDGPU::VGPRRegBank) { 870 // Insert copy from AGPR to VGPR before the loop. 871 B.setMBB(MBB); 872 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 873 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 874 B.setMBB(*LoopBB); 875 } 876 877 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); 878 879 // Build the comparison(s). 880 unsigned OpSize = OpTy.getSizeInBits(); 881 bool Is64 = OpSize % 64 == 0; 882 unsigned PartSize = Is64 ? 64 : 32; 883 LLT PartTy = LLT::scalar(PartSize); 884 unsigned NumParts = OpSize / PartSize; 885 SmallVector<Register, 8> OpParts; 886 SmallVector<Register, 8> CurrentLaneParts; 887 888 if (NumParts == 1) { 889 OpParts.push_back(OpReg); 890 CurrentLaneParts.push_back(CurrentLaneReg); 891 } else { 892 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); 893 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); 894 for (unsigned i = 0; i < NumParts; ++i) { 895 OpParts.push_back(UnmergeOp.getReg(i)); 896 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); 897 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); 898 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); 899 } 900 } 901 902 for (unsigned i = 0; i < NumParts; ++i) { 903 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], 904 OpParts[i]).getReg(0); 905 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); 906 907 if (!CondReg) { 908 CondReg = CmpReg; 909 } else { 910 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); 911 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); 912 } 913 } 914 915 Op.setReg(CurrentLaneReg); 916 917 // Make sure we don't re-process this register again. 918 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); 919 } 920 } 921 922 // The ballot becomes a no-op during instruction selection. 923 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, 924 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}, 925 false) 926 .addReg(CondReg) 927 .getReg(0); 928 MRI.setRegClass(CondReg, WaveRC); 929 930 // Update EXEC, save the original EXEC value to VCC. 931 B.buildInstr(AndSaveExecOpc) 932 .addDef(NewExec) 933 .addReg(CondReg, RegState::Kill); 934 935 MRI.setSimpleHint(NewExec, CondReg); 936 937 B.setInsertPt(*BodyBB, BodyBB->end()); 938 939 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 940 B.buildInstr(XorTermOpc) 941 .addDef(ExecReg) 942 .addReg(ExecReg) 943 .addReg(NewExec); 944 945 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 946 // s_cbranch_scc0? 947 948 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 949 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); 950 951 // Save the EXEC mask before the loop. 952 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) 953 .addReg(ExecReg); 954 955 // Restore the EXEC mask after the loop. 956 B.setMBB(*RestoreExecBB); 957 B.buildInstr(MovExecTermOpc) 958 .addDef(ExecReg) 959 .addReg(SaveExecReg); 960 961 // Set the insert point after the original instruction, so any new 962 // instructions will be in the remainder. 963 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 964 965 return true; 966 } 967 968 // Return any unique registers used by \p MI at \p OpIndices that need to be 969 // handled in a waterfall loop. Returns these registers in \p 970 // SGPROperandRegs. Returns true if there are any operands to handle and a 971 // waterfall loop is necessary. 972 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 973 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 974 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 975 for (unsigned Op : OpIndices) { 976 assert(MI.getOperand(Op).isUse()); 977 Register Reg = MI.getOperand(Op).getReg(); 978 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 979 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 980 SGPROperandRegs.insert(Reg); 981 } 982 983 // No operands need to be replaced, so no need to loop. 984 return !SGPROperandRegs.empty(); 985 } 986 987 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 988 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 989 ArrayRef<unsigned> OpIndices) const { 990 // Use a set to avoid extra readfirstlanes in the case where multiple operands 991 // are the same register. 992 SmallSet<Register, 4> SGPROperandRegs; 993 994 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 995 return false; 996 997 MachineBasicBlock::iterator I = MI.getIterator(); 998 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 999 SGPROperandRegs, MRI); 1000 } 1001 1002 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1003 MachineInstr &MI, MachineRegisterInfo &MRI, 1004 ArrayRef<unsigned> OpIndices) const { 1005 MachineIRBuilder B(MI); 1006 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1007 } 1008 1009 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1010 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1011 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1012 Register Reg = MI.getOperand(OpIdx).getReg(); 1013 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1014 if (Bank == &AMDGPU::SGPRRegBank) 1015 return; 1016 1017 MachineIRBuilder B(MI); 1018 1019 Reg = buildReadFirstLane(B, MRI, Reg); 1020 MI.getOperand(OpIdx).setReg(Reg); 1021 } 1022 1023 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1024 /// rest will be in the remainder. 1025 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1026 unsigned TotalSize = Ty.getSizeInBits(); 1027 if (!Ty.isVector()) 1028 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1029 1030 LLT EltTy = Ty.getElementType(); 1031 unsigned EltSize = EltTy.getSizeInBits(); 1032 assert(FirstSize % EltSize == 0); 1033 1034 unsigned FirstPartNumElts = FirstSize / EltSize; 1035 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1036 1037 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), 1038 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; 1039 } 1040 1041 static LLT widen96To128(LLT Ty) { 1042 if (!Ty.isVector()) 1043 return LLT::scalar(128); 1044 1045 LLT EltTy = Ty.getElementType(); 1046 assert(128 % EltTy.getSizeInBits() == 0); 1047 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); 1048 } 1049 1050 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1051 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1052 MachineRegisterInfo &MRI) const { 1053 Register DstReg = MI.getOperand(0).getReg(); 1054 const LLT LoadTy = MRI.getType(DstReg); 1055 unsigned LoadSize = LoadTy.getSizeInBits(); 1056 const unsigned MaxNonSmrdLoadSize = 128; 1057 1058 const RegisterBank *DstBank = 1059 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1060 if (DstBank == &AMDGPU::SGPRRegBank) { 1061 // There are some special cases that we need to look at for 32 bit and 96 1062 // bit SGPR loads otherwise we have nothing to do. 1063 if (LoadSize != 32 && LoadSize != 96) 1064 return false; 1065 1066 MachineMemOperand *MMO = *MI.memoperands_begin(); 1067 const unsigned MemSize = 8 * MMO->getSize(); 1068 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1069 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1070 // scalar loads should have a load size of 32 but memory access size of less 1071 // than 32. 1072 if (LoadSize == 32 && 1073 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1074 return false; 1075 1076 Register PtrReg = MI.getOperand(1).getReg(); 1077 1078 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1079 MachineIRBuilder B(MI, O); 1080 1081 if (LoadSize == 32) { 1082 // This is an extending load from a sub-dword size. Widen the memory 1083 // access size to 4 bytes and clear the extra high bits appropriately 1084 const LLT S32 = LLT::scalar(32); 1085 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1086 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1087 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1088 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1089 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1090 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1091 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1092 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1093 } else 1094 // We do not need to touch the higher bits for regular loads. 1095 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1096 } else { 1097 // 96-bit loads are only available for vector loads. We need to split this 1098 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1099 if (MMO->getAlign() < Align(16)) { 1100 MachineFunction *MF = MI.getParent()->getParent(); 1101 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 1102 MachineIRBuilder B(MI, ApplyBank); 1103 LegalizerHelper Helper(*MF, ApplyBank, B); 1104 LLT Part64, Part32; 1105 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1106 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != 1107 LegalizerHelper::Legalized) 1108 return false; 1109 return true; 1110 } else { 1111 LLT WiderTy = widen96To128(LoadTy); 1112 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1113 if (WiderTy.isScalar()) 1114 B.buildTrunc(MI.getOperand(0), WideLoad); 1115 else { 1116 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), 1117 WideLoad); 1118 } 1119 } 1120 } 1121 1122 MI.eraseFromParent(); 1123 return true; 1124 } 1125 1126 // 128-bit loads are supported for all instruction types. 1127 if (LoadSize <= MaxNonSmrdLoadSize) 1128 return false; 1129 1130 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1131 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1132 1133 if (SrcRegs.empty()) 1134 SrcRegs.push_back(MI.getOperand(1).getReg()); 1135 1136 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1137 1138 // RegBankSelect only emits scalar types, so we need to reset the pointer 1139 // operand to a pointer type. 1140 Register BasePtrReg = SrcRegs[0]; 1141 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1142 MRI.setType(BasePtrReg, PtrTy); 1143 1144 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1145 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1146 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1147 MachineIRBuilder B(MI, Observer); 1148 LegalizerHelper Helper(B.getMF(), Observer, B); 1149 1150 if (LoadTy.isVector()) { 1151 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1152 return false; 1153 } else { 1154 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1155 return false; 1156 } 1157 1158 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1159 return true; 1160 } 1161 1162 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1163 MachineInstr &MI, 1164 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1165 MachineRegisterInfo &MRI) const { 1166 const MachineFunction &MF = *MI.getMF(); 1167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1168 const auto &TFI = *ST.getFrameLowering(); 1169 1170 // Guard in case the stack growth direction ever changes with scratch 1171 // instructions. 1172 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1173 return false; 1174 1175 Register Dst = MI.getOperand(0).getReg(); 1176 Register AllocSize = MI.getOperand(1).getReg(); 1177 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1178 1179 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1180 1181 // TODO: Need to emit a wave reduction to get the maximum size. 1182 if (SizeBank != &AMDGPU::SGPRRegBank) 1183 return false; 1184 1185 LLT PtrTy = MRI.getType(Dst); 1186 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1187 1188 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1189 Register SPReg = Info->getStackPtrOffsetReg(); 1190 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1191 MachineIRBuilder B(MI, ApplyBank); 1192 1193 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1194 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1195 1196 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1197 if (Alignment > TFI.getStackAlign()) { 1198 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1199 B.buildMaskLowPtrBits(Dst, PtrAdd, 1200 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1201 } else { 1202 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1203 } 1204 1205 MI.eraseFromParent(); 1206 return true; 1207 } 1208 1209 bool AMDGPURegisterBankInfo::applyMappingImage( 1210 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1211 MachineRegisterInfo &MRI, int RsrcIdx) const { 1212 const int NumDefs = MI.getNumExplicitDefs(); 1213 1214 // The reported argument index is relative to the IR intrinsic call arguments, 1215 // so we need to shift by the number of defs and the intrinsic ID. 1216 RsrcIdx += NumDefs + 1; 1217 1218 // Insert copies to VGPR arguments. 1219 applyDefaultMapping(OpdMapper); 1220 1221 // Fixup any SGPR arguments. 1222 SmallVector<unsigned, 4> SGPRIndexes; 1223 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1224 if (!MI.getOperand(I).isReg()) 1225 continue; 1226 1227 // If this intrinsic has a sampler, it immediately follows rsrc. 1228 if (I == RsrcIdx || I == RsrcIdx + 1) 1229 SGPRIndexes.push_back(I); 1230 } 1231 1232 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1233 return true; 1234 } 1235 1236 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1237 Register Reg) { 1238 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1239 if (!Def) 1240 return Reg; 1241 1242 // TODO: Guard against this being an implicit def 1243 return Def->getOperand(0).getReg(); 1244 } 1245 1246 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1247 // the three offsets (voffset, soffset and instoffset) 1248 static unsigned setBufferOffsets(MachineIRBuilder &B, 1249 const AMDGPURegisterBankInfo &RBI, 1250 Register CombinedOffset, Register &VOffsetReg, 1251 Register &SOffsetReg, int64_t &InstOffsetVal, 1252 Align Alignment) { 1253 const LLT S32 = LLT::scalar(32); 1254 MachineRegisterInfo *MRI = B.getMRI(); 1255 1256 if (std::optional<int64_t> Imm = 1257 getIConstantVRegSExtVal(CombinedOffset, *MRI)) { 1258 uint32_t SOffset, ImmOffset; 1259 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1260 Alignment)) { 1261 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1262 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1263 InstOffsetVal = ImmOffset; 1264 1265 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1266 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1267 return SOffset + ImmOffset; 1268 } 1269 } 1270 1271 Register Base; 1272 unsigned Offset; 1273 1274 std::tie(Base, Offset) = 1275 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1276 1277 uint32_t SOffset, ImmOffset; 1278 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1279 &RBI.Subtarget, Alignment)) { 1280 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1281 VOffsetReg = Base; 1282 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1283 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1284 InstOffsetVal = ImmOffset; 1285 return 0; // XXX - Why is this 0? 1286 } 1287 1288 // If we have SGPR base, we can use it for soffset. 1289 if (SOffset == 0) { 1290 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1291 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1292 SOffsetReg = Base; 1293 InstOffsetVal = ImmOffset; 1294 return 0; // XXX - Why is this 0? 1295 } 1296 } 1297 1298 // Handle the variable sgpr + vgpr case. 1299 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1300 if (Add && (int)Offset >= 0) { 1301 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1302 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1303 1304 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1305 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1306 1307 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1308 VOffsetReg = Src0; 1309 SOffsetReg = Src1; 1310 return 0; 1311 } 1312 1313 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1314 VOffsetReg = Src1; 1315 SOffsetReg = Src0; 1316 return 0; 1317 } 1318 } 1319 1320 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1321 // have an SGPR offset and a VGPR resource. 1322 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1323 VOffsetReg = CombinedOffset; 1324 } else { 1325 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1326 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1327 } 1328 1329 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1330 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1331 return 0; 1332 } 1333 1334 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1335 const OperandsMapper &OpdMapper) const { 1336 MachineInstr &MI = OpdMapper.getMI(); 1337 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1338 1339 const LLT S32 = LLT::scalar(32); 1340 Register Dst = MI.getOperand(0).getReg(); 1341 LLT Ty = MRI.getType(Dst); 1342 1343 const RegisterBank *RSrcBank = 1344 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1345 const RegisterBank *OffsetBank = 1346 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1347 if (RSrcBank == &AMDGPU::SGPRRegBank && 1348 OffsetBank == &AMDGPU::SGPRRegBank) 1349 return true; // Legal mapping 1350 1351 // FIXME: 96-bit case was widened during legalize. We need to narrow it back 1352 // here but don't have an MMO. 1353 1354 unsigned LoadSize = Ty.getSizeInBits(); 1355 int NumLoads = 1; 1356 if (LoadSize == 256 || LoadSize == 512) { 1357 NumLoads = LoadSize / 128; 1358 Ty = Ty.divide(NumLoads); 1359 } 1360 1361 // Use the alignment to ensure that the required offsets will fit into the 1362 // immediate offsets. 1363 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1364 1365 MachineIRBuilder B(MI); 1366 MachineFunction &MF = B.getMF(); 1367 1368 Register SOffset; 1369 Register VOffset; 1370 int64_t ImmOffset = 0; 1371 1372 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1373 VOffset, SOffset, ImmOffset, Alignment); 1374 1375 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1376 // can, but we need to track an MMO for that. 1377 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1378 const Align MemAlign(4); // FIXME: ABI type alignment? 1379 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1380 MachinePointerInfo(), 1381 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1382 MachineMemOperand::MOInvariant, 1383 MemSize, MemAlign); 1384 if (MMOOffset != 0) 1385 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1386 1387 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1388 // assume that the buffer is unswizzled. 1389 1390 Register RSrc = MI.getOperand(1).getReg(); 1391 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1392 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1393 1394 SmallVector<Register, 4> LoadParts(NumLoads); 1395 1396 MachineBasicBlock::iterator MII = MI.getIterator(); 1397 MachineInstrSpan Span(MII, &B.getMBB()); 1398 1399 for (int i = 0; i < NumLoads; ++i) { 1400 if (NumLoads == 1) { 1401 LoadParts[i] = Dst; 1402 } else { 1403 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1404 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1405 } 1406 1407 MachineMemOperand *MMO = BaseMMO; 1408 if (i != 0) 1409 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1410 1411 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1412 .addDef(LoadParts[i]) // vdata 1413 .addUse(RSrc) // rsrc 1414 .addUse(VIndex) // vindex 1415 .addUse(VOffset) // voffset 1416 .addUse(SOffset) // soffset 1417 .addImm(ImmOffset + 16 * i) // offset(imm) 1418 .addImm(0) // cachepolicy, swizzled buffer(imm) 1419 .addImm(0) // idxen(imm) 1420 .addMemOperand(MMO); 1421 } 1422 1423 // TODO: If only the resource is a VGPR, it may be better to execute the 1424 // scalar load in the waterfall loop if the resource is expected to frequently 1425 // be dynamically uniform. 1426 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1427 // Remove the original instruction to avoid potentially confusing the 1428 // waterfall loop logic. 1429 B.setInstr(*Span.begin()); 1430 MI.eraseFromParent(); 1431 1432 SmallSet<Register, 4> OpsToWaterfall; 1433 1434 OpsToWaterfall.insert(RSrc); 1435 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1436 OpsToWaterfall, MRI); 1437 } 1438 1439 if (NumLoads != 1) { 1440 if (Ty.isVector()) 1441 B.buildConcatVectors(Dst, LoadParts); 1442 else 1443 B.buildMergeLikeInstr(Dst, LoadParts); 1444 } 1445 1446 // We removed the instruction earlier with a waterfall loop. 1447 if (RSrcBank == &AMDGPU::SGPRRegBank) 1448 MI.eraseFromParent(); 1449 1450 return true; 1451 } 1452 1453 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, 1454 bool Signed) const { 1455 MachineInstr &MI = OpdMapper.getMI(); 1456 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1457 1458 // Insert basic copies 1459 applyDefaultMapping(OpdMapper); 1460 1461 Register DstReg = MI.getOperand(0).getReg(); 1462 LLT Ty = MRI.getType(DstReg); 1463 1464 const LLT S32 = LLT::scalar(32); 1465 1466 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; 1467 Register SrcReg = MI.getOperand(FirstOpnd).getReg(); 1468 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); 1469 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); 1470 1471 const RegisterBank *DstBank = 1472 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1473 if (DstBank == &AMDGPU::VGPRRegBank) { 1474 if (Ty == S32) 1475 return true; 1476 1477 // There is no 64-bit vgpr bitfield extract instructions so the operation 1478 // is expanded to a sequence of instructions that implement the operation. 1479 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); 1480 MachineIRBuilder B(MI, ApplyBank); 1481 1482 const LLT S64 = LLT::scalar(64); 1483 // Shift the source operand so that extracted bits start at bit 0. 1484 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) 1485 : B.buildLShr(S64, SrcReg, OffsetReg); 1486 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); 1487 1488 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions 1489 // if the width is a constant. 1490 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { 1491 // Use the 32-bit bitfield extract instruction if the width is a constant. 1492 // Depending on the width size, use either the low or high 32-bits. 1493 auto Zero = B.buildConstant(S32, 0); 1494 auto WidthImm = ConstWidth->Value.getZExtValue(); 1495 if (WidthImm <= 32) { 1496 // Use bitfield extract on the lower 32-bit source, and then sign-extend 1497 // or clear the upper 32-bits. 1498 auto Extract = 1499 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) 1500 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); 1501 auto Extend = 1502 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; 1503 B.buildMergeLikeInstr(DstReg, {Extract, Extend}); 1504 } else { 1505 // Use bitfield extract on upper 32-bit source, and combine with lower 1506 // 32-bit source. 1507 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); 1508 auto Extract = 1509 Signed 1510 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) 1511 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); 1512 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); 1513 } 1514 MI.eraseFromParent(); 1515 return true; 1516 } 1517 1518 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit 1519 // operations. 1520 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); 1521 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); 1522 if (Signed) 1523 B.buildAShr(S64, SignBit, ExtShift); 1524 else 1525 B.buildLShr(S64, SignBit, ExtShift); 1526 MI.eraseFromParent(); 1527 return true; 1528 } 1529 1530 // The scalar form packs the offset and width in a single operand. 1531 1532 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1533 MachineIRBuilder B(MI, ApplyBank); 1534 1535 // Ensure the high bits are clear to insert the offset. 1536 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1537 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1538 1539 // Zeros out the low bits, so don't bother clamping the input value. 1540 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1541 1542 // Transformation function, pack the offset and width of a BFE into 1543 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1544 // source, bits [5:0] contain the offset and bits [22:16] the width. 1545 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1546 1547 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1548 // register class constraints. 1549 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1550 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1551 1552 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1553 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1554 llvm_unreachable("failed to constrain BFE"); 1555 1556 MI.eraseFromParent(); 1557 return true; 1558 } 1559 1560 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( 1561 const OperandsMapper &OpdMapper) const { 1562 MachineInstr &MI = OpdMapper.getMI(); 1563 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1564 1565 // Insert basic copies. 1566 applyDefaultMapping(OpdMapper); 1567 1568 Register Dst0 = MI.getOperand(0).getReg(); 1569 Register Dst1 = MI.getOperand(1).getReg(); 1570 Register Src0 = MI.getOperand(2).getReg(); 1571 Register Src1 = MI.getOperand(3).getReg(); 1572 Register Src2 = MI.getOperand(4).getReg(); 1573 1574 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) 1575 return true; 1576 1577 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 1578 LLT S1 = LLT::scalar(1); 1579 LLT S32 = LLT::scalar(32); 1580 1581 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; 1582 bool Accumulate = true; 1583 1584 if (!DstOnValu) { 1585 if (mi_match(Src2, MRI, m_ZeroInt())) 1586 Accumulate = false; 1587 } 1588 1589 // Keep the multiplication on the SALU. 1590 MachineIRBuilder B(MI); 1591 1592 Register DstHi; 1593 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); 1594 bool MulHiInVgpr = false; 1595 1596 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); 1597 1598 if (Subtarget.hasSMulHi()) { 1599 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) 1600 : B.buildSMulH(S32, Src0, Src1).getReg(0); 1601 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); 1602 } else { 1603 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); 1604 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); 1605 1606 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); 1607 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); 1608 1609 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) 1610 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); 1611 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1612 1613 if (!DstOnValu) { 1614 DstHi = buildReadFirstLane(B, MRI, DstHi); 1615 } else { 1616 MulHiInVgpr = true; 1617 } 1618 } 1619 1620 // Accumulate and produce the "carry-out" bit. 1621 // 1622 // The "carry-out" is defined as bit 64 of the result when computed as a 1623 // big integer. For unsigned multiply-add, this matches the usual definition 1624 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the 1625 // result, which is determined as: 1626 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add 1627 LLT CarryType = DstOnValu ? S1 : S32; 1628 const RegisterBank &CarryBank = 1629 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 1630 const RegisterBank &DstBank = 1631 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; 1632 Register Carry; 1633 Register Zero; 1634 1635 if (!IsUnsigned) { 1636 Zero = B.buildConstant(S32, 0).getReg(0); 1637 MRI.setRegBank(Zero, 1638 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); 1639 1640 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) 1641 .getReg(0); 1642 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank 1643 : AMDGPU::SGPRRegBank); 1644 1645 if (DstOnValu && !MulHiInVgpr) { 1646 Carry = B.buildTrunc(S1, Carry).getReg(0); 1647 MRI.setRegBank(Carry, AMDGPU::VCCRegBank); 1648 } 1649 } 1650 1651 if (Accumulate) { 1652 if (DstOnValu) { 1653 DstLo = B.buildCopy(S32, DstLo).getReg(0); 1654 DstHi = B.buildCopy(S32, DstHi).getReg(0); 1655 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); 1656 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); 1657 } 1658 1659 auto Unmerge = B.buildUnmerge(S32, Src2); 1660 Register Src2Lo = Unmerge.getReg(0); 1661 Register Src2Hi = Unmerge.getReg(1); 1662 MRI.setRegBank(Src2Lo, DstBank); 1663 MRI.setRegBank(Src2Hi, DstBank); 1664 1665 if (!IsUnsigned) { 1666 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); 1667 MRI.setRegBank(Src2Sign.getReg(0), CarryBank); 1668 1669 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); 1670 MRI.setRegBank(Carry, CarryBank); 1671 } 1672 1673 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); 1674 DstLo = AddLo.getReg(0); 1675 Register CarryLo = AddLo.getReg(1); 1676 MRI.setRegBank(DstLo, DstBank); 1677 MRI.setRegBank(CarryLo, CarryBank); 1678 1679 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); 1680 DstHi = AddHi.getReg(0); 1681 MRI.setRegBank(DstHi, DstBank); 1682 1683 Register CarryHi = AddHi.getReg(1); 1684 MRI.setRegBank(CarryHi, CarryBank); 1685 1686 if (IsUnsigned) { 1687 Carry = CarryHi; 1688 } else { 1689 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); 1690 MRI.setRegBank(Carry, CarryBank); 1691 } 1692 } else { 1693 if (IsUnsigned) { 1694 Carry = B.buildConstant(CarryType, 0).getReg(0); 1695 MRI.setRegBank(Carry, CarryBank); 1696 } 1697 } 1698 1699 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi}); 1700 1701 if (DstOnValu) { 1702 B.buildCopy(Dst1, Carry); 1703 } else { 1704 B.buildTrunc(Dst1, Carry); 1705 } 1706 1707 MI.eraseFromParent(); 1708 return true; 1709 } 1710 1711 // Return a suitable opcode for extending the operands of Opc when widening. 1712 static unsigned getExtendOp(unsigned Opc) { 1713 switch (Opc) { 1714 case TargetOpcode::G_ASHR: 1715 case TargetOpcode::G_SMIN: 1716 case TargetOpcode::G_SMAX: 1717 return TargetOpcode::G_SEXT; 1718 case TargetOpcode::G_LSHR: 1719 case TargetOpcode::G_UMIN: 1720 case TargetOpcode::G_UMAX: 1721 return TargetOpcode::G_ZEXT; 1722 default: 1723 return TargetOpcode::G_ANYEXT; 1724 } 1725 } 1726 1727 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1728 // any illegal vector extend or unmerge operations. 1729 static std::pair<Register, Register> 1730 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1731 const LLT S32 = LLT::scalar(32); 1732 auto Bitcast = B.buildBitcast(S32, Src); 1733 1734 if (ExtOpcode == TargetOpcode::G_SEXT) { 1735 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1736 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1737 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1738 } 1739 1740 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1741 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1742 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1743 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1744 } 1745 1746 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1747 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1748 } 1749 1750 // For cases where only a single copy is inserted for matching register banks. 1751 // Replace the register in the instruction operand 1752 static bool substituteSimpleCopyRegs( 1753 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1754 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1755 if (!SrcReg.empty()) { 1756 assert(SrcReg.size() == 1); 1757 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1758 return true; 1759 } 1760 1761 return false; 1762 } 1763 1764 /// Handle register layout difference for f16 images for some subtargets. 1765 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1766 MachineRegisterInfo &MRI, 1767 Register Reg) const { 1768 if (!Subtarget.hasUnpackedD16VMem()) 1769 return Reg; 1770 1771 const LLT S16 = LLT::scalar(16); 1772 LLT StoreVT = MRI.getType(Reg); 1773 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1774 return Reg; 1775 1776 auto Unmerge = B.buildUnmerge(S16, Reg); 1777 1778 1779 SmallVector<Register, 4> WideRegs; 1780 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1781 WideRegs.push_back(Unmerge.getReg(I)); 1782 1783 const LLT S32 = LLT::scalar(32); 1784 int NumElts = StoreVT.getNumElements(); 1785 1786 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs) 1787 .getReg(0); 1788 } 1789 1790 static std::pair<Register, unsigned> 1791 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1792 int64_t Const; 1793 if (mi_match(Reg, MRI, m_ICst(Const))) 1794 return std::pair(Register(), Const); 1795 1796 Register Base; 1797 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1798 return std::pair(Base, Const); 1799 1800 // TODO: Handle G_OR used for add case 1801 return std::pair(Reg, 0); 1802 } 1803 1804 std::pair<Register, unsigned> 1805 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1806 Register OrigOffset) const { 1807 const unsigned MaxImm = 4095; 1808 Register BaseReg; 1809 unsigned ImmOffset; 1810 const LLT S32 = LLT::scalar(32); 1811 1812 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. 1813 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1814 OrigOffset); 1815 1816 unsigned C1 = 0; 1817 if (ImmOffset != 0) { 1818 // If the immediate value is too big for the immoffset field, put the value 1819 // and -4096 into the immoffset field so that the value that is copied/added 1820 // for the voffset field is a multiple of 4096, and it stands more chance 1821 // of being CSEd with the copy/add for another similar load/store. 1822 // However, do not do that rounding down to a multiple of 4096 if that is a 1823 // negative number, as it appears to be illegal to have a negative offset 1824 // in the vgpr, even if adding the immediate offset makes it positive. 1825 unsigned Overflow = ImmOffset & ~MaxImm; 1826 ImmOffset -= Overflow; 1827 if ((int32_t)Overflow < 0) { 1828 Overflow += ImmOffset; 1829 ImmOffset = 0; 1830 } 1831 1832 C1 = ImmOffset; 1833 if (Overflow != 0) { 1834 if (!BaseReg) 1835 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1836 else { 1837 auto OverflowVal = B.buildConstant(S32, Overflow); 1838 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1839 } 1840 } 1841 } 1842 1843 if (!BaseReg) 1844 BaseReg = B.buildConstant(S32, 0).getReg(0); 1845 1846 return {BaseReg, C1}; 1847 } 1848 1849 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1850 Register SrcReg) const { 1851 MachineRegisterInfo &MRI = *B.getMRI(); 1852 LLT SrcTy = MRI.getType(SrcReg); 1853 if (SrcTy.getSizeInBits() == 32) { 1854 // Use a v_mov_b32 here to make the exec dependency explicit. 1855 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1856 .addDef(DstReg) 1857 .addUse(SrcReg); 1858 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1859 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1860 } 1861 1862 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1863 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1864 1865 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1866 .addDef(TmpReg0) 1867 .addUse(SrcReg, 0, AMDGPU::sub0); 1868 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1869 .addDef(TmpReg1) 1870 .addUse(SrcReg, 0, AMDGPU::sub1); 1871 B.buildInstr(AMDGPU::REG_SEQUENCE) 1872 .addDef(DstReg) 1873 .addUse(TmpReg0) 1874 .addImm(AMDGPU::sub0) 1875 .addUse(TmpReg1) 1876 .addImm(AMDGPU::sub1); 1877 1878 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1879 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1880 } 1881 1882 /// Utility function for pushing dynamic vector indexes with a constant offset 1883 /// into waterfall loops. 1884 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1885 MachineInstr &IdxUseInstr, 1886 unsigned OpIdx, 1887 unsigned ConstOffset) { 1888 MachineRegisterInfo &MRI = *B.getMRI(); 1889 const LLT S32 = LLT::scalar(32); 1890 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1891 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1892 1893 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1894 1895 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1896 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1897 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1898 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1899 } 1900 1901 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1902 /// original 32-bit source value (to be inserted in the low part of the combined 1903 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1904 /// value. 1905 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1906 Register Hi32Reg, Register Lo32Reg, 1907 unsigned ExtOpc, 1908 const RegisterBank &RegBank, 1909 bool IsBooleanSrc = false) { 1910 if (ExtOpc == AMDGPU::G_ZEXT) { 1911 B.buildConstant(Hi32Reg, 0); 1912 } else if (ExtOpc == AMDGPU::G_SEXT) { 1913 if (IsBooleanSrc) { 1914 // If we know the original source was an s1, the high half is the same as 1915 // the low. 1916 B.buildCopy(Hi32Reg, Lo32Reg); 1917 } else { 1918 // Replicate sign bit from 32-bit extended part. 1919 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1920 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1921 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1922 } 1923 } else { 1924 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1925 B.buildUndef(Hi32Reg); 1926 } 1927 } 1928 1929 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1930 MachineInstr &MI, MachineRegisterInfo &MRI, 1931 const OperandsMapper &OpdMapper) const { 1932 1933 Register VecReg = MI.getOperand(1).getReg(); 1934 Register Idx = MI.getOperand(2).getReg(); 1935 1936 const RegisterBank &IdxBank = 1937 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1938 1939 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1940 1941 LLT VecTy = MRI.getType(VecReg); 1942 unsigned EltSize = VecTy.getScalarSizeInBits(); 1943 unsigned NumElem = VecTy.getNumElements(); 1944 1945 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1946 IsDivergentIdx, &Subtarget)) 1947 return false; 1948 1949 MachineIRBuilder B(MI); 1950 LLT S32 = LLT::scalar(32); 1951 1952 const RegisterBank &DstBank = 1953 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1954 const RegisterBank &SrcBank = 1955 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1956 1957 const RegisterBank &CCBank = 1958 (DstBank == AMDGPU::SGPRRegBank && 1959 SrcBank == AMDGPU::SGPRRegBank && 1960 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1961 : AMDGPU::VCCRegBank; 1962 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1963 1964 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1965 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1966 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1967 } 1968 1969 LLT EltTy = VecTy.getScalarType(); 1970 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1971 unsigned NumLanes = DstRegs.size(); 1972 if (!NumLanes) 1973 NumLanes = 1; 1974 else 1975 EltTy = MRI.getType(DstRegs[0]); 1976 1977 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1978 SmallVector<Register, 2> Res(NumLanes); 1979 for (unsigned L = 0; L < NumLanes; ++L) 1980 Res[L] = UnmergeToEltTy.getReg(L); 1981 1982 for (unsigned I = 1; I < NumElem; ++I) { 1983 auto IC = B.buildConstant(S32, I); 1984 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1985 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1986 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1987 1988 for (unsigned L = 0; L < NumLanes; ++L) { 1989 auto S = B.buildSelect(EltTy, Cmp, 1990 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1991 1992 for (unsigned N : { 0, 2, 3 }) 1993 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1994 1995 Res[L] = S->getOperand(0).getReg(); 1996 } 1997 } 1998 1999 for (unsigned L = 0; L < NumLanes; ++L) { 2000 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 2001 B.buildCopy(DstReg, Res[L]); 2002 MRI.setRegBank(DstReg, DstBank); 2003 } 2004 2005 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2006 MI.eraseFromParent(); 2007 2008 return true; 2009 } 2010 2011 // Insert a cross regbank copy for a register if it already has a bank that 2012 // differs from the one we want to set. 2013 static Register constrainRegToBank(MachineRegisterInfo &MRI, 2014 MachineIRBuilder &B, Register &Reg, 2015 const RegisterBank &Bank) { 2016 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); 2017 if (CurrBank && *CurrBank != Bank) { 2018 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); 2019 MRI.setRegBank(Copy, Bank); 2020 return Copy; 2021 } 2022 2023 MRI.setRegBank(Reg, Bank); 2024 return Reg; 2025 } 2026 2027 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 2028 MachineInstr &MI, MachineRegisterInfo &MRI, 2029 const OperandsMapper &OpdMapper) const { 2030 2031 Register VecReg = MI.getOperand(1).getReg(); 2032 Register Idx = MI.getOperand(3).getReg(); 2033 2034 const RegisterBank &IdxBank = 2035 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2036 2037 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 2038 2039 LLT VecTy = MRI.getType(VecReg); 2040 unsigned EltSize = VecTy.getScalarSizeInBits(); 2041 unsigned NumElem = VecTy.getNumElements(); 2042 2043 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 2044 IsDivergentIdx, &Subtarget)) 2045 return false; 2046 2047 MachineIRBuilder B(MI); 2048 LLT S32 = LLT::scalar(32); 2049 2050 const RegisterBank &DstBank = 2051 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2052 const RegisterBank &SrcBank = 2053 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2054 const RegisterBank &InsBank = 2055 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2056 2057 const RegisterBank &CCBank = 2058 (DstBank == AMDGPU::SGPRRegBank && 2059 SrcBank == AMDGPU::SGPRRegBank && 2060 InsBank == AMDGPU::SGPRRegBank && 2061 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2062 : AMDGPU::VCCRegBank; 2063 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2064 2065 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2066 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2067 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2068 } 2069 2070 LLT EltTy = VecTy.getScalarType(); 2071 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2072 unsigned NumLanes = InsRegs.size(); 2073 if (!NumLanes) { 2074 NumLanes = 1; 2075 InsRegs.push_back(MI.getOperand(2).getReg()); 2076 } else { 2077 EltTy = MRI.getType(InsRegs[0]); 2078 } 2079 2080 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2081 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2082 2083 for (unsigned I = 0; I < NumElem; ++I) { 2084 auto IC = B.buildConstant(S32, I); 2085 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2086 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2087 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2088 2089 for (unsigned L = 0; L < NumLanes; ++L) { 2090 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); 2091 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); 2092 Op1 = constrainRegToBank(MRI, B, Op1, DstBank); 2093 2094 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); 2095 MRI.setRegBank(Select, DstBank); 2096 2097 Ops[I * NumLanes + L] = Select; 2098 } 2099 } 2100 2101 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); 2102 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2103 B.buildBuildVector(MI.getOperand(0), Ops); 2104 } else { 2105 auto Vec = B.buildBuildVector(MergeTy, Ops); 2106 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2107 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2108 } 2109 2110 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2111 MI.eraseFromParent(); 2112 2113 return true; 2114 } 2115 2116 void AMDGPURegisterBankInfo::applyMappingImpl( 2117 const OperandsMapper &OpdMapper) const { 2118 MachineInstr &MI = OpdMapper.getMI(); 2119 unsigned Opc = MI.getOpcode(); 2120 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2121 switch (Opc) { 2122 case AMDGPU::G_CONSTANT: 2123 case AMDGPU::G_IMPLICIT_DEF: { 2124 Register DstReg = MI.getOperand(0).getReg(); 2125 LLT DstTy = MRI.getType(DstReg); 2126 if (DstTy != LLT::scalar(1)) 2127 break; 2128 2129 const RegisterBank *DstBank = 2130 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2131 if (DstBank == &AMDGPU::VCCRegBank) 2132 break; 2133 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2134 if (DefRegs.empty()) 2135 DefRegs.push_back(DstReg); 2136 2137 MachineIRBuilder B(MI); 2138 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2139 2140 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 2141 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 2142 2143 MI.getOperand(0).setReg(NewDstReg); 2144 if (Opc != AMDGPU::G_IMPLICIT_DEF) { 2145 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); 2146 MI.getOperand(1).setCImm( 2147 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal)); 2148 } 2149 2150 MRI.setRegBank(NewDstReg, *DstBank); 2151 B.buildTrunc(DefRegs[0], NewDstReg); 2152 return; 2153 } 2154 case AMDGPU::G_PHI: { 2155 Register DstReg = MI.getOperand(0).getReg(); 2156 LLT DstTy = MRI.getType(DstReg); 2157 if (DstTy != LLT::scalar(1)) 2158 break; 2159 2160 const LLT S32 = LLT::scalar(32); 2161 const RegisterBank *DstBank = 2162 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2163 if (DstBank == &AMDGPU::VCCRegBank) { 2164 applyDefaultMapping(OpdMapper); 2165 // The standard handling only considers the result register bank for 2166 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2167 // produce an invalid copy. We can only copy with some kind of compare to 2168 // get a vector boolean result. Insert a register bank copy that will be 2169 // correctly lowered to a compare. 2170 MachineIRBuilder B(*MI.getParent()->getParent()); 2171 2172 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2173 Register SrcReg = MI.getOperand(I).getReg(); 2174 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2175 2176 if (SrcBank != &AMDGPU::VCCRegBank) { 2177 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2178 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2179 2180 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2181 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2182 MI.getOperand(I).setReg(Copy.getReg(0)); 2183 } 2184 } 2185 2186 return; 2187 } 2188 2189 // Phi handling is strange and only considers the bank of the destination. 2190 substituteSimpleCopyRegs(OpdMapper, 0); 2191 2192 // Promote SGPR/VGPR booleans to s32 2193 MachineFunction *MF = MI.getParent()->getParent(); 2194 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2195 MachineIRBuilder B(MI, ApplyBank); 2196 LegalizerHelper Helper(*MF, ApplyBank, B); 2197 2198 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2199 llvm_unreachable("widen scalar should have succeeded"); 2200 2201 return; 2202 } 2203 case AMDGPU::G_ICMP: 2204 case AMDGPU::G_UADDO: 2205 case AMDGPU::G_USUBO: 2206 case AMDGPU::G_UADDE: 2207 case AMDGPU::G_SADDE: 2208 case AMDGPU::G_USUBE: 2209 case AMDGPU::G_SSUBE: { 2210 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2211 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2212 2213 const RegisterBank *DstBank = 2214 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2215 if (DstBank != &AMDGPU::SGPRRegBank) 2216 break; 2217 2218 const bool HasCarryIn = MI.getNumOperands() == 5; 2219 2220 // If this is a scalar compare, promote the result to s32, as the selection 2221 // will end up using a copy to a 32-bit vreg. 2222 const LLT S32 = LLT::scalar(32); 2223 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2224 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2225 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2226 MachineIRBuilder B(MI); 2227 2228 if (HasCarryIn) { 2229 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2230 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2231 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2232 MI.getOperand(4).setReg(NewSrcReg); 2233 } 2234 2235 MachineBasicBlock *MBB = MI.getParent(); 2236 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2237 2238 // If we had a constrained VCC result register, a copy was inserted to VCC 2239 // from SGPR. 2240 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2241 if (DefRegs.empty()) 2242 DefRegs.push_back(DstReg); 2243 B.buildTrunc(DefRegs[0], NewDstReg); 2244 return; 2245 } 2246 case AMDGPU::G_SELECT: { 2247 Register DstReg = MI.getOperand(0).getReg(); 2248 LLT DstTy = MRI.getType(DstReg); 2249 2250 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2251 if (CondRegs.empty()) 2252 CondRegs.push_back(MI.getOperand(1).getReg()); 2253 else { 2254 assert(CondRegs.size() == 1); 2255 } 2256 2257 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2258 if (CondBank == &AMDGPU::SGPRRegBank) { 2259 MachineIRBuilder B(MI); 2260 const LLT S32 = LLT::scalar(32); 2261 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2262 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2263 2264 MI.getOperand(1).setReg(NewCondReg); 2265 B.buildZExt(NewCondReg, CondRegs[0]); 2266 } 2267 2268 if (DstTy.getSizeInBits() != 64) 2269 break; 2270 2271 MachineIRBuilder B(MI); 2272 LLT HalfTy = getHalfSizedType(DstTy); 2273 2274 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2275 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2276 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2277 2278 // All inputs are SGPRs, nothing special to do. 2279 if (DefRegs.empty()) { 2280 assert(Src1Regs.empty() && Src2Regs.empty()); 2281 break; 2282 } 2283 2284 if (Src1Regs.empty()) 2285 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2286 else { 2287 setRegsToType(MRI, Src1Regs, HalfTy); 2288 } 2289 2290 if (Src2Regs.empty()) 2291 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2292 else 2293 setRegsToType(MRI, Src2Regs, HalfTy); 2294 2295 setRegsToType(MRI, DefRegs, HalfTy); 2296 2297 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2298 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2299 2300 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2301 MI.eraseFromParent(); 2302 return; 2303 } 2304 case AMDGPU::G_BRCOND: { 2305 Register CondReg = MI.getOperand(0).getReg(); 2306 // FIXME: Should use legalizer helper, but should change bool ext type. 2307 const RegisterBank *CondBank = 2308 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2309 2310 if (CondBank == &AMDGPU::SGPRRegBank) { 2311 MachineIRBuilder B(MI); 2312 const LLT S32 = LLT::scalar(32); 2313 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2314 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2315 2316 MI.getOperand(0).setReg(NewCondReg); 2317 B.buildZExt(NewCondReg, CondReg); 2318 return; 2319 } 2320 2321 break; 2322 } 2323 case AMDGPU::G_AND: 2324 case AMDGPU::G_OR: 2325 case AMDGPU::G_XOR: { 2326 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2327 // there is a VGPR input. 2328 Register DstReg = MI.getOperand(0).getReg(); 2329 LLT DstTy = MRI.getType(DstReg); 2330 2331 if (DstTy.getSizeInBits() == 1) { 2332 const RegisterBank *DstBank = 2333 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2334 if (DstBank == &AMDGPU::VCCRegBank) 2335 break; 2336 2337 MachineFunction *MF = MI.getParent()->getParent(); 2338 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2339 MachineIRBuilder B(MI, ApplyBank); 2340 LegalizerHelper Helper(*MF, ApplyBank, B); 2341 2342 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2343 LegalizerHelper::Legalized) 2344 llvm_unreachable("widen scalar should have succeeded"); 2345 return; 2346 } 2347 2348 if (DstTy.getSizeInBits() != 64) 2349 break; 2350 2351 LLT HalfTy = getHalfSizedType(DstTy); 2352 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2353 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2354 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2355 2356 // All inputs are SGPRs, nothing special to do. 2357 if (DefRegs.empty()) { 2358 assert(Src0Regs.empty() && Src1Regs.empty()); 2359 break; 2360 } 2361 2362 assert(DefRegs.size() == 2); 2363 assert(Src0Regs.size() == Src1Regs.size() && 2364 (Src0Regs.empty() || Src0Regs.size() == 2)); 2365 2366 // Depending on where the source registers came from, the generic code may 2367 // have decided to split the inputs already or not. If not, we still need to 2368 // extract the values. 2369 MachineIRBuilder B(MI); 2370 2371 if (Src0Regs.empty()) 2372 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2373 else 2374 setRegsToType(MRI, Src0Regs, HalfTy); 2375 2376 if (Src1Regs.empty()) 2377 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2378 else 2379 setRegsToType(MRI, Src1Regs, HalfTy); 2380 2381 setRegsToType(MRI, DefRegs, HalfTy); 2382 2383 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2384 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2385 2386 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2387 MI.eraseFromParent(); 2388 return; 2389 } 2390 case AMDGPU::G_ABS: { 2391 Register SrcReg = MI.getOperand(1).getReg(); 2392 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); 2393 2394 // There is no VALU abs instruction so we need to replace it with a sub and 2395 // max combination. 2396 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { 2397 MachineFunction *MF = MI.getParent()->getParent(); 2398 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); 2399 MachineIRBuilder B(MI, Apply); 2400 LegalizerHelper Helper(*MF, Apply, B); 2401 2402 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) 2403 llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); 2404 return; 2405 } 2406 [[fallthrough]]; 2407 } 2408 case AMDGPU::G_ADD: 2409 case AMDGPU::G_SUB: 2410 case AMDGPU::G_MUL: 2411 case AMDGPU::G_SHL: 2412 case AMDGPU::G_LSHR: 2413 case AMDGPU::G_ASHR: 2414 case AMDGPU::G_SMIN: 2415 case AMDGPU::G_SMAX: 2416 case AMDGPU::G_UMIN: 2417 case AMDGPU::G_UMAX: { 2418 Register DstReg = MI.getOperand(0).getReg(); 2419 LLT DstTy = MRI.getType(DstReg); 2420 2421 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2422 // Packed 16-bit operations need to be scalarized and promoted. 2423 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) 2424 break; 2425 2426 const RegisterBank *DstBank = 2427 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2428 if (DstBank == &AMDGPU::VGPRRegBank) 2429 break; 2430 2431 const LLT S32 = LLT::scalar(32); 2432 MachineBasicBlock *MBB = MI.getParent(); 2433 MachineFunction *MF = MBB->getParent(); 2434 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2435 MachineIRBuilder B(MI, ApplySALU); 2436 2437 if (DstTy.isVector()) { 2438 Register WideSrc0Lo, WideSrc0Hi; 2439 Register WideSrc1Lo, WideSrc1Hi; 2440 2441 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2442 std::tie(WideSrc0Lo, WideSrc0Hi) 2443 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2444 std::tie(WideSrc1Lo, WideSrc1Hi) 2445 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2446 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2447 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2448 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2449 MI.eraseFromParent(); 2450 } else { 2451 LegalizerHelper Helper(*MF, ApplySALU, B); 2452 2453 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2454 llvm_unreachable("widen scalar should have succeeded"); 2455 2456 // FIXME: s16 shift amounts should be legal. 2457 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2458 Opc == AMDGPU::G_ASHR) { 2459 B.setInsertPt(*MBB, MI.getIterator()); 2460 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2461 llvm_unreachable("widen scalar should have succeeded"); 2462 } 2463 } 2464 2465 return; 2466 } 2467 case AMDGPU::G_SEXT_INREG: { 2468 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2469 if (SrcRegs.empty()) 2470 break; // Nothing to repair 2471 2472 const LLT S32 = LLT::scalar(32); 2473 MachineIRBuilder B(MI); 2474 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2475 GISelObserverWrapper Observer(&O); 2476 B.setChangeObserver(Observer); 2477 2478 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2479 // we would need to further expand, and doesn't let us directly set the 2480 // result registers. 2481 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2482 2483 int Amt = MI.getOperand(2).getImm(); 2484 if (Amt <= 32) { 2485 // Downstream users have expectations for the high bit behavior, so freeze 2486 // incoming undefined bits. 2487 if (Amt == 32) { 2488 // The low bits are unchanged. 2489 B.buildFreeze(DstRegs[0], SrcRegs[0]); 2490 } else { 2491 auto Freeze = B.buildFreeze(S32, SrcRegs[0]); 2492 // Extend in the low bits and propagate the sign bit to the high half. 2493 B.buildSExtInReg(DstRegs[0], Freeze, Amt); 2494 } 2495 2496 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2497 } else { 2498 // The low bits are unchanged, and extend in the high bits. 2499 // No freeze required 2500 B.buildCopy(DstRegs[0], SrcRegs[0]); 2501 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2502 } 2503 2504 Register DstReg = MI.getOperand(0).getReg(); 2505 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2506 MI.eraseFromParent(); 2507 return; 2508 } 2509 case AMDGPU::G_CTPOP: 2510 case AMDGPU::G_BITREVERSE: { 2511 const RegisterBank *DstBank = 2512 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2513 if (DstBank == &AMDGPU::SGPRRegBank) 2514 break; 2515 2516 Register SrcReg = MI.getOperand(1).getReg(); 2517 const LLT S32 = LLT::scalar(32); 2518 LLT Ty = MRI.getType(SrcReg); 2519 if (Ty == S32) 2520 break; 2521 2522 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2523 MachineIRBuilder B(MI, ApplyVALU); 2524 2525 MachineFunction &MF = B.getMF(); 2526 LegalizerHelper Helper(MF, ApplyVALU, B); 2527 2528 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2529 llvm_unreachable("narrowScalar should have succeeded"); 2530 return; 2531 } 2532 case AMDGPU::G_AMDGPU_FFBH_U32: 2533 case AMDGPU::G_AMDGPU_FFBL_B32: 2534 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2535 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2536 const RegisterBank *DstBank = 2537 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2538 if (DstBank == &AMDGPU::SGPRRegBank) 2539 break; 2540 2541 Register SrcReg = MI.getOperand(1).getReg(); 2542 const LLT S32 = LLT::scalar(32); 2543 LLT Ty = MRI.getType(SrcReg); 2544 if (Ty == S32) 2545 break; 2546 2547 // We can narrow this more efficiently than Helper can by using ffbh/ffbl 2548 // which return -1 when the input is zero: 2549 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 2550 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 2551 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) 2552 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) 2553 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2554 MachineIRBuilder B(MI, ApplyVALU); 2555 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2556 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF 2557 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 2558 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2559 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 2560 : Opc; 2561 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; 2562 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); 2563 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); 2564 unsigned AddOpc = 2565 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF 2566 ? AMDGPU::G_ADD 2567 : AMDGPU::G_UADDSAT; 2568 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); 2569 Register DstReg = MI.getOperand(0).getReg(); 2570 B.buildUMin(DstReg, X, Y); 2571 MI.eraseFromParent(); 2572 return; 2573 } 2574 case AMDGPU::G_SEXT: 2575 case AMDGPU::G_ZEXT: 2576 case AMDGPU::G_ANYEXT: { 2577 Register SrcReg = MI.getOperand(1).getReg(); 2578 LLT SrcTy = MRI.getType(SrcReg); 2579 const bool Signed = Opc == AMDGPU::G_SEXT; 2580 2581 assert(OpdMapper.getVRegs(1).empty()); 2582 2583 MachineIRBuilder B(MI); 2584 const RegisterBank *SrcBank = 2585 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2586 2587 Register DstReg = MI.getOperand(0).getReg(); 2588 LLT DstTy = MRI.getType(DstReg); 2589 if (DstTy.isScalar() && 2590 SrcBank != &AMDGPU::SGPRRegBank && 2591 SrcBank != &AMDGPU::VCCRegBank && 2592 // FIXME: Should handle any type that round to s64 when irregular 2593 // breakdowns supported. 2594 DstTy.getSizeInBits() == 64 && 2595 SrcTy.getSizeInBits() <= 32) { 2596 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2597 2598 // Extend to 32-bit, and then extend the low half. 2599 if (Signed) { 2600 // TODO: Should really be buildSExtOrCopy 2601 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2602 } else if (Opc == AMDGPU::G_ZEXT) { 2603 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2604 } else { 2605 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2606 } 2607 2608 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2609 MRI.setRegBank(DstReg, *SrcBank); 2610 MI.eraseFromParent(); 2611 return; 2612 } 2613 2614 if (SrcTy != LLT::scalar(1)) 2615 return; 2616 2617 // It is not legal to have a legalization artifact with a VCC source. Rather 2618 // than introducing a copy, insert the select we would have to select the 2619 // copy to. 2620 if (SrcBank == &AMDGPU::VCCRegBank) { 2621 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2622 2623 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2624 2625 unsigned DstSize = DstTy.getSizeInBits(); 2626 // 64-bit select is SGPR only 2627 const bool UseSel64 = DstSize > 32 && 2628 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2629 2630 // TODO: Should s16 select be legal? 2631 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2632 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2633 auto False = B.buildConstant(SelType, 0); 2634 2635 MRI.setRegBank(True.getReg(0), *DstBank); 2636 MRI.setRegBank(False.getReg(0), *DstBank); 2637 MRI.setRegBank(DstReg, *DstBank); 2638 2639 if (DstSize > 32) { 2640 B.buildSelect(DefRegs[0], SrcReg, True, False); 2641 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2642 } else if (DstSize < 32) { 2643 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2644 MRI.setRegBank(Sel.getReg(0), *DstBank); 2645 B.buildTrunc(DstReg, Sel); 2646 } else { 2647 B.buildSelect(DstReg, SrcReg, True, False); 2648 } 2649 2650 MI.eraseFromParent(); 2651 return; 2652 } 2653 2654 break; 2655 } 2656 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2657 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2658 2659 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2660 2661 Register DstReg = MI.getOperand(0).getReg(); 2662 Register SrcReg = MI.getOperand(1).getReg(); 2663 2664 const LLT S32 = LLT::scalar(32); 2665 LLT DstTy = MRI.getType(DstReg); 2666 LLT SrcTy = MRI.getType(SrcReg); 2667 2668 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2669 return; 2670 2671 MachineIRBuilder B(MI); 2672 2673 const ValueMapping &DstMapping 2674 = OpdMapper.getInstrMapping().getOperandMapping(0); 2675 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2676 const RegisterBank *SrcBank = 2677 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2678 const RegisterBank *IdxBank = 2679 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2680 2681 Register BaseIdxReg; 2682 unsigned ConstOffset; 2683 std::tie(BaseIdxReg, ConstOffset) = 2684 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2685 2686 // See if the index is an add of a constant which will be foldable by moving 2687 // the base register of the index later if this is going to be executed in a 2688 // waterfall loop. This is essentially to reassociate the add of a constant 2689 // with the readfirstlane. 2690 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2691 ConstOffset > 0 && 2692 ConstOffset < SrcTy.getNumElements(); 2693 2694 // Move the base register. We'll re-insert the add later. 2695 if (ShouldMoveIndexIntoLoop) 2696 MI.getOperand(2).setReg(BaseIdxReg); 2697 2698 // If this is a VGPR result only because the index was a VGPR result, the 2699 // actual indexing will be done on the SGPR source vector, which will 2700 // produce a scalar result. We need to copy to the VGPR result inside the 2701 // waterfall loop. 2702 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2703 SrcBank == &AMDGPU::SGPRRegBank; 2704 if (DstRegs.empty()) { 2705 applyDefaultMapping(OpdMapper); 2706 2707 executeInWaterfallLoop(MI, MRI, { 2 }); 2708 2709 if (NeedCopyToVGPR) { 2710 // We don't want a phi for this temporary reg. 2711 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2712 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2713 MI.getOperand(0).setReg(TmpReg); 2714 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2715 2716 // Use a v_mov_b32 here to make the exec dependency explicit. 2717 buildVCopy(B, DstReg, TmpReg); 2718 } 2719 2720 // Re-insert the constant offset add inside the waterfall loop. 2721 if (ShouldMoveIndexIntoLoop) 2722 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2723 2724 return; 2725 } 2726 2727 assert(DstTy.getSizeInBits() == 64); 2728 2729 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); 2730 2731 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2732 auto One = B.buildConstant(S32, 1); 2733 2734 MachineBasicBlock::iterator MII = MI.getIterator(); 2735 2736 // Split the vector index into 32-bit pieces. Prepare to move all of the 2737 // new instructions into a waterfall loop if necessary. 2738 // 2739 // Don't put the bitcast or constant in the loop. 2740 MachineInstrSpan Span(MII, &B.getMBB()); 2741 2742 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2743 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2744 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2745 2746 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2747 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2748 2749 MRI.setRegBank(DstReg, *DstBank); 2750 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2751 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2752 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2753 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2754 2755 SmallSet<Register, 4> OpsToWaterfall; 2756 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2757 MI.eraseFromParent(); 2758 return; 2759 } 2760 2761 // Remove the original instruction to avoid potentially confusing the 2762 // waterfall loop logic. 2763 B.setInstr(*Span.begin()); 2764 MI.eraseFromParent(); 2765 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2766 OpsToWaterfall, MRI); 2767 2768 if (NeedCopyToVGPR) { 2769 MachineBasicBlock *LoopBB = Extract1->getParent(); 2770 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2771 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2772 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2773 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2774 2775 Extract0->getOperand(0).setReg(TmpReg0); 2776 Extract1->getOperand(0).setReg(TmpReg1); 2777 2778 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2779 2780 buildVCopy(B, DstRegs[0], TmpReg0); 2781 buildVCopy(B, DstRegs[1], TmpReg1); 2782 } 2783 2784 if (ShouldMoveIndexIntoLoop) 2785 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2786 2787 return; 2788 } 2789 case AMDGPU::G_INSERT_VECTOR_ELT: { 2790 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2791 2792 Register DstReg = MI.getOperand(0).getReg(); 2793 LLT VecTy = MRI.getType(DstReg); 2794 2795 assert(OpdMapper.getVRegs(0).empty()); 2796 assert(OpdMapper.getVRegs(3).empty()); 2797 2798 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2799 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2800 2801 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2802 return; 2803 2804 const RegisterBank *IdxBank = 2805 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2806 2807 Register SrcReg = MI.getOperand(1).getReg(); 2808 Register InsReg = MI.getOperand(2).getReg(); 2809 LLT InsTy = MRI.getType(InsReg); 2810 (void)InsTy; 2811 2812 Register BaseIdxReg; 2813 unsigned ConstOffset; 2814 std::tie(BaseIdxReg, ConstOffset) = 2815 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2816 2817 // See if the index is an add of a constant which will be foldable by moving 2818 // the base register of the index later if this is going to be executed in a 2819 // waterfall loop. This is essentially to reassociate the add of a constant 2820 // with the readfirstlane. 2821 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2822 ConstOffset > 0 && 2823 ConstOffset < VecTy.getNumElements(); 2824 2825 // Move the base register. We'll re-insert the add later. 2826 if (ShouldMoveIndexIntoLoop) 2827 MI.getOperand(3).setReg(BaseIdxReg); 2828 2829 2830 if (InsRegs.empty()) { 2831 executeInWaterfallLoop(MI, MRI, { 3 }); 2832 2833 // Re-insert the constant offset add inside the waterfall loop. 2834 if (ShouldMoveIndexIntoLoop) { 2835 MachineIRBuilder B(MI); 2836 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2837 } 2838 2839 return; 2840 } 2841 2842 2843 assert(InsTy.getSizeInBits() == 64); 2844 2845 const LLT S32 = LLT::scalar(32); 2846 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); 2847 2848 MachineIRBuilder B(MI); 2849 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2850 auto One = B.buildConstant(S32, 1); 2851 2852 // Split the vector index into 32-bit pieces. Prepare to move all of the 2853 // new instructions into a waterfall loop if necessary. 2854 // 2855 // Don't put the bitcast or constant in the loop. 2856 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2857 2858 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2859 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2860 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2861 2862 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2863 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2864 2865 const RegisterBank *DstBank = 2866 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2867 const RegisterBank *SrcBank = 2868 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2869 const RegisterBank *InsSrcBank = 2870 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2871 2872 MRI.setRegBank(InsReg, *InsSrcBank); 2873 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2874 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2875 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2876 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2877 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2878 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2879 2880 2881 SmallSet<Register, 4> OpsToWaterfall; 2882 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2883 B.setInsertPt(B.getMBB(), MI); 2884 B.buildBitcast(DstReg, InsHi); 2885 MI.eraseFromParent(); 2886 return; 2887 } 2888 2889 B.setInstr(*Span.begin()); 2890 MI.eraseFromParent(); 2891 2892 // Figure out the point after the waterfall loop before mangling the control 2893 // flow. 2894 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2895 OpsToWaterfall, MRI); 2896 2897 // The insertion point is now right after the original instruction. 2898 // 2899 // Keep the bitcast to the original vector type out of the loop. Doing this 2900 // saved an extra phi we don't need inside the loop. 2901 B.buildBitcast(DstReg, InsHi); 2902 2903 // Re-insert the constant offset add inside the waterfall loop. 2904 if (ShouldMoveIndexIntoLoop) 2905 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2906 2907 return; 2908 } 2909 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2910 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2911 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2912 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2913 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2914 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2915 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 2916 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2917 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2918 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2919 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2920 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2921 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2922 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2923 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2924 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2925 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2926 applyDefaultMapping(OpdMapper); 2927 executeInWaterfallLoop(MI, MRI, {1, 4}); 2928 return; 2929 } 2930 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2931 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2932 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2933 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2934 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2935 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2936 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2937 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2938 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2939 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2940 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2941 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2942 applyDefaultMapping(OpdMapper); 2943 executeInWaterfallLoop(MI, MRI, {2, 5}); 2944 return; 2945 } 2946 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2947 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2948 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2949 applyDefaultMapping(OpdMapper); 2950 executeInWaterfallLoop(MI, MRI, {2, 5}); 2951 return; 2952 } 2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2954 applyDefaultMapping(OpdMapper); 2955 executeInWaterfallLoop(MI, MRI, {3, 6}); 2956 return; 2957 } 2958 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2959 applyMappingSBufferLoad(OpdMapper); 2960 return; 2961 } 2962 case AMDGPU::G_INTRINSIC: { 2963 switch (MI.getIntrinsicID()) { 2964 case Intrinsic::amdgcn_readlane: { 2965 substituteSimpleCopyRegs(OpdMapper, 2); 2966 2967 assert(OpdMapper.getVRegs(0).empty()); 2968 assert(OpdMapper.getVRegs(3).empty()); 2969 2970 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2971 // waterfall loop, so assume it's a uniform value. 2972 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2973 return; 2974 } 2975 case Intrinsic::amdgcn_writelane: { 2976 assert(OpdMapper.getVRegs(0).empty()); 2977 assert(OpdMapper.getVRegs(2).empty()); 2978 assert(OpdMapper.getVRegs(3).empty()); 2979 2980 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2981 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2982 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2983 return; 2984 } 2985 case Intrinsic::amdgcn_interp_p1: 2986 case Intrinsic::amdgcn_interp_p2: 2987 case Intrinsic::amdgcn_interp_mov: 2988 case Intrinsic::amdgcn_interp_p1_f16: 2989 case Intrinsic::amdgcn_interp_p2_f16: 2990 case Intrinsic::amdgcn_lds_param_load: { 2991 applyDefaultMapping(OpdMapper); 2992 2993 // Readlane for m0 value, which is always the last operand. 2994 // FIXME: Should this be a waterfall loop instead? 2995 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 2996 return; 2997 } 2998 case Intrinsic::amdgcn_interp_inreg_p10: 2999 case Intrinsic::amdgcn_interp_inreg_p2: 3000 case Intrinsic::amdgcn_interp_inreg_p10_f16: 3001 case Intrinsic::amdgcn_interp_inreg_p2_f16: 3002 applyDefaultMapping(OpdMapper); 3003 return; 3004 case Intrinsic::amdgcn_permlane16: 3005 case Intrinsic::amdgcn_permlanex16: { 3006 // Doing a waterfall loop over these wouldn't make any sense. 3007 substituteSimpleCopyRegs(OpdMapper, 2); 3008 substituteSimpleCopyRegs(OpdMapper, 3); 3009 constrainOpWithReadfirstlane(MI, MRI, 4); 3010 constrainOpWithReadfirstlane(MI, MRI, 5); 3011 return; 3012 } 3013 case Intrinsic::amdgcn_sbfe: 3014 applyMappingBFE(OpdMapper, true); 3015 return; 3016 case Intrinsic::amdgcn_ubfe: 3017 applyMappingBFE(OpdMapper, false); 3018 return; 3019 case Intrinsic::amdgcn_ballot: 3020 // Use default handling and insert copy to vcc source. 3021 break; 3022 } 3023 break; 3024 } 3025 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3026 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3027 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3028 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3029 const AMDGPU::RsrcIntrinsic *RSrcIntrin 3030 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 3031 assert(RSrcIntrin && RSrcIntrin->IsImage); 3032 // Non-images can have complications from operands that allow both SGPR 3033 // and VGPR. For now it's too complicated to figure out the final opcode 3034 // to derive the register bank from the MCInstrDesc. 3035 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3036 return; 3037 } 3038 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 3039 unsigned N = MI.getNumExplicitOperands() - 2; 3040 applyDefaultMapping(OpdMapper); 3041 executeInWaterfallLoop(MI, MRI, { N }); 3042 return; 3043 } 3044 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 3045 auto IntrID = MI.getIntrinsicID(); 3046 switch (IntrID) { 3047 case Intrinsic::amdgcn_ds_ordered_add: 3048 case Intrinsic::amdgcn_ds_ordered_swap: { 3049 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 3050 assert(OpdMapper.getVRegs(0).empty()); 3051 substituteSimpleCopyRegs(OpdMapper, 3); 3052 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3053 return; 3054 } 3055 case Intrinsic::amdgcn_ds_gws_init: 3056 case Intrinsic::amdgcn_ds_gws_barrier: 3057 case Intrinsic::amdgcn_ds_gws_sema_br: { 3058 // Only the first lane is executes, so readfirstlane is safe. 3059 substituteSimpleCopyRegs(OpdMapper, 1); 3060 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3061 return; 3062 } 3063 case Intrinsic::amdgcn_ds_gws_sema_v: 3064 case Intrinsic::amdgcn_ds_gws_sema_p: 3065 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 3066 // Only the first lane is executes, so readfirstlane is safe. 3067 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 3068 return; 3069 } 3070 case Intrinsic::amdgcn_ds_append: 3071 case Intrinsic::amdgcn_ds_consume: { 3072 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3073 return; 3074 } 3075 case Intrinsic::amdgcn_s_sendmsg: 3076 case Intrinsic::amdgcn_s_sendmsghalt: { 3077 // FIXME: Should this use a waterfall loop? 3078 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3079 return; 3080 } 3081 case Intrinsic::amdgcn_s_setreg: { 3082 constrainOpWithReadfirstlane(MI, MRI, 2); 3083 return; 3084 } 3085 case Intrinsic::amdgcn_raw_buffer_load_lds: { 3086 applyDefaultMapping(OpdMapper); 3087 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3088 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3089 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset 3090 return; 3091 } 3092 case Intrinsic::amdgcn_struct_buffer_load_lds: { 3093 applyDefaultMapping(OpdMapper); 3094 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc 3095 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 3096 constrainOpWithReadfirstlane(MI, MRI, 6); // soffset 3097 return; 3098 } 3099 case Intrinsic::amdgcn_global_load_lds: { 3100 applyDefaultMapping(OpdMapper); 3101 constrainOpWithReadfirstlane(MI, MRI, 2); 3102 return; 3103 } 3104 case Intrinsic::amdgcn_lds_direct_load: { 3105 applyDefaultMapping(OpdMapper); 3106 // Readlane for m0 value, which is always the last operand. 3107 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 3108 return; 3109 } 3110 case Intrinsic::amdgcn_exp_row: 3111 applyDefaultMapping(OpdMapper); 3112 constrainOpWithReadfirstlane(MI, MRI, 8); // M0 3113 return; 3114 default: { 3115 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 3116 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 3117 // Non-images can have complications from operands that allow both SGPR 3118 // and VGPR. For now it's too complicated to figure out the final opcode 3119 // to derive the register bank from the MCInstrDesc. 3120 if (RSrcIntrin->IsImage) { 3121 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3122 return; 3123 } 3124 } 3125 3126 break; 3127 } 3128 } 3129 break; 3130 } 3131 case AMDGPU::G_SI_CALL: { 3132 // Use a set to avoid extra readfirstlanes in the case where multiple 3133 // operands are the same register. 3134 SmallSet<Register, 4> SGPROperandRegs; 3135 3136 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) 3137 break; 3138 3139 // Move all copies to physical SGPRs that are used by the call instruction 3140 // into the loop block. Start searching for these copies until the 3141 // ADJCALLSTACKUP. 3142 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; 3143 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; 3144 3145 // Move all non-copies before the copies, so that a complete range can be 3146 // moved into the waterfall loop. 3147 SmallVector<MachineInstr *, 4> NonCopyInstrs; 3148 // Count of NonCopyInstrs found until the current LastCopy. 3149 unsigned NonCopyInstrsLen = 0; 3150 MachineBasicBlock::iterator Start(&MI); 3151 MachineBasicBlock::iterator LastCopy = Start; 3152 MachineBasicBlock *MBB = MI.getParent(); 3153 const SIMachineFunctionInfo *Info = 3154 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); 3155 while (Start->getOpcode() != FrameSetupOpcode) { 3156 --Start; 3157 bool IsCopy = false; 3158 if (Start->getOpcode() == AMDGPU::COPY) { 3159 auto &Dst = Start->getOperand(0); 3160 if (Dst.isReg()) { 3161 Register Reg = Dst.getReg(); 3162 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { 3163 IsCopy = true; 3164 } else { 3165 // Also move the copy from the scratch rsrc descriptor into the loop 3166 // to allow it to be optimized away. 3167 auto &Src = Start->getOperand(1); 3168 if (Src.isReg()) { 3169 Reg = Src.getReg(); 3170 IsCopy = Info->getScratchRSrcReg() == Reg; 3171 } 3172 } 3173 } 3174 } 3175 3176 if (IsCopy) { 3177 LastCopy = Start; 3178 NonCopyInstrsLen = NonCopyInstrs.size(); 3179 } else { 3180 NonCopyInstrs.push_back(&*Start); 3181 } 3182 } 3183 NonCopyInstrs.resize(NonCopyInstrsLen); 3184 3185 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3186 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3187 } 3188 Start = LastCopy; 3189 3190 // Do the same for copies after the loop 3191 NonCopyInstrs.clear(); 3192 NonCopyInstrsLen = 0; 3193 MachineBasicBlock::iterator End(&MI); 3194 LastCopy = End; 3195 while (End->getOpcode() != FrameDestroyOpcode) { 3196 ++End; 3197 bool IsCopy = false; 3198 if (End->getOpcode() == AMDGPU::COPY) { 3199 auto &Src = End->getOperand(1); 3200 if (Src.isReg()) { 3201 Register Reg = Src.getReg(); 3202 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); 3203 } 3204 } 3205 3206 if (IsCopy) { 3207 LastCopy = End; 3208 NonCopyInstrsLen = NonCopyInstrs.size(); 3209 } else { 3210 NonCopyInstrs.push_back(&*End); 3211 } 3212 } 3213 NonCopyInstrs.resize(NonCopyInstrsLen); 3214 3215 End = LastCopy; 3216 ++LastCopy; 3217 for (auto *NonCopy : reverse(NonCopyInstrs)) { 3218 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); 3219 } 3220 3221 ++End; 3222 MachineIRBuilder B(*Start); 3223 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI); 3224 break; 3225 } 3226 case AMDGPU::G_LOAD: 3227 case AMDGPU::G_ZEXTLOAD: 3228 case AMDGPU::G_SEXTLOAD: { 3229 if (applyMappingLoad(MI, OpdMapper, MRI)) 3230 return; 3231 break; 3232 } 3233 case AMDGPU::G_DYN_STACKALLOC: 3234 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3235 return; 3236 case AMDGPU::G_SBFX: 3237 applyMappingBFE(OpdMapper, /*Signed*/ true); 3238 return; 3239 case AMDGPU::G_UBFX: 3240 applyMappingBFE(OpdMapper, /*Signed*/ false); 3241 return; 3242 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3243 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3244 applyMappingMAD_64_32(OpdMapper); 3245 return; 3246 default: 3247 break; 3248 } 3249 3250 return applyDefaultMapping(OpdMapper); 3251 } 3252 3253 // vgpr, sgpr -> vgpr 3254 // vgpr, agpr -> vgpr 3255 // agpr, agpr -> agpr 3256 // agpr, sgpr -> vgpr 3257 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3258 if (RB0 == AMDGPU::InvalidRegBankID) 3259 return RB1; 3260 if (RB1 == AMDGPU::InvalidRegBankID) 3261 return RB0; 3262 3263 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3264 return AMDGPU::SGPRRegBankID; 3265 3266 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3267 return AMDGPU::AGPRRegBankID; 3268 3269 return AMDGPU::VGPRRegBankID; 3270 } 3271 3272 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3273 if (RB0 == AMDGPU::InvalidRegBankID) 3274 return RB1; 3275 if (RB1 == AMDGPU::InvalidRegBankID) 3276 return RB0; 3277 3278 // vcc, vcc -> vcc 3279 // vcc, sgpr -> vcc 3280 // vcc, vgpr -> vcc 3281 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3282 return AMDGPU::VCCRegBankID; 3283 3284 // vcc, vgpr -> vgpr 3285 return regBankUnion(RB0, RB1); 3286 } 3287 3288 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3289 const MachineInstr &MI) const { 3290 unsigned RegBank = AMDGPU::InvalidRegBankID; 3291 3292 for (const MachineOperand &MO : MI.operands()) { 3293 if (!MO.isReg()) 3294 continue; 3295 Register Reg = MO.getReg(); 3296 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3297 RegBank = regBankUnion(RegBank, Bank->getID()); 3298 if (RegBank == AMDGPU::VGPRRegBankID) 3299 break; 3300 } 3301 } 3302 3303 return RegBank; 3304 } 3305 3306 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3307 const MachineFunction &MF = *MI.getParent()->getParent(); 3308 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3309 for (const MachineOperand &MO : MI.operands()) { 3310 if (!MO.isReg()) 3311 continue; 3312 Register Reg = MO.getReg(); 3313 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3314 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3315 return false; 3316 } 3317 } 3318 return true; 3319 } 3320 3321 const RegisterBankInfo::InstructionMapping & 3322 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3323 const MachineFunction &MF = *MI.getParent()->getParent(); 3324 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3325 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3326 3327 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3328 const MachineOperand &SrcOp = MI.getOperand(i); 3329 if (!SrcOp.isReg()) 3330 continue; 3331 3332 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3333 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3334 } 3335 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3336 MI.getNumOperands()); 3337 } 3338 3339 const RegisterBankInfo::InstructionMapping & 3340 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3341 const MachineFunction &MF = *MI.getParent()->getParent(); 3342 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3343 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3344 3345 // Even though we technically could use SGPRs, this would require knowledge of 3346 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3347 // 3348 // TODO: Unary ops are trivially OK, so accept SGPRs? 3349 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3350 const MachineOperand &Src = MI.getOperand(i); 3351 if (!Src.isReg()) 3352 continue; 3353 3354 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3355 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3356 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3357 } 3358 3359 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3360 MI.getNumOperands()); 3361 } 3362 3363 const RegisterBankInfo::InstructionMapping & 3364 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3365 const MachineFunction &MF = *MI.getParent()->getParent(); 3366 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3367 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3368 3369 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3370 const MachineOperand &Op = MI.getOperand(I); 3371 if (!Op.isReg()) 3372 continue; 3373 3374 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3375 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3376 } 3377 3378 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3379 MI.getNumOperands()); 3380 } 3381 3382 const RegisterBankInfo::InstructionMapping & 3383 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3384 const MachineInstr &MI, 3385 int RsrcIdx) const { 3386 // The reported argument index is relative to the IR intrinsic call arguments, 3387 // so we need to shift by the number of defs and the intrinsic ID. 3388 RsrcIdx += MI.getNumExplicitDefs() + 1; 3389 3390 const int NumOps = MI.getNumOperands(); 3391 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3392 3393 // TODO: Should packed/unpacked D16 difference be reported here as part of 3394 // the value mapping? 3395 for (int I = 0; I != NumOps; ++I) { 3396 if (!MI.getOperand(I).isReg()) 3397 continue; 3398 3399 Register OpReg = MI.getOperand(I).getReg(); 3400 // We replace some dead address operands with $noreg 3401 if (!OpReg) 3402 continue; 3403 3404 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3405 3406 // FIXME: Probably need a new intrinsic register bank searchable table to 3407 // handle arbitrary intrinsics easily. 3408 // 3409 // If this has a sampler, it immediately follows rsrc. 3410 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3411 3412 if (MustBeSGPR) { 3413 // If this must be an SGPR, so we must report whatever it is as legal. 3414 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3415 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3416 } else { 3417 // Some operands must be VGPR, and these are easy to copy to. 3418 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3419 } 3420 } 3421 3422 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3423 } 3424 3425 /// Return the mapping for a pointer argument. 3426 const RegisterBankInfo::ValueMapping * 3427 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3428 Register PtrReg) const { 3429 LLT PtrTy = MRI.getType(PtrReg); 3430 unsigned Size = PtrTy.getSizeInBits(); 3431 if (Subtarget.useFlatForGlobal() || 3432 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3433 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3434 3435 // If we're using MUBUF instructions for global memory, an SGPR base register 3436 // is possible. Otherwise this needs to be a VGPR. 3437 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3438 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3439 } 3440 3441 const RegisterBankInfo::InstructionMapping & 3442 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3443 3444 const MachineFunction &MF = *MI.getParent()->getParent(); 3445 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3446 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3447 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3448 Register PtrReg = MI.getOperand(1).getReg(); 3449 LLT PtrTy = MRI.getType(PtrReg); 3450 unsigned AS = PtrTy.getAddressSpace(); 3451 unsigned PtrSize = PtrTy.getSizeInBits(); 3452 3453 const ValueMapping *ValMapping; 3454 const ValueMapping *PtrMapping; 3455 3456 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3457 3458 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3459 if (isScalarLoadLegal(MI)) { 3460 // We have a uniform instruction so we want to use an SMRD load 3461 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3462 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3463 } else { 3464 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3465 3466 // If we're using MUBUF instructions for global memory, an SGPR base 3467 // register is possible. Otherwise this needs to be a VGPR. 3468 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3469 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3470 3471 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3472 } 3473 } else { 3474 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3475 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3476 } 3477 3478 OpdsMapping[0] = ValMapping; 3479 OpdsMapping[1] = PtrMapping; 3480 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3481 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3482 return Mapping; 3483 3484 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3485 // handle that during instruction selection? 3486 } 3487 3488 unsigned 3489 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3490 const MachineRegisterInfo &MRI, 3491 unsigned Default) const { 3492 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3493 return Bank ? Bank->getID() : Default; 3494 } 3495 3496 const RegisterBankInfo::ValueMapping * 3497 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3498 const MachineRegisterInfo &MRI, 3499 const TargetRegisterInfo &TRI) const { 3500 // Lie and claim anything is legal, even though this needs to be an SGPR 3501 // applyMapping will have to deal with it as a waterfall loop. 3502 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3503 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3504 return AMDGPU::getValueMapping(Bank, Size); 3505 } 3506 3507 const RegisterBankInfo::ValueMapping * 3508 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3509 const MachineRegisterInfo &MRI, 3510 const TargetRegisterInfo &TRI) const { 3511 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3512 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3513 } 3514 3515 const RegisterBankInfo::ValueMapping * 3516 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3517 const MachineRegisterInfo &MRI, 3518 const TargetRegisterInfo &TRI) const { 3519 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3520 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3521 } 3522 3523 /// 3524 /// This function must return a legal mapping, because 3525 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3526 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3527 /// VGPR to SGPR generated is illegal. 3528 /// 3529 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3530 // legal. These will be dealt with in applyMappingImpl. 3531 // 3532 const RegisterBankInfo::InstructionMapping & 3533 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3534 const MachineFunction &MF = *MI.getParent()->getParent(); 3535 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3536 3537 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3538 // The default logic bothers to analyze impossible alternative mappings. We 3539 // want the most straightforward mapping, so just directly handle this. 3540 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3541 *TRI); 3542 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3543 *TRI); 3544 assert(SrcBank && "src bank should have been assigned already"); 3545 if (!DstBank) 3546 DstBank = SrcBank; 3547 3548 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3549 if (MI.getOpcode() != AMDGPU::G_FREEZE && 3550 cannotCopy(*DstBank, *SrcBank, Size)) 3551 return getInvalidInstructionMapping(); 3552 3553 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3554 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3555 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3556 OpdsMapping[0] = &ValMap; 3557 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3558 OpdsMapping[1] = &ValMap; 3559 3560 return getInstructionMapping( 3561 1, /*Cost*/ 1, 3562 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3563 } 3564 3565 if (MI.isRegSequence()) { 3566 // If any input is a VGPR, the result must be a VGPR. The default handling 3567 // assumes any copy between banks is legal. 3568 unsigned BankID = AMDGPU::SGPRRegBankID; 3569 3570 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3571 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3572 // It doesn't make sense to use vcc or scc banks here, so just ignore 3573 // them. 3574 if (OpBank != AMDGPU::SGPRRegBankID) { 3575 BankID = AMDGPU::VGPRRegBankID; 3576 break; 3577 } 3578 } 3579 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3580 3581 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3582 return getInstructionMapping( 3583 1, /*Cost*/ 1, 3584 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3585 } 3586 3587 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3588 // properly. 3589 // 3590 // TODO: There are additional exec masking dependencies to analyze. 3591 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3592 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3593 Register DstReg = MI.getOperand(0).getReg(); 3594 3595 // Sometimes the result may have already been assigned a bank. 3596 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3597 ResultBank = DstBank->getID(); 3598 3599 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3600 Register Reg = MI.getOperand(I).getReg(); 3601 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3602 3603 // FIXME: Assuming VGPR for any undetermined inputs. 3604 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3605 ResultBank = AMDGPU::VGPRRegBankID; 3606 break; 3607 } 3608 3609 // FIXME: Need to promote SGPR case to s32 3610 unsigned OpBank = Bank->getID(); 3611 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3612 } 3613 3614 assert(ResultBank != AMDGPU::InvalidRegBankID); 3615 3616 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3617 3618 const ValueMapping &ValMap = 3619 getValueMapping(0, Size, getRegBank(ResultBank)); 3620 return getInstructionMapping( 3621 1, /*Cost*/ 1, 3622 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3623 } 3624 3625 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3626 if (Mapping.isValid()) 3627 return Mapping; 3628 3629 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3630 3631 switch (MI.getOpcode()) { 3632 default: 3633 return getInvalidInstructionMapping(); 3634 3635 case AMDGPU::G_AND: 3636 case AMDGPU::G_OR: 3637 case AMDGPU::G_XOR: { 3638 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3639 if (Size == 1) { 3640 const RegisterBank *DstBank 3641 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3642 3643 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3644 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3645 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3646 if (DstBank) { 3647 TargetBankID = DstBank->getID(); 3648 if (DstBank == &AMDGPU::VCCRegBank) { 3649 TargetBankID = AMDGPU::VCCRegBankID; 3650 BankLHS = AMDGPU::VCCRegBankID; 3651 BankRHS = AMDGPU::VCCRegBankID; 3652 } else { 3653 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3654 AMDGPU::SGPRRegBankID); 3655 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3656 AMDGPU::SGPRRegBankID); 3657 } 3658 } else { 3659 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3660 AMDGPU::VCCRegBankID); 3661 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3662 AMDGPU::VCCRegBankID); 3663 3664 // Both inputs should be true booleans to produce a boolean result. 3665 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3666 TargetBankID = AMDGPU::VGPRRegBankID; 3667 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3668 TargetBankID = AMDGPU::VCCRegBankID; 3669 BankLHS = AMDGPU::VCCRegBankID; 3670 BankRHS = AMDGPU::VCCRegBankID; 3671 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3672 TargetBankID = AMDGPU::SGPRRegBankID; 3673 } 3674 } 3675 3676 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3677 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3678 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3679 break; 3680 } 3681 3682 if (Size == 64) { 3683 3684 if (isSALUMapping(MI)) { 3685 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3686 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3687 } else { 3688 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3689 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3690 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3691 3692 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3693 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3694 } 3695 3696 break; 3697 } 3698 3699 [[fallthrough]]; 3700 } 3701 case AMDGPU::G_PTR_ADD: 3702 case AMDGPU::G_PTRMASK: 3703 case AMDGPU::G_ADD: 3704 case AMDGPU::G_SUB: 3705 case AMDGPU::G_MUL: 3706 case AMDGPU::G_SHL: 3707 case AMDGPU::G_LSHR: 3708 case AMDGPU::G_ASHR: 3709 case AMDGPU::G_UADDO: 3710 case AMDGPU::G_USUBO: 3711 case AMDGPU::G_UADDE: 3712 case AMDGPU::G_SADDE: 3713 case AMDGPU::G_USUBE: 3714 case AMDGPU::G_SSUBE: 3715 case AMDGPU::G_SMIN: 3716 case AMDGPU::G_SMAX: 3717 case AMDGPU::G_UMIN: 3718 case AMDGPU::G_UMAX: 3719 case AMDGPU::G_ABS: 3720 case AMDGPU::G_SHUFFLE_VECTOR: 3721 case AMDGPU::G_SBFX: 3722 case AMDGPU::G_UBFX: 3723 if (isSALUMapping(MI)) 3724 return getDefaultMappingSOP(MI); 3725 [[fallthrough]]; 3726 3727 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3728 case AMDGPU::G_SSUBSAT: 3729 case AMDGPU::G_UADDSAT: 3730 case AMDGPU::G_USUBSAT: 3731 case AMDGPU::G_FADD: 3732 case AMDGPU::G_FSUB: 3733 case AMDGPU::G_FPTOSI: 3734 case AMDGPU::G_FPTOUI: 3735 case AMDGPU::G_FMUL: 3736 case AMDGPU::G_FMA: 3737 case AMDGPU::G_FMAD: 3738 case AMDGPU::G_FSQRT: 3739 case AMDGPU::G_FFLOOR: 3740 case AMDGPU::G_FCEIL: 3741 case AMDGPU::G_FRINT: 3742 case AMDGPU::G_SITOFP: 3743 case AMDGPU::G_UITOFP: 3744 case AMDGPU::G_FPTRUNC: 3745 case AMDGPU::G_FPEXT: 3746 case AMDGPU::G_FEXP2: 3747 case AMDGPU::G_FLOG2: 3748 case AMDGPU::G_FMINNUM: 3749 case AMDGPU::G_FMAXNUM: 3750 case AMDGPU::G_FMINNUM_IEEE: 3751 case AMDGPU::G_FMAXNUM_IEEE: 3752 case AMDGPU::G_FCANONICALIZE: 3753 case AMDGPU::G_INTRINSIC_TRUNC: 3754 case AMDGPU::G_STRICT_FADD: 3755 case AMDGPU::G_STRICT_FSUB: 3756 case AMDGPU::G_STRICT_FMUL: 3757 case AMDGPU::G_STRICT_FMA: 3758 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3759 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3760 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3761 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3762 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3763 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3764 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3765 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3766 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3767 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3768 case AMDGPU::G_AMDGPU_SMED3: 3769 return getDefaultMappingVOP(MI); 3770 case AMDGPU::G_UMULH: 3771 case AMDGPU::G_SMULH: { 3772 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3773 return getDefaultMappingSOP(MI); 3774 return getDefaultMappingVOP(MI); 3775 } 3776 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3777 case AMDGPU::G_AMDGPU_MAD_I64_I32: { 3778 // Three possible mappings: 3779 // 3780 // - Default SOP 3781 // - Default VOP 3782 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. 3783 // 3784 // This allows instruction selection to keep the multiplication part of the 3785 // instruction on the SALU. 3786 bool AllSalu = true; 3787 bool MulSalu = true; 3788 for (unsigned i = 0; i < 5; ++i) { 3789 Register Reg = MI.getOperand(i).getReg(); 3790 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3791 if (Bank->getID() != AMDGPU::SGPRRegBankID) { 3792 AllSalu = false; 3793 if (i == 2 || i == 3) { 3794 MulSalu = false; 3795 break; 3796 } 3797 } 3798 } 3799 } 3800 3801 if (AllSalu) 3802 return getDefaultMappingSOP(MI); 3803 3804 // If the multiply-add is full-rate in VALU, use that even if the 3805 // multiplication part is scalar. Accumulating separately on the VALU would 3806 // take two instructions. 3807 if (!MulSalu || Subtarget.hasFullRate64Ops()) 3808 return getDefaultMappingVOP(MI); 3809 3810 // Keep the multiplication on the SALU, then accumulate on the VALU. 3811 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3812 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3813 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3814 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3815 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); 3816 break; 3817 } 3818 case AMDGPU::G_IMPLICIT_DEF: { 3819 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3820 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3821 break; 3822 } 3823 case AMDGPU::G_FCONSTANT: 3824 case AMDGPU::G_CONSTANT: 3825 case AMDGPU::G_GLOBAL_VALUE: 3826 case AMDGPU::G_BLOCK_ADDR: 3827 case AMDGPU::G_READCYCLECOUNTER: { 3828 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3829 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3830 break; 3831 } 3832 case AMDGPU::G_FRAME_INDEX: { 3833 // TODO: This should be the same as other constants, but eliminateFrameIndex 3834 // currently assumes VALU uses. 3835 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3836 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3837 break; 3838 } 3839 case AMDGPU::G_DYN_STACKALLOC: { 3840 // Result is always uniform, and a wave reduction is needed for the source. 3841 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3842 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3843 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3844 break; 3845 } 3846 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { 3847 // This case is weird because we expect a physical register in the source, 3848 // but need to set a bank anyway. 3849 // 3850 // We could select the result to SGPR or VGPR, but for the one current use 3851 // it's more practical to always use VGPR. 3852 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 3853 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3854 break; 3855 } 3856 case AMDGPU::G_INSERT: { 3857 unsigned BankID = getMappingType(MRI, MI); 3858 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3859 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3860 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3861 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3862 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3863 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3864 OpdsMapping[3] = nullptr; 3865 break; 3866 } 3867 case AMDGPU::G_EXTRACT: { 3868 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3869 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3870 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3871 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3872 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3873 OpdsMapping[2] = nullptr; 3874 break; 3875 } 3876 case AMDGPU::G_BUILD_VECTOR: 3877 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3878 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3879 if (DstTy == LLT::fixed_vector(2, 16)) { 3880 unsigned DstSize = DstTy.getSizeInBits(); 3881 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3882 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3883 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3884 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3885 3886 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3887 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3888 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3889 break; 3890 } 3891 3892 [[fallthrough]]; 3893 } 3894 case AMDGPU::G_MERGE_VALUES: 3895 case AMDGPU::G_CONCAT_VECTORS: { 3896 unsigned Bank = getMappingType(MRI, MI); 3897 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3898 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3899 3900 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3901 // Op1 and Dst should use the same register bank. 3902 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3903 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3904 break; 3905 } 3906 case AMDGPU::G_BITREVERSE: 3907 case AMDGPU::G_BITCAST: 3908 case AMDGPU::G_INTTOPTR: 3909 case AMDGPU::G_PTRTOINT: 3910 case AMDGPU::G_FABS: 3911 case AMDGPU::G_FNEG: { 3912 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3913 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3914 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3915 break; 3916 } 3917 case AMDGPU::G_AMDGPU_FFBH_U32: 3918 case AMDGPU::G_AMDGPU_FFBL_B32: 3919 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3920 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 3921 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3922 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3923 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3924 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); 3925 break; 3926 } 3927 case AMDGPU::G_CTPOP: { 3928 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3929 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3930 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3931 3932 // This should really be getValueMappingSGPR64Only, but allowing the generic 3933 // code to handle the register split just makes using LegalizerHelper more 3934 // difficult. 3935 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3936 break; 3937 } 3938 case AMDGPU::G_TRUNC: { 3939 Register Dst = MI.getOperand(0).getReg(); 3940 Register Src = MI.getOperand(1).getReg(); 3941 unsigned Bank = getRegBankID(Src, MRI); 3942 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3943 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3944 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3945 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3946 break; 3947 } 3948 case AMDGPU::G_ZEXT: 3949 case AMDGPU::G_SEXT: 3950 case AMDGPU::G_ANYEXT: 3951 case AMDGPU::G_SEXT_INREG: { 3952 Register Dst = MI.getOperand(0).getReg(); 3953 Register Src = MI.getOperand(1).getReg(); 3954 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3955 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3956 3957 unsigned DstBank; 3958 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3959 assert(SrcBank); 3960 switch (SrcBank->getID()) { 3961 case AMDGPU::SGPRRegBankID: 3962 DstBank = AMDGPU::SGPRRegBankID; 3963 break; 3964 default: 3965 DstBank = AMDGPU::VGPRRegBankID; 3966 break; 3967 } 3968 3969 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3970 // 32-bits, and then to 64. 3971 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3972 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3973 SrcSize); 3974 break; 3975 } 3976 case AMDGPU::G_FCMP: { 3977 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3978 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3979 OpdsMapping[1] = nullptr; // Predicate Operand. 3980 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3981 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3982 break; 3983 } 3984 case AMDGPU::G_IS_FPCLASS: { 3985 Register SrcReg = MI.getOperand(1).getReg(); 3986 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 3987 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3988 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 3989 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 3990 break; 3991 } 3992 case AMDGPU::G_STORE: { 3993 assert(MI.getOperand(0).isReg()); 3994 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3995 3996 // FIXME: We need to specify a different reg bank once scalar stores are 3997 // supported. 3998 const ValueMapping *ValMapping = 3999 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4000 OpdsMapping[0] = ValMapping; 4001 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4002 break; 4003 } 4004 case AMDGPU::G_ICMP: { 4005 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 4006 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4007 4008 // See if the result register has already been constrained to vcc, which may 4009 // happen due to control flow intrinsic lowering. 4010 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4011 AMDGPU::SGPRRegBankID); 4012 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4013 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 4014 4015 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 4016 Op2Bank == AMDGPU::SGPRRegBankID && 4017 Op3Bank == AMDGPU::SGPRRegBankID && 4018 (Size == 32 || (Size == 64 && 4019 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 4020 Subtarget.hasScalarCompareEq64())); 4021 4022 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4023 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4024 4025 // TODO: Use 32-bit for scalar output size. 4026 // SCC results will need to be copied to a 32-bit SGPR virtual register. 4027 const unsigned ResultSize = 1; 4028 4029 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 4030 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 4031 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 4032 break; 4033 } 4034 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 4035 // VGPR index can be used for waterfall when indexing a SGPR vector. 4036 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 4037 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4038 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4039 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4040 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 4041 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 4042 4043 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 4044 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 4045 4046 // The index can be either if the source vector is VGPR. 4047 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4048 break; 4049 } 4050 case AMDGPU::G_INSERT_VECTOR_ELT: { 4051 unsigned OutputBankID = isSALUMapping(MI) ? 4052 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4053 4054 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4055 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4056 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4057 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 4058 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 4059 4060 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4061 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 4062 4063 // This is a weird case, because we need to break down the mapping based on 4064 // the register bank of a different operand. 4065 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 4066 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 4067 InsertSize); 4068 } else { 4069 assert(InsertSize == 32 || InsertSize == 64); 4070 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 4071 } 4072 4073 // The index can be either if the source vector is VGPR. 4074 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 4075 break; 4076 } 4077 case AMDGPU::G_UNMERGE_VALUES: { 4078 unsigned Bank = getMappingType(MRI, MI); 4079 4080 // Op1 and Dst should use the same register bank. 4081 // FIXME: Shouldn't this be the default? Why do we need to handle this? 4082 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4083 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 4084 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 4085 } 4086 break; 4087 } 4088 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 4089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4090 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4091 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4093 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 4094 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: 4095 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 4096 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 4097 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 4098 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 4099 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 4100 case AMDGPU::G_AMDGPU_BUFFER_STORE: 4101 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 4102 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 4103 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 4104 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 4105 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4106 4107 // rsrc 4108 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4109 4110 // vindex 4111 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4112 4113 // voffset 4114 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4115 4116 // soffset 4117 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4118 4119 // Any remaining operands are immediates and were correctly null 4120 // initialized. 4121 break; 4122 } 4123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 4124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 4125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 4126 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 4127 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 4128 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 4129 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 4130 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 4131 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 4132 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 4133 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 4134 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 4135 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 4136 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 4137 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 4138 // vdata_out 4139 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4140 4141 // vdata_in 4142 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4143 4144 // rsrc 4145 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4146 4147 // vindex 4148 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4149 4150 // voffset 4151 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4152 4153 // soffset 4154 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4155 4156 // Any remaining operands are immediates and were correctly null 4157 // initialized. 4158 break; 4159 } 4160 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 4161 // vdata_out 4162 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4163 4164 // vdata_in 4165 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4166 4167 // cmp 4168 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4169 4170 // rsrc 4171 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4172 4173 // vindex 4174 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4175 4176 // voffset 4177 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4178 4179 // soffset 4180 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4181 4182 // Any remaining operands are immediates and were correctly null 4183 // initialized. 4184 break; 4185 } 4186 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 4187 // Lie and claim everything is legal, even though some need to be 4188 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4189 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4190 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4191 4192 // We need to convert this to a MUBUF if either the resource of offset is 4193 // VGPR. 4194 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 4195 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 4196 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 4197 4198 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4199 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 4200 break; 4201 } 4202 case AMDGPU::G_INTRINSIC: { 4203 switch (MI.getIntrinsicID()) { 4204 default: 4205 return getInvalidInstructionMapping(); 4206 case Intrinsic::amdgcn_div_fmas: 4207 case Intrinsic::amdgcn_div_fixup: 4208 case Intrinsic::amdgcn_trig_preop: 4209 case Intrinsic::amdgcn_sin: 4210 case Intrinsic::amdgcn_cos: 4211 case Intrinsic::amdgcn_log_clamp: 4212 case Intrinsic::amdgcn_rcp: 4213 case Intrinsic::amdgcn_rcp_legacy: 4214 case Intrinsic::amdgcn_sqrt: 4215 case Intrinsic::amdgcn_rsq: 4216 case Intrinsic::amdgcn_rsq_legacy: 4217 case Intrinsic::amdgcn_rsq_clamp: 4218 case Intrinsic::amdgcn_fmul_legacy: 4219 case Intrinsic::amdgcn_fma_legacy: 4220 case Intrinsic::amdgcn_ldexp: 4221 case Intrinsic::amdgcn_frexp_mant: 4222 case Intrinsic::amdgcn_frexp_exp: 4223 case Intrinsic::amdgcn_fract: 4224 case Intrinsic::amdgcn_cvt_pkrtz: 4225 case Intrinsic::amdgcn_cvt_pknorm_i16: 4226 case Intrinsic::amdgcn_cvt_pknorm_u16: 4227 case Intrinsic::amdgcn_cvt_pk_i16: 4228 case Intrinsic::amdgcn_cvt_pk_u16: 4229 case Intrinsic::amdgcn_fmed3: 4230 case Intrinsic::amdgcn_cubeid: 4231 case Intrinsic::amdgcn_cubema: 4232 case Intrinsic::amdgcn_cubesc: 4233 case Intrinsic::amdgcn_cubetc: 4234 case Intrinsic::amdgcn_sffbh: 4235 case Intrinsic::amdgcn_fmad_ftz: 4236 case Intrinsic::amdgcn_mbcnt_lo: 4237 case Intrinsic::amdgcn_mbcnt_hi: 4238 case Intrinsic::amdgcn_mul_u24: 4239 case Intrinsic::amdgcn_mul_i24: 4240 case Intrinsic::amdgcn_mulhi_u24: 4241 case Intrinsic::amdgcn_mulhi_i24: 4242 case Intrinsic::amdgcn_lerp: 4243 case Intrinsic::amdgcn_sad_u8: 4244 case Intrinsic::amdgcn_msad_u8: 4245 case Intrinsic::amdgcn_sad_hi_u8: 4246 case Intrinsic::amdgcn_sad_u16: 4247 case Intrinsic::amdgcn_qsad_pk_u16_u8: 4248 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 4249 case Intrinsic::amdgcn_mqsad_u32_u8: 4250 case Intrinsic::amdgcn_cvt_pk_u8_f32: 4251 case Intrinsic::amdgcn_alignbyte: 4252 case Intrinsic::amdgcn_perm: 4253 case Intrinsic::amdgcn_fdot2: 4254 case Intrinsic::amdgcn_sdot2: 4255 case Intrinsic::amdgcn_udot2: 4256 case Intrinsic::amdgcn_sdot4: 4257 case Intrinsic::amdgcn_udot4: 4258 case Intrinsic::amdgcn_sdot8: 4259 case Intrinsic::amdgcn_udot8: 4260 case Intrinsic::amdgcn_fdot2_bf16_bf16: 4261 case Intrinsic::amdgcn_fdot2_f16_f16: 4262 case Intrinsic::amdgcn_fdot2_f32_bf16: 4263 case Intrinsic::amdgcn_sudot4: 4264 case Intrinsic::amdgcn_sudot8: 4265 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: 4266 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: 4267 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: 4268 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: 4269 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: 4270 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: 4271 return getDefaultMappingVOP(MI); 4272 case Intrinsic::amdgcn_sbfe: 4273 case Intrinsic::amdgcn_ubfe: 4274 if (isSALUMapping(MI)) 4275 return getDefaultMappingSOP(MI); 4276 return getDefaultMappingVOP(MI); 4277 case Intrinsic::amdgcn_ds_swizzle: 4278 case Intrinsic::amdgcn_ds_permute: 4279 case Intrinsic::amdgcn_ds_bpermute: 4280 case Intrinsic::amdgcn_update_dpp: 4281 case Intrinsic::amdgcn_mov_dpp8: 4282 case Intrinsic::amdgcn_mov_dpp: 4283 case Intrinsic::amdgcn_strict_wwm: 4284 case Intrinsic::amdgcn_wwm: 4285 case Intrinsic::amdgcn_strict_wqm: 4286 case Intrinsic::amdgcn_wqm: 4287 case Intrinsic::amdgcn_softwqm: 4288 case Intrinsic::amdgcn_set_inactive: 4289 case Intrinsic::amdgcn_permlane64: 4290 return getDefaultMappingAllVGPR(MI); 4291 case Intrinsic::amdgcn_kernarg_segment_ptr: 4292 case Intrinsic::amdgcn_s_getpc: 4293 case Intrinsic::amdgcn_groupstaticsize: 4294 case Intrinsic::amdgcn_reloc_constant: 4295 case Intrinsic::returnaddress: { 4296 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4297 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4298 break; 4299 } 4300 case Intrinsic::amdgcn_wqm_vote: { 4301 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4302 OpdsMapping[0] = OpdsMapping[2] 4303 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 4304 break; 4305 } 4306 case Intrinsic::amdgcn_ps_live: { 4307 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4308 break; 4309 } 4310 case Intrinsic::amdgcn_div_scale: { 4311 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4312 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4313 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4314 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4315 4316 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4317 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4318 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4319 break; 4320 } 4321 case Intrinsic::amdgcn_class: { 4322 Register Src0Reg = MI.getOperand(2).getReg(); 4323 Register Src1Reg = MI.getOperand(3).getReg(); 4324 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4325 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4326 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4327 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4328 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4329 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4330 break; 4331 } 4332 case Intrinsic::amdgcn_icmp: 4333 case Intrinsic::amdgcn_fcmp: { 4334 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4335 // This is not VCCRegBank because this is not used in boolean contexts. 4336 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4337 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4338 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4339 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4340 break; 4341 } 4342 case Intrinsic::amdgcn_readlane: { 4343 // This must be an SGPR, but accept a VGPR. 4344 Register IdxReg = MI.getOperand(3).getReg(); 4345 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4346 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4347 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4348 [[fallthrough]]; 4349 } 4350 case Intrinsic::amdgcn_readfirstlane: { 4351 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4352 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4353 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4354 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4355 break; 4356 } 4357 case Intrinsic::amdgcn_writelane: { 4358 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4359 Register SrcReg = MI.getOperand(2).getReg(); 4360 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4361 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4362 Register IdxReg = MI.getOperand(3).getReg(); 4363 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4364 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4365 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4366 4367 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4368 // to legalize. 4369 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4370 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4371 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4372 break; 4373 } 4374 case Intrinsic::amdgcn_if_break: { 4375 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4376 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4377 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4378 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4379 break; 4380 } 4381 case Intrinsic::amdgcn_permlane16: 4382 case Intrinsic::amdgcn_permlanex16: { 4383 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4384 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4385 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4386 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4387 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4388 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4389 break; 4390 } 4391 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4392 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4393 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4394 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4395 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4396 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4397 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4398 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4399 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4400 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4401 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4402 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4403 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4404 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4405 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4406 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4407 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4408 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4409 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4410 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4411 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4412 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4413 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4414 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4415 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4416 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4417 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: 4418 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: 4419 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: 4420 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: 4421 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: 4422 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: 4423 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: 4424 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: 4425 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: 4426 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: 4427 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: 4428 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: 4429 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { 4430 // Default for MAI intrinsics. 4431 // srcC can also be an immediate which can be folded later. 4432 // FIXME: Should we eventually add an alternative mapping with AGPR src 4433 // for srcA/srcB? 4434 // 4435 // vdst, srcA, srcB, srcC 4436 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4437 OpdsMapping[0] = 4438 Info->mayNeedAGPRs() 4439 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) 4440 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4441 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4442 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4443 OpdsMapping[4] = 4444 Info->mayNeedAGPRs() 4445 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) 4446 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4447 break; 4448 } 4449 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 4450 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 4451 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 4452 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 4453 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 4454 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 4455 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 4456 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 4457 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 4458 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 4459 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 4460 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 4461 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 4462 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: { 4463 // vdst, srcA, srcB, srcC, idx 4464 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4465 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4466 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4467 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4468 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4469 break; 4470 } 4471 case Intrinsic::amdgcn_interp_p1: 4472 case Intrinsic::amdgcn_interp_p2: 4473 case Intrinsic::amdgcn_interp_mov: 4474 case Intrinsic::amdgcn_interp_p1_f16: 4475 case Intrinsic::amdgcn_interp_p2_f16: 4476 case Intrinsic::amdgcn_lds_param_load: { 4477 const int M0Idx = MI.getNumOperands() - 1; 4478 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4479 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4480 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4481 4482 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4483 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4484 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4485 4486 // Must be SGPR, but we must take whatever the original bank is and fix it 4487 // later. 4488 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4489 break; 4490 } 4491 case Intrinsic::amdgcn_interp_inreg_p10: 4492 case Intrinsic::amdgcn_interp_inreg_p2: 4493 case Intrinsic::amdgcn_interp_inreg_p10_f16: 4494 case Intrinsic::amdgcn_interp_inreg_p2_f16: { 4495 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4496 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4497 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4498 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4499 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4500 break; 4501 } 4502 case Intrinsic::amdgcn_ballot: { 4503 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4504 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4505 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4506 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4507 break; 4508 } 4509 } 4510 break; 4511 } 4512 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4513 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4514 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4515 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4516 auto IntrID = MI.getIntrinsicID(); 4517 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4518 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4519 // Non-images can have complications from operands that allow both SGPR 4520 // and VGPR. For now it's too complicated to figure out the final opcode 4521 // to derive the register bank from the MCInstrDesc. 4522 assert(RSrcIntrin->IsImage); 4523 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4524 } 4525 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4526 unsigned N = MI.getNumExplicitOperands() - 2; 4527 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4528 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4529 if (N == 3) { 4530 // Sequential form: all operands combined into VGPR256/VGPR512 4531 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4532 if (Size > 256) 4533 Size = 512; 4534 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4535 } else { 4536 // NSA form 4537 for (unsigned I = 2; I < N; ++I) { 4538 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); 4539 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4540 } 4541 } 4542 break; 4543 } 4544 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4545 auto IntrID = MI.getIntrinsicID(); 4546 switch (IntrID) { 4547 case Intrinsic::amdgcn_s_getreg: 4548 case Intrinsic::amdgcn_s_memtime: 4549 case Intrinsic::amdgcn_s_memrealtime: 4550 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: 4551 case Intrinsic::amdgcn_s_sendmsg_rtn: { 4552 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4553 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4554 break; 4555 } 4556 case Intrinsic::amdgcn_global_atomic_fadd: 4557 case Intrinsic::amdgcn_global_atomic_csub: 4558 case Intrinsic::amdgcn_global_atomic_fmin: 4559 case Intrinsic::amdgcn_global_atomic_fmax: 4560 case Intrinsic::amdgcn_flat_atomic_fadd: 4561 case Intrinsic::amdgcn_flat_atomic_fmin: 4562 case Intrinsic::amdgcn_flat_atomic_fmax: 4563 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: 4564 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: 4565 return getDefaultMappingAllVGPR(MI); 4566 case Intrinsic::amdgcn_ds_ordered_add: 4567 case Intrinsic::amdgcn_ds_ordered_swap: 4568 case Intrinsic::amdgcn_ds_fadd_v2bf16: { 4569 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4570 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4571 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4572 AMDGPU::SGPRRegBankID); 4573 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4574 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4575 break; 4576 } 4577 case Intrinsic::amdgcn_ds_append: 4578 case Intrinsic::amdgcn_ds_consume: { 4579 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4580 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4581 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4582 break; 4583 } 4584 case Intrinsic::amdgcn_exp_compr: 4585 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4586 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4587 break; 4588 case Intrinsic::amdgcn_exp: 4589 // FIXME: Could we support packed types here? 4590 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4591 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4592 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4593 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4594 break; 4595 case Intrinsic::amdgcn_exp_row: 4596 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4597 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4598 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4599 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4600 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); 4601 break; 4602 case Intrinsic::amdgcn_s_sendmsg: 4603 case Intrinsic::amdgcn_s_sendmsghalt: { 4604 // This must be an SGPR, but accept a VGPR. 4605 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4606 AMDGPU::SGPRRegBankID); 4607 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4608 break; 4609 } 4610 case Intrinsic::amdgcn_s_setreg: { 4611 // This must be an SGPR, but accept a VGPR. 4612 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4613 AMDGPU::SGPRRegBankID); 4614 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4615 break; 4616 } 4617 case Intrinsic::amdgcn_end_cf: { 4618 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4619 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4620 break; 4621 } 4622 case Intrinsic::amdgcn_else: { 4623 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4624 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4625 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4626 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4627 break; 4628 } 4629 case Intrinsic::amdgcn_live_mask: { 4630 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4631 break; 4632 } 4633 case Intrinsic::amdgcn_wqm_demote: 4634 case Intrinsic::amdgcn_kill: { 4635 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4636 break; 4637 } 4638 case Intrinsic::amdgcn_raw_buffer_load: 4639 case Intrinsic::amdgcn_raw_tbuffer_load: { 4640 // FIXME: Should make intrinsic ID the last operand of the instruction, 4641 // then this would be the same as store 4642 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4643 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4644 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4645 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4646 break; 4647 } 4648 case Intrinsic::amdgcn_raw_buffer_load_lds: { 4649 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4650 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4651 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4652 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4653 break; 4654 } 4655 case Intrinsic::amdgcn_raw_buffer_store: 4656 case Intrinsic::amdgcn_raw_buffer_store_format: 4657 case Intrinsic::amdgcn_raw_tbuffer_store: { 4658 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4659 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4660 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4661 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4662 break; 4663 } 4664 case Intrinsic::amdgcn_struct_buffer_load: 4665 case Intrinsic::amdgcn_struct_tbuffer_load: { 4666 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4667 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4668 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4669 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4670 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4671 break; 4672 } 4673 case Intrinsic::amdgcn_struct_buffer_load_lds: { 4674 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4675 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4676 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4677 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4678 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 4679 break; 4680 } 4681 case Intrinsic::amdgcn_struct_buffer_store: 4682 case Intrinsic::amdgcn_struct_tbuffer_store: { 4683 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4684 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4685 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4686 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4687 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4688 break; 4689 } 4690 case Intrinsic::amdgcn_init_exec_from_input: { 4691 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4692 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4693 break; 4694 } 4695 case Intrinsic::amdgcn_ds_gws_init: 4696 case Intrinsic::amdgcn_ds_gws_barrier: 4697 case Intrinsic::amdgcn_ds_gws_sema_br: { 4698 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4699 4700 // This must be an SGPR, but accept a VGPR. 4701 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4702 AMDGPU::SGPRRegBankID); 4703 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4704 break; 4705 } 4706 case Intrinsic::amdgcn_ds_gws_sema_v: 4707 case Intrinsic::amdgcn_ds_gws_sema_p: 4708 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4709 // This must be an SGPR, but accept a VGPR. 4710 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4711 AMDGPU::SGPRRegBankID); 4712 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4713 break; 4714 } 4715 case Intrinsic::amdgcn_global_load_lds: { 4716 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4717 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4718 break; 4719 } 4720 case Intrinsic::amdgcn_lds_direct_load: { 4721 const int M0Idx = MI.getNumOperands() - 1; 4722 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4723 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4724 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4725 4726 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4727 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4728 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4729 4730 // Must be SGPR, but we must take whatever the original bank is and fix it 4731 // later. 4732 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4733 break; 4734 } 4735 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 4736 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: 4737 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4738 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4739 break; 4740 case Intrinsic::amdgcn_ds_bvh_stack_rtn: { 4741 OpdsMapping[0] = 4742 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst 4743 OpdsMapping[1] = 4744 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr 4745 OpdsMapping[3] = 4746 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr 4747 OpdsMapping[4] = 4748 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0 4749 OpdsMapping[5] = 4750 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 4751 break; 4752 } 4753 4754 default: 4755 return getInvalidInstructionMapping(); 4756 } 4757 break; 4758 } 4759 case AMDGPU::G_SELECT: { 4760 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4761 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4762 AMDGPU::SGPRRegBankID); 4763 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4764 AMDGPU::SGPRRegBankID); 4765 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4766 Op3Bank == AMDGPU::SGPRRegBankID; 4767 4768 unsigned CondBankDefault = SGPRSrcs ? 4769 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4770 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4771 CondBankDefault); 4772 if (CondBank == AMDGPU::SGPRRegBankID) 4773 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4774 else if (CondBank == AMDGPU::VGPRRegBankID) 4775 CondBank = AMDGPU::VCCRegBankID; 4776 4777 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4778 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4779 4780 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4781 4782 // TODO: Should report 32-bit for scalar condition type. 4783 if (Size == 64) { 4784 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4785 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4786 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4787 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4788 } else { 4789 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4790 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4791 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4792 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4793 } 4794 4795 break; 4796 } 4797 4798 case AMDGPU::G_SI_CALL: { 4799 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); 4800 // Lie and claim everything is legal, even though some need to be 4801 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 4802 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4803 4804 // Allow anything for implicit arguments 4805 for (unsigned I = 4; I < MI.getNumOperands(); ++I) { 4806 if (MI.getOperand(I).isReg()) { 4807 Register Reg = MI.getOperand(I).getReg(); 4808 auto OpBank = getRegBankID(Reg, MRI); 4809 unsigned Size = getSizeInBits(Reg, MRI, *TRI); 4810 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); 4811 } 4812 } 4813 break; 4814 } 4815 case AMDGPU::G_LOAD: 4816 case AMDGPU::G_ZEXTLOAD: 4817 case AMDGPU::G_SEXTLOAD: 4818 return getInstrMappingForLoad(MI); 4819 4820 case AMDGPU::G_ATOMICRMW_XCHG: 4821 case AMDGPU::G_ATOMICRMW_ADD: 4822 case AMDGPU::G_ATOMICRMW_SUB: 4823 case AMDGPU::G_ATOMICRMW_AND: 4824 case AMDGPU::G_ATOMICRMW_OR: 4825 case AMDGPU::G_ATOMICRMW_XOR: 4826 case AMDGPU::G_ATOMICRMW_MAX: 4827 case AMDGPU::G_ATOMICRMW_MIN: 4828 case AMDGPU::G_ATOMICRMW_UMAX: 4829 case AMDGPU::G_ATOMICRMW_UMIN: 4830 case AMDGPU::G_ATOMICRMW_FADD: 4831 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4832 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4833 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4834 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4835 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4836 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4837 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4838 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4839 break; 4840 } 4841 case AMDGPU::G_ATOMIC_CMPXCHG: { 4842 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4843 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4844 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4845 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4846 break; 4847 } 4848 case AMDGPU::G_BRCOND: { 4849 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4850 AMDGPU::SGPRRegBankID); 4851 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4852 if (Bank != AMDGPU::SGPRRegBankID) 4853 Bank = AMDGPU::VCCRegBankID; 4854 4855 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4856 break; 4857 } 4858 case AMDGPU::G_FPTRUNC_ROUND_UPWARD: 4859 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: 4860 return getDefaultMappingVOP(MI); 4861 } 4862 4863 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4864 getOperandsMapping(OpdsMapping), 4865 MI.getNumOperands()); 4866 } 4867