1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 25 using namespace llvm; 26 27 #define GET_REGINFO_TARGET_DESC 28 #include "AMDGPUGenRegisterInfo.inc" 29 30 static cl::opt<bool> EnableSpillSGPRToVGPR( 31 "amdgpu-spill-sgpr-to-vgpr", 32 cl::desc("Enable spilling VGPRs to SGPRs"), 33 cl::ReallyHidden, 34 cl::init(true)); 35 36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 38 39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 42 // meaning index 7 in SubRegFromChannelTable. 43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 45 46 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 47 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 48 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 49 50 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 51 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 52 (getSubRegIndexLaneMask(AMDGPU::lo16) | 53 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 54 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 55 "getNumCoveredRegs() will not work with generated subreg masks!"); 56 57 RegPressureIgnoredUnits.resize(getNumRegUnits()); 58 RegPressureIgnoredUnits.set( 59 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 60 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 61 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 62 63 // HACK: Until this is fully tablegen'd. 64 static llvm::once_flag InitializeRegSplitPartsFlag; 65 66 static auto InitializeRegSplitPartsOnce = [this]() { 67 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 68 unsigned Size = getSubRegIdxSize(Idx); 69 if (Size & 31) 70 continue; 71 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 72 unsigned Pos = getSubRegIdxOffset(Idx); 73 if (Pos % Size) 74 continue; 75 Pos /= Size; 76 if (Vec.empty()) { 77 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 78 Vec.resize(MaxNumParts); 79 } 80 Vec[Pos] = Idx; 81 } 82 }; 83 84 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 85 86 static auto InitializeSubRegFromChannelTableOnce = [this]() { 87 for (auto &Row : SubRegFromChannelTable) 88 Row.fill(AMDGPU::NoSubRegister); 89 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 90 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 91 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 92 assert(Width < SubRegFromChannelTableWidthMap.size()); 93 Width = SubRegFromChannelTableWidthMap[Width]; 94 if (Width == 0) 95 continue; 96 unsigned TableIdx = Width - 1; 97 assert(TableIdx < SubRegFromChannelTable.size()); 98 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 99 SubRegFromChannelTable[TableIdx][Offset] = Idx; 100 } 101 }; 102 103 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 104 llvm::call_once(InitializeSubRegFromChannelTableFlag, 105 InitializeSubRegFromChannelTableOnce); 106 } 107 108 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 109 MCRegister Reg) const { 110 MCRegAliasIterator R(Reg, this, true); 111 112 for (; R.isValid(); ++R) 113 Reserved.set(*R); 114 } 115 116 // Forced to be here by one .inc 117 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 118 const MachineFunction *MF) const { 119 CallingConv::ID CC = MF->getFunction().getCallingConv(); 120 switch (CC) { 121 case CallingConv::C: 122 case CallingConv::Fast: 123 case CallingConv::Cold: 124 case CallingConv::AMDGPU_Gfx: 125 return CSR_AMDGPU_HighRegs_SaveList; 126 default: { 127 // Dummy to not crash RegisterClassInfo. 128 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 129 return &NoCalleeSavedReg; 130 } 131 } 132 } 133 134 const MCPhysReg * 135 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 136 return nullptr; 137 } 138 139 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 140 CallingConv::ID CC) const { 141 switch (CC) { 142 case CallingConv::C: 143 case CallingConv::Fast: 144 case CallingConv::Cold: 145 case CallingConv::AMDGPU_Gfx: 146 return CSR_AMDGPU_HighRegs_RegMask; 147 default: 148 return nullptr; 149 } 150 } 151 152 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 153 return CSR_AMDGPU_NoRegs_RegMask; 154 } 155 156 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 157 const SIFrameLowering *TFI = 158 MF.getSubtarget<GCNSubtarget>().getFrameLowering(); 159 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 160 // During ISel lowering we always reserve the stack pointer in entry 161 // functions, but never actually want to reference it when accessing our own 162 // frame. If we need a frame pointer we use it, but otherwise we can just use 163 // an immediate "0" which we represent by returning NoRegister. 164 if (FuncInfo->isEntryFunction()) { 165 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 166 } 167 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 168 : FuncInfo->getStackPtrOffsetReg(); 169 } 170 171 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 172 // When we need stack realignment, we can't reference off of the 173 // stack pointer, so we reserve a base pointer. 174 const MachineFrameInfo &MFI = MF.getFrameInfo(); 175 return MFI.getNumFixedObjects() && needsStackRealignment(MF); 176 } 177 178 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 179 180 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 181 return CSR_AMDGPU_AllVGPRs_RegMask; 182 } 183 184 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 185 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 186 } 187 188 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 189 unsigned NumRegs) { 190 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 191 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 192 assert(NumRegIndex && "Not implemented"); 193 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 194 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 195 } 196 197 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 198 const MachineFunction &MF) const { 199 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 200 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 201 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 202 } 203 204 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 205 BitVector Reserved(getNumRegs()); 206 Reserved.set(AMDGPU::MODE); 207 208 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 209 // this seems likely to result in bugs, so I'm marking them as reserved. 210 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 211 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 212 213 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 214 reserveRegisterTuples(Reserved, AMDGPU::M0); 215 216 // Reserve src_vccz, src_execz, src_scc. 217 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 218 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 219 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 220 221 // Reserve the memory aperture registers. 222 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 223 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 224 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 225 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 226 227 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 228 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 229 230 // Reserve xnack_mask registers - support is not implemented in Codegen. 231 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 232 233 // Reserve lds_direct register - support is not implemented in Codegen. 234 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 235 236 // Reserve Trap Handler registers - support is not implemented in Codegen. 237 reserveRegisterTuples(Reserved, AMDGPU::TBA); 238 reserveRegisterTuples(Reserved, AMDGPU::TMA); 239 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 240 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 241 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 242 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 243 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 244 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 245 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 246 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 247 248 // Reserve null register - it shall never be allocated 249 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 250 251 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 252 // will result in bugs. 253 if (isWave32) { 254 Reserved.set(AMDGPU::VCC); 255 Reserved.set(AMDGPU::VCC_HI); 256 } 257 258 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 259 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 260 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 261 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 262 reserveRegisterTuples(Reserved, Reg); 263 } 264 265 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 266 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 267 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 268 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 269 reserveRegisterTuples(Reserved, Reg); 270 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 271 reserveRegisterTuples(Reserved, Reg); 272 } 273 274 for (auto Reg : AMDGPU::SReg_32RegClass) { 275 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 276 Register Low = getSubReg(Reg, AMDGPU::lo16); 277 // This is to prevent BB vcc liveness errors. 278 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 279 Reserved.set(Low); 280 } 281 282 for (auto Reg : AMDGPU::AGPR_32RegClass) { 283 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 284 } 285 286 // Reserve all the rest AGPRs if there are no instructions to use it. 287 if (!ST.hasMAIInsts()) { 288 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 289 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 290 reserveRegisterTuples(Reserved, Reg); 291 } 292 } 293 294 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 295 296 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 297 if (ScratchRSrcReg != AMDGPU::NoRegister) { 298 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 299 // to spill. 300 // TODO: May need to reserve a VGPR if doing LDS spilling. 301 reserveRegisterTuples(Reserved, ScratchRSrcReg); 302 } 303 304 // We have to assume the SP is needed in case there are calls in the function, 305 // which is detected after the function is lowered. If we aren't really going 306 // to need SP, don't bother reserving it. 307 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 308 309 if (StackPtrReg) { 310 reserveRegisterTuples(Reserved, StackPtrReg); 311 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 312 } 313 314 MCRegister FrameReg = MFI->getFrameOffsetReg(); 315 if (FrameReg) { 316 reserveRegisterTuples(Reserved, FrameReg); 317 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 318 } 319 320 if (hasBasePointer(MF)) { 321 MCRegister BasePtrReg = getBaseRegister(); 322 reserveRegisterTuples(Reserved, BasePtrReg); 323 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 324 } 325 326 for (MCRegister Reg : MFI->WWMReservedRegs) { 327 reserveRegisterTuples(Reserved, Reg); 328 } 329 330 // FIXME: Stop using reserved registers for this. 331 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 332 reserveRegisterTuples(Reserved, Reg); 333 334 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 335 reserveRegisterTuples(Reserved, Reg); 336 337 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 338 reserveRegisterTuples(Reserved, SSpill.VGPR); 339 340 return Reserved; 341 } 342 343 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 344 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 345 // On entry, the base address is 0, so it can't possibly need any more 346 // alignment. 347 348 // FIXME: Should be able to specify the entry frame alignment per calling 349 // convention instead. 350 if (Info->isEntryFunction()) 351 return false; 352 353 return TargetRegisterInfo::canRealignStack(MF); 354 } 355 356 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 357 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 358 if (Info->isEntryFunction()) { 359 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 360 return MFI.hasStackObjects() || MFI.hasCalls(); 361 } 362 363 // May need scavenger for dealing with callee saved registers. 364 return true; 365 } 366 367 bool SIRegisterInfo::requiresFrameIndexScavenging( 368 const MachineFunction &MF) const { 369 // Do not use frame virtual registers. They used to be used for SGPRs, but 370 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 371 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 372 // spill. 373 return false; 374 } 375 376 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 377 const MachineFunction &MF) const { 378 const MachineFrameInfo &MFI = MF.getFrameInfo(); 379 return MFI.hasStackObjects(); 380 } 381 382 bool SIRegisterInfo::requiresVirtualBaseRegisters( 383 const MachineFunction &) const { 384 // There are no special dedicated stack or frame pointers. 385 return true; 386 } 387 388 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 389 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 390 391 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 392 AMDGPU::OpName::offset); 393 return MI->getOperand(OffIdx).getImm(); 394 } 395 396 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 397 int Idx) const { 398 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 399 return 0; 400 401 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 402 AMDGPU::OpName::vaddr) || 403 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 404 AMDGPU::OpName::saddr))) && 405 "Should never see frame index on non-address operand"); 406 407 return getScratchInstrOffset(MI); 408 } 409 410 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 411 if (!MI->mayLoadOrStore()) 412 return false; 413 414 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 415 416 if (SIInstrInfo::isMUBUF(*MI)) 417 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 418 419 const SIInstrInfo *TII = ST.getInstrInfo(); 420 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true); 421 } 422 423 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 424 int FrameIdx, 425 int64_t Offset) const { 426 MachineBasicBlock::iterator Ins = MBB->begin(); 427 DebugLoc DL; // Defaults to "unknown" 428 429 if (Ins != MBB->end()) 430 DL = Ins->getDebugLoc(); 431 432 MachineFunction *MF = MBB->getParent(); 433 const SIInstrInfo *TII = ST.getInstrInfo(); 434 MachineRegisterInfo &MRI = MF->getRegInfo(); 435 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 436 : AMDGPU::V_MOV_B32_e32; 437 438 Register BaseReg = MRI.createVirtualRegister( 439 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 440 : &AMDGPU::VGPR_32RegClass); 441 442 if (Offset == 0) { 443 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 444 .addFrameIndex(FrameIdx); 445 return BaseReg; 446 } 447 448 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 449 450 Register FIReg = MRI.createVirtualRegister( 451 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 452 : &AMDGPU::VGPR_32RegClass); 453 454 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 455 .addImm(Offset); 456 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 457 .addFrameIndex(FrameIdx); 458 459 if (ST.enableFlatScratch() ) { 460 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg) 461 .addReg(OffsetReg, RegState::Kill) 462 .addReg(FIReg); 463 return BaseReg; 464 } 465 466 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 467 .addReg(OffsetReg, RegState::Kill) 468 .addReg(FIReg) 469 .addImm(0); // clamp bit 470 471 return BaseReg; 472 } 473 474 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 475 int64_t Offset) const { 476 const SIInstrInfo *TII = ST.getInstrInfo(); 477 bool IsFlat = TII->isFLATScratch(MI); 478 479 #ifndef NDEBUG 480 // FIXME: Is it possible to be storing a frame index to itself? 481 bool SeenFI = false; 482 for (const MachineOperand &MO: MI.operands()) { 483 if (MO.isFI()) { 484 if (SeenFI) 485 llvm_unreachable("should not see multiple frame indices"); 486 487 SeenFI = true; 488 } 489 } 490 #endif 491 492 MachineOperand *FIOp = 493 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 494 : AMDGPU::OpName::vaddr); 495 496 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 497 int64_t NewOffset = OffsetOp->getImm() + Offset; 498 499 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 500 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 501 502 if (IsFlat) { 503 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) && 504 "offset should be legal"); 505 FIOp->ChangeToRegister(BaseReg, false); 506 OffsetOp->setImm(NewOffset); 507 return; 508 } 509 510 #ifndef NDEBUG 511 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 512 assert(SOffset->isImm() && SOffset->getImm() == 0); 513 #endif 514 515 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 516 "offset should be legal"); 517 518 FIOp->ChangeToRegister(BaseReg, false); 519 OffsetOp->setImm(NewOffset); 520 } 521 522 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 523 Register BaseReg, 524 int64_t Offset) const { 525 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 526 return false; 527 528 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 529 530 if (SIInstrInfo::isMUBUF(*MI)) 531 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 532 533 const SIInstrInfo *TII = ST.getInstrInfo(); 534 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true); 535 } 536 537 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 538 const MachineFunction &MF, unsigned Kind) const { 539 // This is inaccurate. It depends on the instruction and address space. The 540 // only place where we should hit this is for dealing with frame indexes / 541 // private accesses, so this is correct in that case. 542 return &AMDGPU::VGPR_32RegClass; 543 } 544 545 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 546 547 switch (Op) { 548 case AMDGPU::SI_SPILL_S1024_SAVE: 549 case AMDGPU::SI_SPILL_S1024_RESTORE: 550 case AMDGPU::SI_SPILL_V1024_SAVE: 551 case AMDGPU::SI_SPILL_V1024_RESTORE: 552 case AMDGPU::SI_SPILL_A1024_SAVE: 553 case AMDGPU::SI_SPILL_A1024_RESTORE: 554 return 32; 555 case AMDGPU::SI_SPILL_S512_SAVE: 556 case AMDGPU::SI_SPILL_S512_RESTORE: 557 case AMDGPU::SI_SPILL_V512_SAVE: 558 case AMDGPU::SI_SPILL_V512_RESTORE: 559 case AMDGPU::SI_SPILL_A512_SAVE: 560 case AMDGPU::SI_SPILL_A512_RESTORE: 561 return 16; 562 case AMDGPU::SI_SPILL_S256_SAVE: 563 case AMDGPU::SI_SPILL_S256_RESTORE: 564 case AMDGPU::SI_SPILL_V256_SAVE: 565 case AMDGPU::SI_SPILL_V256_RESTORE: 566 case AMDGPU::SI_SPILL_A256_SAVE: 567 case AMDGPU::SI_SPILL_A256_RESTORE: 568 return 8; 569 case AMDGPU::SI_SPILL_S192_SAVE: 570 case AMDGPU::SI_SPILL_S192_RESTORE: 571 case AMDGPU::SI_SPILL_V192_SAVE: 572 case AMDGPU::SI_SPILL_V192_RESTORE: 573 case AMDGPU::SI_SPILL_A192_SAVE: 574 case AMDGPU::SI_SPILL_A192_RESTORE: 575 return 6; 576 case AMDGPU::SI_SPILL_S160_SAVE: 577 case AMDGPU::SI_SPILL_S160_RESTORE: 578 case AMDGPU::SI_SPILL_V160_SAVE: 579 case AMDGPU::SI_SPILL_V160_RESTORE: 580 case AMDGPU::SI_SPILL_A160_SAVE: 581 case AMDGPU::SI_SPILL_A160_RESTORE: 582 return 5; 583 case AMDGPU::SI_SPILL_S128_SAVE: 584 case AMDGPU::SI_SPILL_S128_RESTORE: 585 case AMDGPU::SI_SPILL_V128_SAVE: 586 case AMDGPU::SI_SPILL_V128_RESTORE: 587 case AMDGPU::SI_SPILL_A128_SAVE: 588 case AMDGPU::SI_SPILL_A128_RESTORE: 589 return 4; 590 case AMDGPU::SI_SPILL_S96_SAVE: 591 case AMDGPU::SI_SPILL_S96_RESTORE: 592 case AMDGPU::SI_SPILL_V96_SAVE: 593 case AMDGPU::SI_SPILL_V96_RESTORE: 594 case AMDGPU::SI_SPILL_A96_SAVE: 595 case AMDGPU::SI_SPILL_A96_RESTORE: 596 return 3; 597 case AMDGPU::SI_SPILL_S64_SAVE: 598 case AMDGPU::SI_SPILL_S64_RESTORE: 599 case AMDGPU::SI_SPILL_V64_SAVE: 600 case AMDGPU::SI_SPILL_V64_RESTORE: 601 case AMDGPU::SI_SPILL_A64_SAVE: 602 case AMDGPU::SI_SPILL_A64_RESTORE: 603 return 2; 604 case AMDGPU::SI_SPILL_S32_SAVE: 605 case AMDGPU::SI_SPILL_S32_RESTORE: 606 case AMDGPU::SI_SPILL_V32_SAVE: 607 case AMDGPU::SI_SPILL_V32_RESTORE: 608 case AMDGPU::SI_SPILL_A32_SAVE: 609 case AMDGPU::SI_SPILL_A32_RESTORE: 610 return 1; 611 default: llvm_unreachable("Invalid spill opcode"); 612 } 613 } 614 615 static int getOffsetMUBUFStore(unsigned Opc) { 616 switch (Opc) { 617 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 618 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 619 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 620 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 621 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 622 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 623 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 624 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 625 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 626 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 627 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 628 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 629 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 630 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 631 default: 632 return -1; 633 } 634 } 635 636 static int getOffsetMUBUFLoad(unsigned Opc) { 637 switch (Opc) { 638 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 639 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 640 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 641 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 642 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 643 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 644 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 645 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 646 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 647 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 648 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 649 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 650 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 651 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 652 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 653 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 654 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 655 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 656 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 657 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 658 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 659 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 660 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 661 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 662 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 663 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 664 default: 665 return -1; 666 } 667 } 668 669 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 670 MachineBasicBlock::iterator MI, 671 int Index, 672 unsigned Lane, 673 unsigned ValueReg, 674 bool IsKill) { 675 MachineBasicBlock *MBB = MI->getParent(); 676 MachineFunction *MF = MI->getParent()->getParent(); 677 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 678 const SIInstrInfo *TII = ST.getInstrInfo(); 679 680 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 681 682 if (Reg == AMDGPU::NoRegister) 683 return MachineInstrBuilder(); 684 685 bool IsStore = MI->mayStore(); 686 MachineRegisterInfo &MRI = MF->getRegInfo(); 687 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 688 689 unsigned Dst = IsStore ? Reg : ValueReg; 690 unsigned Src = IsStore ? ValueReg : Reg; 691 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 692 : AMDGPU::V_ACCVGPR_READ_B32_e64; 693 694 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 695 .addReg(Src, getKillRegState(IsKill)); 696 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 697 return MIB; 698 } 699 700 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 701 // need to handle the case where an SGPR may need to be spilled while spilling. 702 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 703 MachineFrameInfo &MFI, 704 MachineBasicBlock::iterator MI, 705 int Index, 706 int64_t Offset) { 707 const SIInstrInfo *TII = ST.getInstrInfo(); 708 MachineBasicBlock *MBB = MI->getParent(); 709 const DebugLoc &DL = MI->getDebugLoc(); 710 bool IsStore = MI->mayStore(); 711 712 unsigned Opc = MI->getOpcode(); 713 int LoadStoreOp = IsStore ? 714 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 715 if (LoadStoreOp == -1) 716 return false; 717 718 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 719 if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) 720 return true; 721 722 MachineInstrBuilder NewMI = 723 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 724 .add(*Reg) 725 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 726 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 727 .addImm(Offset) 728 .addImm(0) // glc 729 .addImm(0) // slc 730 .addImm(0) // tfe 731 .addImm(0) // dlc 732 .addImm(0) // swz 733 .cloneMemRefs(*MI); 734 735 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 736 AMDGPU::OpName::vdata_in); 737 if (VDataIn) 738 NewMI.add(*VDataIn); 739 return true; 740 } 741 742 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 743 unsigned LoadStoreOp, 744 unsigned EltSize) { 745 bool IsStore = TII->get(LoadStoreOp).mayStore(); 746 bool UseST = 747 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && 748 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 749 750 switch (EltSize) { 751 case 4: 752 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 753 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 754 break; 755 case 8: 756 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 757 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 758 break; 759 case 12: 760 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 761 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 762 break; 763 case 16: 764 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 765 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 766 break; 767 default: 768 llvm_unreachable("Unexpected spill load/store size!"); 769 } 770 771 if (UseST) 772 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 773 774 return LoadStoreOp; 775 } 776 777 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 778 unsigned LoadStoreOp, 779 int Index, 780 Register ValueReg, 781 bool IsKill, 782 MCRegister ScratchOffsetReg, 783 int64_t InstOffset, 784 MachineMemOperand *MMO, 785 RegScavenger *RS) const { 786 MachineBasicBlock *MBB = MI->getParent(); 787 MachineFunction *MF = MI->getParent()->getParent(); 788 const SIInstrInfo *TII = ST.getInstrInfo(); 789 const MachineFrameInfo &MFI = MF->getFrameInfo(); 790 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 791 792 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 793 const DebugLoc &DL = MI->getDebugLoc(); 794 bool IsStore = Desc->mayStore(); 795 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 796 797 bool Scavenged = false; 798 MCRegister SOffset = ScratchOffsetReg; 799 800 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 801 const bool IsAGPR = hasAGPRs(RC); 802 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 803 804 // Always use 4 byte operations for AGPRs because we need to scavenge 805 // a temporary VGPR. 806 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 807 unsigned NumSubRegs = RegWidth / EltSize; 808 unsigned Size = NumSubRegs * EltSize; 809 unsigned RemSize = RegWidth - Size; 810 unsigned NumRemSubRegs = RemSize ? 1 : 0; 811 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 812 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 813 int64_t ScratchOffsetRegDelta = 0; 814 815 if (IsFlat && EltSize > 4) { 816 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 817 Desc = &TII->get(LoadStoreOp); 818 } 819 820 Align Alignment = MFI.getObjectAlign(Index); 821 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 822 823 assert((IsFlat || ((Offset % EltSize) == 0)) && 824 "unexpected VGPR spill offset"); 825 826 bool IsOffsetLegal = IsFlat 827 ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true) 828 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 829 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 830 SOffset = MCRegister(); 831 832 // We currently only support spilling VGPRs to EltSize boundaries, meaning 833 // we can simplify the adjustment of Offset here to just scale with 834 // WavefrontSize. 835 if (!IsFlat) 836 Offset *= ST.getWavefrontSize(); 837 838 // We don't have access to the register scavenger if this function is called 839 // during PEI::scavengeFrameVirtualRegs(). 840 if (RS) 841 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 842 843 if (!SOffset) { 844 // There are no free SGPRs, and since we are in the process of spilling 845 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 846 // on SI/CI and on VI it is true until we implement spilling using scalar 847 // stores), we have no way to free up an SGPR. Our solution here is to 848 // add the offset directly to the ScratchOffset or StackPtrOffset 849 // register, and then subtract the offset after the spill to return the 850 // register to it's original value. 851 if (!ScratchOffsetReg) 852 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 853 SOffset = ScratchOffsetReg; 854 ScratchOffsetRegDelta = Offset; 855 } else { 856 Scavenged = true; 857 } 858 859 if (!SOffset) 860 report_fatal_error("could not scavenge SGPR to spill in entry function"); 861 862 if (ScratchOffsetReg == AMDGPU::NoRegister) { 863 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) 864 .addImm(Offset); 865 } else { 866 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 867 .addReg(ScratchOffsetReg) 868 .addImm(Offset); 869 } 870 871 Offset = 0; 872 } 873 874 if (IsFlat && SOffset == AMDGPU::NoRegister) { 875 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 876 && "Unexpected vaddr for flat scratch with a FI operand"); 877 878 assert(ST.hasFlatScratchSTMode()); 879 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 880 Desc = &TII->get(LoadStoreOp); 881 } 882 883 Register TmpReg; 884 885 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 886 ++i, RegOffset += EltSize) { 887 if (i == NumSubRegs) { 888 EltSize = RemSize; 889 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 890 } 891 Desc = &TII->get(LoadStoreOp); 892 893 unsigned NumRegs = EltSize / 4; 894 Register SubReg = e == 1 895 ? ValueReg 896 : Register(getSubReg(ValueReg, 897 getSubRegFromChannel(RegOffset / 4, NumRegs))); 898 899 unsigned SOffsetRegState = 0; 900 unsigned SrcDstRegState = getDefRegState(!IsStore); 901 if (i + 1 == e) { 902 SOffsetRegState |= getKillRegState(Scavenged); 903 // The last implicit use carries the "Kill" flag. 904 SrcDstRegState |= getKillRegState(IsKill); 905 } 906 907 // Make sure the whole register is defined if there are undef components by 908 // adding an implicit def of the super-reg on the first instruction. 909 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 910 bool NeedSuperRegImpOperand = e > 1; 911 912 unsigned Lane = RegOffset / 4; 913 unsigned LaneE = (RegOffset + EltSize) / 4; 914 for ( ; Lane != LaneE; ++Lane) { 915 bool IsSubReg = e > 1 || EltSize > 4; 916 Register Sub = IsSubReg 917 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 918 : ValueReg; 919 auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill); 920 if (!MIB.getInstr()) 921 break; 922 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) { 923 MIB.addReg(ValueReg, RegState::ImplicitDefine); 924 NeedSuperRegDef = false; 925 } 926 if (IsSubReg || NeedSuperRegImpOperand) { 927 NeedSuperRegImpOperand = true; 928 unsigned State = SrcDstRegState; 929 if (Lane + 1 != LaneE) 930 State &= ~RegState::Kill; 931 MIB.addReg(ValueReg, RegState::Implicit | State); 932 } 933 } 934 935 if (Lane == LaneE) // Fully spilled into AGPRs. 936 continue; 937 938 // Offset in bytes from the beginning of the ValueReg to its portion we 939 // still need to spill. It may differ from RegOffset if a portion of 940 // current SubReg has been already spilled into AGPRs by the loop above. 941 unsigned RemRegOffset = Lane * 4; 942 unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset); 943 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 944 assert(IsFlat && EltSize > 4); 945 946 unsigned NumRegs = RemEltSize / 4; 947 SubReg = Register(getSubReg(ValueReg, 948 getSubRegFromChannel(RemRegOffset / 4, NumRegs))); 949 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 950 Desc = &TII->get(Opc); 951 } 952 953 unsigned FinalReg = SubReg; 954 955 if (IsAGPR) { 956 assert(EltSize == 4); 957 958 if (!TmpReg) { 959 assert(RS && "Needs to have RegScavenger to spill an AGPR!"); 960 // FIXME: change to scavengeRegisterBackwards() 961 TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 962 RS->setRegUsed(TmpReg); 963 } 964 if (IsStore) { 965 auto AccRead = BuildMI(*MBB, MI, DL, 966 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) 967 .addReg(SubReg, getKillRegState(IsKill)); 968 if (NeedSuperRegDef) 969 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 970 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 971 } 972 SubReg = TmpReg; 973 } 974 975 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset); 976 MachineMemOperand *NewMMO = 977 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 978 commonAlignment(Alignment, RemRegOffset)); 979 980 auto MIB = BuildMI(*MBB, MI, DL, *Desc) 981 .addReg(SubReg, 982 getDefRegState(!IsStore) | getKillRegState(IsKill)); 983 if (!IsFlat) 984 MIB.addReg(FuncInfo->getScratchRSrcReg()); 985 986 if (SOffset == AMDGPU::NoRegister) { 987 if (!IsFlat) 988 MIB.addImm(0); 989 } else { 990 MIB.addReg(SOffset, SOffsetRegState); 991 } 992 MIB.addImm(Offset + RemRegOffset) 993 .addImm(0) // glc 994 .addImm(0) // slc 995 .addImm(0); // tfe for MUBUF or dlc for FLAT 996 if (!IsFlat) 997 MIB.addImm(0) // dlc 998 .addImm(0); // swz 999 MIB.addMemOperand(NewMMO); 1000 1001 if (!IsAGPR && NeedSuperRegDef) 1002 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1003 1004 if (!IsStore && TmpReg != AMDGPU::NoRegister) { 1005 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1006 FinalReg) 1007 .addReg(TmpReg, RegState::Kill); 1008 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1009 } 1010 1011 if (NeedSuperRegImpOperand) 1012 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1013 } 1014 1015 if (ScratchOffsetRegDelta != 0) { 1016 // Subtract the offset we added to the ScratchOffset register. 1017 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset) 1018 .addReg(SOffset) 1019 .addImm(ScratchOffsetRegDelta); 1020 } 1021 } 1022 1023 // Generate a VMEM access which loads or stores the VGPR containing an SGPR 1024 // spill such that all the lanes set in VGPRLanes are loaded or stored. 1025 // This generates exec mask manipulation and will use SGPRs available in MI 1026 // or VGPR lanes in the VGPR to save and restore the exec mask. 1027 void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, 1028 int Index, int Offset, 1029 unsigned EltSize, Register VGPR, 1030 int64_t VGPRLanes, 1031 RegScavenger *RS, 1032 bool IsLoad) const { 1033 MachineBasicBlock *MBB = MI->getParent(); 1034 MachineFunction *MF = MBB->getParent(); 1035 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1036 const SIInstrInfo *TII = ST.getInstrInfo(); 1037 1038 Register SuperReg = MI->getOperand(0).getReg(); 1039 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1040 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1041 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1042 unsigned FirstPart = Offset * 32; 1043 unsigned ExecLane = 0; 1044 1045 bool IsKill = MI->getOperand(0).isKill(); 1046 const DebugLoc &DL = MI->getDebugLoc(); 1047 1048 // Cannot handle load/store to EXEC 1049 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 1050 SuperReg != AMDGPU::EXEC && "exec should never spill"); 1051 1052 // On Wave32 only handle EXEC_LO. 1053 // On Wave64 only update EXEC_HI if there is sufficent space for a copy. 1054 bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI; 1055 1056 unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1057 Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1058 Register SavedExecReg; 1059 1060 // Backup EXEC 1061 if (OnlyExecLo) { 1062 SavedExecReg = 1063 NumSubRegs == 1 1064 ? SuperReg 1065 : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane])); 1066 } else { 1067 // If src/dst is an odd size it is possible subreg0 is not aligned. 1068 for (; ExecLane < (NumSubRegs - 1); ++ExecLane) { 1069 SavedExecReg = getMatchingSuperReg( 1070 getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0, 1071 &AMDGPU::SReg_64_XEXECRegClass); 1072 if (SavedExecReg) 1073 break; 1074 } 1075 } 1076 assert(SavedExecReg); 1077 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); 1078 1079 // Setup EXEC 1080 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); 1081 1082 // Load/store VGPR 1083 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1084 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1085 1086 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1087 ? getBaseRegister() 1088 : getFrameRegister(*MF); 1089 1090 Align Alignment = FrameInfo.getObjectAlign(Index); 1091 MachinePointerInfo PtrInfo = 1092 MachinePointerInfo::getFixedStack(*MF, Index); 1093 MachineMemOperand *MMO = MF->getMachineMemOperand( 1094 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1095 EltSize, Alignment); 1096 1097 if (IsLoad) { 1098 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1099 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1100 buildSpillLoadStore(MI, Opc, 1101 Index, 1102 VGPR, false, 1103 FrameReg, 1104 Offset * EltSize, MMO, 1105 RS); 1106 } else { 1107 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1108 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1109 buildSpillLoadStore(MI, Opc, Index, VGPR, 1110 IsKill, FrameReg, 1111 Offset * EltSize, MMO, RS); 1112 // This only ever adds one VGPR spill 1113 MFI->addToSpilledVGPRs(1); 1114 } 1115 1116 // Restore EXEC 1117 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) 1118 .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill)); 1119 1120 // Restore clobbered SGPRs 1121 if (IsLoad) { 1122 // Nothing to do; register will be overwritten 1123 } else if (!IsKill) { 1124 // Restore SGPRs from appropriate VGPR lanes 1125 if (!OnlyExecLo) { 1126 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), 1127 getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1])) 1128 .addReg(VGPR) 1129 .addImm(ExecLane + 1); 1130 } 1131 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), 1132 NumSubRegs == 1 ? SavedExecReg 1133 : Register(getSubReg( 1134 SuperReg, SplitParts[FirstPart + ExecLane]))) 1135 .addReg(VGPR, RegState::Kill) 1136 .addImm(ExecLane); 1137 } 1138 } 1139 1140 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1141 int Index, 1142 RegScavenger *RS, 1143 bool OnlyToVGPR) const { 1144 MachineBasicBlock *MBB = MI->getParent(); 1145 MachineFunction *MF = MBB->getParent(); 1146 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1147 1148 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 1149 = MFI->getSGPRToVGPRSpills(Index); 1150 bool SpillToVGPR = !VGPRSpills.empty(); 1151 if (OnlyToVGPR && !SpillToVGPR) 1152 return false; 1153 1154 const SIInstrInfo *TII = ST.getInstrInfo(); 1155 1156 Register SuperReg = MI->getOperand(0).getReg(); 1157 bool IsKill = MI->getOperand(0).isKill(); 1158 const DebugLoc &DL = MI->getDebugLoc(); 1159 1160 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 1161 SuperReg != MFI->getFrameOffsetReg())); 1162 1163 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 1164 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 1165 SuperReg != AMDGPU::EXEC && "exec should never spill"); 1166 1167 unsigned EltSize = 4; 1168 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1169 1170 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1171 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1172 1173 if (SpillToVGPR) { 1174 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 1175 Register SubReg = NumSubRegs == 1 1176 ? SuperReg 1177 : Register(getSubReg(SuperReg, SplitParts[i])); 1178 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1179 1180 bool UseKill = IsKill && i == NumSubRegs - 1; 1181 1182 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1183 // spill to this specific vgpr in the first basic block. 1184 auto MIB = 1185 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1186 .addReg(SubReg, getKillRegState(UseKill)) 1187 .addImm(Spill.Lane) 1188 .addReg(Spill.VGPR); 1189 1190 if (i == 0 && NumSubRegs > 1) { 1191 // We may be spilling a super-register which is only partially defined, 1192 // and need to ensure later spills think the value is defined. 1193 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1194 } 1195 1196 if (NumSubRegs > 1) 1197 MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1198 1199 // FIXME: Since this spills to another register instead of an actual 1200 // frame index, we should delete the frame index when all references to 1201 // it are fixed. 1202 } 1203 } else { 1204 // Scavenged temporary VGPR to use. It must be scavenged once for any number 1205 // of spilled subregs. 1206 Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1207 RS->setRegUsed(TmpVGPR); 1208 1209 // SubReg carries the "Kill" flag when SubReg == SuperReg. 1210 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 1211 1212 unsigned PerVGPR = 32; 1213 unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; 1214 int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; 1215 1216 for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { 1217 unsigned TmpVGPRFlags = RegState::Undef; 1218 1219 // Write sub registers into the VGPR 1220 for (unsigned i = Offset * PerVGPR, 1221 e = std::min((Offset + 1) * PerVGPR, NumSubRegs); 1222 i < e; ++i) { 1223 Register SubReg = NumSubRegs == 1 1224 ? SuperReg 1225 : Register(getSubReg(SuperReg, SplitParts[i])); 1226 1227 MachineInstrBuilder WriteLane = 1228 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR) 1229 .addReg(SubReg, SubKillState) 1230 .addImm(i % PerVGPR) 1231 .addReg(TmpVGPR, TmpVGPRFlags); 1232 TmpVGPRFlags = 0; 1233 1234 // There could be undef components of a spilled super register. 1235 // TODO: Can we detect this and skip the spill? 1236 if (NumSubRegs > 1) { 1237 // The last implicit use of the SuperReg carries the "Kill" flag. 1238 unsigned SuperKillState = 0; 1239 if (i + 1 == NumSubRegs) 1240 SuperKillState |= getKillRegState(IsKill); 1241 WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState); 1242 } 1243 } 1244 1245 // Write out VGPR 1246 buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, 1247 RS, false); 1248 } 1249 } 1250 1251 MI->eraseFromParent(); 1252 MFI->addToSpilledSGPRs(NumSubRegs); 1253 return true; 1254 } 1255 1256 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1257 int Index, 1258 RegScavenger *RS, 1259 bool OnlyToVGPR) const { 1260 MachineFunction *MF = MI->getParent()->getParent(); 1261 MachineBasicBlock *MBB = MI->getParent(); 1262 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1263 1264 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 1265 = MFI->getSGPRToVGPRSpills(Index); 1266 bool SpillToVGPR = !VGPRSpills.empty(); 1267 if (OnlyToVGPR && !SpillToVGPR) 1268 return false; 1269 1270 const SIInstrInfo *TII = ST.getInstrInfo(); 1271 const DebugLoc &DL = MI->getDebugLoc(); 1272 1273 Register SuperReg = MI->getOperand(0).getReg(); 1274 1275 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 1276 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 1277 SuperReg != AMDGPU::EXEC && "exec should never spill"); 1278 1279 unsigned EltSize = 4; 1280 1281 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 1282 1283 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 1284 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 1285 1286 if (SpillToVGPR) { 1287 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 1288 Register SubReg = NumSubRegs == 1 1289 ? SuperReg 1290 : Register(getSubReg(SuperReg, SplitParts[i])); 1291 1292 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1293 auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) 1294 .addReg(Spill.VGPR) 1295 .addImm(Spill.Lane); 1296 if (NumSubRegs > 1 && i == 0) 1297 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1298 } 1299 } else { 1300 Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1301 RS->setRegUsed(TmpVGPR); 1302 1303 unsigned PerVGPR = 32; 1304 unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; 1305 int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; 1306 1307 for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { 1308 // Load in VGPR data 1309 buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, 1310 RS, true); 1311 1312 // Unpack lanes 1313 for (unsigned i = Offset * PerVGPR, 1314 e = std::min((Offset + 1) * PerVGPR, NumSubRegs); 1315 i < e; ++i) { 1316 Register SubReg = NumSubRegs == 1 1317 ? SuperReg 1318 : Register(getSubReg(SuperReg, SplitParts[i])); 1319 1320 bool LastSubReg = (i + 1 == e); 1321 auto MIB = 1322 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) 1323 .addReg(TmpVGPR, getKillRegState(LastSubReg)) 1324 .addImm(i); 1325 if (NumSubRegs > 1 && i == 0) 1326 MIB.addReg(SuperReg, RegState::ImplicitDefine); 1327 } 1328 } 1329 } 1330 1331 MI->eraseFromParent(); 1332 return true; 1333 } 1334 1335 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1336 /// a VGPR and the stack slot can be safely eliminated when all other users are 1337 /// handled. 1338 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1339 MachineBasicBlock::iterator MI, 1340 int FI, 1341 RegScavenger *RS) const { 1342 switch (MI->getOpcode()) { 1343 case AMDGPU::SI_SPILL_S1024_SAVE: 1344 case AMDGPU::SI_SPILL_S512_SAVE: 1345 case AMDGPU::SI_SPILL_S256_SAVE: 1346 case AMDGPU::SI_SPILL_S192_SAVE: 1347 case AMDGPU::SI_SPILL_S160_SAVE: 1348 case AMDGPU::SI_SPILL_S128_SAVE: 1349 case AMDGPU::SI_SPILL_S96_SAVE: 1350 case AMDGPU::SI_SPILL_S64_SAVE: 1351 case AMDGPU::SI_SPILL_S32_SAVE: 1352 return spillSGPR(MI, FI, RS, true); 1353 case AMDGPU::SI_SPILL_S1024_RESTORE: 1354 case AMDGPU::SI_SPILL_S512_RESTORE: 1355 case AMDGPU::SI_SPILL_S256_RESTORE: 1356 case AMDGPU::SI_SPILL_S192_RESTORE: 1357 case AMDGPU::SI_SPILL_S160_RESTORE: 1358 case AMDGPU::SI_SPILL_S128_RESTORE: 1359 case AMDGPU::SI_SPILL_S96_RESTORE: 1360 case AMDGPU::SI_SPILL_S64_RESTORE: 1361 case AMDGPU::SI_SPILL_S32_RESTORE: 1362 return restoreSGPR(MI, FI, RS, true); 1363 default: 1364 llvm_unreachable("not an SGPR spill instruction"); 1365 } 1366 } 1367 1368 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1369 int SPAdj, unsigned FIOperandNum, 1370 RegScavenger *RS) const { 1371 MachineFunction *MF = MI->getParent()->getParent(); 1372 MachineBasicBlock *MBB = MI->getParent(); 1373 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1374 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1375 const SIInstrInfo *TII = ST.getInstrInfo(); 1376 DebugLoc DL = MI->getDebugLoc(); 1377 1378 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1379 1380 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1381 int Index = MI->getOperand(FIOperandNum).getIndex(); 1382 1383 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1384 ? getBaseRegister() 1385 : getFrameRegister(*MF); 1386 1387 switch (MI->getOpcode()) { 1388 // SGPR register spill 1389 case AMDGPU::SI_SPILL_S1024_SAVE: 1390 case AMDGPU::SI_SPILL_S512_SAVE: 1391 case AMDGPU::SI_SPILL_S256_SAVE: 1392 case AMDGPU::SI_SPILL_S192_SAVE: 1393 case AMDGPU::SI_SPILL_S160_SAVE: 1394 case AMDGPU::SI_SPILL_S128_SAVE: 1395 case AMDGPU::SI_SPILL_S96_SAVE: 1396 case AMDGPU::SI_SPILL_S64_SAVE: 1397 case AMDGPU::SI_SPILL_S32_SAVE: { 1398 spillSGPR(MI, Index, RS); 1399 break; 1400 } 1401 1402 // SGPR register restore 1403 case AMDGPU::SI_SPILL_S1024_RESTORE: 1404 case AMDGPU::SI_SPILL_S512_RESTORE: 1405 case AMDGPU::SI_SPILL_S256_RESTORE: 1406 case AMDGPU::SI_SPILL_S192_RESTORE: 1407 case AMDGPU::SI_SPILL_S160_RESTORE: 1408 case AMDGPU::SI_SPILL_S128_RESTORE: 1409 case AMDGPU::SI_SPILL_S96_RESTORE: 1410 case AMDGPU::SI_SPILL_S64_RESTORE: 1411 case AMDGPU::SI_SPILL_S32_RESTORE: { 1412 restoreSGPR(MI, Index, RS); 1413 break; 1414 } 1415 1416 // VGPR register spill 1417 case AMDGPU::SI_SPILL_V1024_SAVE: 1418 case AMDGPU::SI_SPILL_V512_SAVE: 1419 case AMDGPU::SI_SPILL_V256_SAVE: 1420 case AMDGPU::SI_SPILL_V192_SAVE: 1421 case AMDGPU::SI_SPILL_V160_SAVE: 1422 case AMDGPU::SI_SPILL_V128_SAVE: 1423 case AMDGPU::SI_SPILL_V96_SAVE: 1424 case AMDGPU::SI_SPILL_V64_SAVE: 1425 case AMDGPU::SI_SPILL_V32_SAVE: 1426 case AMDGPU::SI_SPILL_A1024_SAVE: 1427 case AMDGPU::SI_SPILL_A512_SAVE: 1428 case AMDGPU::SI_SPILL_A256_SAVE: 1429 case AMDGPU::SI_SPILL_A192_SAVE: 1430 case AMDGPU::SI_SPILL_A160_SAVE: 1431 case AMDGPU::SI_SPILL_A128_SAVE: 1432 case AMDGPU::SI_SPILL_A96_SAVE: 1433 case AMDGPU::SI_SPILL_A64_SAVE: 1434 case AMDGPU::SI_SPILL_A32_SAVE: { 1435 const MachineOperand *VData = TII->getNamedOperand(*MI, 1436 AMDGPU::OpName::vdata); 1437 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1438 MFI->getStackPtrOffsetReg()); 1439 1440 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1441 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1442 buildSpillLoadStore(MI, Opc, 1443 Index, 1444 VData->getReg(), VData->isKill(), 1445 FrameReg, 1446 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1447 *MI->memoperands_begin(), 1448 RS); 1449 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1450 MI->eraseFromParent(); 1451 break; 1452 } 1453 case AMDGPU::SI_SPILL_V32_RESTORE: 1454 case AMDGPU::SI_SPILL_V64_RESTORE: 1455 case AMDGPU::SI_SPILL_V96_RESTORE: 1456 case AMDGPU::SI_SPILL_V128_RESTORE: 1457 case AMDGPU::SI_SPILL_V160_RESTORE: 1458 case AMDGPU::SI_SPILL_V192_RESTORE: 1459 case AMDGPU::SI_SPILL_V256_RESTORE: 1460 case AMDGPU::SI_SPILL_V512_RESTORE: 1461 case AMDGPU::SI_SPILL_V1024_RESTORE: 1462 case AMDGPU::SI_SPILL_A32_RESTORE: 1463 case AMDGPU::SI_SPILL_A64_RESTORE: 1464 case AMDGPU::SI_SPILL_A96_RESTORE: 1465 case AMDGPU::SI_SPILL_A128_RESTORE: 1466 case AMDGPU::SI_SPILL_A160_RESTORE: 1467 case AMDGPU::SI_SPILL_A192_RESTORE: 1468 case AMDGPU::SI_SPILL_A256_RESTORE: 1469 case AMDGPU::SI_SPILL_A512_RESTORE: 1470 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1471 const MachineOperand *VData = TII->getNamedOperand(*MI, 1472 AMDGPU::OpName::vdata); 1473 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1474 MFI->getStackPtrOffsetReg()); 1475 1476 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1477 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1478 buildSpillLoadStore(MI, Opc, 1479 Index, 1480 VData->getReg(), VData->isKill(), 1481 FrameReg, 1482 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1483 *MI->memoperands_begin(), 1484 RS); 1485 MI->eraseFromParent(); 1486 break; 1487 } 1488 1489 default: { 1490 const DebugLoc &DL = MI->getDebugLoc(); 1491 1492 int64_t Offset = FrameInfo.getObjectOffset(Index); 1493 if (ST.enableFlatScratch()) { 1494 if (TII->isFLATScratch(*MI)) { 1495 assert((int16_t)FIOperandNum == 1496 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1497 AMDGPU::OpName::saddr)); 1498 1499 // The offset is always swizzled, just replace it 1500 if (FrameReg) 1501 FIOp.ChangeToRegister(FrameReg, false); 1502 1503 if (!Offset) 1504 return; 1505 1506 MachineOperand *OffsetOp = 1507 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1508 int64_t NewOffset = Offset + OffsetOp->getImm(); 1509 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1510 true)) { 1511 OffsetOp->setImm(NewOffset); 1512 if (FrameReg) 1513 return; 1514 Offset = 0; 1515 } 1516 1517 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && 1518 "Unexpected vaddr for flat scratch with a FI operand"); 1519 1520 // On GFX10 we have ST mode to use no registers for an address. 1521 // Otherwise we need to materialize 0 into an SGPR. 1522 if (!Offset && ST.hasFlatScratchSTMode()) { 1523 unsigned Opc = MI->getOpcode(); 1524 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 1525 MI->RemoveOperand( 1526 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 1527 MI->setDesc(TII->get(NewOpc)); 1528 return; 1529 } 1530 } 1531 1532 if (!FrameReg) { 1533 FIOp.ChangeToImmediate(Offset); 1534 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 1535 return; 1536 } 1537 1538 // We need to use register here. Check if we can use an SGPR or need 1539 // a VGPR. 1540 FIOp.ChangeToRegister(AMDGPU::M0, false); 1541 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 1542 1543 if (!Offset && FrameReg && UseSGPR) { 1544 FIOp.setReg(FrameReg); 1545 return; 1546 } 1547 1548 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 1549 : &AMDGPU::VGPR_32RegClass; 1550 1551 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 1552 FIOp.setReg(TmpReg); 1553 FIOp.setIsKill(true); 1554 1555 if ((!FrameReg || !Offset) && TmpReg) { 1556 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1557 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 1558 if (FrameReg) 1559 MIB.addReg(FrameReg); 1560 else 1561 MIB.addImm(Offset); 1562 1563 return; 1564 } 1565 1566 Register TmpSReg = 1567 UseSGPR ? TmpReg 1568 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 1569 !UseSGPR); 1570 1571 // TODO: for flat scratch another attempt can be made with a VGPR index 1572 // if no SGPRs can be scavenged. 1573 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 1574 report_fatal_error("Cannot scavenge register in FI elimination!"); 1575 1576 if (!TmpSReg) { 1577 // Use frame register and restore it after. 1578 TmpSReg = FrameReg; 1579 FIOp.setReg(FrameReg); 1580 FIOp.setIsKill(false); 1581 } 1582 1583 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg) 1584 .addReg(FrameReg) 1585 .addImm(Offset); 1586 1587 if (!UseSGPR) 1588 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1589 .addReg(TmpSReg, RegState::Kill); 1590 1591 if (TmpSReg == FrameReg) { 1592 // Undo frame register modification. 1593 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32), 1594 FrameReg) 1595 .addReg(FrameReg) 1596 .addImm(Offset); 1597 } 1598 1599 return; 1600 } 1601 1602 bool IsMUBUF = TII->isMUBUF(*MI); 1603 1604 if (!IsMUBUF && !MFI->isEntryFunction()) { 1605 // Convert to a swizzled stack address by scaling by the wave size. 1606 // 1607 // In an entry function/kernel the offset is already swizzled. 1608 1609 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1610 Register ResultReg = 1611 IsCopy ? MI->getOperand(0).getReg() 1612 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1613 1614 int64_t Offset = FrameInfo.getObjectOffset(Index); 1615 if (Offset == 0) { 1616 // XXX - This never happens because of emergency scavenging slot at 0? 1617 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1618 .addImm(ST.getWavefrontSizeLog2()) 1619 .addReg(FrameReg); 1620 } else { 1621 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 1622 // Reuse ResultReg in intermediate step. 1623 Register ScaledReg = ResultReg; 1624 1625 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 1626 ScaledReg) 1627 .addImm(ST.getWavefrontSizeLog2()) 1628 .addReg(FrameReg); 1629 1630 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 1631 1632 // TODO: Fold if use instruction is another add of a constant. 1633 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1634 // FIXME: This can fail 1635 MIB.addImm(Offset); 1636 MIB.addReg(ScaledReg, RegState::Kill); 1637 if (!IsVOP2) 1638 MIB.addImm(0); // clamp bit 1639 } else { 1640 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 1641 "Need to reuse carry out register"); 1642 1643 // Use scavenged unused carry out as offset register. 1644 Register ConstOffsetReg; 1645 if (!isWave32) 1646 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 1647 else 1648 ConstOffsetReg = MIB.getReg(1); 1649 1650 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1651 .addImm(Offset); 1652 MIB.addReg(ConstOffsetReg, RegState::Kill); 1653 MIB.addReg(ScaledReg, RegState::Kill); 1654 MIB.addImm(0); // clamp bit 1655 } 1656 } else { 1657 // We have to produce a carry out, and there isn't a free SGPR pair 1658 // for it. We can keep the whole computation on the SALU to avoid 1659 // clobbering an additional register at the cost of an extra mov. 1660 1661 // We may have 1 free scratch SGPR even though a carry out is 1662 // unavailable. Only one additional mov is needed. 1663 Register TmpScaledReg = 1664 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1665 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 1666 1667 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 1668 .addReg(FrameReg) 1669 .addImm(ST.getWavefrontSizeLog2()); 1670 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) 1671 .addReg(ScaledReg, RegState::Kill) 1672 .addImm(Offset); 1673 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 1674 .addReg(ScaledReg, RegState::Kill); 1675 1676 // If there were truly no free SGPRs, we need to undo everything. 1677 if (!TmpScaledReg.isValid()) { 1678 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) 1679 .addReg(ScaledReg, RegState::Kill) 1680 .addImm(Offset); 1681 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 1682 .addReg(FrameReg) 1683 .addImm(ST.getWavefrontSizeLog2()); 1684 } 1685 } 1686 } 1687 1688 // Don't introduce an extra copy if we're just materializing in a mov. 1689 if (IsCopy) 1690 MI->eraseFromParent(); 1691 else 1692 FIOp.ChangeToRegister(ResultReg, false, false, true); 1693 return; 1694 } 1695 1696 if (IsMUBUF) { 1697 // Disable offen so we don't need a 0 vgpr base. 1698 assert(static_cast<int>(FIOperandNum) == 1699 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1700 AMDGPU::OpName::vaddr)); 1701 1702 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 1703 assert((SOffset.isImm() && SOffset.getImm() == 0)); 1704 1705 if (FrameReg != AMDGPU::NoRegister) 1706 SOffset.ChangeToRegister(FrameReg, false); 1707 1708 int64_t Offset = FrameInfo.getObjectOffset(Index); 1709 int64_t OldImm 1710 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1711 int64_t NewOffset = OldImm + Offset; 1712 1713 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 1714 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 1715 MI->eraseFromParent(); 1716 return; 1717 } 1718 } 1719 1720 // If the offset is simply too big, don't convert to a scratch wave offset 1721 // relative index. 1722 1723 FIOp.ChangeToImmediate(Offset); 1724 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1725 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1726 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1727 .addImm(Offset); 1728 FIOp.ChangeToRegister(TmpReg, false, false, true); 1729 } 1730 } 1731 } 1732 } 1733 1734 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 1735 return AMDGPUInstPrinter::getRegisterName(Reg); 1736 } 1737 1738 const TargetRegisterClass * 1739 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) { 1740 if (BitWidth == 1) 1741 return &AMDGPU::VReg_1RegClass; 1742 if (BitWidth <= 16) 1743 return &AMDGPU::VGPR_LO16RegClass; 1744 if (BitWidth <= 32) 1745 return &AMDGPU::VGPR_32RegClass; 1746 if (BitWidth <= 64) 1747 return &AMDGPU::VReg_64RegClass; 1748 if (BitWidth <= 96) 1749 return &AMDGPU::VReg_96RegClass; 1750 if (BitWidth <= 128) 1751 return &AMDGPU::VReg_128RegClass; 1752 if (BitWidth <= 160) 1753 return &AMDGPU::VReg_160RegClass; 1754 if (BitWidth <= 192) 1755 return &AMDGPU::VReg_192RegClass; 1756 if (BitWidth <= 256) 1757 return &AMDGPU::VReg_256RegClass; 1758 if (BitWidth <= 512) 1759 return &AMDGPU::VReg_512RegClass; 1760 if (BitWidth <= 1024) 1761 return &AMDGPU::VReg_1024RegClass; 1762 1763 return nullptr; 1764 } 1765 1766 const TargetRegisterClass * 1767 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) { 1768 if (BitWidth <= 16) 1769 return &AMDGPU::AGPR_LO16RegClass; 1770 if (BitWidth <= 32) 1771 return &AMDGPU::AGPR_32RegClass; 1772 if (BitWidth <= 64) 1773 return &AMDGPU::AReg_64RegClass; 1774 if (BitWidth <= 96) 1775 return &AMDGPU::AReg_96RegClass; 1776 if (BitWidth <= 128) 1777 return &AMDGPU::AReg_128RegClass; 1778 if (BitWidth <= 160) 1779 return &AMDGPU::AReg_160RegClass; 1780 if (BitWidth <= 192) 1781 return &AMDGPU::AReg_192RegClass; 1782 if (BitWidth <= 256) 1783 return &AMDGPU::AReg_256RegClass; 1784 if (BitWidth <= 512) 1785 return &AMDGPU::AReg_512RegClass; 1786 if (BitWidth <= 1024) 1787 return &AMDGPU::AReg_1024RegClass; 1788 1789 return nullptr; 1790 } 1791 1792 const TargetRegisterClass * 1793 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 1794 if (BitWidth <= 16) 1795 return &AMDGPU::SGPR_LO16RegClass; 1796 if (BitWidth <= 32) 1797 return &AMDGPU::SReg_32RegClass; 1798 if (BitWidth <= 64) 1799 return &AMDGPU::SReg_64RegClass; 1800 if (BitWidth <= 96) 1801 return &AMDGPU::SGPR_96RegClass; 1802 if (BitWidth <= 128) 1803 return &AMDGPU::SGPR_128RegClass; 1804 if (BitWidth <= 160) 1805 return &AMDGPU::SGPR_160RegClass; 1806 if (BitWidth <= 192) 1807 return &AMDGPU::SGPR_192RegClass; 1808 if (BitWidth <= 256) 1809 return &AMDGPU::SGPR_256RegClass; 1810 if (BitWidth <= 512) 1811 return &AMDGPU::SGPR_512RegClass; 1812 if (BitWidth <= 1024) 1813 return &AMDGPU::SGPR_1024RegClass; 1814 1815 return nullptr; 1816 } 1817 1818 // FIXME: This is very slow. It might be worth creating a map from physreg to 1819 // register class. 1820 const TargetRegisterClass * 1821 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 1822 static const TargetRegisterClass *const BaseClasses[] = { 1823 &AMDGPU::VGPR_LO16RegClass, 1824 &AMDGPU::VGPR_HI16RegClass, 1825 &AMDGPU::SReg_LO16RegClass, 1826 &AMDGPU::AGPR_LO16RegClass, 1827 &AMDGPU::VGPR_32RegClass, 1828 &AMDGPU::SReg_32RegClass, 1829 &AMDGPU::AGPR_32RegClass, 1830 &AMDGPU::VReg_64RegClass, 1831 &AMDGPU::SReg_64RegClass, 1832 &AMDGPU::AReg_64RegClass, 1833 &AMDGPU::VReg_96RegClass, 1834 &AMDGPU::SReg_96RegClass, 1835 &AMDGPU::AReg_96RegClass, 1836 &AMDGPU::VReg_128RegClass, 1837 &AMDGPU::SReg_128RegClass, 1838 &AMDGPU::AReg_128RegClass, 1839 &AMDGPU::VReg_160RegClass, 1840 &AMDGPU::SReg_160RegClass, 1841 &AMDGPU::AReg_160RegClass, 1842 &AMDGPU::VReg_192RegClass, 1843 &AMDGPU::SReg_192RegClass, 1844 &AMDGPU::AReg_192RegClass, 1845 &AMDGPU::VReg_256RegClass, 1846 &AMDGPU::SReg_256RegClass, 1847 &AMDGPU::AReg_256RegClass, 1848 &AMDGPU::VReg_512RegClass, 1849 &AMDGPU::SReg_512RegClass, 1850 &AMDGPU::AReg_512RegClass, 1851 &AMDGPU::SReg_1024RegClass, 1852 &AMDGPU::VReg_1024RegClass, 1853 &AMDGPU::AReg_1024RegClass, 1854 &AMDGPU::SCC_CLASSRegClass, 1855 &AMDGPU::Pseudo_SReg_32RegClass, 1856 &AMDGPU::Pseudo_SReg_128RegClass, 1857 }; 1858 1859 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1860 if (BaseClass->contains(Reg)) { 1861 return BaseClass; 1862 } 1863 } 1864 return nullptr; 1865 } 1866 1867 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 1868 Register Reg) const { 1869 const TargetRegisterClass *RC; 1870 if (Reg.isVirtual()) 1871 RC = MRI.getRegClass(Reg); 1872 else 1873 RC = getPhysRegClass(Reg); 1874 return isSGPRClass(RC); 1875 } 1876 1877 // TODO: It might be helpful to have some target specific flags in 1878 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1879 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1880 unsigned Size = getRegSizeInBits(*RC); 1881 if (Size == 16) { 1882 return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr || 1883 getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr; 1884 } 1885 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 1886 if (!VRC) { 1887 assert(Size < 32 && "Invalid register class size"); 1888 return false; 1889 } 1890 return getCommonSubClass(VRC, RC) != nullptr; 1891 } 1892 1893 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1894 unsigned Size = getRegSizeInBits(*RC); 1895 if (Size < 16) 1896 return false; 1897 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 1898 if (!ARC) { 1899 assert(getVGPRClassForBitWidth(Size) && "Invalid register class size"); 1900 return false; 1901 } 1902 return getCommonSubClass(ARC, RC) != nullptr; 1903 } 1904 1905 const TargetRegisterClass * 1906 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 1907 unsigned Size = getRegSizeInBits(*SRC); 1908 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 1909 assert(VRC && "Invalid register class size"); 1910 return VRC; 1911 } 1912 1913 const TargetRegisterClass * 1914 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 1915 unsigned Size = getRegSizeInBits(*SRC); 1916 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 1917 assert(ARC && "Invalid register class size"); 1918 return ARC; 1919 } 1920 1921 const TargetRegisterClass * 1922 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 1923 unsigned Size = getRegSizeInBits(*VRC); 1924 if (Size == 32) 1925 return &AMDGPU::SGPR_32RegClass; 1926 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 1927 assert(SRC && "Invalid register class size"); 1928 return SRC; 1929 } 1930 1931 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1932 const TargetRegisterClass *RC, unsigned SubIdx) const { 1933 if (SubIdx == AMDGPU::NoSubRegister) 1934 return RC; 1935 1936 // We can assume that each lane corresponds to one 32-bit register. 1937 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 1938 if (isSGPRClass(RC)) { 1939 if (Size == 32) 1940 RC = &AMDGPU::SGPR_32RegClass; 1941 else 1942 RC = getSGPRClassForBitWidth(Size); 1943 } else if (hasAGPRs(RC)) { 1944 RC = getAGPRClassForBitWidth(Size); 1945 } else { 1946 RC = getVGPRClassForBitWidth(Size); 1947 } 1948 assert(RC && "Invalid sub-register class size"); 1949 return RC; 1950 } 1951 1952 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 1953 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 1954 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 1955 return !ST.hasMFMAInlineLiteralBug(); 1956 1957 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 1958 OpType <= AMDGPU::OPERAND_SRC_LAST; 1959 } 1960 1961 bool SIRegisterInfo::shouldRewriteCopySrc( 1962 const TargetRegisterClass *DefRC, 1963 unsigned DefSubReg, 1964 const TargetRegisterClass *SrcRC, 1965 unsigned SrcSubReg) const { 1966 // We want to prefer the smallest register class possible, so we don't want to 1967 // stop and rewrite on anything that looks like a subregister 1968 // extract. Operations mostly don't care about the super register class, so we 1969 // only want to stop on the most basic of copies between the same register 1970 // class. 1971 // 1972 // e.g. if we have something like 1973 // %0 = ... 1974 // %1 = ... 1975 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 1976 // %3 = COPY %2, sub0 1977 // 1978 // We want to look through the COPY to find: 1979 // => %3 = COPY %0 1980 1981 // Plain copy. 1982 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1983 } 1984 1985 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 1986 // TODO: 64-bit operands have extending behavior from 32-bit literal. 1987 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 1988 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 1989 } 1990 1991 /// Returns a lowest register that is not used at any point in the function. 1992 /// If all registers are used, then this function will return 1993 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 1994 /// highest unused register. 1995 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1996 const TargetRegisterClass *RC, 1997 const MachineFunction &MF, 1998 bool ReserveHighestVGPR) const { 1999 if (ReserveHighestVGPR) { 2000 for (MCRegister Reg : reverse(*RC)) 2001 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2002 return Reg; 2003 } else { 2004 for (MCRegister Reg : *RC) 2005 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2006 return Reg; 2007 } 2008 return MCRegister(); 2009 } 2010 2011 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2012 unsigned EltSize) const { 2013 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2014 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2015 2016 const unsigned RegDWORDs = RegBitWidth / 32; 2017 const unsigned EltDWORDs = EltSize / 4; 2018 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2019 2020 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2021 const unsigned NumParts = RegDWORDs / EltDWORDs; 2022 2023 return makeArrayRef(Parts.data(), NumParts); 2024 } 2025 2026 const TargetRegisterClass* 2027 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2028 Register Reg) const { 2029 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2030 } 2031 2032 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2033 Register Reg) const { 2034 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2035 // Registers without classes are unaddressable, SGPR-like registers. 2036 return RC && hasVGPRs(RC); 2037 } 2038 2039 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2040 Register Reg) const { 2041 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2042 2043 // Registers without classes are unaddressable, SGPR-like registers. 2044 return RC && hasAGPRs(RC); 2045 } 2046 2047 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2048 const TargetRegisterClass *SrcRC, 2049 unsigned SubReg, 2050 const TargetRegisterClass *DstRC, 2051 unsigned DstSubReg, 2052 const TargetRegisterClass *NewRC, 2053 LiveIntervals &LIS) const { 2054 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2055 unsigned DstSize = getRegSizeInBits(*DstRC); 2056 unsigned NewSize = getRegSizeInBits(*NewRC); 2057 2058 // Do not increase size of registers beyond dword, we would need to allocate 2059 // adjacent registers and constraint regalloc more than needed. 2060 2061 // Always allow dword coalescing. 2062 if (SrcSize <= 32 || DstSize <= 32) 2063 return true; 2064 2065 return NewSize <= DstSize || NewSize <= SrcSize; 2066 } 2067 2068 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2069 MachineFunction &MF) const { 2070 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2071 2072 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2073 MF.getFunction()); 2074 switch (RC->getID()) { 2075 default: 2076 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2077 case AMDGPU::VGPR_32RegClassID: 2078 case AMDGPU::VGPR_LO16RegClassID: 2079 case AMDGPU::VGPR_HI16RegClassID: 2080 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2081 case AMDGPU::SGPR_32RegClassID: 2082 case AMDGPU::SGPR_LO16RegClassID: 2083 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2084 } 2085 } 2086 2087 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2088 unsigned Idx) const { 2089 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2090 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2091 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2092 const_cast<MachineFunction &>(MF)); 2093 2094 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2095 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2096 const_cast<MachineFunction &>(MF)); 2097 2098 llvm_unreachable("Unexpected register pressure set!"); 2099 } 2100 2101 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2102 static const int Empty[] = { -1 }; 2103 2104 if (RegPressureIgnoredUnits[RegUnit]) 2105 return Empty; 2106 2107 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2108 } 2109 2110 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2111 // Not a callee saved register. 2112 return AMDGPU::SGPR30_SGPR31; 2113 } 2114 2115 const TargetRegisterClass * 2116 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2117 const RegisterBank &RB, 2118 const MachineRegisterInfo &MRI) const { 2119 switch (RB.getID()) { 2120 case AMDGPU::VGPRRegBankID: 2121 return getVGPRClassForBitWidth(std::max(32u, Size)); 2122 case AMDGPU::VCCRegBankID: 2123 assert(Size == 1); 2124 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2125 : &AMDGPU::SReg_64_XEXECRegClass; 2126 case AMDGPU::SGPRRegBankID: 2127 return getSGPRClassForBitWidth(std::max(32u, Size)); 2128 case AMDGPU::AGPRRegBankID: 2129 return getAGPRClassForBitWidth(std::max(32u, Size)); 2130 default: 2131 llvm_unreachable("unknown register bank"); 2132 } 2133 } 2134 2135 const TargetRegisterClass * 2136 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2137 const MachineRegisterInfo &MRI) const { 2138 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2139 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2140 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2141 2142 const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); 2143 return getAllocatableClass(RC); 2144 } 2145 2146 MCRegister SIRegisterInfo::getVCC() const { 2147 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2148 } 2149 2150 const TargetRegisterClass * 2151 SIRegisterInfo::getRegClass(unsigned RCID) const { 2152 switch ((int)RCID) { 2153 case AMDGPU::SReg_1RegClassID: 2154 return getBoolRC(); 2155 case AMDGPU::SReg_1_XEXECRegClassID: 2156 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2157 : &AMDGPU::SReg_64_XEXECRegClass; 2158 case -1: 2159 return nullptr; 2160 default: 2161 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2162 } 2163 } 2164 2165 // Find reaching register definition 2166 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2167 MachineInstr &Use, 2168 MachineRegisterInfo &MRI, 2169 LiveIntervals *LIS) const { 2170 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2171 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2172 SlotIndex DefIdx; 2173 2174 if (Reg.isVirtual()) { 2175 if (!LIS->hasInterval(Reg)) 2176 return nullptr; 2177 LiveInterval &LI = LIS->getInterval(Reg); 2178 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2179 : MRI.getMaxLaneMaskForVReg(Reg); 2180 VNInfo *V = nullptr; 2181 if (LI.hasSubRanges()) { 2182 for (auto &S : LI.subranges()) { 2183 if ((S.LaneMask & SubLanes) == SubLanes) { 2184 V = S.getVNInfoAt(UseIdx); 2185 break; 2186 } 2187 } 2188 } else { 2189 V = LI.getVNInfoAt(UseIdx); 2190 } 2191 if (!V) 2192 return nullptr; 2193 DefIdx = V->def; 2194 } else { 2195 // Find last def. 2196 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2197 ++Units) { 2198 LiveRange &LR = LIS->getRegUnit(*Units); 2199 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2200 if (!DefIdx.isValid() || 2201 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2202 LIS->getInstructionFromIndex(V->def))) 2203 DefIdx = V->def; 2204 } else { 2205 return nullptr; 2206 } 2207 } 2208 } 2209 2210 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2211 2212 if (!Def || !MDT.dominates(Def, &Use)) 2213 return nullptr; 2214 2215 assert(Def->modifiesRegister(Reg, this)); 2216 2217 return Def; 2218 } 2219 2220 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2221 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2222 2223 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2224 AMDGPU::SReg_32RegClass, 2225 AMDGPU::AGPR_32RegClass } ) { 2226 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2227 return Super; 2228 } 2229 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 2230 &AMDGPU::VGPR_32RegClass)) { 2231 return Super; 2232 } 2233 2234 return AMDGPU::NoRegister; 2235 } 2236 2237 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 2238 switch (PhysReg) { 2239 case AMDGPU::SGPR_NULL: 2240 case AMDGPU::SRC_SHARED_BASE: 2241 case AMDGPU::SRC_PRIVATE_BASE: 2242 case AMDGPU::SRC_SHARED_LIMIT: 2243 case AMDGPU::SRC_PRIVATE_LIMIT: 2244 return true; 2245 default: 2246 return false; 2247 } 2248 } 2249 2250 ArrayRef<MCPhysReg> 2251 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 2252 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 2253 ST.getMaxNumSGPRs(MF) / 4); 2254 } 2255 2256 ArrayRef<MCPhysReg> 2257 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 2258 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 2259 ST.getMaxNumSGPRs(MF) / 2); 2260 } 2261 2262 ArrayRef<MCPhysReg> 2263 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 2264 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 2265 } 2266