1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPURegisterBankInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUInstPrinter.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/RegisterScavenging.h" 24 25 using namespace llvm; 26 27 #define GET_REGINFO_TARGET_DESC 28 #include "AMDGPUGenRegisterInfo.inc" 29 30 static cl::opt<bool> EnableSpillSGPRToVGPR( 31 "amdgpu-spill-sgpr-to-vgpr", 32 cl::desc("Enable spilling VGPRs to SGPRs"), 33 cl::ReallyHidden, 34 cl::init(true)); 35 36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 38 39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 42 // meaning index 7 in SubRegFromChannelTable. 43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 45 46 namespace llvm { 47 48 // A temporary struct to spill SGPRs. 49 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 50 // just v_writelane and v_readlane. 51 // 52 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 53 // is saved to scratch (or the other way around for loads). 54 // For this, a VGPR is required where the needed lanes can be clobbered. The 55 // RegScavenger can provide a VGPR where currently active lanes can be 56 // clobbered, but we still need to save inactive lanes. 57 // The high-level steps are: 58 // - Try to scavenge SGPR(s) to save exec 59 // - Try to scavenge VGPR 60 // - Save needed, all or inactive lanes of a TmpVGPR 61 // - Spill/Restore SGPRs using TmpVGPR 62 // - Restore TmpVGPR 63 // 64 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 65 // cannot scavenge temporary SGPRs to save exec, we use the following code: 66 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 67 // s_not exec, exec 68 // buffer_store_dword TmpVGPR ; save inactive lanes 69 // s_not exec, exec 70 struct SGPRSpillBuilder { 71 struct PerVGPRData { 72 unsigned PerVGPR; 73 unsigned NumVGPRs; 74 int64_t VGPRLanes; 75 }; 76 77 // The SGPR to save 78 Register SuperReg; 79 MachineBasicBlock::iterator MI; 80 ArrayRef<int16_t> SplitParts; 81 unsigned NumSubRegs; 82 bool IsKill; 83 const DebugLoc &DL; 84 85 /* When spilling to stack */ 86 // The SGPRs are written into this VGPR, which is then written to scratch 87 // (or vice versa for loads). 88 Register TmpVGPR = AMDGPU::NoRegister; 89 // Temporary spill slot to save TmpVGPR to. 90 int TmpVGPRIndex = 0; 91 // If TmpVGPR is live before the spill or if it is scavenged. 92 bool TmpVGPRLive = false; 93 // Scavenged SGPR to save EXEC. 94 Register SavedExecReg = AMDGPU::NoRegister; 95 // Stack index to write the SGPRs to. 96 int Index; 97 unsigned EltSize = 4; 98 99 RegScavenger *RS; 100 MachineBasicBlock *MBB; 101 MachineFunction &MF; 102 SIMachineFunctionInfo &MFI; 103 const SIInstrInfo &TII; 104 const SIRegisterInfo &TRI; 105 bool IsWave32; 106 Register ExecReg; 107 unsigned MovOpc; 108 unsigned NotOpc; 109 110 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 111 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 112 RegScavenger *RS) 113 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 114 MI->getOperand(0).isKill(), Index, RS) {} 115 116 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 117 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 118 bool IsKill, int Index, RegScavenger *RS) 119 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 120 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 121 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 122 IsWave32(IsWave32) { 123 const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); 124 SplitParts = TRI.getRegSplitParts(RC, EltSize); 125 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 126 127 if (IsWave32) { 128 ExecReg = AMDGPU::EXEC_LO; 129 MovOpc = AMDGPU::S_MOV_B32; 130 NotOpc = AMDGPU::S_NOT_B32; 131 } else { 132 ExecReg = AMDGPU::EXEC; 133 MovOpc = AMDGPU::S_MOV_B64; 134 NotOpc = AMDGPU::S_NOT_B64; 135 } 136 137 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 138 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 139 SuperReg != AMDGPU::EXEC && "exec should never spill"); 140 } 141 142 PerVGPRData getPerVGPRData() { 143 PerVGPRData Data; 144 Data.PerVGPR = IsWave32 ? 32 : 64; 145 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 146 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 147 return Data; 148 } 149 150 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 151 // free. 152 // Writes these instructions if an SGPR can be scavenged: 153 // s_mov_b64 s[6:7], exec ; Save exec 154 // s_mov_b64 exec, 3 ; Wanted lanemask 155 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 156 // 157 // Writes these instructions if no SGPR can be scavenged: 158 // buffer_store_dword v0 ; Only if no free VGPR was found 159 // s_not_b64 exec, exec 160 // buffer_store_dword v0 ; Save inactive lanes 161 // ; exec stays inverted, it is flipped back in 162 // ; restore. 163 void prepare() { 164 // Scavenged temporary VGPR to use. It must be scavenged once for any number 165 // of spilled subregs. 166 // FIXME: The liveness analysis is limited and does not tell if a register 167 // is in use in lanes that are currently inactive. We can never be sure if 168 // a register as actually in use in another lane, so we need to save all 169 // used lanes of the chosen VGPR. 170 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 171 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); 172 173 // Reserve temporary stack slot 174 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 175 if (TmpVGPR) { 176 // Found a register that is dead in the currently active lanes, we only 177 // need to spill inactive lanes. 178 TmpVGPRLive = false; 179 } else { 180 // Pick v0 because it doesn't make a difference. 181 TmpVGPR = AMDGPU::VGPR0; 182 TmpVGPRLive = true; 183 } 184 185 // Try to scavenge SGPRs to save exec 186 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 187 const TargetRegisterClass &RC = 188 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 189 RS->setRegUsed(SuperReg); 190 SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); 191 192 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 193 194 if (SavedExecReg) { 195 RS->setRegUsed(SavedExecReg); 196 // Set exec to needed lanes 197 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 198 auto I = 199 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 200 if (!TmpVGPRLive) 201 I.addReg(TmpVGPR, RegState::ImplicitDefine); 202 // Spill needed lanes 203 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 204 } else { 205 // Spill active lanes 206 if (TmpVGPRLive) 207 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 208 /*IsKill*/ false); 209 // Spill inactive lanes 210 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 211 if (!TmpVGPRLive) 212 I.addReg(TmpVGPR, RegState::ImplicitDefine); 213 I->getOperand(2).setIsDead(true); // Mark SCC as dead. 214 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 215 } 216 } 217 218 // Writes these instructions if an SGPR can be scavenged: 219 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 220 // s_waitcnt vmcnt(0) ; If a free VGPR was found 221 // s_mov_b64 exec, s[6:7] ; Save exec 222 // 223 // Writes these instructions if no SGPR can be scavenged: 224 // buffer_load_dword v0 ; Restore inactive lanes 225 // s_waitcnt vmcnt(0) ; If a free VGPR was found 226 // s_not_b64 exec, exec 227 // buffer_load_dword v0 ; Only if no free VGPR was found 228 void restore() { 229 if (SavedExecReg) { 230 // Restore used lanes 231 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 232 /*IsKill*/ false); 233 // Restore exec 234 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 235 .addReg(SavedExecReg, RegState::Kill); 236 // Add an implicit use of the load so it is not dead. 237 // FIXME This inserts an unnecessary waitcnt 238 if (!TmpVGPRLive) { 239 I.addReg(TmpVGPR, RegState::ImplicitKill); 240 } 241 } else { 242 // Restore inactive lanes 243 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 244 /*IsKill*/ false); 245 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 246 if (!TmpVGPRLive) 247 I.addReg(TmpVGPR, RegState::ImplicitKill); 248 I->getOperand(2).setIsDead(true); // Mark SCC as dead. 249 250 // Restore active lanes 251 if (TmpVGPRLive) 252 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 253 } 254 } 255 256 // Write TmpVGPR to memory or read TmpVGPR from memory. 257 // Either using a single buffer_load/store if exec is set to the needed mask 258 // or using 259 // buffer_load 260 // s_not exec, exec 261 // buffer_load 262 // s_not exec, exec 263 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 264 if (SavedExecReg) { 265 // Spill needed lanes 266 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 267 } else { 268 // Spill active lanes 269 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 270 /*IsKill*/ false); 271 // Spill inactive lanes 272 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 273 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 274 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 275 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 276 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 277 } 278 } 279 280 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 281 assert(MBB->getParent() == &MF); 282 MI = NewMI; 283 MBB = NewMBB; 284 } 285 }; 286 287 } // namespace llvm 288 289 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 290 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 291 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 292 293 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 294 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 295 (getSubRegIndexLaneMask(AMDGPU::lo16) | 296 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 297 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 298 "getNumCoveredRegs() will not work with generated subreg masks!"); 299 300 RegPressureIgnoredUnits.resize(getNumRegUnits()); 301 RegPressureIgnoredUnits.set( 302 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 303 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 304 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 305 306 // HACK: Until this is fully tablegen'd. 307 static llvm::once_flag InitializeRegSplitPartsFlag; 308 309 static auto InitializeRegSplitPartsOnce = [this]() { 310 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 311 unsigned Size = getSubRegIdxSize(Idx); 312 if (Size & 31) 313 continue; 314 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 315 unsigned Pos = getSubRegIdxOffset(Idx); 316 if (Pos % Size) 317 continue; 318 Pos /= Size; 319 if (Vec.empty()) { 320 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 321 Vec.resize(MaxNumParts); 322 } 323 Vec[Pos] = Idx; 324 } 325 }; 326 327 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 328 329 static auto InitializeSubRegFromChannelTableOnce = [this]() { 330 for (auto &Row : SubRegFromChannelTable) 331 Row.fill(AMDGPU::NoSubRegister); 332 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 333 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 334 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 335 assert(Width < SubRegFromChannelTableWidthMap.size()); 336 Width = SubRegFromChannelTableWidthMap[Width]; 337 if (Width == 0) 338 continue; 339 unsigned TableIdx = Width - 1; 340 assert(TableIdx < SubRegFromChannelTable.size()); 341 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 342 SubRegFromChannelTable[TableIdx][Offset] = Idx; 343 } 344 }; 345 346 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 347 llvm::call_once(InitializeSubRegFromChannelTableFlag, 348 InitializeSubRegFromChannelTableOnce); 349 } 350 351 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 352 MCRegister Reg) const { 353 MCRegAliasIterator R(Reg, this, true); 354 355 for (; R.isValid(); ++R) 356 Reserved.set(*R); 357 } 358 359 // Forced to be here by one .inc 360 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 361 const MachineFunction *MF) const { 362 CallingConv::ID CC = MF->getFunction().getCallingConv(); 363 switch (CC) { 364 case CallingConv::C: 365 case CallingConv::Fast: 366 case CallingConv::Cold: 367 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 368 ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList 369 : CSR_AMDGPU_HighRegs_SaveList; 370 case CallingConv::AMDGPU_Gfx: 371 return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() 372 ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList 373 : CSR_AMDGPU_SI_Gfx_SaveList; 374 default: { 375 // Dummy to not crash RegisterClassInfo. 376 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 377 return &NoCalleeSavedReg; 378 } 379 } 380 } 381 382 const MCPhysReg * 383 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 384 return nullptr; 385 } 386 387 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 388 CallingConv::ID CC) const { 389 switch (CC) { 390 case CallingConv::C: 391 case CallingConv::Fast: 392 case CallingConv::Cold: 393 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 394 ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask 395 : CSR_AMDGPU_HighRegs_RegMask; 396 case CallingConv::AMDGPU_Gfx: 397 return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() 398 ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask 399 : CSR_AMDGPU_SI_Gfx_RegMask; 400 default: 401 return nullptr; 402 } 403 } 404 405 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 406 return CSR_AMDGPU_NoRegs_RegMask; 407 } 408 409 const TargetRegisterClass * 410 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 411 const MachineFunction &MF) const { 412 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 413 // equivalent AV class. If used one, the verifier will crash after 414 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 415 // until Instruction selection. 416 if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() && 417 (isVGPRClass(RC) || isAGPRClass(RC))) { 418 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 419 return &AMDGPU::AV_32RegClass; 420 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 421 return &AMDGPU::AV_64RegClass; 422 if (RC == &AMDGPU::VReg_64_Align2RegClass || 423 RC == &AMDGPU::AReg_64_Align2RegClass) 424 return &AMDGPU::AV_64_Align2RegClass; 425 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 426 return &AMDGPU::AV_96RegClass; 427 if (RC == &AMDGPU::VReg_96_Align2RegClass || 428 RC == &AMDGPU::AReg_96_Align2RegClass) 429 return &AMDGPU::AV_96_Align2RegClass; 430 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 431 return &AMDGPU::AV_128RegClass; 432 if (RC == &AMDGPU::VReg_128_Align2RegClass || 433 RC == &AMDGPU::AReg_128_Align2RegClass) 434 return &AMDGPU::AV_128_Align2RegClass; 435 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 436 return &AMDGPU::AV_160RegClass; 437 if (RC == &AMDGPU::VReg_160_Align2RegClass || 438 RC == &AMDGPU::AReg_160_Align2RegClass) 439 return &AMDGPU::AV_160_Align2RegClass; 440 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 441 return &AMDGPU::AV_192RegClass; 442 if (RC == &AMDGPU::VReg_192_Align2RegClass || 443 RC == &AMDGPU::AReg_192_Align2RegClass) 444 return &AMDGPU::AV_192_Align2RegClass; 445 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 446 return &AMDGPU::AV_256RegClass; 447 if (RC == &AMDGPU::VReg_256_Align2RegClass || 448 RC == &AMDGPU::AReg_256_Align2RegClass) 449 return &AMDGPU::AV_256_Align2RegClass; 450 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 451 return &AMDGPU::AV_512RegClass; 452 if (RC == &AMDGPU::VReg_512_Align2RegClass || 453 RC == &AMDGPU::AReg_512_Align2RegClass) 454 return &AMDGPU::AV_512_Align2RegClass; 455 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 456 return &AMDGPU::AV_1024RegClass; 457 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 458 RC == &AMDGPU::AReg_1024_Align2RegClass) 459 return &AMDGPU::AV_1024_Align2RegClass; 460 } 461 462 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 463 } 464 465 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 466 const SIFrameLowering *TFI = 467 MF.getSubtarget<GCNSubtarget>().getFrameLowering(); 468 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 469 // During ISel lowering we always reserve the stack pointer in entry 470 // functions, but never actually want to reference it when accessing our own 471 // frame. If we need a frame pointer we use it, but otherwise we can just use 472 // an immediate "0" which we represent by returning NoRegister. 473 if (FuncInfo->isEntryFunction()) { 474 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 475 } 476 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 477 : FuncInfo->getStackPtrOffsetReg(); 478 } 479 480 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 481 // When we need stack realignment, we can't reference off of the 482 // stack pointer, so we reserve a base pointer. 483 const MachineFrameInfo &MFI = MF.getFrameInfo(); 484 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 485 } 486 487 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 488 489 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 490 return CSR_AMDGPU_AllVGPRs_RegMask; 491 } 492 493 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 494 return CSR_AMDGPU_AllAGPRs_RegMask; 495 } 496 497 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 498 return CSR_AMDGPU_AllVectorRegs_RegMask; 499 } 500 501 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 502 return CSR_AMDGPU_AllAllocatableSRegs_RegMask; 503 } 504 505 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 506 unsigned NumRegs) { 507 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 508 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 509 assert(NumRegIndex && "Not implemented"); 510 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 511 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 512 } 513 514 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 515 const MachineFunction &MF) const { 516 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 517 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 518 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 519 } 520 521 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 522 BitVector Reserved(getNumRegs()); 523 Reserved.set(AMDGPU::MODE); 524 525 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 526 // this seems likely to result in bugs, so I'm marking them as reserved. 527 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 528 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 529 530 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 531 reserveRegisterTuples(Reserved, AMDGPU::M0); 532 533 // Reserve src_vccz, src_execz, src_scc. 534 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 535 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 536 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 537 538 // Reserve the memory aperture registers. 539 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 540 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 541 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 542 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 543 544 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 545 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 546 547 // Reserve xnack_mask registers - support is not implemented in Codegen. 548 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 549 550 // Reserve lds_direct register - support is not implemented in Codegen. 551 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 552 553 // Reserve Trap Handler registers - support is not implemented in Codegen. 554 reserveRegisterTuples(Reserved, AMDGPU::TBA); 555 reserveRegisterTuples(Reserved, AMDGPU::TMA); 556 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 557 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 558 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 559 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 560 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 561 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 562 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 563 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 564 565 // Reserve null register - it shall never be allocated 566 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 567 568 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 569 // will result in bugs. 570 if (isWave32) { 571 Reserved.set(AMDGPU::VCC); 572 Reserved.set(AMDGPU::VCC_HI); 573 } 574 575 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 576 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 577 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 578 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 579 reserveRegisterTuples(Reserved, Reg); 580 } 581 582 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 583 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 584 unsigned MaxNumAGPRs = MaxNumVGPRs; 585 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 586 587 if (ST.hasGFX90AInsts()) { 588 // In an entry function without calls and AGPRs used it is possible to use 589 // the whole register budget for VGPRs. 590 591 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and 592 // split register file accordingly. 593 if (MFI->usesAGPRs(MF)) { 594 MaxNumVGPRs /= 2; 595 MaxNumAGPRs = MaxNumVGPRs; 596 } else { 597 if (MaxNumVGPRs > TotalNumVGPRs) { 598 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 599 MaxNumVGPRs = TotalNumVGPRs; 600 } else 601 MaxNumAGPRs = 0; 602 } 603 } 604 605 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 606 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 607 reserveRegisterTuples(Reserved, Reg); 608 } 609 610 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 611 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 612 reserveRegisterTuples(Reserved, Reg); 613 } 614 615 for (auto Reg : AMDGPU::SReg_32RegClass) { 616 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 617 Register Low = getSubReg(Reg, AMDGPU::lo16); 618 // This is to prevent BB vcc liveness errors. 619 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 620 Reserved.set(Low); 621 } 622 623 for (auto Reg : AMDGPU::AGPR_32RegClass) { 624 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 625 } 626 627 // Reserve all the rest AGPRs if there are no instructions to use it. 628 if (!ST.hasMAIInsts()) { 629 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 630 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 631 reserveRegisterTuples(Reserved, Reg); 632 } 633 } 634 635 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 636 if (ScratchRSrcReg != AMDGPU::NoRegister) { 637 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 638 // to spill. 639 // TODO: May need to reserve a VGPR if doing LDS spilling. 640 reserveRegisterTuples(Reserved, ScratchRSrcReg); 641 } 642 643 // We have to assume the SP is needed in case there are calls in the function, 644 // which is detected after the function is lowered. If we aren't really going 645 // to need SP, don't bother reserving it. 646 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 647 648 if (StackPtrReg) { 649 reserveRegisterTuples(Reserved, StackPtrReg); 650 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 651 } 652 653 MCRegister FrameReg = MFI->getFrameOffsetReg(); 654 if (FrameReg) { 655 reserveRegisterTuples(Reserved, FrameReg); 656 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 657 } 658 659 if (hasBasePointer(MF)) { 660 MCRegister BasePtrReg = getBaseRegister(); 661 reserveRegisterTuples(Reserved, BasePtrReg); 662 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 663 } 664 665 for (auto Reg : MFI->WWMReservedRegs) { 666 reserveRegisterTuples(Reserved, Reg.first); 667 } 668 669 // Reserve VGPRs used for SGPR spilling. 670 // Note we treat freezeReservedRegs unusually because we run register 671 // allocation in two phases. It's OK to re-freeze with new registers for the 672 // second run. 673 #if 0 674 for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { 675 for (auto &SpilledVGPR : SpilledFI.second) 676 reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); 677 } 678 #endif 679 680 // FIXME: Stop using reserved registers for this. 681 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 682 reserveRegisterTuples(Reserved, Reg); 683 684 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 685 reserveRegisterTuples(Reserved, Reg); 686 687 for (auto SSpill : MFI->getSGPRSpillVGPRs()) 688 reserveRegisterTuples(Reserved, SSpill.VGPR); 689 690 return Reserved; 691 } 692 693 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 694 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 695 // On entry, the base address is 0, so it can't possibly need any more 696 // alignment. 697 698 // FIXME: Should be able to specify the entry frame alignment per calling 699 // convention instead. 700 if (Info->isEntryFunction()) 701 return false; 702 703 return TargetRegisterInfo::shouldRealignStack(MF); 704 } 705 706 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 707 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 708 if (Info->isEntryFunction()) { 709 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 710 return MFI.hasStackObjects() || MFI.hasCalls(); 711 } 712 713 // May need scavenger for dealing with callee saved registers. 714 return true; 715 } 716 717 bool SIRegisterInfo::requiresFrameIndexScavenging( 718 const MachineFunction &MF) const { 719 // Do not use frame virtual registers. They used to be used for SGPRs, but 720 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 721 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 722 // spill. 723 return false; 724 } 725 726 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 727 const MachineFunction &MF) const { 728 const MachineFrameInfo &MFI = MF.getFrameInfo(); 729 return MFI.hasStackObjects(); 730 } 731 732 bool SIRegisterInfo::requiresVirtualBaseRegisters( 733 const MachineFunction &) const { 734 // There are no special dedicated stack or frame pointers. 735 return true; 736 } 737 738 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 739 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 740 741 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 742 AMDGPU::OpName::offset); 743 return MI->getOperand(OffIdx).getImm(); 744 } 745 746 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 747 int Idx) const { 748 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 749 return 0; 750 751 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 752 AMDGPU::OpName::vaddr) || 753 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 754 AMDGPU::OpName::saddr))) && 755 "Should never see frame index on non-address operand"); 756 757 return getScratchInstrOffset(MI); 758 } 759 760 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 761 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 762 return false; 763 764 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 765 766 if (SIInstrInfo::isMUBUF(*MI)) 767 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 768 769 const SIInstrInfo *TII = ST.getInstrInfo(); 770 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 771 SIInstrFlags::FlatScratch); 772 } 773 774 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 775 int FrameIdx, 776 int64_t Offset) const { 777 MachineBasicBlock::iterator Ins = MBB->begin(); 778 DebugLoc DL; // Defaults to "unknown" 779 780 if (Ins != MBB->end()) 781 DL = Ins->getDebugLoc(); 782 783 MachineFunction *MF = MBB->getParent(); 784 const SIInstrInfo *TII = ST.getInstrInfo(); 785 MachineRegisterInfo &MRI = MF->getRegInfo(); 786 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 787 : AMDGPU::V_MOV_B32_e32; 788 789 Register BaseReg = MRI.createVirtualRegister( 790 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 791 : &AMDGPU::VGPR_32RegClass); 792 793 if (Offset == 0) { 794 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 795 .addFrameIndex(FrameIdx); 796 return BaseReg; 797 } 798 799 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 800 801 Register FIReg = MRI.createVirtualRegister( 802 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 803 : &AMDGPU::VGPR_32RegClass); 804 805 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 806 .addImm(Offset); 807 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 808 .addFrameIndex(FrameIdx); 809 810 if (ST.enableFlatScratch() ) { 811 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 812 .addReg(OffsetReg, RegState::Kill) 813 .addReg(FIReg); 814 return BaseReg; 815 } 816 817 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 818 .addReg(OffsetReg, RegState::Kill) 819 .addReg(FIReg) 820 .addImm(0); // clamp bit 821 822 return BaseReg; 823 } 824 825 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 826 int64_t Offset) const { 827 const SIInstrInfo *TII = ST.getInstrInfo(); 828 bool IsFlat = TII->isFLATScratch(MI); 829 830 #ifndef NDEBUG 831 // FIXME: Is it possible to be storing a frame index to itself? 832 bool SeenFI = false; 833 for (const MachineOperand &MO: MI.operands()) { 834 if (MO.isFI()) { 835 if (SeenFI) 836 llvm_unreachable("should not see multiple frame indices"); 837 838 SeenFI = true; 839 } 840 } 841 #endif 842 843 MachineOperand *FIOp = 844 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 845 : AMDGPU::OpName::vaddr); 846 847 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 848 int64_t NewOffset = OffsetOp->getImm() + Offset; 849 850 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 851 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 852 853 if (IsFlat) { 854 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 855 SIInstrFlags::FlatScratch) && 856 "offset should be legal"); 857 FIOp->ChangeToRegister(BaseReg, false); 858 OffsetOp->setImm(NewOffset); 859 return; 860 } 861 862 #ifndef NDEBUG 863 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 864 assert(SOffset->isImm() && SOffset->getImm() == 0); 865 #endif 866 867 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 868 "offset should be legal"); 869 870 FIOp->ChangeToRegister(BaseReg, false); 871 OffsetOp->setImm(NewOffset); 872 } 873 874 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 875 Register BaseReg, 876 int64_t Offset) const { 877 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 878 return false; 879 880 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 881 882 if (SIInstrInfo::isMUBUF(*MI)) 883 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 884 885 const SIInstrInfo *TII = ST.getInstrInfo(); 886 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 887 SIInstrFlags::FlatScratch); 888 } 889 890 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 891 const MachineFunction &MF, unsigned Kind) const { 892 // This is inaccurate. It depends on the instruction and address space. The 893 // only place where we should hit this is for dealing with frame indexes / 894 // private accesses, so this is correct in that case. 895 return &AMDGPU::VGPR_32RegClass; 896 } 897 898 const TargetRegisterClass * 899 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 900 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 901 return getEquivalentVGPRClass(RC); 902 903 return RC; 904 } 905 906 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 907 908 switch (Op) { 909 case AMDGPU::SI_SPILL_S1024_SAVE: 910 case AMDGPU::SI_SPILL_S1024_RESTORE: 911 case AMDGPU::SI_SPILL_V1024_SAVE: 912 case AMDGPU::SI_SPILL_V1024_RESTORE: 913 case AMDGPU::SI_SPILL_A1024_SAVE: 914 case AMDGPU::SI_SPILL_A1024_RESTORE: 915 case AMDGPU::SI_SPILL_AV1024_SAVE: 916 case AMDGPU::SI_SPILL_AV1024_RESTORE: 917 return 32; 918 case AMDGPU::SI_SPILL_S512_SAVE: 919 case AMDGPU::SI_SPILL_S512_RESTORE: 920 case AMDGPU::SI_SPILL_V512_SAVE: 921 case AMDGPU::SI_SPILL_V512_RESTORE: 922 case AMDGPU::SI_SPILL_A512_SAVE: 923 case AMDGPU::SI_SPILL_A512_RESTORE: 924 case AMDGPU::SI_SPILL_AV512_SAVE: 925 case AMDGPU::SI_SPILL_AV512_RESTORE: 926 return 16; 927 case AMDGPU::SI_SPILL_S256_SAVE: 928 case AMDGPU::SI_SPILL_S256_RESTORE: 929 case AMDGPU::SI_SPILL_V256_SAVE: 930 case AMDGPU::SI_SPILL_V256_RESTORE: 931 case AMDGPU::SI_SPILL_A256_SAVE: 932 case AMDGPU::SI_SPILL_A256_RESTORE: 933 case AMDGPU::SI_SPILL_AV256_SAVE: 934 case AMDGPU::SI_SPILL_AV256_RESTORE: 935 return 8; 936 case AMDGPU::SI_SPILL_S224_SAVE: 937 case AMDGPU::SI_SPILL_S224_RESTORE: 938 case AMDGPU::SI_SPILL_V224_SAVE: 939 case AMDGPU::SI_SPILL_V224_RESTORE: 940 case AMDGPU::SI_SPILL_A224_SAVE: 941 case AMDGPU::SI_SPILL_A224_RESTORE: 942 case AMDGPU::SI_SPILL_AV224_SAVE: 943 case AMDGPU::SI_SPILL_AV224_RESTORE: 944 return 7; 945 case AMDGPU::SI_SPILL_S192_SAVE: 946 case AMDGPU::SI_SPILL_S192_RESTORE: 947 case AMDGPU::SI_SPILL_V192_SAVE: 948 case AMDGPU::SI_SPILL_V192_RESTORE: 949 case AMDGPU::SI_SPILL_A192_SAVE: 950 case AMDGPU::SI_SPILL_A192_RESTORE: 951 case AMDGPU::SI_SPILL_AV192_SAVE: 952 case AMDGPU::SI_SPILL_AV192_RESTORE: 953 return 6; 954 case AMDGPU::SI_SPILL_S160_SAVE: 955 case AMDGPU::SI_SPILL_S160_RESTORE: 956 case AMDGPU::SI_SPILL_V160_SAVE: 957 case AMDGPU::SI_SPILL_V160_RESTORE: 958 case AMDGPU::SI_SPILL_A160_SAVE: 959 case AMDGPU::SI_SPILL_A160_RESTORE: 960 case AMDGPU::SI_SPILL_AV160_SAVE: 961 case AMDGPU::SI_SPILL_AV160_RESTORE: 962 return 5; 963 case AMDGPU::SI_SPILL_S128_SAVE: 964 case AMDGPU::SI_SPILL_S128_RESTORE: 965 case AMDGPU::SI_SPILL_V128_SAVE: 966 case AMDGPU::SI_SPILL_V128_RESTORE: 967 case AMDGPU::SI_SPILL_A128_SAVE: 968 case AMDGPU::SI_SPILL_A128_RESTORE: 969 case AMDGPU::SI_SPILL_AV128_SAVE: 970 case AMDGPU::SI_SPILL_AV128_RESTORE: 971 return 4; 972 case AMDGPU::SI_SPILL_S96_SAVE: 973 case AMDGPU::SI_SPILL_S96_RESTORE: 974 case AMDGPU::SI_SPILL_V96_SAVE: 975 case AMDGPU::SI_SPILL_V96_RESTORE: 976 case AMDGPU::SI_SPILL_A96_SAVE: 977 case AMDGPU::SI_SPILL_A96_RESTORE: 978 case AMDGPU::SI_SPILL_AV96_SAVE: 979 case AMDGPU::SI_SPILL_AV96_RESTORE: 980 return 3; 981 case AMDGPU::SI_SPILL_S64_SAVE: 982 case AMDGPU::SI_SPILL_S64_RESTORE: 983 case AMDGPU::SI_SPILL_V64_SAVE: 984 case AMDGPU::SI_SPILL_V64_RESTORE: 985 case AMDGPU::SI_SPILL_A64_SAVE: 986 case AMDGPU::SI_SPILL_A64_RESTORE: 987 case AMDGPU::SI_SPILL_AV64_SAVE: 988 case AMDGPU::SI_SPILL_AV64_RESTORE: 989 return 2; 990 case AMDGPU::SI_SPILL_S32_SAVE: 991 case AMDGPU::SI_SPILL_S32_RESTORE: 992 case AMDGPU::SI_SPILL_V32_SAVE: 993 case AMDGPU::SI_SPILL_V32_RESTORE: 994 case AMDGPU::SI_SPILL_A32_SAVE: 995 case AMDGPU::SI_SPILL_A32_RESTORE: 996 case AMDGPU::SI_SPILL_AV32_SAVE: 997 case AMDGPU::SI_SPILL_AV32_RESTORE: 998 return 1; 999 default: llvm_unreachable("Invalid spill opcode"); 1000 } 1001 } 1002 1003 static int getOffsetMUBUFStore(unsigned Opc) { 1004 switch (Opc) { 1005 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1006 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1007 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1008 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1009 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1010 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1011 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1012 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1013 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1014 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1015 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1016 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1017 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1018 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1019 default: 1020 return -1; 1021 } 1022 } 1023 1024 static int getOffsetMUBUFLoad(unsigned Opc) { 1025 switch (Opc) { 1026 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1027 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1028 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1029 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1030 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1031 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1032 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1033 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1034 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1035 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1036 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1037 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1038 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1039 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1040 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1041 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1042 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1043 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1044 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1045 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1046 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1047 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1048 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1049 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1050 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1051 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1052 default: 1053 return -1; 1054 } 1055 } 1056 1057 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1058 MachineBasicBlock &MBB, 1059 MachineBasicBlock::iterator MI, 1060 int Index, unsigned Lane, 1061 unsigned ValueReg, bool IsKill) { 1062 MachineFunction *MF = MBB.getParent(); 1063 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1064 const SIInstrInfo *TII = ST.getInstrInfo(); 1065 1066 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1067 1068 if (Reg == AMDGPU::NoRegister) 1069 return MachineInstrBuilder(); 1070 1071 bool IsStore = MI->mayStore(); 1072 MachineRegisterInfo &MRI = MF->getRegInfo(); 1073 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1074 1075 unsigned Dst = IsStore ? Reg : ValueReg; 1076 unsigned Src = IsStore ? ValueReg : Reg; 1077 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1078 DebugLoc DL = MI->getDebugLoc(); 1079 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1080 // Spiller during regalloc may restore a spilled register to its superclass. 1081 // It could result in AGPR spills restored to VGPRs or the other way around, 1082 // making the src and dst with identical regclasses at this point. It just 1083 // needs a copy in such cases. 1084 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1085 .addReg(Src, getKillRegState(IsKill)); 1086 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1087 return CopyMIB; 1088 } 1089 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1090 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1091 1092 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1093 .addReg(Src, getKillRegState(IsKill)); 1094 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1095 return MIB; 1096 } 1097 1098 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1099 // need to handle the case where an SGPR may need to be spilled while spilling. 1100 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1101 MachineFrameInfo &MFI, 1102 MachineBasicBlock::iterator MI, 1103 int Index, 1104 int64_t Offset) { 1105 const SIInstrInfo *TII = ST.getInstrInfo(); 1106 MachineBasicBlock *MBB = MI->getParent(); 1107 const DebugLoc &DL = MI->getDebugLoc(); 1108 bool IsStore = MI->mayStore(); 1109 1110 unsigned Opc = MI->getOpcode(); 1111 int LoadStoreOp = IsStore ? 1112 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1113 if (LoadStoreOp == -1) 1114 return false; 1115 1116 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1117 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1118 return true; 1119 1120 MachineInstrBuilder NewMI = 1121 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1122 .add(*Reg) 1123 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1124 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1125 .addImm(Offset) 1126 .addImm(0) // cpol 1127 .addImm(0) // tfe 1128 .addImm(0) // swz 1129 .cloneMemRefs(*MI); 1130 1131 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1132 AMDGPU::OpName::vdata_in); 1133 if (VDataIn) 1134 NewMI.add(*VDataIn); 1135 return true; 1136 } 1137 1138 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1139 unsigned LoadStoreOp, 1140 unsigned EltSize) { 1141 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1142 bool UseST = 1143 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && 1144 AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; 1145 1146 switch (EltSize) { 1147 case 4: 1148 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1149 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1150 break; 1151 case 8: 1152 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1153 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1154 break; 1155 case 12: 1156 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1157 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1158 break; 1159 case 16: 1160 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1161 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1162 break; 1163 default: 1164 llvm_unreachable("Unexpected spill load/store size!"); 1165 } 1166 1167 if (UseST) 1168 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1169 1170 return LoadStoreOp; 1171 } 1172 1173 void SIRegisterInfo::buildSpillLoadStore( 1174 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1175 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1176 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1177 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1178 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1179 1180 MachineFunction *MF = MBB.getParent(); 1181 const SIInstrInfo *TII = ST.getInstrInfo(); 1182 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1183 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1184 1185 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1186 bool IsStore = Desc->mayStore(); 1187 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1188 1189 bool Scavenged = false; 1190 MCRegister SOffset = ScratchOffsetReg; 1191 1192 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1193 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1194 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1195 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 1196 1197 // Always use 4 byte operations for AGPRs because we need to scavenge 1198 // a temporary VGPR. 1199 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1200 unsigned NumSubRegs = RegWidth / EltSize; 1201 unsigned Size = NumSubRegs * EltSize; 1202 unsigned RemSize = RegWidth - Size; 1203 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1204 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1205 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1206 int64_t ScratchOffsetRegDelta = 0; 1207 1208 if (IsFlat && EltSize > 4) { 1209 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1210 Desc = &TII->get(LoadStoreOp); 1211 } 1212 1213 Align Alignment = MFI.getObjectAlign(Index); 1214 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1215 1216 assert((IsFlat || ((Offset % EltSize) == 0)) && 1217 "unexpected VGPR spill offset"); 1218 1219 bool IsOffsetLegal = 1220 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1221 SIInstrFlags::FlatScratch) 1222 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1223 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1224 SOffset = MCRegister(); 1225 1226 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1227 // we can simplify the adjustment of Offset here to just scale with 1228 // WavefrontSize. 1229 if (!IsFlat) 1230 Offset *= ST.getWavefrontSize(); 1231 1232 // We don't have access to the register scavenger if this function is called 1233 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1234 if (RS) { 1235 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 1236 } else if (LiveRegs) { 1237 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1238 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1239 SOffset = Reg; 1240 break; 1241 } 1242 } 1243 } 1244 1245 if (!SOffset) { 1246 // There are no free SGPRs, and since we are in the process of spilling 1247 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1248 // on SI/CI and on VI it is true until we implement spilling using scalar 1249 // stores), we have no way to free up an SGPR. Our solution here is to 1250 // add the offset directly to the ScratchOffset or StackPtrOffset 1251 // register, and then subtract the offset after the spill to return the 1252 // register to it's original value. 1253 if (!ScratchOffsetReg) 1254 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1255 SOffset = ScratchOffsetReg; 1256 ScratchOffsetRegDelta = Offset; 1257 } else { 1258 Scavenged = true; 1259 } 1260 1261 if (!SOffset) 1262 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1263 1264 if (ScratchOffsetReg == AMDGPU::NoRegister) { 1265 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1266 } else { 1267 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1268 .addReg(ScratchOffsetReg) 1269 .addImm(Offset); 1270 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1271 } 1272 1273 Offset = 0; 1274 } 1275 1276 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1277 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1278 && "Unexpected vaddr for flat scratch with a FI operand"); 1279 1280 assert(ST.hasFlatScratchSTMode()); 1281 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1282 Desc = &TII->get(LoadStoreOp); 1283 } 1284 1285 Register TmpReg; 1286 1287 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1288 ++i, RegOffset += EltSize) { 1289 if (i == NumSubRegs) { 1290 EltSize = RemSize; 1291 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1292 } 1293 Desc = &TII->get(LoadStoreOp); 1294 1295 unsigned NumRegs = EltSize / 4; 1296 Register SubReg = e == 1 1297 ? ValueReg 1298 : Register(getSubReg(ValueReg, 1299 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1300 1301 unsigned SOffsetRegState = 0; 1302 unsigned SrcDstRegState = getDefRegState(!IsStore); 1303 if (i + 1 == e) { 1304 SOffsetRegState |= getKillRegState(Scavenged); 1305 // The last implicit use carries the "Kill" flag. 1306 SrcDstRegState |= getKillRegState(IsKill); 1307 } 1308 1309 // Make sure the whole register is defined if there are undef components by 1310 // adding an implicit def of the super-reg on the first instruction. 1311 bool NeedSuperRegDef = e > 1 && IsStore && i == 0; 1312 bool NeedSuperRegImpOperand = e > 1; 1313 1314 // Remaining element size to spill into memory after some parts of it 1315 // spilled into either AGPRs or VGPRs. 1316 unsigned RemEltSize = EltSize; 1317 1318 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1319 // starting from the last lane. In case if a register cannot be completely 1320 // spilled into another register that will ensure its alignment does not 1321 // change. For targets with VGPR alignment requirement this is important 1322 // in case of flat scratch usage as we might get a scratch_load or 1323 // scratch_store of an unaligned register otherwise. 1324 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1325 LaneE = RegOffset / 4; 1326 Lane >= LaneE; --Lane) { 1327 bool IsSubReg = e > 1 || EltSize > 4; 1328 Register Sub = IsSubReg 1329 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1330 : ValueReg; 1331 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1332 if (!MIB.getInstr()) 1333 break; 1334 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && !i)) { 1335 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1336 NeedSuperRegDef = false; 1337 } 1338 if (IsSubReg || NeedSuperRegImpOperand) { 1339 NeedSuperRegImpOperand = true; 1340 unsigned State = SrcDstRegState; 1341 if (Lane != LaneE) 1342 State &= ~RegState::Kill; 1343 MIB.addReg(ValueReg, RegState::Implicit | State); 1344 } 1345 RemEltSize -= 4; 1346 } 1347 1348 if (!RemEltSize) // Fully spilled into AGPRs. 1349 continue; 1350 1351 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1352 assert(IsFlat && EltSize > 4); 1353 1354 unsigned NumRegs = RemEltSize / 4; 1355 SubReg = Register(getSubReg(ValueReg, 1356 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1357 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1358 Desc = &TII->get(Opc); 1359 } 1360 1361 unsigned FinalReg = SubReg; 1362 1363 if (IsAGPR) { 1364 assert(EltSize == 4); 1365 1366 if (!TmpReg) { 1367 assert(RS && "Needs to have RegScavenger to spill an AGPR!"); 1368 // FIXME: change to scavengeRegisterBackwards() 1369 TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1370 RS->setRegUsed(TmpReg); 1371 } 1372 if (IsStore) { 1373 auto AccRead = BuildMI(MBB, MI, DL, 1374 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) 1375 .addReg(SubReg, getKillRegState(IsKill)); 1376 if (NeedSuperRegDef) 1377 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1378 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1379 } 1380 SubReg = TmpReg; 1381 } 1382 1383 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1384 MachineMemOperand *NewMMO = 1385 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1386 commonAlignment(Alignment, RegOffset)); 1387 1388 auto MIB = 1389 BuildMI(MBB, MI, DL, *Desc) 1390 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1391 if (!IsFlat) 1392 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1393 1394 if (SOffset == AMDGPU::NoRegister) { 1395 if (!IsFlat) 1396 MIB.addImm(0); 1397 } else { 1398 MIB.addReg(SOffset, SOffsetRegState); 1399 } 1400 MIB.addImm(Offset + RegOffset) 1401 .addImm(0); // cpol 1402 if (!IsFlat) 1403 MIB.addImm(0) // tfe 1404 .addImm(0); // swz 1405 MIB.addMemOperand(NewMMO); 1406 1407 if (!IsAGPR && NeedSuperRegDef) 1408 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1409 1410 if (!IsStore && TmpReg != AMDGPU::NoRegister) { 1411 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1412 FinalReg) 1413 .addReg(TmpReg, RegState::Kill); 1414 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1415 } 1416 1417 if (NeedSuperRegImpOperand) 1418 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1419 } 1420 1421 if (ScratchOffsetRegDelta != 0) { 1422 // Subtract the offset we added to the ScratchOffset register. 1423 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1424 .addReg(SOffset) 1425 .addImm(-ScratchOffsetRegDelta); 1426 } 1427 } 1428 1429 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1430 int Offset, bool IsLoad, 1431 bool IsKill) const { 1432 // Load/store VGPR 1433 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1434 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1435 1436 Register FrameReg = 1437 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1438 ? getBaseRegister() 1439 : getFrameRegister(SB.MF); 1440 1441 Align Alignment = FrameInfo.getObjectAlign(Index); 1442 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1443 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1444 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1445 SB.EltSize, Alignment); 1446 1447 if (IsLoad) { 1448 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1449 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1450 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1451 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1452 } else { 1453 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1454 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1455 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1456 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1457 // This only ever adds one VGPR spill 1458 SB.MFI.addToSpilledVGPRs(1); 1459 } 1460 } 1461 1462 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 1463 int Index, 1464 RegScavenger *RS, 1465 LiveIntervals *LIS, 1466 bool OnlyToVGPR) const { 1467 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1468 1469 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1470 SB.MFI.getSGPRToVGPRSpills(Index); 1471 bool SpillToVGPR = !VGPRSpills.empty(); 1472 if (OnlyToVGPR && !SpillToVGPR) 1473 return false; 1474 1475 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1476 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1477 1478 if (SpillToVGPR) { 1479 1480 assert(SB.NumSubRegs == VGPRSpills.size() && 1481 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1482 1483 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1484 Register SubReg = 1485 SB.NumSubRegs == 1 1486 ? SB.SuperReg 1487 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1488 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1489 1490 bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; 1491 1492 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1493 // spill to this specific vgpr in the first basic block. 1494 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1495 SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1496 .addReg(SubReg, getKillRegState(UseKill)) 1497 .addImm(Spill.Lane) 1498 .addReg(Spill.VGPR); 1499 if (LIS) { 1500 if (i == 0) 1501 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1502 else 1503 LIS->InsertMachineInstrInMaps(*MIB); 1504 } 1505 1506 if (i == 0 && SB.NumSubRegs > 1) { 1507 // We may be spilling a super-register which is only partially defined, 1508 // and need to ensure later spills think the value is defined. 1509 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1510 } 1511 1512 if (SB.NumSubRegs > 1) 1513 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1514 1515 // FIXME: Since this spills to another register instead of an actual 1516 // frame index, we should delete the frame index when all references to 1517 // it are fixed. 1518 } 1519 } else { 1520 SB.prepare(); 1521 1522 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1523 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1524 1525 // Per VGPR helper data 1526 auto PVD = SB.getPerVGPRData(); 1527 1528 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1529 unsigned TmpVGPRFlags = RegState::Undef; 1530 1531 // Write sub registers into the VGPR 1532 for (unsigned i = Offset * PVD.PerVGPR, 1533 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1534 i < e; ++i) { 1535 Register SubReg = 1536 SB.NumSubRegs == 1 1537 ? SB.SuperReg 1538 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1539 1540 MachineInstrBuilder WriteLane = 1541 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1542 SB.TmpVGPR) 1543 .addReg(SubReg, SubKillState) 1544 .addImm(i % PVD.PerVGPR) 1545 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1546 TmpVGPRFlags = 0; 1547 1548 if (LIS) { 1549 if (i == 0) 1550 LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); 1551 else 1552 LIS->InsertMachineInstrInMaps(*WriteLane); 1553 } 1554 1555 // There could be undef components of a spilled super register. 1556 // TODO: Can we detect this and skip the spill? 1557 if (SB.NumSubRegs > 1) { 1558 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1559 unsigned SuperKillState = 0; 1560 if (i + 1 == SB.NumSubRegs) 1561 SuperKillState |= getKillRegState(SB.IsKill); 1562 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1563 } 1564 } 1565 1566 // Write out VGPR 1567 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1568 } 1569 1570 SB.restore(); 1571 } 1572 1573 MI->eraseFromParent(); 1574 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1575 1576 if (LIS) 1577 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1578 1579 return true; 1580 } 1581 1582 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 1583 int Index, 1584 RegScavenger *RS, 1585 LiveIntervals *LIS, 1586 bool OnlyToVGPR) const { 1587 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1588 1589 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = 1590 SB.MFI.getSGPRToVGPRSpills(Index); 1591 bool SpillToVGPR = !VGPRSpills.empty(); 1592 if (OnlyToVGPR && !SpillToVGPR) 1593 return false; 1594 1595 if (SpillToVGPR) { 1596 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1597 Register SubReg = 1598 SB.NumSubRegs == 1 1599 ? SB.SuperReg 1600 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1601 1602 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 1603 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1604 SubReg) 1605 .addReg(Spill.VGPR) 1606 .addImm(Spill.Lane); 1607 if (SB.NumSubRegs > 1 && i == 0) 1608 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1609 if (LIS) { 1610 if (i == e - 1) 1611 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1612 else 1613 LIS->InsertMachineInstrInMaps(*MIB); 1614 } 1615 1616 } 1617 } else { 1618 SB.prepare(); 1619 1620 // Per VGPR helper data 1621 auto PVD = SB.getPerVGPRData(); 1622 1623 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1624 // Load in VGPR data 1625 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1626 1627 // Unpack lanes 1628 for (unsigned i = Offset * PVD.PerVGPR, 1629 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1630 i < e; ++i) { 1631 Register SubReg = 1632 SB.NumSubRegs == 1 1633 ? SB.SuperReg 1634 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1635 1636 bool LastSubReg = (i + 1 == e); 1637 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1638 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1639 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1640 .addImm(i); 1641 if (SB.NumSubRegs > 1 && i == 0) 1642 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1643 if (LIS) { 1644 if (i == e - 1) 1645 LIS->ReplaceMachineInstrInMaps(*MI, *MIB); 1646 else 1647 LIS->InsertMachineInstrInMaps(*MIB); 1648 } 1649 } 1650 } 1651 1652 SB.restore(); 1653 } 1654 1655 MI->eraseFromParent(); 1656 1657 if (LIS) 1658 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1659 1660 return true; 1661 } 1662 1663 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1664 MachineBasicBlock &RestoreMBB, 1665 Register SGPR, RegScavenger *RS) const { 1666 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1667 RS); 1668 SB.prepare(); 1669 // Generate the spill of SGPR to SB.TmpVGPR. 1670 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1671 auto PVD = SB.getPerVGPRData(); 1672 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1673 unsigned TmpVGPRFlags = RegState::Undef; 1674 // Write sub registers into the VGPR 1675 for (unsigned i = Offset * PVD.PerVGPR, 1676 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1677 i < e; ++i) { 1678 Register SubReg = 1679 SB.NumSubRegs == 1 1680 ? SB.SuperReg 1681 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1682 1683 MachineInstrBuilder WriteLane = 1684 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1685 SB.TmpVGPR) 1686 .addReg(SubReg, SubKillState) 1687 .addImm(i % PVD.PerVGPR) 1688 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1689 TmpVGPRFlags = 0; 1690 // There could be undef components of a spilled super register. 1691 // TODO: Can we detect this and skip the spill? 1692 if (SB.NumSubRegs > 1) { 1693 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1694 unsigned SuperKillState = 0; 1695 if (i + 1 == SB.NumSubRegs) 1696 SuperKillState |= getKillRegState(SB.IsKill); 1697 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1698 } 1699 } 1700 // Don't need to write VGPR out. 1701 } 1702 1703 // Restore clobbered registers in the specified restore block. 1704 MI = RestoreMBB.end(); 1705 SB.setMI(&RestoreMBB, MI); 1706 // Generate the restore of SGPR from SB.TmpVGPR. 1707 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1708 // Don't need to load VGPR in. 1709 // Unpack lanes 1710 for (unsigned i = Offset * PVD.PerVGPR, 1711 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1712 i < e; ++i) { 1713 Register SubReg = 1714 SB.NumSubRegs == 1 1715 ? SB.SuperReg 1716 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1717 bool LastSubReg = (i + 1 == e); 1718 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1719 SubReg) 1720 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1721 .addImm(i); 1722 if (SB.NumSubRegs > 1 && i == 0) 1723 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1724 } 1725 } 1726 SB.restore(); 1727 1728 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1729 return false; 1730 } 1731 1732 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1733 /// a VGPR and the stack slot can be safely eliminated when all other users are 1734 /// handled. 1735 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1736 MachineBasicBlock::iterator MI, 1737 int FI, 1738 RegScavenger *RS, 1739 LiveIntervals *LIS) const { 1740 switch (MI->getOpcode()) { 1741 case AMDGPU::SI_SPILL_S1024_SAVE: 1742 case AMDGPU::SI_SPILL_S512_SAVE: 1743 case AMDGPU::SI_SPILL_S256_SAVE: 1744 case AMDGPU::SI_SPILL_S224_SAVE: 1745 case AMDGPU::SI_SPILL_S192_SAVE: 1746 case AMDGPU::SI_SPILL_S160_SAVE: 1747 case AMDGPU::SI_SPILL_S128_SAVE: 1748 case AMDGPU::SI_SPILL_S96_SAVE: 1749 case AMDGPU::SI_SPILL_S64_SAVE: 1750 case AMDGPU::SI_SPILL_S32_SAVE: 1751 return spillSGPR(MI, FI, RS, LIS, true); 1752 case AMDGPU::SI_SPILL_S1024_RESTORE: 1753 case AMDGPU::SI_SPILL_S512_RESTORE: 1754 case AMDGPU::SI_SPILL_S256_RESTORE: 1755 case AMDGPU::SI_SPILL_S224_RESTORE: 1756 case AMDGPU::SI_SPILL_S192_RESTORE: 1757 case AMDGPU::SI_SPILL_S160_RESTORE: 1758 case AMDGPU::SI_SPILL_S128_RESTORE: 1759 case AMDGPU::SI_SPILL_S96_RESTORE: 1760 case AMDGPU::SI_SPILL_S64_RESTORE: 1761 case AMDGPU::SI_SPILL_S32_RESTORE: 1762 return restoreSGPR(MI, FI, RS, LIS, true); 1763 default: 1764 llvm_unreachable("not an SGPR spill instruction"); 1765 } 1766 } 1767 1768 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 1769 int SPAdj, unsigned FIOperandNum, 1770 RegScavenger *RS) const { 1771 MachineFunction *MF = MI->getParent()->getParent(); 1772 MachineBasicBlock *MBB = MI->getParent(); 1773 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1774 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1775 const SIInstrInfo *TII = ST.getInstrInfo(); 1776 DebugLoc DL = MI->getDebugLoc(); 1777 1778 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 1779 1780 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 1781 int Index = MI->getOperand(FIOperandNum).getIndex(); 1782 1783 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 1784 ? getBaseRegister() 1785 : getFrameRegister(*MF); 1786 1787 switch (MI->getOpcode()) { 1788 // SGPR register spill 1789 case AMDGPU::SI_SPILL_S1024_SAVE: 1790 case AMDGPU::SI_SPILL_S512_SAVE: 1791 case AMDGPU::SI_SPILL_S256_SAVE: 1792 case AMDGPU::SI_SPILL_S224_SAVE: 1793 case AMDGPU::SI_SPILL_S192_SAVE: 1794 case AMDGPU::SI_SPILL_S160_SAVE: 1795 case AMDGPU::SI_SPILL_S128_SAVE: 1796 case AMDGPU::SI_SPILL_S96_SAVE: 1797 case AMDGPU::SI_SPILL_S64_SAVE: 1798 case AMDGPU::SI_SPILL_S32_SAVE: { 1799 spillSGPR(MI, Index, RS); 1800 break; 1801 } 1802 1803 // SGPR register restore 1804 case AMDGPU::SI_SPILL_S1024_RESTORE: 1805 case AMDGPU::SI_SPILL_S512_RESTORE: 1806 case AMDGPU::SI_SPILL_S256_RESTORE: 1807 case AMDGPU::SI_SPILL_S224_RESTORE: 1808 case AMDGPU::SI_SPILL_S192_RESTORE: 1809 case AMDGPU::SI_SPILL_S160_RESTORE: 1810 case AMDGPU::SI_SPILL_S128_RESTORE: 1811 case AMDGPU::SI_SPILL_S96_RESTORE: 1812 case AMDGPU::SI_SPILL_S64_RESTORE: 1813 case AMDGPU::SI_SPILL_S32_RESTORE: { 1814 restoreSGPR(MI, Index, RS); 1815 break; 1816 } 1817 1818 // VGPR register spill 1819 case AMDGPU::SI_SPILL_V1024_SAVE: 1820 case AMDGPU::SI_SPILL_V512_SAVE: 1821 case AMDGPU::SI_SPILL_V256_SAVE: 1822 case AMDGPU::SI_SPILL_V224_SAVE: 1823 case AMDGPU::SI_SPILL_V192_SAVE: 1824 case AMDGPU::SI_SPILL_V160_SAVE: 1825 case AMDGPU::SI_SPILL_V128_SAVE: 1826 case AMDGPU::SI_SPILL_V96_SAVE: 1827 case AMDGPU::SI_SPILL_V64_SAVE: 1828 case AMDGPU::SI_SPILL_V32_SAVE: 1829 case AMDGPU::SI_SPILL_A1024_SAVE: 1830 case AMDGPU::SI_SPILL_A512_SAVE: 1831 case AMDGPU::SI_SPILL_A256_SAVE: 1832 case AMDGPU::SI_SPILL_A224_SAVE: 1833 case AMDGPU::SI_SPILL_A192_SAVE: 1834 case AMDGPU::SI_SPILL_A160_SAVE: 1835 case AMDGPU::SI_SPILL_A128_SAVE: 1836 case AMDGPU::SI_SPILL_A96_SAVE: 1837 case AMDGPU::SI_SPILL_A64_SAVE: 1838 case AMDGPU::SI_SPILL_A32_SAVE: 1839 case AMDGPU::SI_SPILL_AV1024_SAVE: 1840 case AMDGPU::SI_SPILL_AV512_SAVE: 1841 case AMDGPU::SI_SPILL_AV256_SAVE: 1842 case AMDGPU::SI_SPILL_AV224_SAVE: 1843 case AMDGPU::SI_SPILL_AV192_SAVE: 1844 case AMDGPU::SI_SPILL_AV160_SAVE: 1845 case AMDGPU::SI_SPILL_AV128_SAVE: 1846 case AMDGPU::SI_SPILL_AV96_SAVE: 1847 case AMDGPU::SI_SPILL_AV64_SAVE: 1848 case AMDGPU::SI_SPILL_AV32_SAVE: { 1849 const MachineOperand *VData = TII->getNamedOperand(*MI, 1850 AMDGPU::OpName::vdata); 1851 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1852 MFI->getStackPtrOffsetReg()); 1853 1854 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1855 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1856 auto *MBB = MI->getParent(); 1857 buildSpillLoadStore( 1858 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1859 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1860 *MI->memoperands_begin(), RS); 1861 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1862 MI->eraseFromParent(); 1863 break; 1864 } 1865 case AMDGPU::SI_SPILL_V32_RESTORE: 1866 case AMDGPU::SI_SPILL_V64_RESTORE: 1867 case AMDGPU::SI_SPILL_V96_RESTORE: 1868 case AMDGPU::SI_SPILL_V128_RESTORE: 1869 case AMDGPU::SI_SPILL_V160_RESTORE: 1870 case AMDGPU::SI_SPILL_V192_RESTORE: 1871 case AMDGPU::SI_SPILL_V224_RESTORE: 1872 case AMDGPU::SI_SPILL_V256_RESTORE: 1873 case AMDGPU::SI_SPILL_V512_RESTORE: 1874 case AMDGPU::SI_SPILL_V1024_RESTORE: 1875 case AMDGPU::SI_SPILL_A32_RESTORE: 1876 case AMDGPU::SI_SPILL_A64_RESTORE: 1877 case AMDGPU::SI_SPILL_A96_RESTORE: 1878 case AMDGPU::SI_SPILL_A128_RESTORE: 1879 case AMDGPU::SI_SPILL_A160_RESTORE: 1880 case AMDGPU::SI_SPILL_A192_RESTORE: 1881 case AMDGPU::SI_SPILL_A224_RESTORE: 1882 case AMDGPU::SI_SPILL_A256_RESTORE: 1883 case AMDGPU::SI_SPILL_A512_RESTORE: 1884 case AMDGPU::SI_SPILL_A1024_RESTORE: 1885 case AMDGPU::SI_SPILL_AV32_RESTORE: 1886 case AMDGPU::SI_SPILL_AV64_RESTORE: 1887 case AMDGPU::SI_SPILL_AV96_RESTORE: 1888 case AMDGPU::SI_SPILL_AV128_RESTORE: 1889 case AMDGPU::SI_SPILL_AV160_RESTORE: 1890 case AMDGPU::SI_SPILL_AV192_RESTORE: 1891 case AMDGPU::SI_SPILL_AV224_RESTORE: 1892 case AMDGPU::SI_SPILL_AV256_RESTORE: 1893 case AMDGPU::SI_SPILL_AV512_RESTORE: 1894 case AMDGPU::SI_SPILL_AV1024_RESTORE: { 1895 const MachineOperand *VData = TII->getNamedOperand(*MI, 1896 AMDGPU::OpName::vdata); 1897 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1898 MFI->getStackPtrOffsetReg()); 1899 1900 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1901 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1902 auto *MBB = MI->getParent(); 1903 buildSpillLoadStore( 1904 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 1905 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1906 *MI->memoperands_begin(), RS); 1907 MI->eraseFromParent(); 1908 break; 1909 } 1910 1911 default: { 1912 // Other access to frame index 1913 const DebugLoc &DL = MI->getDebugLoc(); 1914 1915 int64_t Offset = FrameInfo.getObjectOffset(Index); 1916 if (ST.enableFlatScratch()) { 1917 if (TII->isFLATScratch(*MI)) { 1918 assert((int16_t)FIOperandNum == 1919 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1920 AMDGPU::OpName::saddr)); 1921 1922 // The offset is always swizzled, just replace it 1923 if (FrameReg) 1924 FIOp.ChangeToRegister(FrameReg, false); 1925 1926 if (!Offset) 1927 return; 1928 1929 MachineOperand *OffsetOp = 1930 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1931 int64_t NewOffset = Offset + OffsetOp->getImm(); 1932 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 1933 SIInstrFlags::FlatScratch)) { 1934 OffsetOp->setImm(NewOffset); 1935 if (FrameReg) 1936 return; 1937 Offset = 0; 1938 } 1939 1940 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && 1941 "Unexpected vaddr for flat scratch with a FI operand"); 1942 1943 // On GFX10 we have ST mode to use no registers for an address. 1944 // Otherwise we need to materialize 0 into an SGPR. 1945 if (!Offset && ST.hasFlatScratchSTMode()) { 1946 unsigned Opc = MI->getOpcode(); 1947 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 1948 MI->RemoveOperand( 1949 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 1950 MI->setDesc(TII->get(NewOpc)); 1951 return; 1952 } 1953 } 1954 1955 if (!FrameReg) { 1956 FIOp.ChangeToImmediate(Offset); 1957 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 1958 return; 1959 } 1960 1961 // We need to use register here. Check if we can use an SGPR or need 1962 // a VGPR. 1963 FIOp.ChangeToRegister(AMDGPU::M0, false); 1964 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 1965 1966 if (!Offset && FrameReg && UseSGPR) { 1967 FIOp.setReg(FrameReg); 1968 return; 1969 } 1970 1971 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 1972 : &AMDGPU::VGPR_32RegClass; 1973 1974 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 1975 FIOp.setReg(TmpReg); 1976 FIOp.setIsKill(true); 1977 1978 if ((!FrameReg || !Offset) && TmpReg) { 1979 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1980 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 1981 if (FrameReg) 1982 MIB.addReg(FrameReg); 1983 else 1984 MIB.addImm(Offset); 1985 1986 return; 1987 } 1988 1989 Register TmpSReg = 1990 UseSGPR ? TmpReg 1991 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 1992 !UseSGPR); 1993 1994 // TODO: for flat scratch another attempt can be made with a VGPR index 1995 // if no SGPRs can be scavenged. 1996 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 1997 report_fatal_error("Cannot scavenge register in FI elimination!"); 1998 1999 if (!TmpSReg) { 2000 // Use frame register and restore it after. 2001 TmpSReg = FrameReg; 2002 FIOp.setReg(FrameReg); 2003 FIOp.setIsKill(false); 2004 } 2005 2006 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2007 .addReg(FrameReg) 2008 .addImm(Offset); 2009 2010 if (!UseSGPR) 2011 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2012 .addReg(TmpSReg, RegState::Kill); 2013 2014 if (TmpSReg == FrameReg) { 2015 // Undo frame register modification. 2016 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2017 FrameReg) 2018 .addReg(FrameReg) 2019 .addImm(-Offset); 2020 } 2021 2022 return; 2023 } 2024 2025 bool IsMUBUF = TII->isMUBUF(*MI); 2026 2027 if (!IsMUBUF && !MFI->isEntryFunction()) { 2028 // Convert to a swizzled stack address by scaling by the wave size. 2029 // 2030 // In an entry function/kernel the offset is already swizzled. 2031 2032 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 2033 Register ResultReg = 2034 IsCopy ? MI->getOperand(0).getReg() 2035 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2036 2037 int64_t Offset = FrameInfo.getObjectOffset(Index); 2038 if (Offset == 0) { 2039 // XXX - This never happens because of emergency scavenging slot at 0? 2040 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 2041 .addImm(ST.getWavefrontSizeLog2()) 2042 .addReg(FrameReg); 2043 } else { 2044 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 2045 // Reuse ResultReg in intermediate step. 2046 Register ScaledReg = ResultReg; 2047 2048 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2049 ScaledReg) 2050 .addImm(ST.getWavefrontSizeLog2()) 2051 .addReg(FrameReg); 2052 2053 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2054 2055 // TODO: Fold if use instruction is another add of a constant. 2056 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2057 // FIXME: This can fail 2058 MIB.addImm(Offset); 2059 MIB.addReg(ScaledReg, RegState::Kill); 2060 if (!IsVOP2) 2061 MIB.addImm(0); // clamp bit 2062 } else { 2063 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2064 "Need to reuse carry out register"); 2065 2066 // Use scavenged unused carry out as offset register. 2067 Register ConstOffsetReg; 2068 if (!isWave32) 2069 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2070 else 2071 ConstOffsetReg = MIB.getReg(1); 2072 2073 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2074 .addImm(Offset); 2075 MIB.addReg(ConstOffsetReg, RegState::Kill); 2076 MIB.addReg(ScaledReg, RegState::Kill); 2077 MIB.addImm(0); // clamp bit 2078 } 2079 } else { 2080 // We have to produce a carry out, and there isn't a free SGPR pair 2081 // for it. We can keep the whole computation on the SALU to avoid 2082 // clobbering an additional register at the cost of an extra mov. 2083 2084 // We may have 1 free scratch SGPR even though a carry out is 2085 // unavailable. Only one additional mov is needed. 2086 Register TmpScaledReg = 2087 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 2088 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2089 2090 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2091 .addReg(FrameReg) 2092 .addImm(ST.getWavefrontSizeLog2()); 2093 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2094 .addReg(ScaledReg, RegState::Kill) 2095 .addImm(Offset); 2096 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2097 .addReg(ScaledReg, RegState::Kill); 2098 2099 // If there were truly no free SGPRs, we need to undo everything. 2100 if (!TmpScaledReg.isValid()) { 2101 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2102 .addReg(ScaledReg, RegState::Kill) 2103 .addImm(-Offset); 2104 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2105 .addReg(FrameReg) 2106 .addImm(ST.getWavefrontSizeLog2()); 2107 } 2108 } 2109 } 2110 2111 // Don't introduce an extra copy if we're just materializing in a mov. 2112 if (IsCopy) 2113 MI->eraseFromParent(); 2114 else 2115 FIOp.ChangeToRegister(ResultReg, false, false, true); 2116 return; 2117 } 2118 2119 if (IsMUBUF) { 2120 // Disable offen so we don't need a 0 vgpr base. 2121 assert(static_cast<int>(FIOperandNum) == 2122 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2123 AMDGPU::OpName::vaddr)); 2124 2125 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2126 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2127 2128 if (FrameReg != AMDGPU::NoRegister) 2129 SOffset.ChangeToRegister(FrameReg, false); 2130 2131 int64_t Offset = FrameInfo.getObjectOffset(Index); 2132 int64_t OldImm 2133 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2134 int64_t NewOffset = OldImm + Offset; 2135 2136 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 2137 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2138 MI->eraseFromParent(); 2139 return; 2140 } 2141 } 2142 2143 // If the offset is simply too big, don't convert to a scratch wave offset 2144 // relative index. 2145 2146 FIOp.ChangeToImmediate(Offset); 2147 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2148 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2149 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2150 .addImm(Offset); 2151 FIOp.ChangeToRegister(TmpReg, false, false, true); 2152 } 2153 } 2154 } 2155 } 2156 2157 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2158 return AMDGPUInstPrinter::getRegisterName(Reg); 2159 } 2160 2161 static const TargetRegisterClass * 2162 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2163 if (BitWidth <= 64) 2164 return &AMDGPU::VReg_64RegClass; 2165 if (BitWidth <= 96) 2166 return &AMDGPU::VReg_96RegClass; 2167 if (BitWidth <= 128) 2168 return &AMDGPU::VReg_128RegClass; 2169 if (BitWidth <= 160) 2170 return &AMDGPU::VReg_160RegClass; 2171 if (BitWidth <= 192) 2172 return &AMDGPU::VReg_192RegClass; 2173 if (BitWidth <= 224) 2174 return &AMDGPU::VReg_224RegClass; 2175 if (BitWidth <= 256) 2176 return &AMDGPU::VReg_256RegClass; 2177 if (BitWidth <= 512) 2178 return &AMDGPU::VReg_512RegClass; 2179 if (BitWidth <= 1024) 2180 return &AMDGPU::VReg_1024RegClass; 2181 2182 return nullptr; 2183 } 2184 2185 static const TargetRegisterClass * 2186 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2187 if (BitWidth <= 64) 2188 return &AMDGPU::VReg_64_Align2RegClass; 2189 if (BitWidth <= 96) 2190 return &AMDGPU::VReg_96_Align2RegClass; 2191 if (BitWidth <= 128) 2192 return &AMDGPU::VReg_128_Align2RegClass; 2193 if (BitWidth <= 160) 2194 return &AMDGPU::VReg_160_Align2RegClass; 2195 if (BitWidth <= 192) 2196 return &AMDGPU::VReg_192_Align2RegClass; 2197 if (BitWidth <= 224) 2198 return &AMDGPU::VReg_224_Align2RegClass; 2199 if (BitWidth <= 256) 2200 return &AMDGPU::VReg_256_Align2RegClass; 2201 if (BitWidth <= 512) 2202 return &AMDGPU::VReg_512_Align2RegClass; 2203 if (BitWidth <= 1024) 2204 return &AMDGPU::VReg_1024_Align2RegClass; 2205 2206 return nullptr; 2207 } 2208 2209 const TargetRegisterClass * 2210 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2211 if (BitWidth == 1) 2212 return &AMDGPU::VReg_1RegClass; 2213 if (BitWidth <= 16) 2214 return &AMDGPU::VGPR_LO16RegClass; 2215 if (BitWidth <= 32) 2216 return &AMDGPU::VGPR_32RegClass; 2217 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2218 : getAnyVGPRClassForBitWidth(BitWidth); 2219 } 2220 2221 static const TargetRegisterClass * 2222 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2223 if (BitWidth <= 64) 2224 return &AMDGPU::AReg_64RegClass; 2225 if (BitWidth <= 96) 2226 return &AMDGPU::AReg_96RegClass; 2227 if (BitWidth <= 128) 2228 return &AMDGPU::AReg_128RegClass; 2229 if (BitWidth <= 160) 2230 return &AMDGPU::AReg_160RegClass; 2231 if (BitWidth <= 192) 2232 return &AMDGPU::AReg_192RegClass; 2233 if (BitWidth <= 224) 2234 return &AMDGPU::AReg_224RegClass; 2235 if (BitWidth <= 256) 2236 return &AMDGPU::AReg_256RegClass; 2237 if (BitWidth <= 512) 2238 return &AMDGPU::AReg_512RegClass; 2239 if (BitWidth <= 1024) 2240 return &AMDGPU::AReg_1024RegClass; 2241 2242 return nullptr; 2243 } 2244 2245 static const TargetRegisterClass * 2246 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2247 if (BitWidth <= 64) 2248 return &AMDGPU::AReg_64_Align2RegClass; 2249 if (BitWidth <= 96) 2250 return &AMDGPU::AReg_96_Align2RegClass; 2251 if (BitWidth <= 128) 2252 return &AMDGPU::AReg_128_Align2RegClass; 2253 if (BitWidth <= 160) 2254 return &AMDGPU::AReg_160_Align2RegClass; 2255 if (BitWidth <= 192) 2256 return &AMDGPU::AReg_192_Align2RegClass; 2257 if (BitWidth <= 224) 2258 return &AMDGPU::AReg_224_Align2RegClass; 2259 if (BitWidth <= 256) 2260 return &AMDGPU::AReg_256_Align2RegClass; 2261 if (BitWidth <= 512) 2262 return &AMDGPU::AReg_512_Align2RegClass; 2263 if (BitWidth <= 1024) 2264 return &AMDGPU::AReg_1024_Align2RegClass; 2265 2266 return nullptr; 2267 } 2268 2269 const TargetRegisterClass * 2270 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2271 if (BitWidth <= 16) 2272 return &AMDGPU::AGPR_LO16RegClass; 2273 if (BitWidth <= 32) 2274 return &AMDGPU::AGPR_32RegClass; 2275 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2276 : getAnyAGPRClassForBitWidth(BitWidth); 2277 } 2278 2279 static const TargetRegisterClass * 2280 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2281 if (BitWidth <= 64) 2282 return &AMDGPU::AV_64RegClass; 2283 if (BitWidth <= 96) 2284 return &AMDGPU::AV_96RegClass; 2285 if (BitWidth <= 128) 2286 return &AMDGPU::AV_128RegClass; 2287 if (BitWidth <= 160) 2288 return &AMDGPU::AV_160RegClass; 2289 if (BitWidth <= 192) 2290 return &AMDGPU::AV_192RegClass; 2291 if (BitWidth <= 224) 2292 return &AMDGPU::AV_224RegClass; 2293 if (BitWidth <= 256) 2294 return &AMDGPU::AV_256RegClass; 2295 if (BitWidth <= 512) 2296 return &AMDGPU::AV_512RegClass; 2297 if (BitWidth <= 1024) 2298 return &AMDGPU::AV_1024RegClass; 2299 2300 return nullptr; 2301 } 2302 2303 static const TargetRegisterClass * 2304 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2305 if (BitWidth <= 64) 2306 return &AMDGPU::AV_64_Align2RegClass; 2307 if (BitWidth <= 96) 2308 return &AMDGPU::AV_96_Align2RegClass; 2309 if (BitWidth <= 128) 2310 return &AMDGPU::AV_128_Align2RegClass; 2311 if (BitWidth <= 160) 2312 return &AMDGPU::AV_160_Align2RegClass; 2313 if (BitWidth <= 192) 2314 return &AMDGPU::AV_192_Align2RegClass; 2315 if (BitWidth <= 224) 2316 return &AMDGPU::AV_224_Align2RegClass; 2317 if (BitWidth <= 256) 2318 return &AMDGPU::AV_256_Align2RegClass; 2319 if (BitWidth <= 512) 2320 return &AMDGPU::AV_512_Align2RegClass; 2321 if (BitWidth <= 1024) 2322 return &AMDGPU::AV_1024_Align2RegClass; 2323 2324 return nullptr; 2325 } 2326 2327 const TargetRegisterClass * 2328 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2329 if (BitWidth <= 16) 2330 return &AMDGPU::VGPR_LO16RegClass; 2331 if (BitWidth <= 32) 2332 return &AMDGPU::AV_32RegClass; 2333 return ST.needsAlignedVGPRs() 2334 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2335 : getAnyVectorSuperClassForBitWidth(BitWidth); 2336 } 2337 2338 const TargetRegisterClass * 2339 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2340 if (BitWidth <= 16) 2341 return &AMDGPU::SGPR_LO16RegClass; 2342 if (BitWidth <= 32) 2343 return &AMDGPU::SReg_32RegClass; 2344 if (BitWidth <= 64) 2345 return &AMDGPU::SReg_64RegClass; 2346 if (BitWidth <= 96) 2347 return &AMDGPU::SGPR_96RegClass; 2348 if (BitWidth <= 128) 2349 return &AMDGPU::SGPR_128RegClass; 2350 if (BitWidth <= 160) 2351 return &AMDGPU::SGPR_160RegClass; 2352 if (BitWidth <= 192) 2353 return &AMDGPU::SGPR_192RegClass; 2354 if (BitWidth <= 224) 2355 return &AMDGPU::SGPR_224RegClass; 2356 if (BitWidth <= 256) 2357 return &AMDGPU::SGPR_256RegClass; 2358 if (BitWidth <= 512) 2359 return &AMDGPU::SGPR_512RegClass; 2360 if (BitWidth <= 1024) 2361 return &AMDGPU::SGPR_1024RegClass; 2362 2363 return nullptr; 2364 } 2365 2366 // FIXME: This is very slow. It might be worth creating a map from physreg to 2367 // register class. 2368 const TargetRegisterClass * 2369 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { 2370 static const TargetRegisterClass *const BaseClasses[] = { 2371 &AMDGPU::VGPR_LO16RegClass, 2372 &AMDGPU::VGPR_HI16RegClass, 2373 &AMDGPU::SReg_LO16RegClass, 2374 &AMDGPU::AGPR_LO16RegClass, 2375 &AMDGPU::VGPR_32RegClass, 2376 &AMDGPU::SReg_32RegClass, 2377 &AMDGPU::AGPR_32RegClass, 2378 &AMDGPU::AGPR_32RegClass, 2379 &AMDGPU::VReg_64_Align2RegClass, 2380 &AMDGPU::VReg_64RegClass, 2381 &AMDGPU::SReg_64RegClass, 2382 &AMDGPU::AReg_64_Align2RegClass, 2383 &AMDGPU::AReg_64RegClass, 2384 &AMDGPU::VReg_96_Align2RegClass, 2385 &AMDGPU::VReg_96RegClass, 2386 &AMDGPU::SReg_96RegClass, 2387 &AMDGPU::AReg_96_Align2RegClass, 2388 &AMDGPU::AReg_96RegClass, 2389 &AMDGPU::VReg_128_Align2RegClass, 2390 &AMDGPU::VReg_128RegClass, 2391 &AMDGPU::SReg_128RegClass, 2392 &AMDGPU::AReg_128_Align2RegClass, 2393 &AMDGPU::AReg_128RegClass, 2394 &AMDGPU::VReg_160_Align2RegClass, 2395 &AMDGPU::VReg_160RegClass, 2396 &AMDGPU::SReg_160RegClass, 2397 &AMDGPU::AReg_160_Align2RegClass, 2398 &AMDGPU::AReg_160RegClass, 2399 &AMDGPU::VReg_192_Align2RegClass, 2400 &AMDGPU::VReg_192RegClass, 2401 &AMDGPU::SReg_192RegClass, 2402 &AMDGPU::AReg_192_Align2RegClass, 2403 &AMDGPU::AReg_192RegClass, 2404 &AMDGPU::VReg_224_Align2RegClass, 2405 &AMDGPU::VReg_224RegClass, 2406 &AMDGPU::SReg_224RegClass, 2407 &AMDGPU::AReg_224_Align2RegClass, 2408 &AMDGPU::AReg_224RegClass, 2409 &AMDGPU::VReg_256_Align2RegClass, 2410 &AMDGPU::VReg_256RegClass, 2411 &AMDGPU::SReg_256RegClass, 2412 &AMDGPU::AReg_256_Align2RegClass, 2413 &AMDGPU::AReg_256RegClass, 2414 &AMDGPU::VReg_512_Align2RegClass, 2415 &AMDGPU::VReg_512RegClass, 2416 &AMDGPU::SReg_512RegClass, 2417 &AMDGPU::AReg_512_Align2RegClass, 2418 &AMDGPU::AReg_512RegClass, 2419 &AMDGPU::SReg_1024RegClass, 2420 &AMDGPU::VReg_1024_Align2RegClass, 2421 &AMDGPU::VReg_1024RegClass, 2422 &AMDGPU::AReg_1024_Align2RegClass, 2423 &AMDGPU::AReg_1024RegClass, 2424 &AMDGPU::SCC_CLASSRegClass, 2425 &AMDGPU::Pseudo_SReg_32RegClass, 2426 &AMDGPU::Pseudo_SReg_128RegClass, 2427 }; 2428 2429 for (const TargetRegisterClass *BaseClass : BaseClasses) { 2430 if (BaseClass->contains(Reg)) { 2431 return BaseClass; 2432 } 2433 } 2434 return nullptr; 2435 } 2436 2437 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2438 Register Reg) const { 2439 const TargetRegisterClass *RC; 2440 if (Reg.isVirtual()) 2441 RC = MRI.getRegClass(Reg); 2442 else 2443 RC = getPhysRegClass(Reg); 2444 return isSGPRClass(RC); 2445 } 2446 2447 const TargetRegisterClass * 2448 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2449 unsigned Size = getRegSizeInBits(*SRC); 2450 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2451 assert(VRC && "Invalid register class size"); 2452 return VRC; 2453 } 2454 2455 const TargetRegisterClass * 2456 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2457 unsigned Size = getRegSizeInBits(*SRC); 2458 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2459 assert(ARC && "Invalid register class size"); 2460 return ARC; 2461 } 2462 2463 const TargetRegisterClass * 2464 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2465 unsigned Size = getRegSizeInBits(*VRC); 2466 if (Size == 32) 2467 return &AMDGPU::SGPR_32RegClass; 2468 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2469 assert(SRC && "Invalid register class size"); 2470 return SRC; 2471 } 2472 2473 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 2474 const TargetRegisterClass *RC, unsigned SubIdx) const { 2475 if (SubIdx == AMDGPU::NoSubRegister) 2476 return RC; 2477 2478 // We can assume that each lane corresponds to one 32-bit register. 2479 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; 2480 if (isAGPRClass(RC)) { 2481 RC = getAGPRClassForBitWidth(Size); 2482 } else if (isVGPRClass(RC)) { 2483 RC = getVGPRClassForBitWidth(Size); 2484 } else if (isVectorSuperClass(RC)) { 2485 RC = getVectorSuperClassForBitWidth(Size); 2486 } else { 2487 RC = getSGPRClassForBitWidth(Size); 2488 } 2489 assert(RC && "Invalid sub-register class size"); 2490 return RC; 2491 } 2492 2493 const TargetRegisterClass * 2494 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2495 const TargetRegisterClass *SubRC, 2496 unsigned SubIdx) const { 2497 // Ensure this subregister index is aligned in the super register. 2498 const TargetRegisterClass *MatchRC = 2499 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2500 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2501 } 2502 2503 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2504 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2505 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2506 return !ST.hasMFMAInlineLiteralBug(); 2507 2508 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2509 OpType <= AMDGPU::OPERAND_SRC_LAST; 2510 } 2511 2512 bool SIRegisterInfo::shouldRewriteCopySrc( 2513 const TargetRegisterClass *DefRC, 2514 unsigned DefSubReg, 2515 const TargetRegisterClass *SrcRC, 2516 unsigned SrcSubReg) const { 2517 // We want to prefer the smallest register class possible, so we don't want to 2518 // stop and rewrite on anything that looks like a subregister 2519 // extract. Operations mostly don't care about the super register class, so we 2520 // only want to stop on the most basic of copies between the same register 2521 // class. 2522 // 2523 // e.g. if we have something like 2524 // %0 = ... 2525 // %1 = ... 2526 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2527 // %3 = COPY %2, sub0 2528 // 2529 // We want to look through the COPY to find: 2530 // => %3 = COPY %0 2531 2532 // Plain copy. 2533 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2534 } 2535 2536 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2537 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2538 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2539 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2540 } 2541 2542 /// Returns a lowest register that is not used at any point in the function. 2543 /// If all registers are used, then this function will return 2544 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2545 /// highest unused register. 2546 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2547 const TargetRegisterClass *RC, 2548 const MachineFunction &MF, 2549 bool ReserveHighestVGPR) const { 2550 if (ReserveHighestVGPR) { 2551 for (MCRegister Reg : reverse(*RC)) 2552 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2553 return Reg; 2554 } else { 2555 for (MCRegister Reg : *RC) 2556 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2557 return Reg; 2558 } 2559 return MCRegister(); 2560 } 2561 2562 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2563 unsigned EltSize) const { 2564 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2565 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2566 2567 const unsigned RegDWORDs = RegBitWidth / 32; 2568 const unsigned EltDWORDs = EltSize / 4; 2569 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2570 2571 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2572 const unsigned NumParts = RegDWORDs / EltDWORDs; 2573 2574 return makeArrayRef(Parts.data(), NumParts); 2575 } 2576 2577 const TargetRegisterClass* 2578 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2579 Register Reg) const { 2580 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); 2581 } 2582 2583 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2584 Register Reg) const { 2585 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2586 // Registers without classes are unaddressable, SGPR-like registers. 2587 return RC && isVGPRClass(RC); 2588 } 2589 2590 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2591 Register Reg) const { 2592 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2593 2594 // Registers without classes are unaddressable, SGPR-like registers. 2595 return RC && isAGPRClass(RC); 2596 } 2597 2598 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2599 const TargetRegisterClass *SrcRC, 2600 unsigned SubReg, 2601 const TargetRegisterClass *DstRC, 2602 unsigned DstSubReg, 2603 const TargetRegisterClass *NewRC, 2604 LiveIntervals &LIS) const { 2605 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2606 unsigned DstSize = getRegSizeInBits(*DstRC); 2607 unsigned NewSize = getRegSizeInBits(*NewRC); 2608 2609 // Do not increase size of registers beyond dword, we would need to allocate 2610 // adjacent registers and constraint regalloc more than needed. 2611 2612 // Always allow dword coalescing. 2613 if (SrcSize <= 32 || DstSize <= 32) 2614 return true; 2615 2616 return NewSize <= DstSize || NewSize <= SrcSize; 2617 } 2618 2619 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2620 MachineFunction &MF) const { 2621 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2622 2623 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2624 MF.getFunction()); 2625 switch (RC->getID()) { 2626 default: 2627 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2628 case AMDGPU::VGPR_32RegClassID: 2629 case AMDGPU::VGPR_LO16RegClassID: 2630 case AMDGPU::VGPR_HI16RegClassID: 2631 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2632 case AMDGPU::SGPR_32RegClassID: 2633 case AMDGPU::SGPR_LO16RegClassID: 2634 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2635 } 2636 } 2637 2638 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2639 unsigned Idx) const { 2640 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2641 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2642 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2643 const_cast<MachineFunction &>(MF)); 2644 2645 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2646 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2647 const_cast<MachineFunction &>(MF)); 2648 2649 llvm_unreachable("Unexpected register pressure set!"); 2650 } 2651 2652 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2653 static const int Empty[] = { -1 }; 2654 2655 if (RegPressureIgnoredUnits[RegUnit]) 2656 return Empty; 2657 2658 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2659 } 2660 2661 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2662 // Not a callee saved register. 2663 return AMDGPU::SGPR30_SGPR31; 2664 } 2665 2666 const TargetRegisterClass * 2667 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2668 const RegisterBank &RB, 2669 const MachineRegisterInfo &MRI) const { 2670 switch (RB.getID()) { 2671 case AMDGPU::VGPRRegBankID: 2672 return getVGPRClassForBitWidth(std::max(32u, Size)); 2673 case AMDGPU::VCCRegBankID: 2674 assert(Size == 1); 2675 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2676 : &AMDGPU::SReg_64_XEXECRegClass; 2677 case AMDGPU::SGPRRegBankID: 2678 return getSGPRClassForBitWidth(std::max(32u, Size)); 2679 case AMDGPU::AGPRRegBankID: 2680 return getAGPRClassForBitWidth(std::max(32u, Size)); 2681 default: 2682 llvm_unreachable("unknown register bank"); 2683 } 2684 } 2685 2686 const TargetRegisterClass * 2687 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 2688 const MachineRegisterInfo &MRI) const { 2689 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 2690 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 2691 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 2692 2693 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 2694 return getAllocatableClass(RC); 2695 2696 return nullptr; 2697 } 2698 2699 MCRegister SIRegisterInfo::getVCC() const { 2700 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 2701 } 2702 2703 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 2704 // VGPR tuples have an alignment requirement on gfx90a variants. 2705 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 2706 : &AMDGPU::VReg_64RegClass; 2707 } 2708 2709 const TargetRegisterClass * 2710 SIRegisterInfo::getRegClass(unsigned RCID) const { 2711 switch ((int)RCID) { 2712 case AMDGPU::SReg_1RegClassID: 2713 return getBoolRC(); 2714 case AMDGPU::SReg_1_XEXECRegClassID: 2715 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 2716 : &AMDGPU::SReg_64_XEXECRegClass; 2717 case -1: 2718 return nullptr; 2719 default: 2720 return AMDGPUGenRegisterInfo::getRegClass(RCID); 2721 } 2722 } 2723 2724 // Find reaching register definition 2725 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 2726 MachineInstr &Use, 2727 MachineRegisterInfo &MRI, 2728 LiveIntervals *LIS) const { 2729 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 2730 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 2731 SlotIndex DefIdx; 2732 2733 if (Reg.isVirtual()) { 2734 if (!LIS->hasInterval(Reg)) 2735 return nullptr; 2736 LiveInterval &LI = LIS->getInterval(Reg); 2737 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 2738 : MRI.getMaxLaneMaskForVReg(Reg); 2739 VNInfo *V = nullptr; 2740 if (LI.hasSubRanges()) { 2741 for (auto &S : LI.subranges()) { 2742 if ((S.LaneMask & SubLanes) == SubLanes) { 2743 V = S.getVNInfoAt(UseIdx); 2744 break; 2745 } 2746 } 2747 } else { 2748 V = LI.getVNInfoAt(UseIdx); 2749 } 2750 if (!V) 2751 return nullptr; 2752 DefIdx = V->def; 2753 } else { 2754 // Find last def. 2755 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 2756 ++Units) { 2757 LiveRange &LR = LIS->getRegUnit(*Units); 2758 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 2759 if (!DefIdx.isValid() || 2760 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 2761 LIS->getInstructionFromIndex(V->def))) 2762 DefIdx = V->def; 2763 } else { 2764 return nullptr; 2765 } 2766 } 2767 } 2768 2769 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 2770 2771 if (!Def || !MDT.dominates(Def, &Use)) 2772 return nullptr; 2773 2774 assert(Def->modifiesRegister(Reg, this)); 2775 2776 return Def; 2777 } 2778 2779 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 2780 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); 2781 2782 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 2783 AMDGPU::SReg_32RegClass, 2784 AMDGPU::AGPR_32RegClass } ) { 2785 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 2786 return Super; 2787 } 2788 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 2789 &AMDGPU::VGPR_32RegClass)) { 2790 return Super; 2791 } 2792 2793 return AMDGPU::NoRegister; 2794 } 2795 2796 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 2797 if (!ST.needsAlignedVGPRs()) 2798 return true; 2799 2800 if (isVGPRClass(&RC)) 2801 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 2802 if (isAGPRClass(&RC)) 2803 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 2804 if (isVectorSuperClass(&RC)) 2805 return RC.hasSuperClassEq( 2806 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 2807 2808 return true; 2809 } 2810 2811 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { 2812 switch (PhysReg) { 2813 case AMDGPU::SGPR_NULL: 2814 case AMDGPU::SRC_SHARED_BASE: 2815 case AMDGPU::SRC_PRIVATE_BASE: 2816 case AMDGPU::SRC_SHARED_LIMIT: 2817 case AMDGPU::SRC_PRIVATE_LIMIT: 2818 return true; 2819 default: 2820 return false; 2821 } 2822 } 2823 2824 ArrayRef<MCPhysReg> 2825 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 2826 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 2827 ST.getMaxNumSGPRs(MF) / 4); 2828 } 2829 2830 ArrayRef<MCPhysReg> 2831 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 2832 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(), 2833 ST.getMaxNumSGPRs(MF) / 2); 2834 } 2835 2836 ArrayRef<MCPhysReg> 2837 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 2838 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 2839 } 2840