1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUInstPrinter.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LivePhysRegs.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling SGPRs to VGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 namespace llvm { 49 50 // A temporary struct to spill SGPRs. 51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 52 // just v_writelane and v_readlane. 53 // 54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 55 // is saved to scratch (or the other way around for loads). 56 // For this, a VGPR is required where the needed lanes can be clobbered. The 57 // RegScavenger can provide a VGPR where currently active lanes can be 58 // clobbered, but we still need to save inactive lanes. 59 // The high-level steps are: 60 // - Try to scavenge SGPR(s) to save exec 61 // - Try to scavenge VGPR 62 // - Save needed, all or inactive lanes of a TmpVGPR 63 // - Spill/Restore SGPRs using TmpVGPR 64 // - Restore TmpVGPR 65 // 66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 67 // cannot scavenge temporary SGPRs to save exec, we use the following code: 68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 69 // s_not exec, exec 70 // buffer_store_dword TmpVGPR ; save inactive lanes 71 // s_not exec, exec 72 struct SGPRSpillBuilder { 73 struct PerVGPRData { 74 unsigned PerVGPR; 75 unsigned NumVGPRs; 76 int64_t VGPRLanes; 77 }; 78 79 // The SGPR to save 80 Register SuperReg; 81 MachineBasicBlock::iterator MI; 82 ArrayRef<int16_t> SplitParts; 83 unsigned NumSubRegs; 84 bool IsKill; 85 const DebugLoc &DL; 86 87 /* When spilling to stack */ 88 // The SGPRs are written into this VGPR, which is then written to scratch 89 // (or vice versa for loads). 90 Register TmpVGPR = AMDGPU::NoRegister; 91 // Temporary spill slot to save TmpVGPR to. 92 int TmpVGPRIndex = 0; 93 // If TmpVGPR is live before the spill or if it is scavenged. 94 bool TmpVGPRLive = false; 95 // Scavenged SGPR to save EXEC. 96 Register SavedExecReg = AMDGPU::NoRegister; 97 // Stack index to write the SGPRs to. 98 int Index; 99 unsigned EltSize = 4; 100 101 RegScavenger *RS; 102 MachineBasicBlock *MBB; 103 MachineFunction &MF; 104 SIMachineFunctionInfo &MFI; 105 const SIInstrInfo &TII; 106 const SIRegisterInfo &TRI; 107 bool IsWave32; 108 Register ExecReg; 109 unsigned MovOpc; 110 unsigned NotOpc; 111 112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 113 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 114 RegScavenger *RS) 115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 116 MI->getOperand(0).isKill(), Index, RS) {} 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 120 bool IsKill, int Index, RegScavenger *RS) 121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 124 IsWave32(IsWave32) { 125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 126 SplitParts = TRI.getRegSplitParts(RC, EltSize); 127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 128 129 if (IsWave32) { 130 ExecReg = AMDGPU::EXEC_LO; 131 MovOpc = AMDGPU::S_MOV_B32; 132 NotOpc = AMDGPU::S_NOT_B32; 133 } else { 134 ExecReg = AMDGPU::EXEC; 135 MovOpc = AMDGPU::S_MOV_B64; 136 NotOpc = AMDGPU::S_NOT_B64; 137 } 138 139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 141 SuperReg != AMDGPU::EXEC && "exec should never spill"); 142 } 143 144 PerVGPRData getPerVGPRData() { 145 PerVGPRData Data; 146 Data.PerVGPR = IsWave32 ? 32 : 64; 147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 149 return Data; 150 } 151 152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 153 // free. 154 // Writes these instructions if an SGPR can be scavenged: 155 // s_mov_b64 s[6:7], exec ; Save exec 156 // s_mov_b64 exec, 3 ; Wanted lanemask 157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 158 // 159 // Writes these instructions if no SGPR can be scavenged: 160 // buffer_store_dword v0 ; Only if no free VGPR was found 161 // s_not_b64 exec, exec 162 // buffer_store_dword v0 ; Save inactive lanes 163 // ; exec stays inverted, it is flipped back in 164 // ; restore. 165 void prepare() { 166 // Scavenged temporary VGPR to use. It must be scavenged once for any number 167 // of spilled subregs. 168 // FIXME: The liveness analysis is limited and does not tell if a register 169 // is in use in lanes that are currently inactive. We can never be sure if 170 // a register as actually in use in another lane, so we need to save all 171 // used lanes of the chosen VGPR. 172 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 174 0, false); 175 176 // Reserve temporary stack slot 177 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 178 if (TmpVGPR) { 179 // Found a register that is dead in the currently active lanes, we only 180 // need to spill inactive lanes. 181 TmpVGPRLive = false; 182 } else { 183 // Pick v0 because it doesn't make a difference. 184 TmpVGPR = AMDGPU::VGPR0; 185 TmpVGPRLive = true; 186 } 187 188 if (TmpVGPRLive) { 189 // We need to inform the scavenger that this index is already in use until 190 // we're done with the custom emergency spill. 191 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 192 } 193 194 // We may end up recursively calling the scavenger, and don't want to re-use 195 // the same register. 196 RS->setRegUsed(TmpVGPR); 197 198 // Try to scavenge SGPRs to save exec 199 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 200 const TargetRegisterClass &RC = 201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 202 RS->setRegUsed(SuperReg); 203 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false); 204 205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 206 207 if (SavedExecReg) { 208 RS->setRegUsed(SavedExecReg); 209 // Set exec to needed lanes 210 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 211 auto I = 212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 213 if (!TmpVGPRLive) 214 I.addReg(TmpVGPR, RegState::ImplicitDefine); 215 // Spill needed lanes 216 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 217 } else { 218 // The modify and restore of exec clobber SCC, which we would have to save 219 // and restore. FIXME: We probably would need to reserve a register for 220 // this. 221 if (RS->isRegUsed(AMDGPU::SCC)) 222 MI->emitError("unhandled SGPR spill to memory"); 223 224 // Spill active lanes 225 if (TmpVGPRLive) 226 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 227 /*IsKill*/ false); 228 // Spill inactive lanes 229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 230 if (!TmpVGPRLive) 231 I.addReg(TmpVGPR, RegState::ImplicitDefine); 232 I->getOperand(2).setIsDead(); // Mark SCC as dead. 233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 234 } 235 } 236 237 // Writes these instructions if an SGPR can be scavenged: 238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 239 // s_waitcnt vmcnt(0) ; If a free VGPR was found 240 // s_mov_b64 exec, s[6:7] ; Save exec 241 // 242 // Writes these instructions if no SGPR can be scavenged: 243 // buffer_load_dword v0 ; Restore inactive lanes 244 // s_waitcnt vmcnt(0) ; If a free VGPR was found 245 // s_not_b64 exec, exec 246 // buffer_load_dword v0 ; Only if no free VGPR was found 247 void restore() { 248 if (SavedExecReg) { 249 // Restore used lanes 250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 251 /*IsKill*/ false); 252 // Restore exec 253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 254 .addReg(SavedExecReg, RegState::Kill); 255 // Add an implicit use of the load so it is not dead. 256 // FIXME This inserts an unnecessary waitcnt 257 if (!TmpVGPRLive) { 258 I.addReg(TmpVGPR, RegState::ImplicitKill); 259 } 260 } else { 261 // Restore inactive lanes 262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 263 /*IsKill*/ false); 264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 265 if (!TmpVGPRLive) 266 I.addReg(TmpVGPR, RegState::ImplicitKill); 267 I->getOperand(2).setIsDead(); // Mark SCC as dead. 268 269 // Restore active lanes 270 if (TmpVGPRLive) 271 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 272 } 273 274 // Inform the scavenger where we're releasing our custom scavenged register. 275 if (TmpVGPRLive) { 276 MachineBasicBlock::iterator RestorePt = std::prev(MI); 277 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 278 } 279 } 280 281 // Write TmpVGPR to memory or read TmpVGPR from memory. 282 // Either using a single buffer_load/store if exec is set to the needed mask 283 // or using 284 // buffer_load 285 // s_not exec, exec 286 // buffer_load 287 // s_not exec, exec 288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 289 if (SavedExecReg) { 290 // Spill needed lanes 291 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 292 } else { 293 // The modify and restore of exec clobber SCC, which we would have to save 294 // and restore. FIXME: We probably would need to reserve a register for 295 // this. 296 if (RS->isRegUsed(AMDGPU::SCC)) 297 MI->emitError("unhandled SGPR spill to memory"); 298 299 // Spill active lanes 300 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 301 /*IsKill*/ false); 302 // Spill inactive lanes 303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 305 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 308 } 309 } 310 311 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 312 assert(MBB->getParent() == &MF); 313 MI = NewMI; 314 MBB = NewMBB; 315 } 316 }; 317 318 } // namespace llvm 319 320 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 322 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 323 324 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 325 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 326 (getSubRegIndexLaneMask(AMDGPU::lo16) | 327 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 328 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 329 "getNumCoveredRegs() will not work with generated subreg masks!"); 330 331 RegPressureIgnoredUnits.resize(getNumRegUnits()); 332 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); 333 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 334 RegPressureIgnoredUnits.set(*regunits(Reg).begin()); 335 336 // HACK: Until this is fully tablegen'd. 337 static llvm::once_flag InitializeRegSplitPartsFlag; 338 339 static auto InitializeRegSplitPartsOnce = [this]() { 340 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 341 unsigned Size = getSubRegIdxSize(Idx); 342 if (Size & 31) 343 continue; 344 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 345 unsigned Pos = getSubRegIdxOffset(Idx); 346 if (Pos % Size) 347 continue; 348 Pos /= Size; 349 if (Vec.empty()) { 350 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 351 Vec.resize(MaxNumParts); 352 } 353 Vec[Pos] = Idx; 354 } 355 }; 356 357 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 358 359 static auto InitializeSubRegFromChannelTableOnce = [this]() { 360 for (auto &Row : SubRegFromChannelTable) 361 Row.fill(AMDGPU::NoSubRegister); 362 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 363 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 364 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 365 assert(Width < SubRegFromChannelTableWidthMap.size()); 366 Width = SubRegFromChannelTableWidthMap[Width]; 367 if (Width == 0) 368 continue; 369 unsigned TableIdx = Width - 1; 370 assert(TableIdx < SubRegFromChannelTable.size()); 371 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 372 SubRegFromChannelTable[TableIdx][Offset] = Idx; 373 } 374 }; 375 376 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 377 llvm::call_once(InitializeSubRegFromChannelTableFlag, 378 InitializeSubRegFromChannelTableOnce); 379 } 380 381 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 382 MCRegister Reg) const { 383 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) 384 Reserved.set(*R); 385 } 386 387 // Forced to be here by one .inc 388 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 389 const MachineFunction *MF) const { 390 CallingConv::ID CC = MF->getFunction().getCallingConv(); 391 switch (CC) { 392 case CallingConv::C: 393 case CallingConv::Fast: 394 case CallingConv::Cold: 395 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList 396 : CSR_AMDGPU_SaveList; 397 case CallingConv::AMDGPU_Gfx: 398 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList 399 : CSR_AMDGPU_SI_Gfx_SaveList; 400 default: { 401 // Dummy to not crash RegisterClassInfo. 402 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 403 return &NoCalleeSavedReg; 404 } 405 } 406 } 407 408 const MCPhysReg * 409 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 410 return nullptr; 411 } 412 413 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 414 CallingConv::ID CC) const { 415 switch (CC) { 416 case CallingConv::C: 417 case CallingConv::Fast: 418 case CallingConv::Cold: 419 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask 420 : CSR_AMDGPU_RegMask; 421 case CallingConv::AMDGPU_Gfx: 422 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask 423 : CSR_AMDGPU_SI_Gfx_RegMask; 424 default: 425 return nullptr; 426 } 427 } 428 429 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 430 return CSR_AMDGPU_NoRegs_RegMask; 431 } 432 433 const TargetRegisterClass * 434 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 435 const MachineFunction &MF) const { 436 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 437 // equivalent AV class. If used one, the verifier will crash after 438 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 439 // until Instruction selection. 440 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 441 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 442 return &AMDGPU::AV_32RegClass; 443 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 444 return &AMDGPU::AV_64RegClass; 445 if (RC == &AMDGPU::VReg_64_Align2RegClass || 446 RC == &AMDGPU::AReg_64_Align2RegClass) 447 return &AMDGPU::AV_64_Align2RegClass; 448 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 449 return &AMDGPU::AV_96RegClass; 450 if (RC == &AMDGPU::VReg_96_Align2RegClass || 451 RC == &AMDGPU::AReg_96_Align2RegClass) 452 return &AMDGPU::AV_96_Align2RegClass; 453 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 454 return &AMDGPU::AV_128RegClass; 455 if (RC == &AMDGPU::VReg_128_Align2RegClass || 456 RC == &AMDGPU::AReg_128_Align2RegClass) 457 return &AMDGPU::AV_128_Align2RegClass; 458 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 459 return &AMDGPU::AV_160RegClass; 460 if (RC == &AMDGPU::VReg_160_Align2RegClass || 461 RC == &AMDGPU::AReg_160_Align2RegClass) 462 return &AMDGPU::AV_160_Align2RegClass; 463 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 464 return &AMDGPU::AV_192RegClass; 465 if (RC == &AMDGPU::VReg_192_Align2RegClass || 466 RC == &AMDGPU::AReg_192_Align2RegClass) 467 return &AMDGPU::AV_192_Align2RegClass; 468 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 469 return &AMDGPU::AV_256RegClass; 470 if (RC == &AMDGPU::VReg_256_Align2RegClass || 471 RC == &AMDGPU::AReg_256_Align2RegClass) 472 return &AMDGPU::AV_256_Align2RegClass; 473 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 474 return &AMDGPU::AV_512RegClass; 475 if (RC == &AMDGPU::VReg_512_Align2RegClass || 476 RC == &AMDGPU::AReg_512_Align2RegClass) 477 return &AMDGPU::AV_512_Align2RegClass; 478 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 479 return &AMDGPU::AV_1024RegClass; 480 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 481 RC == &AMDGPU::AReg_1024_Align2RegClass) 482 return &AMDGPU::AV_1024_Align2RegClass; 483 } 484 485 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 486 } 487 488 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 489 const SIFrameLowering *TFI = ST.getFrameLowering(); 490 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 491 // During ISel lowering we always reserve the stack pointer in entry 492 // functions, but never actually want to reference it when accessing our own 493 // frame. If we need a frame pointer we use it, but otherwise we can just use 494 // an immediate "0" which we represent by returning NoRegister. 495 if (FuncInfo->isEntryFunction()) { 496 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 497 } 498 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 499 : FuncInfo->getStackPtrOffsetReg(); 500 } 501 502 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 503 // When we need stack realignment, we can't reference off of the 504 // stack pointer, so we reserve a base pointer. 505 const MachineFrameInfo &MFI = MF.getFrameInfo(); 506 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 507 } 508 509 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 510 511 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 512 return AMDGPU_AllVGPRs_RegMask; 513 } 514 515 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 516 return AMDGPU_AllAGPRs_RegMask; 517 } 518 519 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 520 return AMDGPU_AllVectorRegs_RegMask; 521 } 522 523 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 524 return AMDGPU_AllAllocatableSRegs_RegMask; 525 } 526 527 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 528 unsigned NumRegs) { 529 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 530 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 531 assert(NumRegIndex && "Not implemented"); 532 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 533 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 534 } 535 536 MCRegister 537 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, 538 const unsigned Align, 539 const TargetRegisterClass *RC) const { 540 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; 541 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 542 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); 543 } 544 545 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 546 const MachineFunction &MF) const { 547 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); 548 } 549 550 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 551 BitVector Reserved(getNumRegs()); 552 Reserved.set(AMDGPU::MODE); 553 554 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 555 556 // Reserve special purpose registers. 557 // 558 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 559 // this seems likely to result in bugs, so I'm marking them as reserved. 560 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 561 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 562 563 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 564 reserveRegisterTuples(Reserved, AMDGPU::M0); 565 566 // Reserve src_vccz, src_execz, src_scc. 567 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 568 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 569 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 570 571 // Reserve the memory aperture registers 572 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 573 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 574 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 575 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 576 577 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 578 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 579 580 // Reserve xnack_mask registers - support is not implemented in Codegen. 581 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 582 583 // Reserve lds_direct register - support is not implemented in Codegen. 584 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 585 586 // Reserve Trap Handler registers - support is not implemented in Codegen. 587 reserveRegisterTuples(Reserved, AMDGPU::TBA); 588 reserveRegisterTuples(Reserved, AMDGPU::TMA); 589 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 590 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 591 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 592 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 593 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 594 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 595 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 596 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 597 598 // Reserve null register - it shall never be allocated 599 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); 600 601 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 602 // will result in bugs. 603 if (isWave32) { 604 Reserved.set(AMDGPU::VCC); 605 Reserved.set(AMDGPU::VCC_HI); 606 } 607 608 // Reserve SGPRs. 609 // 610 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 611 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 612 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 613 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 614 reserveRegisterTuples(Reserved, Reg); 615 } 616 617 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 618 if (ScratchRSrcReg != AMDGPU::NoRegister) { 619 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we 620 // need to spill. 621 // TODO: May need to reserve a VGPR if doing LDS spilling. 622 reserveRegisterTuples(Reserved, ScratchRSrcReg); 623 } 624 625 Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 626 if (LongBranchReservedReg) 627 reserveRegisterTuples(Reserved, LongBranchReservedReg); 628 629 // We have to assume the SP is needed in case there are calls in the function, 630 // which is detected after the function is lowered. If we aren't really going 631 // to need SP, don't bother reserving it. 632 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 633 if (StackPtrReg) { 634 reserveRegisterTuples(Reserved, StackPtrReg); 635 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 636 } 637 638 MCRegister FrameReg = MFI->getFrameOffsetReg(); 639 if (FrameReg) { 640 reserveRegisterTuples(Reserved, FrameReg); 641 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 642 } 643 644 if (hasBasePointer(MF)) { 645 MCRegister BasePtrReg = getBaseRegister(); 646 reserveRegisterTuples(Reserved, BasePtrReg); 647 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 648 } 649 650 // FIXME: Use same reserved register introduced in D149775 651 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. 652 Register ExecCopyReg = MFI->getSGPRForEXECCopy(); 653 if (ExecCopyReg) 654 reserveRegisterTuples(Reserved, ExecCopyReg); 655 656 // Reserve VGPRs/AGPRs. 657 // 658 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 659 unsigned MaxNumAGPRs = MaxNumVGPRs; 660 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 661 662 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, 663 // a wave may have up to 512 total vector registers combining together both 664 // VGPRs and AGPRs. Hence, in an entry function without calls and without 665 // AGPRs used within it, it is possible to use the whole vector register 666 // budget for VGPRs. 667 // 668 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split 669 // register file accordingly. 670 if (ST.hasGFX90AInsts()) { 671 if (MFI->usesAGPRs(MF)) { 672 MaxNumVGPRs /= 2; 673 MaxNumAGPRs = MaxNumVGPRs; 674 } else { 675 if (MaxNumVGPRs > TotalNumVGPRs) { 676 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 677 MaxNumVGPRs = TotalNumVGPRs; 678 } else 679 MaxNumAGPRs = 0; 680 } 681 } 682 683 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 684 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 685 reserveRegisterTuples(Reserved, Reg); 686 } 687 688 if (ST.hasMAIInsts()) { 689 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 690 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 691 reserveRegisterTuples(Reserved, Reg); 692 } 693 } else { 694 // Reserve all the AGPRs if there are no instructions to use it. 695 for (MCRegister Reg : AMDGPU::AGPR_32RegClass) 696 reserveRegisterTuples(Reserved, Reg); 697 } 698 699 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch 700 // VGPR available at all times. 701 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 702 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); 703 } 704 705 for (Register Reg : MFI->getWWMReservedRegs()) 706 reserveRegisterTuples(Reserved, Reg); 707 708 // FIXME: Stop using reserved registers for this. 709 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 710 reserveRegisterTuples(Reserved, Reg); 711 712 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 713 reserveRegisterTuples(Reserved, Reg); 714 715 for (auto Reg : MFI->getSGPRSpillVGPRs()) 716 reserveRegisterTuples(Reserved, Reg); 717 718 return Reserved; 719 } 720 721 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 722 MCRegister PhysReg) const { 723 return !MF.getRegInfo().isReserved(PhysReg); 724 } 725 726 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 727 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 728 // On entry, the base address is 0, so it can't possibly need any more 729 // alignment. 730 731 // FIXME: Should be able to specify the entry frame alignment per calling 732 // convention instead. 733 if (Info->isEntryFunction()) 734 return false; 735 736 return TargetRegisterInfo::shouldRealignStack(MF); 737 } 738 739 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 740 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 741 if (Info->isEntryFunction()) { 742 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 743 return MFI.hasStackObjects() || MFI.hasCalls(); 744 } 745 746 // May need scavenger for dealing with callee saved registers. 747 return true; 748 } 749 750 bool SIRegisterInfo::requiresFrameIndexScavenging( 751 const MachineFunction &MF) const { 752 // Do not use frame virtual registers. They used to be used for SGPRs, but 753 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 754 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 755 // spill. 756 return false; 757 } 758 759 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 760 const MachineFunction &MF) const { 761 const MachineFrameInfo &MFI = MF.getFrameInfo(); 762 return MFI.hasStackObjects(); 763 } 764 765 bool SIRegisterInfo::requiresVirtualBaseRegisters( 766 const MachineFunction &) const { 767 // There are no special dedicated stack or frame pointers. 768 return true; 769 } 770 771 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 772 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 773 774 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 775 AMDGPU::OpName::offset); 776 return MI->getOperand(OffIdx).getImm(); 777 } 778 779 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 780 int Idx) const { 781 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 782 return 0; 783 784 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 785 AMDGPU::OpName::vaddr) || 786 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 787 AMDGPU::OpName::saddr))) && 788 "Should never see frame index on non-address operand"); 789 790 return getScratchInstrOffset(MI); 791 } 792 793 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 794 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 795 return false; 796 797 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 798 799 if (SIInstrInfo::isMUBUF(*MI)) 800 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 801 802 const SIInstrInfo *TII = ST.getInstrInfo(); 803 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 804 SIInstrFlags::FlatScratch); 805 } 806 807 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 808 int FrameIdx, 809 int64_t Offset) const { 810 MachineBasicBlock::iterator Ins = MBB->begin(); 811 DebugLoc DL; // Defaults to "unknown" 812 813 if (Ins != MBB->end()) 814 DL = Ins->getDebugLoc(); 815 816 MachineFunction *MF = MBB->getParent(); 817 const SIInstrInfo *TII = ST.getInstrInfo(); 818 MachineRegisterInfo &MRI = MF->getRegInfo(); 819 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 820 : AMDGPU::V_MOV_B32_e32; 821 822 Register BaseReg = MRI.createVirtualRegister( 823 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 824 : &AMDGPU::VGPR_32RegClass); 825 826 if (Offset == 0) { 827 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 828 .addFrameIndex(FrameIdx); 829 return BaseReg; 830 } 831 832 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 833 834 Register FIReg = MRI.createVirtualRegister( 835 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 836 : &AMDGPU::VGPR_32RegClass); 837 838 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 839 .addImm(Offset); 840 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 841 .addFrameIndex(FrameIdx); 842 843 if (ST.enableFlatScratch() ) { 844 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 845 .addReg(OffsetReg, RegState::Kill) 846 .addReg(FIReg); 847 return BaseReg; 848 } 849 850 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 851 .addReg(OffsetReg, RegState::Kill) 852 .addReg(FIReg) 853 .addImm(0); // clamp bit 854 855 return BaseReg; 856 } 857 858 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 859 int64_t Offset) const { 860 const SIInstrInfo *TII = ST.getInstrInfo(); 861 bool IsFlat = TII->isFLATScratch(MI); 862 863 #ifndef NDEBUG 864 // FIXME: Is it possible to be storing a frame index to itself? 865 bool SeenFI = false; 866 for (const MachineOperand &MO: MI.operands()) { 867 if (MO.isFI()) { 868 if (SeenFI) 869 llvm_unreachable("should not see multiple frame indices"); 870 871 SeenFI = true; 872 } 873 } 874 #endif 875 876 MachineOperand *FIOp = 877 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 878 : AMDGPU::OpName::vaddr); 879 880 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 881 int64_t NewOffset = OffsetOp->getImm() + Offset; 882 883 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 884 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 885 886 if (IsFlat) { 887 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 888 SIInstrFlags::FlatScratch) && 889 "offset should be legal"); 890 FIOp->ChangeToRegister(BaseReg, false); 891 OffsetOp->setImm(NewOffset); 892 return; 893 } 894 895 #ifndef NDEBUG 896 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 897 assert(SOffset->isImm() && SOffset->getImm() == 0); 898 #endif 899 900 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 901 "offset should be legal"); 902 903 FIOp->ChangeToRegister(BaseReg, false); 904 OffsetOp->setImm(NewOffset); 905 } 906 907 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 908 Register BaseReg, 909 int64_t Offset) const { 910 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 911 return false; 912 913 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 914 915 if (SIInstrInfo::isMUBUF(*MI)) 916 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 917 918 const SIInstrInfo *TII = ST.getInstrInfo(); 919 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 920 SIInstrFlags::FlatScratch); 921 } 922 923 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 924 const MachineFunction &MF, unsigned Kind) const { 925 // This is inaccurate. It depends on the instruction and address space. The 926 // only place where we should hit this is for dealing with frame indexes / 927 // private accesses, so this is correct in that case. 928 return &AMDGPU::VGPR_32RegClass; 929 } 930 931 const TargetRegisterClass * 932 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 933 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 934 return getEquivalentVGPRClass(RC); 935 if (RC == &AMDGPU::SCC_CLASSRegClass) 936 return getWaveMaskRegClass(); 937 938 return RC; 939 } 940 941 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 942 943 switch (Op) { 944 case AMDGPU::SI_SPILL_S1024_SAVE: 945 case AMDGPU::SI_SPILL_S1024_RESTORE: 946 case AMDGPU::SI_SPILL_V1024_SAVE: 947 case AMDGPU::SI_SPILL_V1024_RESTORE: 948 case AMDGPU::SI_SPILL_A1024_SAVE: 949 case AMDGPU::SI_SPILL_A1024_RESTORE: 950 case AMDGPU::SI_SPILL_AV1024_SAVE: 951 case AMDGPU::SI_SPILL_AV1024_RESTORE: 952 return 32; 953 case AMDGPU::SI_SPILL_S512_SAVE: 954 case AMDGPU::SI_SPILL_S512_RESTORE: 955 case AMDGPU::SI_SPILL_V512_SAVE: 956 case AMDGPU::SI_SPILL_V512_RESTORE: 957 case AMDGPU::SI_SPILL_A512_SAVE: 958 case AMDGPU::SI_SPILL_A512_RESTORE: 959 case AMDGPU::SI_SPILL_AV512_SAVE: 960 case AMDGPU::SI_SPILL_AV512_RESTORE: 961 return 16; 962 case AMDGPU::SI_SPILL_S384_SAVE: 963 case AMDGPU::SI_SPILL_S384_RESTORE: 964 case AMDGPU::SI_SPILL_V384_SAVE: 965 case AMDGPU::SI_SPILL_V384_RESTORE: 966 case AMDGPU::SI_SPILL_A384_SAVE: 967 case AMDGPU::SI_SPILL_A384_RESTORE: 968 case AMDGPU::SI_SPILL_AV384_SAVE: 969 case AMDGPU::SI_SPILL_AV384_RESTORE: 970 return 12; 971 case AMDGPU::SI_SPILL_S352_SAVE: 972 case AMDGPU::SI_SPILL_S352_RESTORE: 973 case AMDGPU::SI_SPILL_V352_SAVE: 974 case AMDGPU::SI_SPILL_V352_RESTORE: 975 case AMDGPU::SI_SPILL_A352_SAVE: 976 case AMDGPU::SI_SPILL_A352_RESTORE: 977 case AMDGPU::SI_SPILL_AV352_SAVE: 978 case AMDGPU::SI_SPILL_AV352_RESTORE: 979 return 11; 980 case AMDGPU::SI_SPILL_S320_SAVE: 981 case AMDGPU::SI_SPILL_S320_RESTORE: 982 case AMDGPU::SI_SPILL_V320_SAVE: 983 case AMDGPU::SI_SPILL_V320_RESTORE: 984 case AMDGPU::SI_SPILL_A320_SAVE: 985 case AMDGPU::SI_SPILL_A320_RESTORE: 986 case AMDGPU::SI_SPILL_AV320_SAVE: 987 case AMDGPU::SI_SPILL_AV320_RESTORE: 988 return 10; 989 case AMDGPU::SI_SPILL_S288_SAVE: 990 case AMDGPU::SI_SPILL_S288_RESTORE: 991 case AMDGPU::SI_SPILL_V288_SAVE: 992 case AMDGPU::SI_SPILL_V288_RESTORE: 993 case AMDGPU::SI_SPILL_A288_SAVE: 994 case AMDGPU::SI_SPILL_A288_RESTORE: 995 case AMDGPU::SI_SPILL_AV288_SAVE: 996 case AMDGPU::SI_SPILL_AV288_RESTORE: 997 return 9; 998 case AMDGPU::SI_SPILL_S256_SAVE: 999 case AMDGPU::SI_SPILL_S256_RESTORE: 1000 case AMDGPU::SI_SPILL_V256_SAVE: 1001 case AMDGPU::SI_SPILL_V256_RESTORE: 1002 case AMDGPU::SI_SPILL_A256_SAVE: 1003 case AMDGPU::SI_SPILL_A256_RESTORE: 1004 case AMDGPU::SI_SPILL_AV256_SAVE: 1005 case AMDGPU::SI_SPILL_AV256_RESTORE: 1006 return 8; 1007 case AMDGPU::SI_SPILL_S224_SAVE: 1008 case AMDGPU::SI_SPILL_S224_RESTORE: 1009 case AMDGPU::SI_SPILL_V224_SAVE: 1010 case AMDGPU::SI_SPILL_V224_RESTORE: 1011 case AMDGPU::SI_SPILL_A224_SAVE: 1012 case AMDGPU::SI_SPILL_A224_RESTORE: 1013 case AMDGPU::SI_SPILL_AV224_SAVE: 1014 case AMDGPU::SI_SPILL_AV224_RESTORE: 1015 return 7; 1016 case AMDGPU::SI_SPILL_S192_SAVE: 1017 case AMDGPU::SI_SPILL_S192_RESTORE: 1018 case AMDGPU::SI_SPILL_V192_SAVE: 1019 case AMDGPU::SI_SPILL_V192_RESTORE: 1020 case AMDGPU::SI_SPILL_A192_SAVE: 1021 case AMDGPU::SI_SPILL_A192_RESTORE: 1022 case AMDGPU::SI_SPILL_AV192_SAVE: 1023 case AMDGPU::SI_SPILL_AV192_RESTORE: 1024 return 6; 1025 case AMDGPU::SI_SPILL_S160_SAVE: 1026 case AMDGPU::SI_SPILL_S160_RESTORE: 1027 case AMDGPU::SI_SPILL_V160_SAVE: 1028 case AMDGPU::SI_SPILL_V160_RESTORE: 1029 case AMDGPU::SI_SPILL_A160_SAVE: 1030 case AMDGPU::SI_SPILL_A160_RESTORE: 1031 case AMDGPU::SI_SPILL_AV160_SAVE: 1032 case AMDGPU::SI_SPILL_AV160_RESTORE: 1033 return 5; 1034 case AMDGPU::SI_SPILL_S128_SAVE: 1035 case AMDGPU::SI_SPILL_S128_RESTORE: 1036 case AMDGPU::SI_SPILL_V128_SAVE: 1037 case AMDGPU::SI_SPILL_V128_RESTORE: 1038 case AMDGPU::SI_SPILL_A128_SAVE: 1039 case AMDGPU::SI_SPILL_A128_RESTORE: 1040 case AMDGPU::SI_SPILL_AV128_SAVE: 1041 case AMDGPU::SI_SPILL_AV128_RESTORE: 1042 return 4; 1043 case AMDGPU::SI_SPILL_S96_SAVE: 1044 case AMDGPU::SI_SPILL_S96_RESTORE: 1045 case AMDGPU::SI_SPILL_V96_SAVE: 1046 case AMDGPU::SI_SPILL_V96_RESTORE: 1047 case AMDGPU::SI_SPILL_A96_SAVE: 1048 case AMDGPU::SI_SPILL_A96_RESTORE: 1049 case AMDGPU::SI_SPILL_AV96_SAVE: 1050 case AMDGPU::SI_SPILL_AV96_RESTORE: 1051 return 3; 1052 case AMDGPU::SI_SPILL_S64_SAVE: 1053 case AMDGPU::SI_SPILL_S64_RESTORE: 1054 case AMDGPU::SI_SPILL_V64_SAVE: 1055 case AMDGPU::SI_SPILL_V64_RESTORE: 1056 case AMDGPU::SI_SPILL_A64_SAVE: 1057 case AMDGPU::SI_SPILL_A64_RESTORE: 1058 case AMDGPU::SI_SPILL_AV64_SAVE: 1059 case AMDGPU::SI_SPILL_AV64_RESTORE: 1060 return 2; 1061 case AMDGPU::SI_SPILL_S32_SAVE: 1062 case AMDGPU::SI_SPILL_S32_RESTORE: 1063 case AMDGPU::SI_SPILL_V32_SAVE: 1064 case AMDGPU::SI_SPILL_V32_RESTORE: 1065 case AMDGPU::SI_SPILL_A32_SAVE: 1066 case AMDGPU::SI_SPILL_A32_RESTORE: 1067 case AMDGPU::SI_SPILL_AV32_SAVE: 1068 case AMDGPU::SI_SPILL_AV32_RESTORE: 1069 case AMDGPU::SI_SPILL_WWM_V32_SAVE: 1070 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: 1071 return 1; 1072 default: llvm_unreachable("Invalid spill opcode"); 1073 } 1074 } 1075 1076 static int getOffsetMUBUFStore(unsigned Opc) { 1077 switch (Opc) { 1078 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1079 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1080 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1081 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1082 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1083 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1084 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1085 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1086 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1087 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1088 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1089 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1090 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1091 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1092 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1093 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1094 default: 1095 return -1; 1096 } 1097 } 1098 1099 static int getOffsetMUBUFLoad(unsigned Opc) { 1100 switch (Opc) { 1101 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1102 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1103 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1104 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1105 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1106 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1107 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1108 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1109 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1110 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1111 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1112 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1113 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1114 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1115 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1116 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1117 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1118 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1119 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1120 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1121 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1122 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1123 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1124 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1125 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1126 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1127 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1128 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1129 default: 1130 return -1; 1131 } 1132 } 1133 1134 static int getOffenMUBUFStore(unsigned Opc) { 1135 switch (Opc) { 1136 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1137 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1138 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1139 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1140 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1141 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1142 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1143 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1144 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1145 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1146 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1147 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1148 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1149 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1150 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1151 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1152 default: 1153 return -1; 1154 } 1155 } 1156 1157 static int getOffenMUBUFLoad(unsigned Opc) { 1158 switch (Opc) { 1159 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1160 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1161 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1162 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1163 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1164 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1165 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1166 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1167 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1168 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1169 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1170 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1171 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1172 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1173 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1174 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1175 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1176 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1177 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1178 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1179 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1180 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1181 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1182 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1183 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1184 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1185 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1186 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1187 default: 1188 return -1; 1189 } 1190 } 1191 1192 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1193 MachineBasicBlock &MBB, 1194 MachineBasicBlock::iterator MI, 1195 int Index, unsigned Lane, 1196 unsigned ValueReg, bool IsKill) { 1197 MachineFunction *MF = MBB.getParent(); 1198 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1199 const SIInstrInfo *TII = ST.getInstrInfo(); 1200 1201 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1202 1203 if (Reg == AMDGPU::NoRegister) 1204 return MachineInstrBuilder(); 1205 1206 bool IsStore = MI->mayStore(); 1207 MachineRegisterInfo &MRI = MF->getRegInfo(); 1208 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1209 1210 unsigned Dst = IsStore ? Reg : ValueReg; 1211 unsigned Src = IsStore ? ValueReg : Reg; 1212 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1213 DebugLoc DL = MI->getDebugLoc(); 1214 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1215 // Spiller during regalloc may restore a spilled register to its superclass. 1216 // It could result in AGPR spills restored to VGPRs or the other way around, 1217 // making the src and dst with identical regclasses at this point. It just 1218 // needs a copy in such cases. 1219 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1220 .addReg(Src, getKillRegState(IsKill)); 1221 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1222 return CopyMIB; 1223 } 1224 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1225 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1226 1227 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1228 .addReg(Src, getKillRegState(IsKill)); 1229 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1230 return MIB; 1231 } 1232 1233 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1234 // need to handle the case where an SGPR may need to be spilled while spilling. 1235 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1236 MachineFrameInfo &MFI, 1237 MachineBasicBlock::iterator MI, 1238 int Index, 1239 int64_t Offset) { 1240 const SIInstrInfo *TII = ST.getInstrInfo(); 1241 MachineBasicBlock *MBB = MI->getParent(); 1242 const DebugLoc &DL = MI->getDebugLoc(); 1243 bool IsStore = MI->mayStore(); 1244 1245 unsigned Opc = MI->getOpcode(); 1246 int LoadStoreOp = IsStore ? 1247 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1248 if (LoadStoreOp == -1) 1249 return false; 1250 1251 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1252 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1253 return true; 1254 1255 MachineInstrBuilder NewMI = 1256 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1257 .add(*Reg) 1258 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1259 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1260 .addImm(Offset) 1261 .addImm(0) // cpol 1262 .addImm(0) // swz 1263 .cloneMemRefs(*MI); 1264 1265 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1266 AMDGPU::OpName::vdata_in); 1267 if (VDataIn) 1268 NewMI.add(*VDataIn); 1269 return true; 1270 } 1271 1272 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1273 unsigned LoadStoreOp, 1274 unsigned EltSize) { 1275 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1276 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr); 1277 bool UseST = 1278 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); 1279 1280 switch (EltSize) { 1281 case 4: 1282 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1283 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1284 break; 1285 case 8: 1286 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1287 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1288 break; 1289 case 12: 1290 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1291 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1292 break; 1293 case 16: 1294 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1295 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1296 break; 1297 default: 1298 llvm_unreachable("Unexpected spill load/store size!"); 1299 } 1300 1301 if (HasVAddr) 1302 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1303 else if (UseST) 1304 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1305 1306 return LoadStoreOp; 1307 } 1308 1309 void SIRegisterInfo::buildSpillLoadStore( 1310 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1311 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1312 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1313 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1314 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1315 1316 MachineFunction *MF = MBB.getParent(); 1317 const SIInstrInfo *TII = ST.getInstrInfo(); 1318 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1319 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1320 1321 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1322 bool IsStore = Desc->mayStore(); 1323 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1324 1325 bool CanClobberSCC = false; 1326 bool Scavenged = false; 1327 MCRegister SOffset = ScratchOffsetReg; 1328 1329 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1330 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1331 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1332 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; 1333 1334 // Always use 4 byte operations for AGPRs because we need to scavenge 1335 // a temporary VGPR. 1336 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1337 unsigned NumSubRegs = RegWidth / EltSize; 1338 unsigned Size = NumSubRegs * EltSize; 1339 unsigned RemSize = RegWidth - Size; 1340 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1341 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1342 int64_t MaterializedOffset = Offset; 1343 1344 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1345 int64_t ScratchOffsetRegDelta = 0; 1346 1347 if (IsFlat && EltSize > 4) { 1348 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1349 Desc = &TII->get(LoadStoreOp); 1350 } 1351 1352 Align Alignment = MFI.getObjectAlign(Index); 1353 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1354 1355 assert((IsFlat || ((Offset % EltSize) == 0)) && 1356 "unexpected VGPR spill offset"); 1357 1358 // Track a VGPR to use for a constant offset we need to materialize. 1359 Register TmpOffsetVGPR; 1360 1361 // Track a VGPR to use as an intermediate value. 1362 Register TmpIntermediateVGPR; 1363 bool UseVGPROffset = false; 1364 1365 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1366 // combination. 1367 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1368 int64_t VOffset) { 1369 // We are using a VGPR offset 1370 if (IsFlat && SGPRBase) { 1371 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1372 // SGPR, so perform the add as vector. 1373 // We don't need a base SGPR in the kernel. 1374 1375 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1376 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1377 .addReg(SGPRBase) 1378 .addImm(VOffset) 1379 .addImm(0); // clamp 1380 } else { 1381 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1382 .addReg(SGPRBase); 1383 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1384 .addImm(VOffset) 1385 .addReg(TmpOffsetVGPR); 1386 } 1387 } else { 1388 assert(TmpOffsetVGPR); 1389 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1390 .addImm(VOffset); 1391 } 1392 }; 1393 1394 bool IsOffsetLegal = 1395 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1396 SIInstrFlags::FlatScratch) 1397 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1398 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1399 SOffset = MCRegister(); 1400 1401 // We don't have access to the register scavenger if this function is called 1402 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1403 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1404 // entry. 1405 if (RS) { 1406 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false); 1407 1408 // Piggy back on the liveness scan we just did see if SCC is dead. 1409 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1410 } else if (LiveRegs) { 1411 CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC); 1412 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1413 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1414 SOffset = Reg; 1415 break; 1416 } 1417 } 1418 } 1419 1420 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1421 SOffset = Register(); 1422 1423 if (!SOffset) { 1424 UseVGPROffset = true; 1425 1426 if (RS) { 1427 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 1428 } else { 1429 assert(LiveRegs); 1430 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1431 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1432 TmpOffsetVGPR = Reg; 1433 break; 1434 } 1435 } 1436 } 1437 1438 assert(TmpOffsetVGPR); 1439 } else if (!SOffset && CanClobberSCC) { 1440 // There are no free SGPRs, and since we are in the process of spilling 1441 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1442 // on SI/CI and on VI it is true until we implement spilling using scalar 1443 // stores), we have no way to free up an SGPR. Our solution here is to 1444 // add the offset directly to the ScratchOffset or StackPtrOffset 1445 // register, and then subtract the offset after the spill to return the 1446 // register to it's original value. 1447 1448 // TODO: If we don't have to do an emergency stack slot spill, converting 1449 // to use the VGPR offset is fewer instructions. 1450 if (!ScratchOffsetReg) 1451 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1452 SOffset = ScratchOffsetReg; 1453 ScratchOffsetRegDelta = Offset; 1454 } else { 1455 Scavenged = true; 1456 } 1457 1458 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1459 // we can simplify the adjustment of Offset here to just scale with 1460 // WavefrontSize. 1461 if (!IsFlat && !UseVGPROffset) 1462 Offset *= ST.getWavefrontSize(); 1463 1464 if (!UseVGPROffset && !SOffset) 1465 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1466 1467 if (UseVGPROffset) { 1468 // We are using a VGPR offset 1469 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1470 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1471 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1472 } else { 1473 assert(Offset != 0); 1474 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1475 .addReg(ScratchOffsetReg) 1476 .addImm(Offset); 1477 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1478 } 1479 1480 Offset = 0; 1481 } 1482 1483 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1484 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1485 && "Unexpected vaddr for flat scratch with a FI operand"); 1486 1487 if (UseVGPROffset) { 1488 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1489 } else { 1490 assert(ST.hasFlatScratchSTMode()); 1491 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1492 } 1493 1494 Desc = &TII->get(LoadStoreOp); 1495 } 1496 1497 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1498 ++i, RegOffset += EltSize) { 1499 if (i == NumSubRegs) { 1500 EltSize = RemSize; 1501 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1502 } 1503 Desc = &TII->get(LoadStoreOp); 1504 1505 if (!IsFlat && UseVGPROffset) { 1506 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1507 : getOffenMUBUFLoad(LoadStoreOp); 1508 Desc = &TII->get(NewLoadStoreOp); 1509 } 1510 1511 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1512 // If we are spilling an AGPR beyond the range of the memory instruction 1513 // offset and need to use a VGPR offset, we ideally have at least 2 1514 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1515 // recycle the VGPR used for the offset which requires resetting after 1516 // each subregister. 1517 1518 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1519 } 1520 1521 unsigned NumRegs = EltSize / 4; 1522 Register SubReg = e == 1 1523 ? ValueReg 1524 : Register(getSubReg(ValueReg, 1525 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1526 1527 unsigned SOffsetRegState = 0; 1528 unsigned SrcDstRegState = getDefRegState(!IsStore); 1529 const bool IsLastSubReg = i + 1 == e; 1530 const bool IsFirstSubReg = i == 0; 1531 if (IsLastSubReg) { 1532 SOffsetRegState |= getKillRegState(Scavenged); 1533 // The last implicit use carries the "Kill" flag. 1534 SrcDstRegState |= getKillRegState(IsKill); 1535 } 1536 1537 // Make sure the whole register is defined if there are undef components by 1538 // adding an implicit def of the super-reg on the first instruction. 1539 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; 1540 bool NeedSuperRegImpOperand = e > 1; 1541 1542 // Remaining element size to spill into memory after some parts of it 1543 // spilled into either AGPRs or VGPRs. 1544 unsigned RemEltSize = EltSize; 1545 1546 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1547 // starting from the last lane. In case if a register cannot be completely 1548 // spilled into another register that will ensure its alignment does not 1549 // change. For targets with VGPR alignment requirement this is important 1550 // in case of flat scratch usage as we might get a scratch_load or 1551 // scratch_store of an unaligned register otherwise. 1552 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1553 LaneE = RegOffset / 4; 1554 Lane >= LaneE; --Lane) { 1555 bool IsSubReg = e > 1 || EltSize > 4; 1556 Register Sub = IsSubReg 1557 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1558 : ValueReg; 1559 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1560 if (!MIB.getInstr()) 1561 break; 1562 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { 1563 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1564 NeedSuperRegDef = false; 1565 } 1566 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { 1567 NeedSuperRegImpOperand = true; 1568 unsigned State = SrcDstRegState; 1569 if (!IsLastSubReg || (Lane != LaneE)) 1570 State &= ~RegState::Kill; 1571 if (!IsFirstSubReg || (Lane != LaneS)) 1572 State &= ~RegState::Define; 1573 MIB.addReg(ValueReg, RegState::Implicit | State); 1574 } 1575 RemEltSize -= 4; 1576 } 1577 1578 if (!RemEltSize) // Fully spilled into AGPRs. 1579 continue; 1580 1581 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1582 assert(IsFlat && EltSize > 4); 1583 1584 unsigned NumRegs = RemEltSize / 4; 1585 SubReg = Register(getSubReg(ValueReg, 1586 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1587 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1588 Desc = &TII->get(Opc); 1589 } 1590 1591 unsigned FinalReg = SubReg; 1592 1593 if (IsAGPR) { 1594 assert(EltSize == 4); 1595 1596 if (!TmpIntermediateVGPR) { 1597 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); 1598 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); 1599 } 1600 if (IsStore) { 1601 auto AccRead = BuildMI(MBB, MI, DL, 1602 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1603 TmpIntermediateVGPR) 1604 .addReg(SubReg, getKillRegState(IsKill)); 1605 if (NeedSuperRegDef) 1606 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1607 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1608 } 1609 SubReg = TmpIntermediateVGPR; 1610 } else if (UseVGPROffset) { 1611 // FIXME: change to scavengeRegisterBackwards() 1612 if (!TmpOffsetVGPR) { 1613 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 1614 MI, false, 0); 1615 RS->setRegUsed(TmpOffsetVGPR); 1616 } 1617 } 1618 1619 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1620 MachineMemOperand *NewMMO = 1621 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1622 commonAlignment(Alignment, RegOffset)); 1623 1624 auto MIB = 1625 BuildMI(MBB, MI, DL, *Desc) 1626 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1627 1628 if (UseVGPROffset) { 1629 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1630 // intermediate accvgpr_write. 1631 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1632 } 1633 1634 if (!IsFlat) 1635 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1636 1637 if (SOffset == AMDGPU::NoRegister) { 1638 if (!IsFlat) { 1639 if (UseVGPROffset && ScratchOffsetReg) { 1640 MIB.addReg(ScratchOffsetReg); 1641 } else { 1642 assert(FuncInfo->isEntryFunction()); 1643 MIB.addImm(0); 1644 } 1645 } 1646 } else { 1647 MIB.addReg(SOffset, SOffsetRegState); 1648 } 1649 MIB.addImm(Offset + RegOffset) 1650 .addImm(0); // cpol 1651 if (!IsFlat) 1652 MIB.addImm(0); // swz 1653 MIB.addMemOperand(NewMMO); 1654 1655 if (!IsAGPR && NeedSuperRegDef) 1656 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1657 1658 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1659 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1660 FinalReg) 1661 .addReg(TmpIntermediateVGPR, RegState::Kill); 1662 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1663 } 1664 1665 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1666 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1667 1668 // The epilog restore of a wwm-scratch register can cause undesired 1669 // optimization during machine-cp post PrologEpilogInserter if the same 1670 // register was assigned for return value ABI lowering with a COPY 1671 // instruction. As given below, with the epilog reload, the earlier COPY 1672 // appeared to be dead during machine-cp. 1673 // ... 1674 // v0 in WWM operation, needs the WWM spill at prolog/epilog. 1675 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 1676 // ... 1677 // Epilog block: 1678 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 1679 // ... 1680 // WWM spill restore to preserve the inactive lanes of v0. 1681 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 1682 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 1683 // $exec = S_MOV_B64 killed $sgpr4_sgpr5 1684 // ... 1685 // SI_RETURN implicit $vgpr0 1686 // ... 1687 // To fix it, mark the same reg as a tied op for such restore instructions 1688 // so that it marks a usage for the preceding COPY. 1689 if (!IsStore && MI != MBB.end() && MI->isReturn() && 1690 MI->readsRegister(SubReg, this)) { 1691 MIB.addReg(SubReg, RegState::Implicit); 1692 MIB->tieOperands(0, MIB->getNumOperands() - 1); 1693 } 1694 } 1695 1696 if (ScratchOffsetRegDelta != 0) { 1697 // Subtract the offset we added to the ScratchOffset register. 1698 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1699 .addReg(SOffset) 1700 .addImm(-ScratchOffsetRegDelta); 1701 } 1702 } 1703 1704 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1705 int Offset, bool IsLoad, 1706 bool IsKill) const { 1707 // Load/store VGPR 1708 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1709 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1710 1711 Register FrameReg = 1712 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1713 ? getBaseRegister() 1714 : getFrameRegister(SB.MF); 1715 1716 Align Alignment = FrameInfo.getObjectAlign(Index); 1717 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1718 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1719 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1720 SB.EltSize, Alignment); 1721 1722 if (IsLoad) { 1723 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1724 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1725 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1726 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1727 } else { 1728 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1729 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1730 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1731 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1732 // This only ever adds one VGPR spill 1733 SB.MFI.addToSpilledVGPRs(1); 1734 } 1735 } 1736 1737 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, 1738 RegScavenger *RS, SlotIndexes *Indexes, 1739 LiveIntervals *LIS, bool OnlyToVGPR) const { 1740 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1741 1742 ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); 1743 bool SpillToVGPR = !VGPRSpills.empty(); 1744 if (OnlyToVGPR && !SpillToVGPR) 1745 return false; 1746 1747 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1748 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1749 1750 if (SpillToVGPR) { 1751 1752 assert(SB.NumSubRegs == VGPRSpills.size() && 1753 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1754 1755 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1756 Register SubReg = 1757 SB.NumSubRegs == 1 1758 ? SB.SuperReg 1759 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1760 SpilledReg Spill = VGPRSpills[i]; 1761 1762 bool IsFirstSubreg = i == 0; 1763 bool IsLastSubreg = i == SB.NumSubRegs - 1; 1764 bool UseKill = SB.IsKill && IsLastSubreg; 1765 1766 1767 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1768 // spill to this specific vgpr in the first basic block. 1769 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1770 SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1771 .addReg(SubReg, getKillRegState(UseKill)) 1772 .addImm(Spill.Lane) 1773 .addReg(Spill.VGPR); 1774 if (Indexes) { 1775 if (IsFirstSubreg) 1776 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1777 else 1778 Indexes->insertMachineInstrInMaps(*MIB); 1779 } 1780 1781 if (IsFirstSubreg && SB.NumSubRegs > 1) { 1782 // We may be spilling a super-register which is only partially defined, 1783 // and need to ensure later spills think the value is defined. 1784 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1785 } 1786 1787 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) 1788 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1789 1790 // FIXME: Since this spills to another register instead of an actual 1791 // frame index, we should delete the frame index when all references to 1792 // it are fixed. 1793 } 1794 } else { 1795 SB.prepare(); 1796 1797 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1798 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1799 1800 // Per VGPR helper data 1801 auto PVD = SB.getPerVGPRData(); 1802 1803 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1804 unsigned TmpVGPRFlags = RegState::Undef; 1805 1806 // Write sub registers into the VGPR 1807 for (unsigned i = Offset * PVD.PerVGPR, 1808 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1809 i < e; ++i) { 1810 Register SubReg = 1811 SB.NumSubRegs == 1 1812 ? SB.SuperReg 1813 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1814 1815 MachineInstrBuilder WriteLane = 1816 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1817 SB.TmpVGPR) 1818 .addReg(SubReg, SubKillState) 1819 .addImm(i % PVD.PerVGPR) 1820 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1821 TmpVGPRFlags = 0; 1822 1823 if (Indexes) { 1824 if (i == 0) 1825 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane); 1826 else 1827 Indexes->insertMachineInstrInMaps(*WriteLane); 1828 } 1829 1830 // There could be undef components of a spilled super register. 1831 // TODO: Can we detect this and skip the spill? 1832 if (SB.NumSubRegs > 1) { 1833 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1834 unsigned SuperKillState = 0; 1835 if (i + 1 == SB.NumSubRegs) 1836 SuperKillState |= getKillRegState(SB.IsKill); 1837 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1838 } 1839 } 1840 1841 // Write out VGPR 1842 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1843 } 1844 1845 SB.restore(); 1846 } 1847 1848 MI->eraseFromParent(); 1849 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1850 1851 if (LIS) 1852 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1853 1854 return true; 1855 } 1856 1857 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, 1858 RegScavenger *RS, SlotIndexes *Indexes, 1859 LiveIntervals *LIS, bool OnlyToVGPR) const { 1860 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1861 1862 ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); 1863 bool SpillToVGPR = !VGPRSpills.empty(); 1864 if (OnlyToVGPR && !SpillToVGPR) 1865 return false; 1866 1867 if (SpillToVGPR) { 1868 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1869 Register SubReg = 1870 SB.NumSubRegs == 1 1871 ? SB.SuperReg 1872 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1873 1874 SpilledReg Spill = VGPRSpills[i]; 1875 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1876 SubReg) 1877 .addReg(Spill.VGPR) 1878 .addImm(Spill.Lane); 1879 if (SB.NumSubRegs > 1 && i == 0) 1880 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1881 if (Indexes) { 1882 if (i == e - 1) 1883 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1884 else 1885 Indexes->insertMachineInstrInMaps(*MIB); 1886 } 1887 } 1888 } else { 1889 SB.prepare(); 1890 1891 // Per VGPR helper data 1892 auto PVD = SB.getPerVGPRData(); 1893 1894 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1895 // Load in VGPR data 1896 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1897 1898 // Unpack lanes 1899 for (unsigned i = Offset * PVD.PerVGPR, 1900 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1901 i < e; ++i) { 1902 Register SubReg = 1903 SB.NumSubRegs == 1 1904 ? SB.SuperReg 1905 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1906 1907 bool LastSubReg = (i + 1 == e); 1908 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1909 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1910 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1911 .addImm(i); 1912 if (SB.NumSubRegs > 1 && i == 0) 1913 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1914 if (Indexes) { 1915 if (i == e - 1) 1916 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1917 else 1918 Indexes->insertMachineInstrInMaps(*MIB); 1919 } 1920 } 1921 } 1922 1923 SB.restore(); 1924 } 1925 1926 MI->eraseFromParent(); 1927 1928 if (LIS) 1929 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1930 1931 return true; 1932 } 1933 1934 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1935 MachineBasicBlock &RestoreMBB, 1936 Register SGPR, RegScavenger *RS) const { 1937 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1938 RS); 1939 SB.prepare(); 1940 // Generate the spill of SGPR to SB.TmpVGPR. 1941 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1942 auto PVD = SB.getPerVGPRData(); 1943 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1944 unsigned TmpVGPRFlags = RegState::Undef; 1945 // Write sub registers into the VGPR 1946 for (unsigned i = Offset * PVD.PerVGPR, 1947 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1948 i < e; ++i) { 1949 Register SubReg = 1950 SB.NumSubRegs == 1 1951 ? SB.SuperReg 1952 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1953 1954 MachineInstrBuilder WriteLane = 1955 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1956 SB.TmpVGPR) 1957 .addReg(SubReg, SubKillState) 1958 .addImm(i % PVD.PerVGPR) 1959 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1960 TmpVGPRFlags = 0; 1961 // There could be undef components of a spilled super register. 1962 // TODO: Can we detect this and skip the spill? 1963 if (SB.NumSubRegs > 1) { 1964 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1965 unsigned SuperKillState = 0; 1966 if (i + 1 == SB.NumSubRegs) 1967 SuperKillState |= getKillRegState(SB.IsKill); 1968 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1969 } 1970 } 1971 // Don't need to write VGPR out. 1972 } 1973 1974 // Restore clobbered registers in the specified restore block. 1975 MI = RestoreMBB.end(); 1976 SB.setMI(&RestoreMBB, MI); 1977 // Generate the restore of SGPR from SB.TmpVGPR. 1978 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1979 // Don't need to load VGPR in. 1980 // Unpack lanes 1981 for (unsigned i = Offset * PVD.PerVGPR, 1982 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1983 i < e; ++i) { 1984 Register SubReg = 1985 SB.NumSubRegs == 1 1986 ? SB.SuperReg 1987 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1988 bool LastSubReg = (i + 1 == e); 1989 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1990 SubReg) 1991 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1992 .addImm(i); 1993 if (SB.NumSubRegs > 1 && i == 0) 1994 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1995 } 1996 } 1997 SB.restore(); 1998 1999 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 2000 return false; 2001 } 2002 2003 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 2004 /// a VGPR and the stack slot can be safely eliminated when all other users are 2005 /// handled. 2006 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 2007 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, 2008 SlotIndexes *Indexes, LiveIntervals *LIS) const { 2009 switch (MI->getOpcode()) { 2010 case AMDGPU::SI_SPILL_S1024_SAVE: 2011 case AMDGPU::SI_SPILL_S512_SAVE: 2012 case AMDGPU::SI_SPILL_S384_SAVE: 2013 case AMDGPU::SI_SPILL_S352_SAVE: 2014 case AMDGPU::SI_SPILL_S320_SAVE: 2015 case AMDGPU::SI_SPILL_S288_SAVE: 2016 case AMDGPU::SI_SPILL_S256_SAVE: 2017 case AMDGPU::SI_SPILL_S224_SAVE: 2018 case AMDGPU::SI_SPILL_S192_SAVE: 2019 case AMDGPU::SI_SPILL_S160_SAVE: 2020 case AMDGPU::SI_SPILL_S128_SAVE: 2021 case AMDGPU::SI_SPILL_S96_SAVE: 2022 case AMDGPU::SI_SPILL_S64_SAVE: 2023 case AMDGPU::SI_SPILL_S32_SAVE: 2024 return spillSGPR(MI, FI, RS, Indexes, LIS, true); 2025 case AMDGPU::SI_SPILL_S1024_RESTORE: 2026 case AMDGPU::SI_SPILL_S512_RESTORE: 2027 case AMDGPU::SI_SPILL_S384_RESTORE: 2028 case AMDGPU::SI_SPILL_S352_RESTORE: 2029 case AMDGPU::SI_SPILL_S320_RESTORE: 2030 case AMDGPU::SI_SPILL_S288_RESTORE: 2031 case AMDGPU::SI_SPILL_S256_RESTORE: 2032 case AMDGPU::SI_SPILL_S224_RESTORE: 2033 case AMDGPU::SI_SPILL_S192_RESTORE: 2034 case AMDGPU::SI_SPILL_S160_RESTORE: 2035 case AMDGPU::SI_SPILL_S128_RESTORE: 2036 case AMDGPU::SI_SPILL_S96_RESTORE: 2037 case AMDGPU::SI_SPILL_S64_RESTORE: 2038 case AMDGPU::SI_SPILL_S32_RESTORE: 2039 return restoreSGPR(MI, FI, RS, Indexes, LIS, true); 2040 default: 2041 llvm_unreachable("not an SGPR spill instruction"); 2042 } 2043 } 2044 2045 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 2046 int SPAdj, unsigned FIOperandNum, 2047 RegScavenger *RS) const { 2048 MachineFunction *MF = MI->getParent()->getParent(); 2049 MachineBasicBlock *MBB = MI->getParent(); 2050 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2051 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 2052 const SIInstrInfo *TII = ST.getInstrInfo(); 2053 DebugLoc DL = MI->getDebugLoc(); 2054 2055 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 2056 2057 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 2058 int Index = MI->getOperand(FIOperandNum).getIndex(); 2059 2060 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 2061 ? getBaseRegister() 2062 : getFrameRegister(*MF); 2063 2064 switch (MI->getOpcode()) { 2065 // SGPR register spill 2066 case AMDGPU::SI_SPILL_S1024_SAVE: 2067 case AMDGPU::SI_SPILL_S512_SAVE: 2068 case AMDGPU::SI_SPILL_S384_SAVE: 2069 case AMDGPU::SI_SPILL_S352_SAVE: 2070 case AMDGPU::SI_SPILL_S320_SAVE: 2071 case AMDGPU::SI_SPILL_S288_SAVE: 2072 case AMDGPU::SI_SPILL_S256_SAVE: 2073 case AMDGPU::SI_SPILL_S224_SAVE: 2074 case AMDGPU::SI_SPILL_S192_SAVE: 2075 case AMDGPU::SI_SPILL_S160_SAVE: 2076 case AMDGPU::SI_SPILL_S128_SAVE: 2077 case AMDGPU::SI_SPILL_S96_SAVE: 2078 case AMDGPU::SI_SPILL_S64_SAVE: 2079 case AMDGPU::SI_SPILL_S32_SAVE: { 2080 return spillSGPR(MI, Index, RS); 2081 } 2082 2083 // SGPR register restore 2084 case AMDGPU::SI_SPILL_S1024_RESTORE: 2085 case AMDGPU::SI_SPILL_S512_RESTORE: 2086 case AMDGPU::SI_SPILL_S384_RESTORE: 2087 case AMDGPU::SI_SPILL_S352_RESTORE: 2088 case AMDGPU::SI_SPILL_S320_RESTORE: 2089 case AMDGPU::SI_SPILL_S288_RESTORE: 2090 case AMDGPU::SI_SPILL_S256_RESTORE: 2091 case AMDGPU::SI_SPILL_S224_RESTORE: 2092 case AMDGPU::SI_SPILL_S192_RESTORE: 2093 case AMDGPU::SI_SPILL_S160_RESTORE: 2094 case AMDGPU::SI_SPILL_S128_RESTORE: 2095 case AMDGPU::SI_SPILL_S96_RESTORE: 2096 case AMDGPU::SI_SPILL_S64_RESTORE: 2097 case AMDGPU::SI_SPILL_S32_RESTORE: { 2098 return restoreSGPR(MI, Index, RS); 2099 } 2100 2101 // VGPR register spill 2102 case AMDGPU::SI_SPILL_V1024_SAVE: 2103 case AMDGPU::SI_SPILL_V512_SAVE: 2104 case AMDGPU::SI_SPILL_V384_SAVE: 2105 case AMDGPU::SI_SPILL_V352_SAVE: 2106 case AMDGPU::SI_SPILL_V320_SAVE: 2107 case AMDGPU::SI_SPILL_V288_SAVE: 2108 case AMDGPU::SI_SPILL_V256_SAVE: 2109 case AMDGPU::SI_SPILL_V224_SAVE: 2110 case AMDGPU::SI_SPILL_V192_SAVE: 2111 case AMDGPU::SI_SPILL_V160_SAVE: 2112 case AMDGPU::SI_SPILL_V128_SAVE: 2113 case AMDGPU::SI_SPILL_V96_SAVE: 2114 case AMDGPU::SI_SPILL_V64_SAVE: 2115 case AMDGPU::SI_SPILL_V32_SAVE: 2116 case AMDGPU::SI_SPILL_A1024_SAVE: 2117 case AMDGPU::SI_SPILL_A512_SAVE: 2118 case AMDGPU::SI_SPILL_A384_SAVE: 2119 case AMDGPU::SI_SPILL_A352_SAVE: 2120 case AMDGPU::SI_SPILL_A320_SAVE: 2121 case AMDGPU::SI_SPILL_A288_SAVE: 2122 case AMDGPU::SI_SPILL_A256_SAVE: 2123 case AMDGPU::SI_SPILL_A224_SAVE: 2124 case AMDGPU::SI_SPILL_A192_SAVE: 2125 case AMDGPU::SI_SPILL_A160_SAVE: 2126 case AMDGPU::SI_SPILL_A128_SAVE: 2127 case AMDGPU::SI_SPILL_A96_SAVE: 2128 case AMDGPU::SI_SPILL_A64_SAVE: 2129 case AMDGPU::SI_SPILL_A32_SAVE: 2130 case AMDGPU::SI_SPILL_AV1024_SAVE: 2131 case AMDGPU::SI_SPILL_AV512_SAVE: 2132 case AMDGPU::SI_SPILL_AV384_SAVE: 2133 case AMDGPU::SI_SPILL_AV352_SAVE: 2134 case AMDGPU::SI_SPILL_AV320_SAVE: 2135 case AMDGPU::SI_SPILL_AV288_SAVE: 2136 case AMDGPU::SI_SPILL_AV256_SAVE: 2137 case AMDGPU::SI_SPILL_AV224_SAVE: 2138 case AMDGPU::SI_SPILL_AV192_SAVE: 2139 case AMDGPU::SI_SPILL_AV160_SAVE: 2140 case AMDGPU::SI_SPILL_AV128_SAVE: 2141 case AMDGPU::SI_SPILL_AV96_SAVE: 2142 case AMDGPU::SI_SPILL_AV64_SAVE: 2143 case AMDGPU::SI_SPILL_AV32_SAVE: 2144 case AMDGPU::SI_SPILL_WWM_V32_SAVE: { 2145 const MachineOperand *VData = TII->getNamedOperand(*MI, 2146 AMDGPU::OpName::vdata); 2147 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2148 MFI->getStackPtrOffsetReg()); 2149 2150 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2151 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2152 auto *MBB = MI->getParent(); 2153 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2154 if (IsWWMRegSpill) { 2155 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2156 RS->isRegUsed(AMDGPU::SCC)); 2157 } 2158 buildSpillLoadStore( 2159 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2160 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2161 *MI->memoperands_begin(), RS); 2162 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2163 if (IsWWMRegSpill) 2164 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2165 2166 MI->eraseFromParent(); 2167 return true; 2168 } 2169 case AMDGPU::SI_SPILL_V32_RESTORE: 2170 case AMDGPU::SI_SPILL_V64_RESTORE: 2171 case AMDGPU::SI_SPILL_V96_RESTORE: 2172 case AMDGPU::SI_SPILL_V128_RESTORE: 2173 case AMDGPU::SI_SPILL_V160_RESTORE: 2174 case AMDGPU::SI_SPILL_V192_RESTORE: 2175 case AMDGPU::SI_SPILL_V224_RESTORE: 2176 case AMDGPU::SI_SPILL_V256_RESTORE: 2177 case AMDGPU::SI_SPILL_V288_RESTORE: 2178 case AMDGPU::SI_SPILL_V320_RESTORE: 2179 case AMDGPU::SI_SPILL_V352_RESTORE: 2180 case AMDGPU::SI_SPILL_V384_RESTORE: 2181 case AMDGPU::SI_SPILL_V512_RESTORE: 2182 case AMDGPU::SI_SPILL_V1024_RESTORE: 2183 case AMDGPU::SI_SPILL_A32_RESTORE: 2184 case AMDGPU::SI_SPILL_A64_RESTORE: 2185 case AMDGPU::SI_SPILL_A96_RESTORE: 2186 case AMDGPU::SI_SPILL_A128_RESTORE: 2187 case AMDGPU::SI_SPILL_A160_RESTORE: 2188 case AMDGPU::SI_SPILL_A192_RESTORE: 2189 case AMDGPU::SI_SPILL_A224_RESTORE: 2190 case AMDGPU::SI_SPILL_A256_RESTORE: 2191 case AMDGPU::SI_SPILL_A288_RESTORE: 2192 case AMDGPU::SI_SPILL_A320_RESTORE: 2193 case AMDGPU::SI_SPILL_A352_RESTORE: 2194 case AMDGPU::SI_SPILL_A384_RESTORE: 2195 case AMDGPU::SI_SPILL_A512_RESTORE: 2196 case AMDGPU::SI_SPILL_A1024_RESTORE: 2197 case AMDGPU::SI_SPILL_AV32_RESTORE: 2198 case AMDGPU::SI_SPILL_AV64_RESTORE: 2199 case AMDGPU::SI_SPILL_AV96_RESTORE: 2200 case AMDGPU::SI_SPILL_AV128_RESTORE: 2201 case AMDGPU::SI_SPILL_AV160_RESTORE: 2202 case AMDGPU::SI_SPILL_AV192_RESTORE: 2203 case AMDGPU::SI_SPILL_AV224_RESTORE: 2204 case AMDGPU::SI_SPILL_AV256_RESTORE: 2205 case AMDGPU::SI_SPILL_AV288_RESTORE: 2206 case AMDGPU::SI_SPILL_AV320_RESTORE: 2207 case AMDGPU::SI_SPILL_AV352_RESTORE: 2208 case AMDGPU::SI_SPILL_AV384_RESTORE: 2209 case AMDGPU::SI_SPILL_AV512_RESTORE: 2210 case AMDGPU::SI_SPILL_AV1024_RESTORE: 2211 case AMDGPU::SI_SPILL_WWM_V32_RESTORE: { 2212 const MachineOperand *VData = TII->getNamedOperand(*MI, 2213 AMDGPU::OpName::vdata); 2214 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2215 MFI->getStackPtrOffsetReg()); 2216 2217 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2218 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2219 auto *MBB = MI->getParent(); 2220 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); 2221 if (IsWWMRegSpill) { 2222 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), 2223 RS->isRegUsed(AMDGPU::SCC)); 2224 } 2225 buildSpillLoadStore( 2226 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2227 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2228 *MI->memoperands_begin(), RS); 2229 2230 if (IsWWMRegSpill) 2231 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); 2232 2233 MI->eraseFromParent(); 2234 return true; 2235 } 2236 2237 default: { 2238 // Other access to frame index 2239 const DebugLoc &DL = MI->getDebugLoc(); 2240 2241 int64_t Offset = FrameInfo.getObjectOffset(Index); 2242 if (ST.enableFlatScratch()) { 2243 if (TII->isFLATScratch(*MI)) { 2244 assert((int16_t)FIOperandNum == 2245 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2246 AMDGPU::OpName::saddr)); 2247 2248 // The offset is always swizzled, just replace it 2249 if (FrameReg) 2250 FIOp.ChangeToRegister(FrameReg, false); 2251 2252 if (!Offset) 2253 return false; 2254 2255 MachineOperand *OffsetOp = 2256 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2257 int64_t NewOffset = Offset + OffsetOp->getImm(); 2258 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2259 SIInstrFlags::FlatScratch)) { 2260 OffsetOp->setImm(NewOffset); 2261 if (FrameReg) 2262 return false; 2263 Offset = 0; 2264 } 2265 2266 if (!Offset) { 2267 unsigned Opc = MI->getOpcode(); 2268 int NewOpc = -1; 2269 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) { 2270 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2271 } else if (ST.hasFlatScratchSTMode()) { 2272 // On GFX10 we have ST mode to use no registers for an address. 2273 // Otherwise we need to materialize 0 into an SGPR. 2274 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2275 } 2276 2277 if (NewOpc != -1) { 2278 // removeOperand doesn't fixup tied operand indexes as it goes, so 2279 // it asserts. Untie vdst_in for now and retie them afterwards. 2280 int VDstIn = AMDGPU::getNamedOperandIdx(Opc, 2281 AMDGPU::OpName::vdst_in); 2282 bool TiedVDst = VDstIn != -1 && 2283 MI->getOperand(VDstIn).isReg() && 2284 MI->getOperand(VDstIn).isTied(); 2285 if (TiedVDst) 2286 MI->untieRegOperand(VDstIn); 2287 2288 MI->removeOperand( 2289 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2290 2291 if (TiedVDst) { 2292 int NewVDst = 2293 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 2294 int NewVDstIn = 2295 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); 2296 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!"); 2297 MI->tieOperands(NewVDst, NewVDstIn); 2298 } 2299 MI->setDesc(TII->get(NewOpc)); 2300 return false; 2301 } 2302 } 2303 } 2304 2305 if (!FrameReg) { 2306 FIOp.ChangeToImmediate(Offset); 2307 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 2308 return false; 2309 } 2310 2311 // We need to use register here. Check if we can use an SGPR or need 2312 // a VGPR. 2313 FIOp.ChangeToRegister(AMDGPU::M0, false); 2314 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 2315 2316 if (!Offset && FrameReg && UseSGPR) { 2317 FIOp.setReg(FrameReg); 2318 return false; 2319 } 2320 2321 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 2322 : &AMDGPU::VGPR_32RegClass; 2323 2324 Register TmpReg = 2325 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); 2326 FIOp.setReg(TmpReg); 2327 FIOp.setIsKill(); 2328 2329 if ((!FrameReg || !Offset) && TmpReg) { 2330 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2331 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2332 if (FrameReg) 2333 MIB.addReg(FrameReg); 2334 else 2335 MIB.addImm(Offset); 2336 2337 return false; 2338 } 2339 2340 bool NeedSaveSCC = 2341 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2342 2343 Register TmpSReg = 2344 UseSGPR ? TmpReg 2345 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, 2346 MI, false, 0, !UseSGPR); 2347 2348 // TODO: for flat scratch another attempt can be made with a VGPR index 2349 // if no SGPRs can be scavenged. 2350 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2351 report_fatal_error("Cannot scavenge register in FI elimination!"); 2352 2353 if (!TmpSReg) { 2354 // Use frame register and restore it after. 2355 TmpSReg = FrameReg; 2356 FIOp.setReg(FrameReg); 2357 FIOp.setIsKill(false); 2358 } 2359 2360 if (NeedSaveSCC) { 2361 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!"); 2362 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) 2363 .addReg(FrameReg) 2364 .addImm(Offset); 2365 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2366 .addReg(TmpSReg) 2367 .addImm(0); 2368 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) 2369 .addImm(0) 2370 .addReg(TmpSReg); 2371 } else { 2372 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2373 .addReg(FrameReg) 2374 .addImm(Offset); 2375 } 2376 2377 if (!UseSGPR) 2378 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2379 .addReg(TmpSReg, RegState::Kill); 2380 2381 if (TmpSReg == FrameReg) { 2382 // Undo frame register modification. 2383 if (NeedSaveSCC && !MI->registerDefIsDead(AMDGPU::SCC)) { 2384 MachineBasicBlock::iterator I = 2385 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32), 2386 TmpSReg) 2387 .addReg(FrameReg) 2388 .addImm(-Offset); 2389 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2390 .addReg(TmpSReg) 2391 .addImm(0); 2392 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32), 2393 TmpSReg) 2394 .addImm(0) 2395 .addReg(TmpSReg); 2396 } else { 2397 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2398 FrameReg) 2399 .addReg(FrameReg) 2400 .addImm(-Offset); 2401 } 2402 } 2403 2404 return false; 2405 } 2406 2407 bool IsMUBUF = TII->isMUBUF(*MI); 2408 2409 if (!IsMUBUF && !MFI->isEntryFunction()) { 2410 // Convert to a swizzled stack address by scaling by the wave size. 2411 // In an entry function/kernel the offset is already swizzled. 2412 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); 2413 bool LiveSCC = 2414 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2415 const TargetRegisterClass *RC = IsSALU && !LiveSCC 2416 ? &AMDGPU::SReg_32RegClass 2417 : &AMDGPU::VGPR_32RegClass; 2418 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || 2419 MI->getOpcode() == AMDGPU::V_MOV_B32_e64; 2420 Register ResultReg = 2421 IsCopy ? MI->getOperand(0).getReg() 2422 : RS->scavengeRegisterBackwards(*RC, MI, false, 0); 2423 2424 int64_t Offset = FrameInfo.getObjectOffset(Index); 2425 if (Offset == 0) { 2426 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 2427 : AMDGPU::V_LSHRREV_B32_e64; 2428 // XXX - This never happens because of emergency scavenging slot at 0? 2429 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg) 2430 .addImm(ST.getWavefrontSizeLog2()) 2431 .addReg(FrameReg); 2432 if (IsSALU && !LiveSCC) 2433 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. 2434 if (IsSALU && LiveSCC) { 2435 Register NewDest = RS->scavengeRegisterBackwards( 2436 AMDGPU::SReg_32RegClass, Shift, false, 0); 2437 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 2438 NewDest) 2439 .addReg(ResultReg); 2440 ResultReg = NewDest; 2441 } 2442 } else { 2443 MachineInstrBuilder MIB; 2444 if (!IsSALU) { 2445 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != 2446 nullptr) { 2447 // Reuse ResultReg in intermediate step. 2448 Register ScaledReg = ResultReg; 2449 2450 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2451 ScaledReg) 2452 .addImm(ST.getWavefrontSizeLog2()) 2453 .addReg(FrameReg); 2454 2455 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2456 2457 // TODO: Fold if use instruction is another add of a constant. 2458 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2459 // FIXME: This can fail 2460 MIB.addImm(Offset); 2461 MIB.addReg(ScaledReg, RegState::Kill); 2462 if (!IsVOP2) 2463 MIB.addImm(0); // clamp bit 2464 } else { 2465 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2466 "Need to reuse carry out register"); 2467 2468 // Use scavenged unused carry out as offset register. 2469 Register ConstOffsetReg; 2470 if (!isWave32) 2471 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2472 else 2473 ConstOffsetReg = MIB.getReg(1); 2474 2475 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2476 .addImm(Offset); 2477 MIB.addReg(ConstOffsetReg, RegState::Kill); 2478 MIB.addReg(ScaledReg, RegState::Kill); 2479 MIB.addImm(0); // clamp bit 2480 } 2481 } 2482 } 2483 if (!MIB || IsSALU) { 2484 // We have to produce a carry out, and there isn't a free SGPR pair 2485 // for it. We can keep the whole computation on the SALU to avoid 2486 // clobbering an additional register at the cost of an extra mov. 2487 2488 // We may have 1 free scratch SGPR even though a carry out is 2489 // unavailable. Only one additional mov is needed. 2490 Register TmpScaledReg = RS->scavengeRegisterBackwards( 2491 AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false); 2492 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2493 2494 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2495 .addReg(FrameReg) 2496 .addImm(ST.getWavefrontSizeLog2()); 2497 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2498 .addReg(ScaledReg, RegState::Kill) 2499 .addImm(Offset); 2500 if (!IsSALU) 2501 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2502 .addReg(ScaledReg, RegState::Kill); 2503 else 2504 ResultReg = ScaledReg; 2505 2506 // If there were truly no free SGPRs, we need to undo everything. 2507 if (!TmpScaledReg.isValid()) { 2508 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2509 .addReg(ScaledReg, RegState::Kill) 2510 .addImm(-Offset); 2511 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2512 .addReg(FrameReg) 2513 .addImm(ST.getWavefrontSizeLog2()); 2514 } 2515 } 2516 } 2517 2518 // Don't introduce an extra copy if we're just materializing in a mov. 2519 if (IsCopy) { 2520 MI->eraseFromParent(); 2521 return true; 2522 } 2523 FIOp.ChangeToRegister(ResultReg, false, false, true); 2524 return false; 2525 } 2526 2527 if (IsMUBUF) { 2528 // Disable offen so we don't need a 0 vgpr base. 2529 assert(static_cast<int>(FIOperandNum) == 2530 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2531 AMDGPU::OpName::vaddr)); 2532 2533 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2534 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2535 2536 if (FrameReg != AMDGPU::NoRegister) 2537 SOffset.ChangeToRegister(FrameReg, false); 2538 2539 int64_t Offset = FrameInfo.getObjectOffset(Index); 2540 int64_t OldImm 2541 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2542 int64_t NewOffset = OldImm + Offset; 2543 2544 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 2545 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2546 MI->eraseFromParent(); 2547 return true; 2548 } 2549 } 2550 2551 // If the offset is simply too big, don't convert to a scratch wave offset 2552 // relative index. 2553 2554 FIOp.ChangeToImmediate(Offset); 2555 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2556 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, 2557 MI, false, 0); 2558 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2559 .addImm(Offset); 2560 FIOp.ChangeToRegister(TmpReg, false, false, true); 2561 } 2562 } 2563 } 2564 return false; 2565 } 2566 2567 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2568 return AMDGPUInstPrinter::getRegisterName(Reg); 2569 } 2570 2571 static const TargetRegisterClass * 2572 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2573 if (BitWidth == 64) 2574 return &AMDGPU::VReg_64RegClass; 2575 if (BitWidth == 96) 2576 return &AMDGPU::VReg_96RegClass; 2577 if (BitWidth == 128) 2578 return &AMDGPU::VReg_128RegClass; 2579 if (BitWidth == 160) 2580 return &AMDGPU::VReg_160RegClass; 2581 if (BitWidth == 192) 2582 return &AMDGPU::VReg_192RegClass; 2583 if (BitWidth == 224) 2584 return &AMDGPU::VReg_224RegClass; 2585 if (BitWidth == 256) 2586 return &AMDGPU::VReg_256RegClass; 2587 if (BitWidth == 288) 2588 return &AMDGPU::VReg_288RegClass; 2589 if (BitWidth == 320) 2590 return &AMDGPU::VReg_320RegClass; 2591 if (BitWidth == 352) 2592 return &AMDGPU::VReg_352RegClass; 2593 if (BitWidth == 384) 2594 return &AMDGPU::VReg_384RegClass; 2595 if (BitWidth == 512) 2596 return &AMDGPU::VReg_512RegClass; 2597 if (BitWidth == 1024) 2598 return &AMDGPU::VReg_1024RegClass; 2599 2600 return nullptr; 2601 } 2602 2603 static const TargetRegisterClass * 2604 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2605 if (BitWidth == 64) 2606 return &AMDGPU::VReg_64_Align2RegClass; 2607 if (BitWidth == 96) 2608 return &AMDGPU::VReg_96_Align2RegClass; 2609 if (BitWidth == 128) 2610 return &AMDGPU::VReg_128_Align2RegClass; 2611 if (BitWidth == 160) 2612 return &AMDGPU::VReg_160_Align2RegClass; 2613 if (BitWidth == 192) 2614 return &AMDGPU::VReg_192_Align2RegClass; 2615 if (BitWidth == 224) 2616 return &AMDGPU::VReg_224_Align2RegClass; 2617 if (BitWidth == 256) 2618 return &AMDGPU::VReg_256_Align2RegClass; 2619 if (BitWidth == 288) 2620 return &AMDGPU::VReg_288_Align2RegClass; 2621 if (BitWidth == 320) 2622 return &AMDGPU::VReg_320_Align2RegClass; 2623 if (BitWidth == 352) 2624 return &AMDGPU::VReg_352_Align2RegClass; 2625 if (BitWidth == 384) 2626 return &AMDGPU::VReg_384_Align2RegClass; 2627 if (BitWidth == 512) 2628 return &AMDGPU::VReg_512_Align2RegClass; 2629 if (BitWidth == 1024) 2630 return &AMDGPU::VReg_1024_Align2RegClass; 2631 2632 return nullptr; 2633 } 2634 2635 const TargetRegisterClass * 2636 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2637 if (BitWidth == 1) 2638 return &AMDGPU::VReg_1RegClass; 2639 if (BitWidth == 16) 2640 return &AMDGPU::VGPR_LO16RegClass; 2641 if (BitWidth == 32) 2642 return &AMDGPU::VGPR_32RegClass; 2643 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2644 : getAnyVGPRClassForBitWidth(BitWidth); 2645 } 2646 2647 static const TargetRegisterClass * 2648 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2649 if (BitWidth == 64) 2650 return &AMDGPU::AReg_64RegClass; 2651 if (BitWidth == 96) 2652 return &AMDGPU::AReg_96RegClass; 2653 if (BitWidth == 128) 2654 return &AMDGPU::AReg_128RegClass; 2655 if (BitWidth == 160) 2656 return &AMDGPU::AReg_160RegClass; 2657 if (BitWidth == 192) 2658 return &AMDGPU::AReg_192RegClass; 2659 if (BitWidth == 224) 2660 return &AMDGPU::AReg_224RegClass; 2661 if (BitWidth == 256) 2662 return &AMDGPU::AReg_256RegClass; 2663 if (BitWidth == 288) 2664 return &AMDGPU::AReg_288RegClass; 2665 if (BitWidth == 320) 2666 return &AMDGPU::AReg_320RegClass; 2667 if (BitWidth == 352) 2668 return &AMDGPU::AReg_352RegClass; 2669 if (BitWidth == 384) 2670 return &AMDGPU::AReg_384RegClass; 2671 if (BitWidth == 512) 2672 return &AMDGPU::AReg_512RegClass; 2673 if (BitWidth == 1024) 2674 return &AMDGPU::AReg_1024RegClass; 2675 2676 return nullptr; 2677 } 2678 2679 static const TargetRegisterClass * 2680 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2681 if (BitWidth == 64) 2682 return &AMDGPU::AReg_64_Align2RegClass; 2683 if (BitWidth == 96) 2684 return &AMDGPU::AReg_96_Align2RegClass; 2685 if (BitWidth == 128) 2686 return &AMDGPU::AReg_128_Align2RegClass; 2687 if (BitWidth == 160) 2688 return &AMDGPU::AReg_160_Align2RegClass; 2689 if (BitWidth == 192) 2690 return &AMDGPU::AReg_192_Align2RegClass; 2691 if (BitWidth == 224) 2692 return &AMDGPU::AReg_224_Align2RegClass; 2693 if (BitWidth == 256) 2694 return &AMDGPU::AReg_256_Align2RegClass; 2695 if (BitWidth == 288) 2696 return &AMDGPU::AReg_288_Align2RegClass; 2697 if (BitWidth == 320) 2698 return &AMDGPU::AReg_320_Align2RegClass; 2699 if (BitWidth == 352) 2700 return &AMDGPU::AReg_352_Align2RegClass; 2701 if (BitWidth == 384) 2702 return &AMDGPU::AReg_384_Align2RegClass; 2703 if (BitWidth == 512) 2704 return &AMDGPU::AReg_512_Align2RegClass; 2705 if (BitWidth == 1024) 2706 return &AMDGPU::AReg_1024_Align2RegClass; 2707 2708 return nullptr; 2709 } 2710 2711 const TargetRegisterClass * 2712 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2713 if (BitWidth == 16) 2714 return &AMDGPU::AGPR_LO16RegClass; 2715 if (BitWidth == 32) 2716 return &AMDGPU::AGPR_32RegClass; 2717 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2718 : getAnyAGPRClassForBitWidth(BitWidth); 2719 } 2720 2721 static const TargetRegisterClass * 2722 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2723 if (BitWidth == 64) 2724 return &AMDGPU::AV_64RegClass; 2725 if (BitWidth == 96) 2726 return &AMDGPU::AV_96RegClass; 2727 if (BitWidth == 128) 2728 return &AMDGPU::AV_128RegClass; 2729 if (BitWidth == 160) 2730 return &AMDGPU::AV_160RegClass; 2731 if (BitWidth == 192) 2732 return &AMDGPU::AV_192RegClass; 2733 if (BitWidth == 224) 2734 return &AMDGPU::AV_224RegClass; 2735 if (BitWidth == 256) 2736 return &AMDGPU::AV_256RegClass; 2737 if (BitWidth == 288) 2738 return &AMDGPU::AV_288RegClass; 2739 if (BitWidth == 320) 2740 return &AMDGPU::AV_320RegClass; 2741 if (BitWidth == 352) 2742 return &AMDGPU::AV_352RegClass; 2743 if (BitWidth == 384) 2744 return &AMDGPU::AV_384RegClass; 2745 if (BitWidth == 512) 2746 return &AMDGPU::AV_512RegClass; 2747 if (BitWidth == 1024) 2748 return &AMDGPU::AV_1024RegClass; 2749 2750 return nullptr; 2751 } 2752 2753 static const TargetRegisterClass * 2754 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2755 if (BitWidth == 64) 2756 return &AMDGPU::AV_64_Align2RegClass; 2757 if (BitWidth == 96) 2758 return &AMDGPU::AV_96_Align2RegClass; 2759 if (BitWidth == 128) 2760 return &AMDGPU::AV_128_Align2RegClass; 2761 if (BitWidth == 160) 2762 return &AMDGPU::AV_160_Align2RegClass; 2763 if (BitWidth == 192) 2764 return &AMDGPU::AV_192_Align2RegClass; 2765 if (BitWidth == 224) 2766 return &AMDGPU::AV_224_Align2RegClass; 2767 if (BitWidth == 256) 2768 return &AMDGPU::AV_256_Align2RegClass; 2769 if (BitWidth == 288) 2770 return &AMDGPU::AV_288_Align2RegClass; 2771 if (BitWidth == 320) 2772 return &AMDGPU::AV_320_Align2RegClass; 2773 if (BitWidth == 352) 2774 return &AMDGPU::AV_352_Align2RegClass; 2775 if (BitWidth == 384) 2776 return &AMDGPU::AV_384_Align2RegClass; 2777 if (BitWidth == 512) 2778 return &AMDGPU::AV_512_Align2RegClass; 2779 if (BitWidth == 1024) 2780 return &AMDGPU::AV_1024_Align2RegClass; 2781 2782 return nullptr; 2783 } 2784 2785 const TargetRegisterClass * 2786 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2787 if (BitWidth == 16) 2788 return &AMDGPU::VGPR_LO16RegClass; 2789 if (BitWidth == 32) 2790 return &AMDGPU::AV_32RegClass; 2791 return ST.needsAlignedVGPRs() 2792 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2793 : getAnyVectorSuperClassForBitWidth(BitWidth); 2794 } 2795 2796 const TargetRegisterClass * 2797 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2798 if (BitWidth == 16) 2799 return &AMDGPU::SGPR_LO16RegClass; 2800 if (BitWidth == 32) 2801 return &AMDGPU::SReg_32RegClass; 2802 if (BitWidth == 64) 2803 return &AMDGPU::SReg_64RegClass; 2804 if (BitWidth == 96) 2805 return &AMDGPU::SGPR_96RegClass; 2806 if (BitWidth == 128) 2807 return &AMDGPU::SGPR_128RegClass; 2808 if (BitWidth == 160) 2809 return &AMDGPU::SGPR_160RegClass; 2810 if (BitWidth == 192) 2811 return &AMDGPU::SGPR_192RegClass; 2812 if (BitWidth == 224) 2813 return &AMDGPU::SGPR_224RegClass; 2814 if (BitWidth == 256) 2815 return &AMDGPU::SGPR_256RegClass; 2816 if (BitWidth == 288) 2817 return &AMDGPU::SGPR_288RegClass; 2818 if (BitWidth == 320) 2819 return &AMDGPU::SGPR_320RegClass; 2820 if (BitWidth == 352) 2821 return &AMDGPU::SGPR_352RegClass; 2822 if (BitWidth == 384) 2823 return &AMDGPU::SGPR_384RegClass; 2824 if (BitWidth == 512) 2825 return &AMDGPU::SGPR_512RegClass; 2826 if (BitWidth == 1024) 2827 return &AMDGPU::SGPR_1024RegClass; 2828 2829 return nullptr; 2830 } 2831 2832 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2833 Register Reg) const { 2834 const TargetRegisterClass *RC; 2835 if (Reg.isVirtual()) 2836 RC = MRI.getRegClass(Reg); 2837 else 2838 RC = getPhysRegBaseClass(Reg); 2839 return RC ? isSGPRClass(RC) : false; 2840 } 2841 2842 const TargetRegisterClass * 2843 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2844 unsigned Size = getRegSizeInBits(*SRC); 2845 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2846 assert(VRC && "Invalid register class size"); 2847 return VRC; 2848 } 2849 2850 const TargetRegisterClass * 2851 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2852 unsigned Size = getRegSizeInBits(*SRC); 2853 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2854 assert(ARC && "Invalid register class size"); 2855 return ARC; 2856 } 2857 2858 const TargetRegisterClass * 2859 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2860 unsigned Size = getRegSizeInBits(*VRC); 2861 if (Size == 32) 2862 return &AMDGPU::SGPR_32RegClass; 2863 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2864 assert(SRC && "Invalid register class size"); 2865 return SRC; 2866 } 2867 2868 const TargetRegisterClass * 2869 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2870 const TargetRegisterClass *SubRC, 2871 unsigned SubIdx) const { 2872 // Ensure this subregister index is aligned in the super register. 2873 const TargetRegisterClass *MatchRC = 2874 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2875 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2876 } 2877 2878 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2879 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2880 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2881 return !ST.hasMFMAInlineLiteralBug(); 2882 2883 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2884 OpType <= AMDGPU::OPERAND_SRC_LAST; 2885 } 2886 2887 bool SIRegisterInfo::shouldRewriteCopySrc( 2888 const TargetRegisterClass *DefRC, 2889 unsigned DefSubReg, 2890 const TargetRegisterClass *SrcRC, 2891 unsigned SrcSubReg) const { 2892 // We want to prefer the smallest register class possible, so we don't want to 2893 // stop and rewrite on anything that looks like a subregister 2894 // extract. Operations mostly don't care about the super register class, so we 2895 // only want to stop on the most basic of copies between the same register 2896 // class. 2897 // 2898 // e.g. if we have something like 2899 // %0 = ... 2900 // %1 = ... 2901 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2902 // %3 = COPY %2, sub0 2903 // 2904 // We want to look through the COPY to find: 2905 // => %3 = COPY %0 2906 2907 // Plain copy. 2908 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2909 } 2910 2911 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2912 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2913 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2914 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2915 } 2916 2917 /// Returns a lowest register that is not used at any point in the function. 2918 /// If all registers are used, then this function will return 2919 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return 2920 /// highest unused register. 2921 MCRegister SIRegisterInfo::findUnusedRegister( 2922 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, 2923 const MachineFunction &MF, bool ReserveHighestRegister) const { 2924 if (ReserveHighestRegister) { 2925 for (MCRegister Reg : reverse(*RC)) 2926 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2927 return Reg; 2928 } else { 2929 for (MCRegister Reg : *RC) 2930 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2931 return Reg; 2932 } 2933 return MCRegister(); 2934 } 2935 2936 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, 2937 const RegisterBankInfo &RBI, 2938 Register Reg) const { 2939 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); 2940 if (!RB) 2941 return false; 2942 2943 return !RBI.isDivergentRegBank(RB); 2944 } 2945 2946 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2947 unsigned EltSize) const { 2948 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); 2949 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2950 2951 const unsigned RegDWORDs = RegBitWidth / 32; 2952 const unsigned EltDWORDs = EltSize / 4; 2953 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2954 2955 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2956 const unsigned NumParts = RegDWORDs / EltDWORDs; 2957 2958 return ArrayRef(Parts.data(), NumParts); 2959 } 2960 2961 const TargetRegisterClass* 2962 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2963 Register Reg) const { 2964 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); 2965 } 2966 2967 const TargetRegisterClass * 2968 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, 2969 const MachineOperand &MO) const { 2970 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg()); 2971 return getSubRegisterClass(SrcRC, MO.getSubReg()); 2972 } 2973 2974 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2975 Register Reg) const { 2976 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2977 // Registers without classes are unaddressable, SGPR-like registers. 2978 return RC && isVGPRClass(RC); 2979 } 2980 2981 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2982 Register Reg) const { 2983 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2984 2985 // Registers without classes are unaddressable, SGPR-like registers. 2986 return RC && isAGPRClass(RC); 2987 } 2988 2989 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2990 const TargetRegisterClass *SrcRC, 2991 unsigned SubReg, 2992 const TargetRegisterClass *DstRC, 2993 unsigned DstSubReg, 2994 const TargetRegisterClass *NewRC, 2995 LiveIntervals &LIS) const { 2996 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2997 unsigned DstSize = getRegSizeInBits(*DstRC); 2998 unsigned NewSize = getRegSizeInBits(*NewRC); 2999 3000 // Do not increase size of registers beyond dword, we would need to allocate 3001 // adjacent registers and constraint regalloc more than needed. 3002 3003 // Always allow dword coalescing. 3004 if (SrcSize <= 32 || DstSize <= 32) 3005 return true; 3006 3007 return NewSize <= DstSize || NewSize <= SrcSize; 3008 } 3009 3010 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 3011 MachineFunction &MF) const { 3012 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3013 3014 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 3015 MF.getFunction()); 3016 switch (RC->getID()) { 3017 default: 3018 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 3019 case AMDGPU::VGPR_32RegClassID: 3020 case AMDGPU::VGPR_LO16RegClassID: 3021 case AMDGPU::VGPR_HI16RegClassID: 3022 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 3023 case AMDGPU::SGPR_32RegClassID: 3024 case AMDGPU::SGPR_LO16RegClassID: 3025 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 3026 } 3027 } 3028 3029 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 3030 unsigned Idx) const { 3031 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 3032 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 3033 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 3034 const_cast<MachineFunction &>(MF)); 3035 3036 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 3037 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 3038 const_cast<MachineFunction &>(MF)); 3039 3040 llvm_unreachable("Unexpected register pressure set!"); 3041 } 3042 3043 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 3044 static const int Empty[] = { -1 }; 3045 3046 if (RegPressureIgnoredUnits[RegUnit]) 3047 return Empty; 3048 3049 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 3050 } 3051 3052 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 3053 // Not a callee saved register. 3054 return AMDGPU::SGPR30_SGPR31; 3055 } 3056 3057 const TargetRegisterClass * 3058 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 3059 const RegisterBank &RB) const { 3060 switch (RB.getID()) { 3061 case AMDGPU::VGPRRegBankID: 3062 return getVGPRClassForBitWidth(std::max(32u, Size)); 3063 case AMDGPU::VCCRegBankID: 3064 assert(Size == 1); 3065 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3066 : &AMDGPU::SReg_64_XEXECRegClass; 3067 case AMDGPU::SGPRRegBankID: 3068 return getSGPRClassForBitWidth(std::max(32u, Size)); 3069 case AMDGPU::AGPRRegBankID: 3070 return getAGPRClassForBitWidth(std::max(32u, Size)); 3071 default: 3072 llvm_unreachable("unknown register bank"); 3073 } 3074 } 3075 3076 const TargetRegisterClass * 3077 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 3078 const MachineRegisterInfo &MRI) const { 3079 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 3080 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 3081 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); 3082 3083 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 3084 return getAllocatableClass(RC); 3085 3086 return nullptr; 3087 } 3088 3089 MCRegister SIRegisterInfo::getVCC() const { 3090 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 3091 } 3092 3093 MCRegister SIRegisterInfo::getExec() const { 3094 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 3095 } 3096 3097 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 3098 // VGPR tuples have an alignment requirement on gfx90a variants. 3099 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 3100 : &AMDGPU::VReg_64RegClass; 3101 } 3102 3103 const TargetRegisterClass * 3104 SIRegisterInfo::getRegClass(unsigned RCID) const { 3105 switch ((int)RCID) { 3106 case AMDGPU::SReg_1RegClassID: 3107 return getBoolRC(); 3108 case AMDGPU::SReg_1_XEXECRegClassID: 3109 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3110 : &AMDGPU::SReg_64_XEXECRegClass; 3111 case -1: 3112 return nullptr; 3113 default: 3114 return AMDGPUGenRegisterInfo::getRegClass(RCID); 3115 } 3116 } 3117 3118 // Find reaching register definition 3119 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 3120 MachineInstr &Use, 3121 MachineRegisterInfo &MRI, 3122 LiveIntervals *LIS) const { 3123 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 3124 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 3125 SlotIndex DefIdx; 3126 3127 if (Reg.isVirtual()) { 3128 if (!LIS->hasInterval(Reg)) 3129 return nullptr; 3130 LiveInterval &LI = LIS->getInterval(Reg); 3131 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 3132 : MRI.getMaxLaneMaskForVReg(Reg); 3133 VNInfo *V = nullptr; 3134 if (LI.hasSubRanges()) { 3135 for (auto &S : LI.subranges()) { 3136 if ((S.LaneMask & SubLanes) == SubLanes) { 3137 V = S.getVNInfoAt(UseIdx); 3138 break; 3139 } 3140 } 3141 } else { 3142 V = LI.getVNInfoAt(UseIdx); 3143 } 3144 if (!V) 3145 return nullptr; 3146 DefIdx = V->def; 3147 } else { 3148 // Find last def. 3149 for (MCRegUnit Unit : regunits(Reg.asMCReg())) { 3150 LiveRange &LR = LIS->getRegUnit(Unit); 3151 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 3152 if (!DefIdx.isValid() || 3153 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 3154 LIS->getInstructionFromIndex(V->def))) 3155 DefIdx = V->def; 3156 } else { 3157 return nullptr; 3158 } 3159 } 3160 } 3161 3162 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 3163 3164 if (!Def || !MDT.dominates(Def, &Use)) 3165 return nullptr; 3166 3167 assert(Def->modifiesRegister(Reg, this)); 3168 3169 return Def; 3170 } 3171 3172 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 3173 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); 3174 3175 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 3176 AMDGPU::SReg_32RegClass, 3177 AMDGPU::AGPR_32RegClass } ) { 3178 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 3179 return Super; 3180 } 3181 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3182 &AMDGPU::VGPR_32RegClass)) { 3183 return Super; 3184 } 3185 3186 return AMDGPU::NoRegister; 3187 } 3188 3189 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3190 if (!ST.needsAlignedVGPRs()) 3191 return true; 3192 3193 if (isVGPRClass(&RC)) 3194 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3195 if (isAGPRClass(&RC)) 3196 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3197 if (isVectorSuperClass(&RC)) 3198 return RC.hasSuperClassEq( 3199 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3200 3201 return true; 3202 } 3203 3204 const TargetRegisterClass * 3205 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 3206 if (!RC || !ST.needsAlignedVGPRs()) 3207 return RC; 3208 3209 unsigned Size = getRegSizeInBits(*RC); 3210 if (Size <= 32) 3211 return RC; 3212 3213 if (isVGPRClass(RC)) 3214 return getAlignedVGPRClassForBitWidth(Size); 3215 if (isAGPRClass(RC)) 3216 return getAlignedAGPRClassForBitWidth(Size); 3217 if (isVectorSuperClass(RC)) 3218 return getAlignedVectorSuperClassForBitWidth(Size); 3219 3220 return RC; 3221 } 3222 3223 ArrayRef<MCPhysReg> 3224 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3225 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); 3226 } 3227 3228 ArrayRef<MCPhysReg> 3229 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3230 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); 3231 } 3232 3233 ArrayRef<MCPhysReg> 3234 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3235 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3236 } 3237 3238 unsigned 3239 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, 3240 unsigned SubReg) const { 3241 switch (RC->TSFlags & SIRCFlags::RegKindMask) { 3242 case SIRCFlags::HasSGPR: 3243 return std::min(128u, getSubRegIdxSize(SubReg)); 3244 case SIRCFlags::HasAGPR: 3245 case SIRCFlags::HasVGPR: 3246 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: 3247 return std::min(32u, getSubRegIdxSize(SubReg)); 3248 default: 3249 break; 3250 } 3251 return 0; 3252 } 3253