1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUInstPrinter.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LivePhysRegs.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 27 using namespace llvm; 28 29 #define GET_REGINFO_TARGET_DESC 30 #include "AMDGPUGenRegisterInfo.inc" 31 32 static cl::opt<bool> EnableSpillSGPRToVGPR( 33 "amdgpu-spill-sgpr-to-vgpr", 34 cl::desc("Enable spilling VGPRs to SGPRs"), 35 cl::ReallyHidden, 36 cl::init(true)); 37 38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; 39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; 40 41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable. 42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported. 43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8, 44 // meaning index 7 in SubRegFromChannelTable. 45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; 47 48 namespace llvm { 49 50 // A temporary struct to spill SGPRs. 51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits 52 // just v_writelane and v_readlane. 53 // 54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR 55 // is saved to scratch (or the other way around for loads). 56 // For this, a VGPR is required where the needed lanes can be clobbered. The 57 // RegScavenger can provide a VGPR where currently active lanes can be 58 // clobbered, but we still need to save inactive lanes. 59 // The high-level steps are: 60 // - Try to scavenge SGPR(s) to save exec 61 // - Try to scavenge VGPR 62 // - Save needed, all or inactive lanes of a TmpVGPR 63 // - Spill/Restore SGPRs using TmpVGPR 64 // - Restore TmpVGPR 65 // 66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we 67 // cannot scavenge temporary SGPRs to save exec, we use the following code: 68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved 69 // s_not exec, exec 70 // buffer_store_dword TmpVGPR ; save inactive lanes 71 // s_not exec, exec 72 struct SGPRSpillBuilder { 73 struct PerVGPRData { 74 unsigned PerVGPR; 75 unsigned NumVGPRs; 76 int64_t VGPRLanes; 77 }; 78 79 // The SGPR to save 80 Register SuperReg; 81 MachineBasicBlock::iterator MI; 82 ArrayRef<int16_t> SplitParts; 83 unsigned NumSubRegs; 84 bool IsKill; 85 const DebugLoc &DL; 86 87 /* When spilling to stack */ 88 // The SGPRs are written into this VGPR, which is then written to scratch 89 // (or vice versa for loads). 90 Register TmpVGPR = AMDGPU::NoRegister; 91 // Temporary spill slot to save TmpVGPR to. 92 int TmpVGPRIndex = 0; 93 // If TmpVGPR is live before the spill or if it is scavenged. 94 bool TmpVGPRLive = false; 95 // Scavenged SGPR to save EXEC. 96 Register SavedExecReg = AMDGPU::NoRegister; 97 // Stack index to write the SGPRs to. 98 int Index; 99 unsigned EltSize = 4; 100 101 RegScavenger *RS; 102 MachineBasicBlock *MBB; 103 MachineFunction &MF; 104 SIMachineFunctionInfo &MFI; 105 const SIInstrInfo &TII; 106 const SIRegisterInfo &TRI; 107 bool IsWave32; 108 Register ExecReg; 109 unsigned MovOpc; 110 unsigned NotOpc; 111 112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 113 bool IsWave32, MachineBasicBlock::iterator MI, int Index, 114 RegScavenger *RS) 115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(), 116 MI->getOperand(0).isKill(), Index, RS) {} 117 118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, 119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, 120 bool IsKill, int Index, RegScavenger *RS) 121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()), 122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()), 123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), 124 IsWave32(IsWave32) { 125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); 126 SplitParts = TRI.getRegSplitParts(RC, EltSize); 127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 128 129 if (IsWave32) { 130 ExecReg = AMDGPU::EXEC_LO; 131 MovOpc = AMDGPU::S_MOV_B32; 132 NotOpc = AMDGPU::S_NOT_B32; 133 } else { 134 ExecReg = AMDGPU::EXEC; 135 MovOpc = AMDGPU::S_MOV_B64; 136 NotOpc = AMDGPU::S_NOT_B64; 137 } 138 139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && 141 SuperReg != AMDGPU::EXEC && "exec should never spill"); 142 } 143 144 PerVGPRData getPerVGPRData() { 145 PerVGPRData Data; 146 Data.PerVGPR = IsWave32 ? 32 : 64; 147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; 148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; 149 return Data; 150 } 151 152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is 153 // free. 154 // Writes these instructions if an SGPR can be scavenged: 155 // s_mov_b64 s[6:7], exec ; Save exec 156 // s_mov_b64 exec, 3 ; Wanted lanemask 157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot 158 // 159 // Writes these instructions if no SGPR can be scavenged: 160 // buffer_store_dword v0 ; Only if no free VGPR was found 161 // s_not_b64 exec, exec 162 // buffer_store_dword v0 ; Save inactive lanes 163 // ; exec stays inverted, it is flipped back in 164 // ; restore. 165 void prepare() { 166 // Scavenged temporary VGPR to use. It must be scavenged once for any number 167 // of spilled subregs. 168 // FIXME: The liveness analysis is limited and does not tell if a register 169 // is in use in lanes that are currently inactive. We can never be sure if 170 // a register as actually in use in another lane, so we need to save all 171 // used lanes of the chosen VGPR. 172 assert(RS && "Cannot spill SGPR to memory without RegScavenger"); 173 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); 174 175 // Reserve temporary stack slot 176 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); 177 if (TmpVGPR) { 178 // Found a register that is dead in the currently active lanes, we only 179 // need to spill inactive lanes. 180 TmpVGPRLive = false; 181 } else { 182 // Pick v0 because it doesn't make a difference. 183 TmpVGPR = AMDGPU::VGPR0; 184 TmpVGPRLive = true; 185 } 186 187 if (TmpVGPRLive) { 188 // We need to inform the scavenger that this index is already in use until 189 // we're done with the custom emergency spill. 190 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); 191 } 192 193 // We may end up recursively calling the scavenger, and don't want to re-use 194 // the same register. 195 RS->setRegUsed(TmpVGPR); 196 197 // Try to scavenge SGPRs to save exec 198 assert(!SavedExecReg && "Exec is already saved, refuse to save again"); 199 const TargetRegisterClass &RC = 200 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; 201 RS->setRegUsed(SuperReg); 202 SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); 203 204 int64_t VGPRLanes = getPerVGPRData().VGPRLanes; 205 206 if (SavedExecReg) { 207 RS->setRegUsed(SavedExecReg); 208 // Set exec to needed lanes 209 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); 210 auto I = 211 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); 212 if (!TmpVGPRLive) 213 I.addReg(TmpVGPR, RegState::ImplicitDefine); 214 // Spill needed lanes 215 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 216 } else { 217 // The modify and restore of exec clobber SCC, which we would have to save 218 // and restore. FIXME: We probably would need to reserve a register for 219 // this. 220 if (RS->isRegUsed(AMDGPU::SCC)) 221 MI->emitError("unhandled SGPR spill to memory"); 222 223 // Spill active lanes 224 if (TmpVGPRLive) 225 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, 226 /*IsKill*/ false); 227 // Spill inactive lanes 228 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 229 if (!TmpVGPRLive) 230 I.addReg(TmpVGPR, RegState::ImplicitDefine); 231 I->getOperand(2).setIsDead(); // Mark SCC as dead. 232 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); 233 } 234 } 235 236 // Writes these instructions if an SGPR can be scavenged: 237 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot 238 // s_waitcnt vmcnt(0) ; If a free VGPR was found 239 // s_mov_b64 exec, s[6:7] ; Save exec 240 // 241 // Writes these instructions if no SGPR can be scavenged: 242 // buffer_load_dword v0 ; Restore inactive lanes 243 // s_waitcnt vmcnt(0) ; If a free VGPR was found 244 // s_not_b64 exec, exec 245 // buffer_load_dword v0 ; Only if no free VGPR was found 246 void restore() { 247 if (SavedExecReg) { 248 // Restore used lanes 249 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 250 /*IsKill*/ false); 251 // Restore exec 252 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg) 253 .addReg(SavedExecReg, RegState::Kill); 254 // Add an implicit use of the load so it is not dead. 255 // FIXME This inserts an unnecessary waitcnt 256 if (!TmpVGPRLive) { 257 I.addReg(TmpVGPR, RegState::ImplicitKill); 258 } 259 } else { 260 // Restore inactive lanes 261 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, 262 /*IsKill*/ false); 263 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 264 if (!TmpVGPRLive) 265 I.addReg(TmpVGPR, RegState::ImplicitKill); 266 I->getOperand(2).setIsDead(); // Mark SCC as dead. 267 268 // Restore active lanes 269 if (TmpVGPRLive) 270 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); 271 } 272 273 // Inform the scavenger where we're releasing our custom scavenged register. 274 if (TmpVGPRLive) { 275 MachineBasicBlock::iterator RestorePt = std::prev(MI); 276 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); 277 } 278 } 279 280 // Write TmpVGPR to memory or read TmpVGPR from memory. 281 // Either using a single buffer_load/store if exec is set to the needed mask 282 // or using 283 // buffer_load 284 // s_not exec, exec 285 // buffer_load 286 // s_not exec, exec 287 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { 288 if (SavedExecReg) { 289 // Spill needed lanes 290 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 291 } else { 292 // The modify and restore of exec clobber SCC, which we would have to save 293 // and restore. FIXME: We probably would need to reserve a register for 294 // this. 295 if (RS->isRegUsed(AMDGPU::SCC)) 296 MI->emitError("unhandled SGPR spill to memory"); 297 298 // Spill active lanes 299 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, 300 /*IsKill*/ false); 301 // Spill inactive lanes 302 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 303 Not0->getOperand(2).setIsDead(); // Mark SCC as dead. 304 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); 305 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); 306 Not1->getOperand(2).setIsDead(); // Mark SCC as dead. 307 } 308 } 309 310 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) { 311 assert(MBB->getParent() == &MF); 312 MI = NewMI; 313 MBB = NewMBB; 314 } 315 }; 316 317 } // namespace llvm 318 319 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) 320 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), 321 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { 322 323 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && 324 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && 325 (getSubRegIndexLaneMask(AMDGPU::lo16) | 326 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == 327 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && 328 "getNumCoveredRegs() will not work with generated subreg masks!"); 329 330 RegPressureIgnoredUnits.resize(getNumRegUnits()); 331 RegPressureIgnoredUnits.set( 332 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); 333 for (auto Reg : AMDGPU::VGPR_HI16RegClass) 334 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); 335 336 // HACK: Until this is fully tablegen'd. 337 static llvm::once_flag InitializeRegSplitPartsFlag; 338 339 static auto InitializeRegSplitPartsOnce = [this]() { 340 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { 341 unsigned Size = getSubRegIdxSize(Idx); 342 if (Size & 31) 343 continue; 344 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; 345 unsigned Pos = getSubRegIdxOffset(Idx); 346 if (Pos % Size) 347 continue; 348 Pos /= Size; 349 if (Vec.empty()) { 350 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. 351 Vec.resize(MaxNumParts); 352 } 353 Vec[Pos] = Idx; 354 } 355 }; 356 357 static llvm::once_flag InitializeSubRegFromChannelTableFlag; 358 359 static auto InitializeSubRegFromChannelTableOnce = [this]() { 360 for (auto &Row : SubRegFromChannelTable) 361 Row.fill(AMDGPU::NoSubRegister); 362 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { 363 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; 364 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; 365 assert(Width < SubRegFromChannelTableWidthMap.size()); 366 Width = SubRegFromChannelTableWidthMap[Width]; 367 if (Width == 0) 368 continue; 369 unsigned TableIdx = Width - 1; 370 assert(TableIdx < SubRegFromChannelTable.size()); 371 assert(Offset < SubRegFromChannelTable[TableIdx].size()); 372 SubRegFromChannelTable[TableIdx][Offset] = Idx; 373 } 374 }; 375 376 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); 377 llvm::call_once(InitializeSubRegFromChannelTableFlag, 378 InitializeSubRegFromChannelTableOnce); 379 } 380 381 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, 382 MCRegister Reg) const { 383 MCRegAliasIterator R(Reg, this, true); 384 385 for (; R.isValid(); ++R) 386 Reserved.set(*R); 387 } 388 389 // Forced to be here by one .inc 390 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( 391 const MachineFunction *MF) const { 392 CallingConv::ID CC = MF->getFunction().getCallingConv(); 393 switch (CC) { 394 case CallingConv::C: 395 case CallingConv::Fast: 396 case CallingConv::Cold: 397 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList 398 : CSR_AMDGPU_SaveList; 399 case CallingConv::AMDGPU_Gfx: 400 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList 401 : CSR_AMDGPU_SI_Gfx_SaveList; 402 default: { 403 // Dummy to not crash RegisterClassInfo. 404 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; 405 return &NoCalleeSavedReg; 406 } 407 } 408 } 409 410 const MCPhysReg * 411 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { 412 return nullptr; 413 } 414 415 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, 416 CallingConv::ID CC) const { 417 switch (CC) { 418 case CallingConv::C: 419 case CallingConv::Fast: 420 case CallingConv::Cold: 421 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask 422 : CSR_AMDGPU_RegMask; 423 case CallingConv::AMDGPU_Gfx: 424 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask 425 : CSR_AMDGPU_SI_Gfx_RegMask; 426 default: 427 return nullptr; 428 } 429 } 430 431 const uint32_t *SIRegisterInfo::getNoPreservedMask() const { 432 return CSR_AMDGPU_NoRegs_RegMask; 433 } 434 435 const TargetRegisterClass * 436 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, 437 const MachineFunction &MF) const { 438 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the 439 // equivalent AV class. If used one, the verifier will crash after 440 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given 441 // until Instruction selection. 442 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { 443 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) 444 return &AMDGPU::AV_32RegClass; 445 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) 446 return &AMDGPU::AV_64RegClass; 447 if (RC == &AMDGPU::VReg_64_Align2RegClass || 448 RC == &AMDGPU::AReg_64_Align2RegClass) 449 return &AMDGPU::AV_64_Align2RegClass; 450 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) 451 return &AMDGPU::AV_96RegClass; 452 if (RC == &AMDGPU::VReg_96_Align2RegClass || 453 RC == &AMDGPU::AReg_96_Align2RegClass) 454 return &AMDGPU::AV_96_Align2RegClass; 455 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) 456 return &AMDGPU::AV_128RegClass; 457 if (RC == &AMDGPU::VReg_128_Align2RegClass || 458 RC == &AMDGPU::AReg_128_Align2RegClass) 459 return &AMDGPU::AV_128_Align2RegClass; 460 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) 461 return &AMDGPU::AV_160RegClass; 462 if (RC == &AMDGPU::VReg_160_Align2RegClass || 463 RC == &AMDGPU::AReg_160_Align2RegClass) 464 return &AMDGPU::AV_160_Align2RegClass; 465 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) 466 return &AMDGPU::AV_192RegClass; 467 if (RC == &AMDGPU::VReg_192_Align2RegClass || 468 RC == &AMDGPU::AReg_192_Align2RegClass) 469 return &AMDGPU::AV_192_Align2RegClass; 470 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) 471 return &AMDGPU::AV_256RegClass; 472 if (RC == &AMDGPU::VReg_256_Align2RegClass || 473 RC == &AMDGPU::AReg_256_Align2RegClass) 474 return &AMDGPU::AV_256_Align2RegClass; 475 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) 476 return &AMDGPU::AV_512RegClass; 477 if (RC == &AMDGPU::VReg_512_Align2RegClass || 478 RC == &AMDGPU::AReg_512_Align2RegClass) 479 return &AMDGPU::AV_512_Align2RegClass; 480 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) 481 return &AMDGPU::AV_1024RegClass; 482 if (RC == &AMDGPU::VReg_1024_Align2RegClass || 483 RC == &AMDGPU::AReg_1024_Align2RegClass) 484 return &AMDGPU::AV_1024_Align2RegClass; 485 } 486 487 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); 488 } 489 490 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { 491 const SIFrameLowering *TFI = ST.getFrameLowering(); 492 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 493 // During ISel lowering we always reserve the stack pointer in entry 494 // functions, but never actually want to reference it when accessing our own 495 // frame. If we need a frame pointer we use it, but otherwise we can just use 496 // an immediate "0" which we represent by returning NoRegister. 497 if (FuncInfo->isEntryFunction()) { 498 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); 499 } 500 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() 501 : FuncInfo->getStackPtrOffsetReg(); 502 } 503 504 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { 505 // When we need stack realignment, we can't reference off of the 506 // stack pointer, so we reserve a base pointer. 507 const MachineFrameInfo &MFI = MF.getFrameInfo(); 508 return MFI.getNumFixedObjects() && shouldRealignStack(MF); 509 } 510 511 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } 512 513 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { 514 return AMDGPU_AllVGPRs_RegMask; 515 } 516 517 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { 518 return AMDGPU_AllAGPRs_RegMask; 519 } 520 521 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { 522 return AMDGPU_AllVectorRegs_RegMask; 523 } 524 525 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { 526 return AMDGPU_AllAllocatableSRegs_RegMask; 527 } 528 529 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, 530 unsigned NumRegs) { 531 assert(NumRegs < SubRegFromChannelTableWidthMap.size()); 532 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs]; 533 assert(NumRegIndex && "Not implemented"); 534 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size()); 535 return SubRegFromChannelTable[NumRegIndex - 1][Channel]; 536 } 537 538 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( 539 const MachineFunction &MF) const { 540 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 541 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 542 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 543 } 544 545 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 546 BitVector Reserved(getNumRegs()); 547 Reserved.set(AMDGPU::MODE); 548 549 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 550 551 // Reserve special purpose registers. 552 // 553 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 554 // this seems likely to result in bugs, so I'm marking them as reserved. 555 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 556 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 557 558 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 559 reserveRegisterTuples(Reserved, AMDGPU::M0); 560 561 // Reserve src_vccz, src_execz, src_scc. 562 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 563 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 564 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 565 566 // Reserve the memory aperture registers 567 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 568 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 569 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 570 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 571 572 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 573 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 574 575 // Reserve xnack_mask registers - support is not implemented in Codegen. 576 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 577 578 // Reserve lds_direct register - support is not implemented in Codegen. 579 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 580 581 // Reserve Trap Handler registers - support is not implemented in Codegen. 582 reserveRegisterTuples(Reserved, AMDGPU::TBA); 583 reserveRegisterTuples(Reserved, AMDGPU::TMA); 584 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 585 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 586 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 587 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 588 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 589 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 590 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 591 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 592 593 // Reserve null register - it shall never be allocated 594 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); 595 596 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 597 // will result in bugs. 598 if (isWave32) { 599 Reserved.set(AMDGPU::VCC); 600 Reserved.set(AMDGPU::VCC_HI); 601 } 602 603 // Reserve SGPRs. 604 // 605 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 606 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 607 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 608 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 609 reserveRegisterTuples(Reserved, Reg); 610 } 611 612 for (auto Reg : AMDGPU::SReg_32RegClass) { 613 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 614 Register Low = getSubReg(Reg, AMDGPU::lo16); 615 // This is to prevent BB vcc liveness errors. 616 if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) 617 Reserved.set(Low); 618 } 619 620 Register ScratchRSrcReg = MFI->getScratchRSrcReg(); 621 if (ScratchRSrcReg != AMDGPU::NoRegister) { 622 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we 623 // need to spill. 624 // TODO: May need to reserve a VGPR if doing LDS spilling. 625 reserveRegisterTuples(Reserved, ScratchRSrcReg); 626 } 627 628 // We have to assume the SP is needed in case there are calls in the function, 629 // which is detected after the function is lowered. If we aren't really going 630 // to need SP, don't bother reserving it. 631 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); 632 if (StackPtrReg) { 633 reserveRegisterTuples(Reserved, StackPtrReg); 634 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 635 } 636 637 MCRegister FrameReg = MFI->getFrameOffsetReg(); 638 if (FrameReg) { 639 reserveRegisterTuples(Reserved, FrameReg); 640 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 641 } 642 643 if (hasBasePointer(MF)) { 644 MCRegister BasePtrReg = getBaseRegister(); 645 reserveRegisterTuples(Reserved, BasePtrReg); 646 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); 647 } 648 649 // Reserve VGPRs/AGPRs. 650 // 651 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 652 unsigned MaxNumAGPRs = MaxNumVGPRs; 653 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 654 655 // Reserve all the AGPRs if there are no instructions to use it. 656 if (!ST.hasMAIInsts()) { 657 for (unsigned i = 0; i < MaxNumAGPRs; ++i) { 658 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 659 reserveRegisterTuples(Reserved, Reg); 660 } 661 } 662 663 for (auto Reg : AMDGPU::AGPR_32RegClass) { 664 Reserved.set(getSubReg(Reg, AMDGPU::hi16)); 665 } 666 667 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, 668 // a wave may have up to 512 total vector registers combining together both 669 // VGPRs and AGPRs. Hence, in an entry function without calls and without 670 // AGPRs used within it, it is possible to use the whole vector register 671 // budget for VGPRs. 672 // 673 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split 674 // register file accordingly. 675 if (ST.hasGFX90AInsts()) { 676 if (MFI->usesAGPRs(MF)) { 677 MaxNumVGPRs /= 2; 678 MaxNumAGPRs = MaxNumVGPRs; 679 } else { 680 if (MaxNumVGPRs > TotalNumVGPRs) { 681 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; 682 MaxNumVGPRs = TotalNumVGPRs; 683 } else 684 MaxNumAGPRs = 0; 685 } 686 } 687 688 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 689 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 690 reserveRegisterTuples(Reserved, Reg); 691 } 692 693 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { 694 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 695 reserveRegisterTuples(Reserved, Reg); 696 } 697 698 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch 699 // VGPR available at all times. 700 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { 701 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); 702 } 703 704 for (Register Reg : MFI->getWWMReservedRegs()) 705 reserveRegisterTuples(Reserved, Reg); 706 707 // FIXME: Stop using reserved registers for this. 708 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 709 reserveRegisterTuples(Reserved, Reg); 710 711 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 712 reserveRegisterTuples(Reserved, Reg); 713 714 for (auto Reg : MFI->getSGPRSpillVGPRs()) 715 reserveRegisterTuples(Reserved, Reg); 716 717 return Reserved; 718 } 719 720 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, 721 MCRegister PhysReg) const { 722 return !MF.getRegInfo().isReserved(PhysReg); 723 } 724 725 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { 726 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 727 // On entry, the base address is 0, so it can't possibly need any more 728 // alignment. 729 730 // FIXME: Should be able to specify the entry frame alignment per calling 731 // convention instead. 732 if (Info->isEntryFunction()) 733 return false; 734 735 return TargetRegisterInfo::shouldRealignStack(MF); 736 } 737 738 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 739 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 740 if (Info->isEntryFunction()) { 741 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 742 return MFI.hasStackObjects() || MFI.hasCalls(); 743 } 744 745 // May need scavenger for dealing with callee saved registers. 746 return true; 747 } 748 749 bool SIRegisterInfo::requiresFrameIndexScavenging( 750 const MachineFunction &MF) const { 751 // Do not use frame virtual registers. They used to be used for SGPRs, but 752 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 753 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 754 // spill. 755 return false; 756 } 757 758 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 759 const MachineFunction &MF) const { 760 const MachineFrameInfo &MFI = MF.getFrameInfo(); 761 return MFI.hasStackObjects(); 762 } 763 764 bool SIRegisterInfo::requiresVirtualBaseRegisters( 765 const MachineFunction &) const { 766 // There are no special dedicated stack or frame pointers. 767 return true; 768 } 769 770 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const { 771 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI)); 772 773 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 774 AMDGPU::OpName::offset); 775 return MI->getOperand(OffIdx).getImm(); 776 } 777 778 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 779 int Idx) const { 780 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 781 return 0; 782 783 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 784 AMDGPU::OpName::vaddr) || 785 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 786 AMDGPU::OpName::saddr))) && 787 "Should never see frame index on non-address operand"); 788 789 return getScratchInstrOffset(MI); 790 } 791 792 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 793 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 794 return false; 795 796 int64_t FullOffset = Offset + getScratchInstrOffset(MI); 797 798 if (SIInstrInfo::isMUBUF(*MI)) 799 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); 800 801 const SIInstrInfo *TII = ST.getInstrInfo(); 802 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, 803 SIInstrFlags::FlatScratch); 804 } 805 806 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 807 int FrameIdx, 808 int64_t Offset) const { 809 MachineBasicBlock::iterator Ins = MBB->begin(); 810 DebugLoc DL; // Defaults to "unknown" 811 812 if (Ins != MBB->end()) 813 DL = Ins->getDebugLoc(); 814 815 MachineFunction *MF = MBB->getParent(); 816 const SIInstrInfo *TII = ST.getInstrInfo(); 817 MachineRegisterInfo &MRI = MF->getRegInfo(); 818 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 819 : AMDGPU::V_MOV_B32_e32; 820 821 Register BaseReg = MRI.createVirtualRegister( 822 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass 823 : &AMDGPU::VGPR_32RegClass); 824 825 if (Offset == 0) { 826 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) 827 .addFrameIndex(FrameIdx); 828 return BaseReg; 829 } 830 831 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 832 833 Register FIReg = MRI.createVirtualRegister( 834 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass 835 : &AMDGPU::VGPR_32RegClass); 836 837 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 838 .addImm(Offset); 839 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) 840 .addFrameIndex(FrameIdx); 841 842 if (ST.enableFlatScratch() ) { 843 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) 844 .addReg(OffsetReg, RegState::Kill) 845 .addReg(FIReg); 846 return BaseReg; 847 } 848 849 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 850 .addReg(OffsetReg, RegState::Kill) 851 .addReg(FIReg) 852 .addImm(0); // clamp bit 853 854 return BaseReg; 855 } 856 857 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, 858 int64_t Offset) const { 859 const SIInstrInfo *TII = ST.getInstrInfo(); 860 bool IsFlat = TII->isFLATScratch(MI); 861 862 #ifndef NDEBUG 863 // FIXME: Is it possible to be storing a frame index to itself? 864 bool SeenFI = false; 865 for (const MachineOperand &MO: MI.operands()) { 866 if (MO.isFI()) { 867 if (SeenFI) 868 llvm_unreachable("should not see multiple frame indices"); 869 870 SeenFI = true; 871 } 872 } 873 #endif 874 875 MachineOperand *FIOp = 876 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr 877 : AMDGPU::OpName::vaddr); 878 879 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 880 int64_t NewOffset = OffsetOp->getImm() + Offset; 881 882 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 883 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); 884 885 if (IsFlat) { 886 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 887 SIInstrFlags::FlatScratch) && 888 "offset should be legal"); 889 FIOp->ChangeToRegister(BaseReg, false); 890 OffsetOp->setImm(NewOffset); 891 return; 892 } 893 894 #ifndef NDEBUG 895 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 896 assert(SOffset->isImm() && SOffset->getImm() == 0); 897 #endif 898 899 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 900 "offset should be legal"); 901 902 FIOp->ChangeToRegister(BaseReg, false); 903 OffsetOp->setImm(NewOffset); 904 } 905 906 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 907 Register BaseReg, 908 int64_t Offset) const { 909 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) 910 return false; 911 912 int64_t NewOffset = Offset + getScratchInstrOffset(MI); 913 914 if (SIInstrInfo::isMUBUF(*MI)) 915 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); 916 917 const SIInstrInfo *TII = ST.getInstrInfo(); 918 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 919 SIInstrFlags::FlatScratch); 920 } 921 922 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 923 const MachineFunction &MF, unsigned Kind) const { 924 // This is inaccurate. It depends on the instruction and address space. The 925 // only place where we should hit this is for dealing with frame indexes / 926 // private accesses, so this is correct in that case. 927 return &AMDGPU::VGPR_32RegClass; 928 } 929 930 const TargetRegisterClass * 931 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { 932 if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) 933 return getEquivalentVGPRClass(RC); 934 if (RC == &AMDGPU::SCC_CLASSRegClass) 935 return getWaveMaskRegClass(); 936 937 return RC; 938 } 939 940 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 941 942 switch (Op) { 943 case AMDGPU::SI_SPILL_S1024_SAVE: 944 case AMDGPU::SI_SPILL_S1024_RESTORE: 945 case AMDGPU::SI_SPILL_V1024_SAVE: 946 case AMDGPU::SI_SPILL_V1024_RESTORE: 947 case AMDGPU::SI_SPILL_A1024_SAVE: 948 case AMDGPU::SI_SPILL_A1024_RESTORE: 949 case AMDGPU::SI_SPILL_AV1024_SAVE: 950 case AMDGPU::SI_SPILL_AV1024_RESTORE: 951 return 32; 952 case AMDGPU::SI_SPILL_S512_SAVE: 953 case AMDGPU::SI_SPILL_S512_RESTORE: 954 case AMDGPU::SI_SPILL_V512_SAVE: 955 case AMDGPU::SI_SPILL_V512_RESTORE: 956 case AMDGPU::SI_SPILL_A512_SAVE: 957 case AMDGPU::SI_SPILL_A512_RESTORE: 958 case AMDGPU::SI_SPILL_AV512_SAVE: 959 case AMDGPU::SI_SPILL_AV512_RESTORE: 960 return 16; 961 case AMDGPU::SI_SPILL_S384_SAVE: 962 case AMDGPU::SI_SPILL_S384_RESTORE: 963 case AMDGPU::SI_SPILL_V384_SAVE: 964 case AMDGPU::SI_SPILL_V384_RESTORE: 965 case AMDGPU::SI_SPILL_A384_SAVE: 966 case AMDGPU::SI_SPILL_A384_RESTORE: 967 case AMDGPU::SI_SPILL_AV384_SAVE: 968 case AMDGPU::SI_SPILL_AV384_RESTORE: 969 return 12; 970 case AMDGPU::SI_SPILL_S352_SAVE: 971 case AMDGPU::SI_SPILL_S352_RESTORE: 972 case AMDGPU::SI_SPILL_V352_SAVE: 973 case AMDGPU::SI_SPILL_V352_RESTORE: 974 case AMDGPU::SI_SPILL_A352_SAVE: 975 case AMDGPU::SI_SPILL_A352_RESTORE: 976 case AMDGPU::SI_SPILL_AV352_SAVE: 977 case AMDGPU::SI_SPILL_AV352_RESTORE: 978 return 11; 979 case AMDGPU::SI_SPILL_S320_SAVE: 980 case AMDGPU::SI_SPILL_S320_RESTORE: 981 case AMDGPU::SI_SPILL_V320_SAVE: 982 case AMDGPU::SI_SPILL_V320_RESTORE: 983 case AMDGPU::SI_SPILL_A320_SAVE: 984 case AMDGPU::SI_SPILL_A320_RESTORE: 985 case AMDGPU::SI_SPILL_AV320_SAVE: 986 case AMDGPU::SI_SPILL_AV320_RESTORE: 987 return 10; 988 case AMDGPU::SI_SPILL_S288_SAVE: 989 case AMDGPU::SI_SPILL_S288_RESTORE: 990 case AMDGPU::SI_SPILL_V288_SAVE: 991 case AMDGPU::SI_SPILL_V288_RESTORE: 992 case AMDGPU::SI_SPILL_A288_SAVE: 993 case AMDGPU::SI_SPILL_A288_RESTORE: 994 case AMDGPU::SI_SPILL_AV288_SAVE: 995 case AMDGPU::SI_SPILL_AV288_RESTORE: 996 return 9; 997 case AMDGPU::SI_SPILL_S256_SAVE: 998 case AMDGPU::SI_SPILL_S256_RESTORE: 999 case AMDGPU::SI_SPILL_V256_SAVE: 1000 case AMDGPU::SI_SPILL_V256_RESTORE: 1001 case AMDGPU::SI_SPILL_A256_SAVE: 1002 case AMDGPU::SI_SPILL_A256_RESTORE: 1003 case AMDGPU::SI_SPILL_AV256_SAVE: 1004 case AMDGPU::SI_SPILL_AV256_RESTORE: 1005 return 8; 1006 case AMDGPU::SI_SPILL_S224_SAVE: 1007 case AMDGPU::SI_SPILL_S224_RESTORE: 1008 case AMDGPU::SI_SPILL_V224_SAVE: 1009 case AMDGPU::SI_SPILL_V224_RESTORE: 1010 case AMDGPU::SI_SPILL_A224_SAVE: 1011 case AMDGPU::SI_SPILL_A224_RESTORE: 1012 case AMDGPU::SI_SPILL_AV224_SAVE: 1013 case AMDGPU::SI_SPILL_AV224_RESTORE: 1014 return 7; 1015 case AMDGPU::SI_SPILL_S192_SAVE: 1016 case AMDGPU::SI_SPILL_S192_RESTORE: 1017 case AMDGPU::SI_SPILL_V192_SAVE: 1018 case AMDGPU::SI_SPILL_V192_RESTORE: 1019 case AMDGPU::SI_SPILL_A192_SAVE: 1020 case AMDGPU::SI_SPILL_A192_RESTORE: 1021 case AMDGPU::SI_SPILL_AV192_SAVE: 1022 case AMDGPU::SI_SPILL_AV192_RESTORE: 1023 return 6; 1024 case AMDGPU::SI_SPILL_S160_SAVE: 1025 case AMDGPU::SI_SPILL_S160_RESTORE: 1026 case AMDGPU::SI_SPILL_V160_SAVE: 1027 case AMDGPU::SI_SPILL_V160_RESTORE: 1028 case AMDGPU::SI_SPILL_A160_SAVE: 1029 case AMDGPU::SI_SPILL_A160_RESTORE: 1030 case AMDGPU::SI_SPILL_AV160_SAVE: 1031 case AMDGPU::SI_SPILL_AV160_RESTORE: 1032 return 5; 1033 case AMDGPU::SI_SPILL_S128_SAVE: 1034 case AMDGPU::SI_SPILL_S128_RESTORE: 1035 case AMDGPU::SI_SPILL_V128_SAVE: 1036 case AMDGPU::SI_SPILL_V128_RESTORE: 1037 case AMDGPU::SI_SPILL_A128_SAVE: 1038 case AMDGPU::SI_SPILL_A128_RESTORE: 1039 case AMDGPU::SI_SPILL_AV128_SAVE: 1040 case AMDGPU::SI_SPILL_AV128_RESTORE: 1041 return 4; 1042 case AMDGPU::SI_SPILL_S96_SAVE: 1043 case AMDGPU::SI_SPILL_S96_RESTORE: 1044 case AMDGPU::SI_SPILL_V96_SAVE: 1045 case AMDGPU::SI_SPILL_V96_RESTORE: 1046 case AMDGPU::SI_SPILL_A96_SAVE: 1047 case AMDGPU::SI_SPILL_A96_RESTORE: 1048 case AMDGPU::SI_SPILL_AV96_SAVE: 1049 case AMDGPU::SI_SPILL_AV96_RESTORE: 1050 return 3; 1051 case AMDGPU::SI_SPILL_S64_SAVE: 1052 case AMDGPU::SI_SPILL_S64_RESTORE: 1053 case AMDGPU::SI_SPILL_V64_SAVE: 1054 case AMDGPU::SI_SPILL_V64_RESTORE: 1055 case AMDGPU::SI_SPILL_A64_SAVE: 1056 case AMDGPU::SI_SPILL_A64_RESTORE: 1057 case AMDGPU::SI_SPILL_AV64_SAVE: 1058 case AMDGPU::SI_SPILL_AV64_RESTORE: 1059 return 2; 1060 case AMDGPU::SI_SPILL_S32_SAVE: 1061 case AMDGPU::SI_SPILL_S32_RESTORE: 1062 case AMDGPU::SI_SPILL_V32_SAVE: 1063 case AMDGPU::SI_SPILL_V32_RESTORE: 1064 case AMDGPU::SI_SPILL_A32_SAVE: 1065 case AMDGPU::SI_SPILL_A32_RESTORE: 1066 case AMDGPU::SI_SPILL_AV32_SAVE: 1067 case AMDGPU::SI_SPILL_AV32_RESTORE: 1068 return 1; 1069 default: llvm_unreachable("Invalid spill opcode"); 1070 } 1071 } 1072 1073 static int getOffsetMUBUFStore(unsigned Opc) { 1074 switch (Opc) { 1075 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 1076 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1077 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 1078 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 1079 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 1080 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 1081 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 1082 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 1083 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: 1084 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; 1085 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 1086 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 1087 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 1088 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 1089 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 1090 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 1091 default: 1092 return -1; 1093 } 1094 } 1095 1096 static int getOffsetMUBUFLoad(unsigned Opc) { 1097 switch (Opc) { 1098 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 1099 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1100 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 1101 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 1102 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 1103 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 1104 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 1105 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 1106 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 1107 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 1108 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 1109 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 1110 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: 1111 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; 1112 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 1113 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 1114 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 1115 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 1116 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 1117 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 1118 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 1119 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 1120 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 1121 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 1122 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 1123 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 1124 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 1125 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 1126 default: 1127 return -1; 1128 } 1129 } 1130 1131 static int getOffenMUBUFStore(unsigned Opc) { 1132 switch (Opc) { 1133 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 1134 return AMDGPU::BUFFER_STORE_DWORD_OFFEN; 1135 case AMDGPU::BUFFER_STORE_BYTE_OFFSET: 1136 return AMDGPU::BUFFER_STORE_BYTE_OFFEN; 1137 case AMDGPU::BUFFER_STORE_SHORT_OFFSET: 1138 return AMDGPU::BUFFER_STORE_SHORT_OFFEN; 1139 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 1140 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 1141 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: 1142 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; 1143 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: 1144 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 1145 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: 1146 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; 1147 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: 1148 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; 1149 default: 1150 return -1; 1151 } 1152 } 1153 1154 static int getOffenMUBUFLoad(unsigned Opc) { 1155 switch (Opc) { 1156 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 1157 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; 1158 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: 1159 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; 1160 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: 1161 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; 1162 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: 1163 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; 1164 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: 1165 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; 1166 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: 1167 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 1168 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: 1169 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; 1170 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: 1171 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; 1172 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: 1173 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; 1174 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: 1175 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; 1176 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: 1177 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; 1178 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: 1179 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; 1180 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: 1181 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; 1182 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: 1183 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; 1184 default: 1185 return -1; 1186 } 1187 } 1188 1189 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 1190 MachineBasicBlock &MBB, 1191 MachineBasicBlock::iterator MI, 1192 int Index, unsigned Lane, 1193 unsigned ValueReg, bool IsKill) { 1194 MachineFunction *MF = MBB.getParent(); 1195 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1196 const SIInstrInfo *TII = ST.getInstrInfo(); 1197 1198 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 1199 1200 if (Reg == AMDGPU::NoRegister) 1201 return MachineInstrBuilder(); 1202 1203 bool IsStore = MI->mayStore(); 1204 MachineRegisterInfo &MRI = MF->getRegInfo(); 1205 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 1206 1207 unsigned Dst = IsStore ? Reg : ValueReg; 1208 unsigned Src = IsStore ? ValueReg : Reg; 1209 bool IsVGPR = TRI->isVGPR(MRI, Reg); 1210 DebugLoc DL = MI->getDebugLoc(); 1211 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { 1212 // Spiller during regalloc may restore a spilled register to its superclass. 1213 // It could result in AGPR spills restored to VGPRs or the other way around, 1214 // making the src and dst with identical regclasses at this point. It just 1215 // needs a copy in such cases. 1216 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) 1217 .addReg(Src, getKillRegState(IsKill)); 1218 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1219 return CopyMIB; 1220 } 1221 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 1222 : AMDGPU::V_ACCVGPR_READ_B32_e64; 1223 1224 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) 1225 .addReg(Src, getKillRegState(IsKill)); 1226 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1227 return MIB; 1228 } 1229 1230 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 1231 // need to handle the case where an SGPR may need to be spilled while spilling. 1232 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 1233 MachineFrameInfo &MFI, 1234 MachineBasicBlock::iterator MI, 1235 int Index, 1236 int64_t Offset) { 1237 const SIInstrInfo *TII = ST.getInstrInfo(); 1238 MachineBasicBlock *MBB = MI->getParent(); 1239 const DebugLoc &DL = MI->getDebugLoc(); 1240 bool IsStore = MI->mayStore(); 1241 1242 unsigned Opc = MI->getOpcode(); 1243 int LoadStoreOp = IsStore ? 1244 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 1245 if (LoadStoreOp == -1) 1246 return false; 1247 1248 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 1249 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) 1250 return true; 1251 1252 MachineInstrBuilder NewMI = 1253 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 1254 .add(*Reg) 1255 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 1256 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 1257 .addImm(Offset) 1258 .addImm(0) // cpol 1259 .addImm(0) // swz 1260 .cloneMemRefs(*MI); 1261 1262 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 1263 AMDGPU::OpName::vdata_in); 1264 if (VDataIn) 1265 NewMI.add(*VDataIn); 1266 return true; 1267 } 1268 1269 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, 1270 unsigned LoadStoreOp, 1271 unsigned EltSize) { 1272 bool IsStore = TII->get(LoadStoreOp).mayStore(); 1273 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr); 1274 bool UseST = 1275 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr); 1276 1277 switch (EltSize) { 1278 case 4: 1279 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1280 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR; 1281 break; 1282 case 8: 1283 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR 1284 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR; 1285 break; 1286 case 12: 1287 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR 1288 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR; 1289 break; 1290 case 16: 1291 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR 1292 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR; 1293 break; 1294 default: 1295 llvm_unreachable("Unexpected spill load/store size!"); 1296 } 1297 1298 if (HasVAddr) 1299 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1300 else if (UseST) 1301 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1302 1303 return LoadStoreOp; 1304 } 1305 1306 void SIRegisterInfo::buildSpillLoadStore( 1307 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, 1308 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, 1309 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, 1310 RegScavenger *RS, LivePhysRegs *LiveRegs) const { 1311 assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); 1312 1313 MachineFunction *MF = MBB.getParent(); 1314 const SIInstrInfo *TII = ST.getInstrInfo(); 1315 const MachineFrameInfo &MFI = MF->getFrameInfo(); 1316 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); 1317 1318 const MCInstrDesc *Desc = &TII->get(LoadStoreOp); 1319 bool IsStore = Desc->mayStore(); 1320 bool IsFlat = TII->isFLATScratch(LoadStoreOp); 1321 1322 bool CanClobberSCC = false; 1323 bool Scavenged = false; 1324 MCRegister SOffset = ScratchOffsetReg; 1325 1326 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 1327 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. 1328 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); 1329 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; 1330 1331 // Always use 4 byte operations for AGPRs because we need to scavenge 1332 // a temporary VGPR. 1333 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u; 1334 unsigned NumSubRegs = RegWidth / EltSize; 1335 unsigned Size = NumSubRegs * EltSize; 1336 unsigned RemSize = RegWidth - Size; 1337 unsigned NumRemSubRegs = RemSize ? 1 : 0; 1338 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 1339 int64_t MaterializedOffset = Offset; 1340 1341 int64_t MaxOffset = Offset + Size + RemSize - EltSize; 1342 int64_t ScratchOffsetRegDelta = 0; 1343 1344 if (IsFlat && EltSize > 4) { 1345 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1346 Desc = &TII->get(LoadStoreOp); 1347 } 1348 1349 Align Alignment = MFI.getObjectAlign(Index); 1350 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 1351 1352 assert((IsFlat || ((Offset % EltSize) == 0)) && 1353 "unexpected VGPR spill offset"); 1354 1355 // Track a VGPR to use for a constant offset we need to materialize. 1356 Register TmpOffsetVGPR; 1357 1358 // Track a VGPR to use as an intermediate value. 1359 Register TmpIntermediateVGPR; 1360 bool UseVGPROffset = false; 1361 1362 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate 1363 // combination. 1364 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, 1365 int64_t VOffset) { 1366 // We are using a VGPR offset 1367 if (IsFlat && SGPRBase) { 1368 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free 1369 // SGPR, so perform the add as vector. 1370 // We don't need a base SGPR in the kernel. 1371 1372 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { 1373 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) 1374 .addReg(SGPRBase) 1375 .addImm(VOffset) 1376 .addImm(0); // clamp 1377 } else { 1378 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1379 .addReg(SGPRBase); 1380 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) 1381 .addImm(VOffset) 1382 .addReg(TmpOffsetVGPR); 1383 } 1384 } else { 1385 assert(TmpOffsetVGPR); 1386 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 1387 .addImm(VOffset); 1388 } 1389 }; 1390 1391 bool IsOffsetLegal = 1392 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, 1393 SIInstrFlags::FlatScratch) 1394 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); 1395 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { 1396 SOffset = MCRegister(); 1397 1398 // We don't have access to the register scavenger if this function is called 1399 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. 1400 // TODO: Clobbering SCC is not necessary for scratch instructions in the 1401 // entry. 1402 if (RS) { 1403 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false); 1404 1405 // Piggy back on the liveness scan we just did see if SCC is dead. 1406 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); 1407 } else if (LiveRegs) { 1408 CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC); 1409 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { 1410 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1411 SOffset = Reg; 1412 break; 1413 } 1414 } 1415 } 1416 1417 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) 1418 SOffset = Register(); 1419 1420 if (!SOffset) { 1421 UseVGPROffset = true; 1422 1423 if (RS) { 1424 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); 1425 } else { 1426 assert(LiveRegs); 1427 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { 1428 if (LiveRegs->available(MF->getRegInfo(), Reg)) { 1429 TmpOffsetVGPR = Reg; 1430 break; 1431 } 1432 } 1433 } 1434 1435 assert(TmpOffsetVGPR); 1436 } else if (!SOffset && CanClobberSCC) { 1437 // There are no free SGPRs, and since we are in the process of spilling 1438 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 1439 // on SI/CI and on VI it is true until we implement spilling using scalar 1440 // stores), we have no way to free up an SGPR. Our solution here is to 1441 // add the offset directly to the ScratchOffset or StackPtrOffset 1442 // register, and then subtract the offset after the spill to return the 1443 // register to it's original value. 1444 1445 // TODO: If we don't have to do an emergency stack slot spill, converting 1446 // to use the VGPR offset is fewer instructions. 1447 if (!ScratchOffsetReg) 1448 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); 1449 SOffset = ScratchOffsetReg; 1450 ScratchOffsetRegDelta = Offset; 1451 } else { 1452 Scavenged = true; 1453 } 1454 1455 // We currently only support spilling VGPRs to EltSize boundaries, meaning 1456 // we can simplify the adjustment of Offset here to just scale with 1457 // WavefrontSize. 1458 if (!IsFlat && !UseVGPROffset) 1459 Offset *= ST.getWavefrontSize(); 1460 1461 if (!UseVGPROffset && !SOffset) 1462 report_fatal_error("could not scavenge SGPR to spill in entry function"); 1463 1464 if (UseVGPROffset) { 1465 // We are using a VGPR offset 1466 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); 1467 } else if (ScratchOffsetReg == AMDGPU::NoRegister) { 1468 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); 1469 } else { 1470 assert(Offset != 0); 1471 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1472 .addReg(ScratchOffsetReg) 1473 .addImm(Offset); 1474 Add->getOperand(3).setIsDead(); // Mark SCC as dead. 1475 } 1476 1477 Offset = 0; 1478 } 1479 1480 if (IsFlat && SOffset == AMDGPU::NoRegister) { 1481 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 1482 && "Unexpected vaddr for flat scratch with a FI operand"); 1483 1484 if (UseVGPROffset) { 1485 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); 1486 } else { 1487 assert(ST.hasFlatScratchSTMode()); 1488 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); 1489 } 1490 1491 Desc = &TII->get(LoadStoreOp); 1492 } 1493 1494 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; 1495 ++i, RegOffset += EltSize) { 1496 if (i == NumSubRegs) { 1497 EltSize = RemSize; 1498 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); 1499 } 1500 Desc = &TII->get(LoadStoreOp); 1501 1502 if (!IsFlat && UseVGPROffset) { 1503 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) 1504 : getOffenMUBUFLoad(LoadStoreOp); 1505 Desc = &TII->get(NewLoadStoreOp); 1506 } 1507 1508 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { 1509 // If we are spilling an AGPR beyond the range of the memory instruction 1510 // offset and need to use a VGPR offset, we ideally have at least 2 1511 // scratch VGPRs. If we don't have a second free VGPR without spilling, 1512 // recycle the VGPR used for the offset which requires resetting after 1513 // each subregister. 1514 1515 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); 1516 } 1517 1518 unsigned NumRegs = EltSize / 4; 1519 Register SubReg = e == 1 1520 ? ValueReg 1521 : Register(getSubReg(ValueReg, 1522 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1523 1524 unsigned SOffsetRegState = 0; 1525 unsigned SrcDstRegState = getDefRegState(!IsStore); 1526 const bool IsLastSubReg = i + 1 == e; 1527 const bool IsFirstSubReg = i == 0; 1528 if (IsLastSubReg) { 1529 SOffsetRegState |= getKillRegState(Scavenged); 1530 // The last implicit use carries the "Kill" flag. 1531 SrcDstRegState |= getKillRegState(IsKill); 1532 } 1533 1534 // Make sure the whole register is defined if there are undef components by 1535 // adding an implicit def of the super-reg on the first instruction. 1536 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg; 1537 bool NeedSuperRegImpOperand = e > 1; 1538 1539 // Remaining element size to spill into memory after some parts of it 1540 // spilled into either AGPRs or VGPRs. 1541 unsigned RemEltSize = EltSize; 1542 1543 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order, 1544 // starting from the last lane. In case if a register cannot be completely 1545 // spilled into another register that will ensure its alignment does not 1546 // change. For targets with VGPR alignment requirement this is important 1547 // in case of flat scratch usage as we might get a scratch_load or 1548 // scratch_store of an unaligned register otherwise. 1549 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS, 1550 LaneE = RegOffset / 4; 1551 Lane >= LaneE; --Lane) { 1552 bool IsSubReg = e > 1 || EltSize > 4; 1553 Register Sub = IsSubReg 1554 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) 1555 : ValueReg; 1556 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); 1557 if (!MIB.getInstr()) 1558 break; 1559 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) { 1560 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1561 NeedSuperRegDef = false; 1562 } 1563 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { 1564 NeedSuperRegImpOperand = true; 1565 unsigned State = SrcDstRegState; 1566 if (!IsLastSubReg || (Lane != LaneE)) 1567 State &= ~RegState::Kill; 1568 if (!IsFirstSubReg || (Lane != LaneS)) 1569 State &= ~RegState::Define; 1570 MIB.addReg(ValueReg, RegState::Implicit | State); 1571 } 1572 RemEltSize -= 4; 1573 } 1574 1575 if (!RemEltSize) // Fully spilled into AGPRs. 1576 continue; 1577 1578 if (RemEltSize != EltSize) { // Partially spilled to AGPRs 1579 assert(IsFlat && EltSize > 4); 1580 1581 unsigned NumRegs = RemEltSize / 4; 1582 SubReg = Register(getSubReg(ValueReg, 1583 getSubRegFromChannel(RegOffset / 4, NumRegs))); 1584 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize); 1585 Desc = &TII->get(Opc); 1586 } 1587 1588 unsigned FinalReg = SubReg; 1589 1590 if (IsAGPR) { 1591 assert(EltSize == 4); 1592 1593 if (!TmpIntermediateVGPR) { 1594 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); 1595 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); 1596 } 1597 if (IsStore) { 1598 auto AccRead = BuildMI(MBB, MI, DL, 1599 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), 1600 TmpIntermediateVGPR) 1601 .addReg(SubReg, getKillRegState(IsKill)); 1602 if (NeedSuperRegDef) 1603 AccRead.addReg(ValueReg, RegState::ImplicitDefine); 1604 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1605 } 1606 SubReg = TmpIntermediateVGPR; 1607 } else if (UseVGPROffset) { 1608 // FIXME: change to scavengeRegisterBackwards() 1609 if (!TmpOffsetVGPR) { 1610 TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1611 RS->setRegUsed(TmpOffsetVGPR); 1612 } 1613 } 1614 1615 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); 1616 MachineMemOperand *NewMMO = 1617 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, 1618 commonAlignment(Alignment, RegOffset)); 1619 1620 auto MIB = 1621 BuildMI(MBB, MI, DL, *Desc) 1622 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); 1623 1624 if (UseVGPROffset) { 1625 // For an AGPR spill, we reuse the same temp VGPR for the offset and the 1626 // intermediate accvgpr_write. 1627 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); 1628 } 1629 1630 if (!IsFlat) 1631 MIB.addReg(FuncInfo->getScratchRSrcReg()); 1632 1633 if (SOffset == AMDGPU::NoRegister) { 1634 if (!IsFlat) { 1635 if (UseVGPROffset && ScratchOffsetReg) { 1636 MIB.addReg(ScratchOffsetReg); 1637 } else { 1638 assert(FuncInfo->isEntryFunction()); 1639 MIB.addImm(0); 1640 } 1641 } 1642 } else { 1643 MIB.addReg(SOffset, SOffsetRegState); 1644 } 1645 MIB.addImm(Offset + RegOffset) 1646 .addImm(0); // cpol 1647 if (!IsFlat) 1648 MIB.addImm(0); // swz 1649 MIB.addMemOperand(NewMMO); 1650 1651 if (!IsAGPR && NeedSuperRegDef) 1652 MIB.addReg(ValueReg, RegState::ImplicitDefine); 1653 1654 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { 1655 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), 1656 FinalReg) 1657 .addReg(TmpIntermediateVGPR, RegState::Kill); 1658 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); 1659 } 1660 1661 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) 1662 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 1663 } 1664 1665 if (ScratchOffsetRegDelta != 0) { 1666 // Subtract the offset we added to the ScratchOffset register. 1667 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) 1668 .addReg(SOffset) 1669 .addImm(-ScratchOffsetRegDelta); 1670 } 1671 } 1672 1673 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, 1674 int Offset, bool IsLoad, 1675 bool IsKill) const { 1676 // Load/store VGPR 1677 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); 1678 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); 1679 1680 Register FrameReg = 1681 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) 1682 ? getBaseRegister() 1683 : getFrameRegister(SB.MF); 1684 1685 Align Alignment = FrameInfo.getObjectAlign(Index); 1686 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); 1687 MachineMemOperand *MMO = SB.MF.getMachineMemOperand( 1688 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, 1689 SB.EltSize, Alignment); 1690 1691 if (IsLoad) { 1692 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 1693 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 1694 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, 1695 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1696 } else { 1697 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 1698 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 1699 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, 1700 FrameReg, Offset * SB.EltSize, MMO, SB.RS); 1701 // This only ever adds one VGPR spill 1702 SB.MFI.addToSpilledVGPRs(1); 1703 } 1704 } 1705 1706 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, 1707 RegScavenger *RS, SlotIndexes *Indexes, 1708 LiveIntervals *LIS, bool OnlyToVGPR) const { 1709 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1710 1711 ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); 1712 bool SpillToVGPR = !VGPRSpills.empty(); 1713 if (OnlyToVGPR && !SpillToVGPR) 1714 return false; 1715 1716 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && 1717 SB.SuperReg != SB.MFI.getFrameOffsetReg())); 1718 1719 if (SpillToVGPR) { 1720 1721 assert(SB.NumSubRegs == VGPRSpills.size() && 1722 "Num of VGPR lanes should be equal to num of SGPRs spilled"); 1723 1724 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1725 Register SubReg = 1726 SB.NumSubRegs == 1 1727 ? SB.SuperReg 1728 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1729 SpilledReg Spill = VGPRSpills[i]; 1730 1731 bool IsFirstSubreg = i == 0; 1732 bool IsLastSubreg = i == SB.NumSubRegs - 1; 1733 bool UseKill = SB.IsKill && IsLastSubreg; 1734 1735 1736 // Mark the "old value of vgpr" input undef only if this is the first sgpr 1737 // spill to this specific vgpr in the first basic block. 1738 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1739 SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) 1740 .addReg(SubReg, getKillRegState(UseKill)) 1741 .addImm(Spill.Lane) 1742 .addReg(Spill.VGPR); 1743 if (Indexes) { 1744 if (IsFirstSubreg) 1745 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1746 else 1747 Indexes->insertMachineInstrInMaps(*MIB); 1748 } 1749 1750 if (IsFirstSubreg && SB.NumSubRegs > 1) { 1751 // We may be spilling a super-register which is only partially defined, 1752 // and need to ensure later spills think the value is defined. 1753 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1754 } 1755 1756 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg)) 1757 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); 1758 1759 // FIXME: Since this spills to another register instead of an actual 1760 // frame index, we should delete the frame index when all references to 1761 // it are fixed. 1762 } 1763 } else { 1764 SB.prepare(); 1765 1766 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. 1767 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1768 1769 // Per VGPR helper data 1770 auto PVD = SB.getPerVGPRData(); 1771 1772 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1773 unsigned TmpVGPRFlags = RegState::Undef; 1774 1775 // Write sub registers into the VGPR 1776 for (unsigned i = Offset * PVD.PerVGPR, 1777 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1778 i < e; ++i) { 1779 Register SubReg = 1780 SB.NumSubRegs == 1 1781 ? SB.SuperReg 1782 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1783 1784 MachineInstrBuilder WriteLane = 1785 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1786 SB.TmpVGPR) 1787 .addReg(SubReg, SubKillState) 1788 .addImm(i % PVD.PerVGPR) 1789 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1790 TmpVGPRFlags = 0; 1791 1792 if (Indexes) { 1793 if (i == 0) 1794 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane); 1795 else 1796 Indexes->insertMachineInstrInMaps(*WriteLane); 1797 } 1798 1799 // There could be undef components of a spilled super register. 1800 // TODO: Can we detect this and skip the spill? 1801 if (SB.NumSubRegs > 1) { 1802 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1803 unsigned SuperKillState = 0; 1804 if (i + 1 == SB.NumSubRegs) 1805 SuperKillState |= getKillRegState(SB.IsKill); 1806 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1807 } 1808 } 1809 1810 // Write out VGPR 1811 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); 1812 } 1813 1814 SB.restore(); 1815 } 1816 1817 MI->eraseFromParent(); 1818 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1819 1820 if (LIS) 1821 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1822 1823 return true; 1824 } 1825 1826 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, 1827 RegScavenger *RS, SlotIndexes *Indexes, 1828 LiveIntervals *LIS, bool OnlyToVGPR) const { 1829 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); 1830 1831 ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); 1832 bool SpillToVGPR = !VGPRSpills.empty(); 1833 if (OnlyToVGPR && !SpillToVGPR) 1834 return false; 1835 1836 if (SpillToVGPR) { 1837 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { 1838 Register SubReg = 1839 SB.NumSubRegs == 1 1840 ? SB.SuperReg 1841 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1842 1843 SpilledReg Spill = VGPRSpills[i]; 1844 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1845 SubReg) 1846 .addReg(Spill.VGPR) 1847 .addImm(Spill.Lane); 1848 if (SB.NumSubRegs > 1 && i == 0) 1849 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1850 if (Indexes) { 1851 if (i == e - 1) 1852 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1853 else 1854 Indexes->insertMachineInstrInMaps(*MIB); 1855 } 1856 } 1857 } else { 1858 SB.prepare(); 1859 1860 // Per VGPR helper data 1861 auto PVD = SB.getPerVGPRData(); 1862 1863 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1864 // Load in VGPR data 1865 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); 1866 1867 // Unpack lanes 1868 for (unsigned i = Offset * PVD.PerVGPR, 1869 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1870 i < e; ++i) { 1871 Register SubReg = 1872 SB.NumSubRegs == 1 1873 ? SB.SuperReg 1874 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1875 1876 bool LastSubReg = (i + 1 == e); 1877 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, 1878 SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) 1879 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1880 .addImm(i); 1881 if (SB.NumSubRegs > 1 && i == 0) 1882 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1883 if (Indexes) { 1884 if (i == e - 1) 1885 Indexes->replaceMachineInstrInMaps(*MI, *MIB); 1886 else 1887 Indexes->insertMachineInstrInMaps(*MIB); 1888 } 1889 } 1890 } 1891 1892 SB.restore(); 1893 } 1894 1895 MI->eraseFromParent(); 1896 1897 if (LIS) 1898 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); 1899 1900 return true; 1901 } 1902 1903 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, 1904 MachineBasicBlock &RestoreMBB, 1905 Register SGPR, RegScavenger *RS) const { 1906 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0, 1907 RS); 1908 SB.prepare(); 1909 // Generate the spill of SGPR to SB.TmpVGPR. 1910 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); 1911 auto PVD = SB.getPerVGPRData(); 1912 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1913 unsigned TmpVGPRFlags = RegState::Undef; 1914 // Write sub registers into the VGPR 1915 for (unsigned i = Offset * PVD.PerVGPR, 1916 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1917 i < e; ++i) { 1918 Register SubReg = 1919 SB.NumSubRegs == 1 1920 ? SB.SuperReg 1921 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1922 1923 MachineInstrBuilder WriteLane = 1924 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), 1925 SB.TmpVGPR) 1926 .addReg(SubReg, SubKillState) 1927 .addImm(i % PVD.PerVGPR) 1928 .addReg(SB.TmpVGPR, TmpVGPRFlags); 1929 TmpVGPRFlags = 0; 1930 // There could be undef components of a spilled super register. 1931 // TODO: Can we detect this and skip the spill? 1932 if (SB.NumSubRegs > 1) { 1933 // The last implicit use of the SB.SuperReg carries the "Kill" flag. 1934 unsigned SuperKillState = 0; 1935 if (i + 1 == SB.NumSubRegs) 1936 SuperKillState |= getKillRegState(SB.IsKill); 1937 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); 1938 } 1939 } 1940 // Don't need to write VGPR out. 1941 } 1942 1943 // Restore clobbered registers in the specified restore block. 1944 MI = RestoreMBB.end(); 1945 SB.setMI(&RestoreMBB, MI); 1946 // Generate the restore of SGPR from SB.TmpVGPR. 1947 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { 1948 // Don't need to load VGPR in. 1949 // Unpack lanes 1950 for (unsigned i = Offset * PVD.PerVGPR, 1951 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); 1952 i < e; ++i) { 1953 Register SubReg = 1954 SB.NumSubRegs == 1 1955 ? SB.SuperReg 1956 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); 1957 bool LastSubReg = (i + 1 == e); 1958 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), 1959 SubReg) 1960 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) 1961 .addImm(i); 1962 if (SB.NumSubRegs > 1 && i == 0) 1963 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); 1964 } 1965 } 1966 SB.restore(); 1967 1968 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); 1969 return false; 1970 } 1971 1972 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 1973 /// a VGPR and the stack slot can be safely eliminated when all other users are 1974 /// handled. 1975 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 1976 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, 1977 SlotIndexes *Indexes, LiveIntervals *LIS) const { 1978 switch (MI->getOpcode()) { 1979 case AMDGPU::SI_SPILL_S1024_SAVE: 1980 case AMDGPU::SI_SPILL_S512_SAVE: 1981 case AMDGPU::SI_SPILL_S384_SAVE: 1982 case AMDGPU::SI_SPILL_S352_SAVE: 1983 case AMDGPU::SI_SPILL_S320_SAVE: 1984 case AMDGPU::SI_SPILL_S288_SAVE: 1985 case AMDGPU::SI_SPILL_S256_SAVE: 1986 case AMDGPU::SI_SPILL_S224_SAVE: 1987 case AMDGPU::SI_SPILL_S192_SAVE: 1988 case AMDGPU::SI_SPILL_S160_SAVE: 1989 case AMDGPU::SI_SPILL_S128_SAVE: 1990 case AMDGPU::SI_SPILL_S96_SAVE: 1991 case AMDGPU::SI_SPILL_S64_SAVE: 1992 case AMDGPU::SI_SPILL_S32_SAVE: 1993 return spillSGPR(MI, FI, RS, Indexes, LIS, true); 1994 case AMDGPU::SI_SPILL_S1024_RESTORE: 1995 case AMDGPU::SI_SPILL_S512_RESTORE: 1996 case AMDGPU::SI_SPILL_S384_RESTORE: 1997 case AMDGPU::SI_SPILL_S352_RESTORE: 1998 case AMDGPU::SI_SPILL_S320_RESTORE: 1999 case AMDGPU::SI_SPILL_S288_RESTORE: 2000 case AMDGPU::SI_SPILL_S256_RESTORE: 2001 case AMDGPU::SI_SPILL_S224_RESTORE: 2002 case AMDGPU::SI_SPILL_S192_RESTORE: 2003 case AMDGPU::SI_SPILL_S160_RESTORE: 2004 case AMDGPU::SI_SPILL_S128_RESTORE: 2005 case AMDGPU::SI_SPILL_S96_RESTORE: 2006 case AMDGPU::SI_SPILL_S64_RESTORE: 2007 case AMDGPU::SI_SPILL_S32_RESTORE: 2008 return restoreSGPR(MI, FI, RS, Indexes, LIS, true); 2009 default: 2010 llvm_unreachable("not an SGPR spill instruction"); 2011 } 2012 } 2013 2014 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 2015 int SPAdj, unsigned FIOperandNum, 2016 RegScavenger *RS) const { 2017 MachineFunction *MF = MI->getParent()->getParent(); 2018 MachineBasicBlock *MBB = MI->getParent(); 2019 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2020 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 2021 const SIInstrInfo *TII = ST.getInstrInfo(); 2022 DebugLoc DL = MI->getDebugLoc(); 2023 2024 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 2025 2026 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 2027 int Index = MI->getOperand(FIOperandNum).getIndex(); 2028 2029 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) 2030 ? getBaseRegister() 2031 : getFrameRegister(*MF); 2032 2033 switch (MI->getOpcode()) { 2034 // SGPR register spill 2035 case AMDGPU::SI_SPILL_S1024_SAVE: 2036 case AMDGPU::SI_SPILL_S512_SAVE: 2037 case AMDGPU::SI_SPILL_S384_SAVE: 2038 case AMDGPU::SI_SPILL_S352_SAVE: 2039 case AMDGPU::SI_SPILL_S320_SAVE: 2040 case AMDGPU::SI_SPILL_S288_SAVE: 2041 case AMDGPU::SI_SPILL_S256_SAVE: 2042 case AMDGPU::SI_SPILL_S224_SAVE: 2043 case AMDGPU::SI_SPILL_S192_SAVE: 2044 case AMDGPU::SI_SPILL_S160_SAVE: 2045 case AMDGPU::SI_SPILL_S128_SAVE: 2046 case AMDGPU::SI_SPILL_S96_SAVE: 2047 case AMDGPU::SI_SPILL_S64_SAVE: 2048 case AMDGPU::SI_SPILL_S32_SAVE: { 2049 return spillSGPR(MI, Index, RS); 2050 } 2051 2052 // SGPR register restore 2053 case AMDGPU::SI_SPILL_S1024_RESTORE: 2054 case AMDGPU::SI_SPILL_S512_RESTORE: 2055 case AMDGPU::SI_SPILL_S384_RESTORE: 2056 case AMDGPU::SI_SPILL_S352_RESTORE: 2057 case AMDGPU::SI_SPILL_S320_RESTORE: 2058 case AMDGPU::SI_SPILL_S288_RESTORE: 2059 case AMDGPU::SI_SPILL_S256_RESTORE: 2060 case AMDGPU::SI_SPILL_S224_RESTORE: 2061 case AMDGPU::SI_SPILL_S192_RESTORE: 2062 case AMDGPU::SI_SPILL_S160_RESTORE: 2063 case AMDGPU::SI_SPILL_S128_RESTORE: 2064 case AMDGPU::SI_SPILL_S96_RESTORE: 2065 case AMDGPU::SI_SPILL_S64_RESTORE: 2066 case AMDGPU::SI_SPILL_S32_RESTORE: { 2067 return restoreSGPR(MI, Index, RS); 2068 } 2069 2070 // VGPR register spill 2071 case AMDGPU::SI_SPILL_V1024_SAVE: 2072 case AMDGPU::SI_SPILL_V512_SAVE: 2073 case AMDGPU::SI_SPILL_V384_SAVE: 2074 case AMDGPU::SI_SPILL_V352_SAVE: 2075 case AMDGPU::SI_SPILL_V320_SAVE: 2076 case AMDGPU::SI_SPILL_V288_SAVE: 2077 case AMDGPU::SI_SPILL_V256_SAVE: 2078 case AMDGPU::SI_SPILL_V224_SAVE: 2079 case AMDGPU::SI_SPILL_V192_SAVE: 2080 case AMDGPU::SI_SPILL_V160_SAVE: 2081 case AMDGPU::SI_SPILL_V128_SAVE: 2082 case AMDGPU::SI_SPILL_V96_SAVE: 2083 case AMDGPU::SI_SPILL_V64_SAVE: 2084 case AMDGPU::SI_SPILL_V32_SAVE: 2085 case AMDGPU::SI_SPILL_A1024_SAVE: 2086 case AMDGPU::SI_SPILL_A512_SAVE: 2087 case AMDGPU::SI_SPILL_A384_SAVE: 2088 case AMDGPU::SI_SPILL_A352_SAVE: 2089 case AMDGPU::SI_SPILL_A320_SAVE: 2090 case AMDGPU::SI_SPILL_A288_SAVE: 2091 case AMDGPU::SI_SPILL_A256_SAVE: 2092 case AMDGPU::SI_SPILL_A224_SAVE: 2093 case AMDGPU::SI_SPILL_A192_SAVE: 2094 case AMDGPU::SI_SPILL_A160_SAVE: 2095 case AMDGPU::SI_SPILL_A128_SAVE: 2096 case AMDGPU::SI_SPILL_A96_SAVE: 2097 case AMDGPU::SI_SPILL_A64_SAVE: 2098 case AMDGPU::SI_SPILL_A32_SAVE: 2099 case AMDGPU::SI_SPILL_AV1024_SAVE: 2100 case AMDGPU::SI_SPILL_AV512_SAVE: 2101 case AMDGPU::SI_SPILL_AV384_SAVE: 2102 case AMDGPU::SI_SPILL_AV352_SAVE: 2103 case AMDGPU::SI_SPILL_AV320_SAVE: 2104 case AMDGPU::SI_SPILL_AV288_SAVE: 2105 case AMDGPU::SI_SPILL_AV256_SAVE: 2106 case AMDGPU::SI_SPILL_AV224_SAVE: 2107 case AMDGPU::SI_SPILL_AV192_SAVE: 2108 case AMDGPU::SI_SPILL_AV160_SAVE: 2109 case AMDGPU::SI_SPILL_AV128_SAVE: 2110 case AMDGPU::SI_SPILL_AV96_SAVE: 2111 case AMDGPU::SI_SPILL_AV64_SAVE: 2112 case AMDGPU::SI_SPILL_AV32_SAVE: { 2113 const MachineOperand *VData = TII->getNamedOperand(*MI, 2114 AMDGPU::OpName::vdata); 2115 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2116 MFI->getStackPtrOffsetReg()); 2117 2118 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR 2119 : AMDGPU::BUFFER_STORE_DWORD_OFFSET; 2120 auto *MBB = MI->getParent(); 2121 buildSpillLoadStore( 2122 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2123 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2124 *MI->memoperands_begin(), RS); 2125 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 2126 MI->eraseFromParent(); 2127 return true; 2128 } 2129 case AMDGPU::SI_SPILL_V32_RESTORE: 2130 case AMDGPU::SI_SPILL_V64_RESTORE: 2131 case AMDGPU::SI_SPILL_V96_RESTORE: 2132 case AMDGPU::SI_SPILL_V128_RESTORE: 2133 case AMDGPU::SI_SPILL_V160_RESTORE: 2134 case AMDGPU::SI_SPILL_V192_RESTORE: 2135 case AMDGPU::SI_SPILL_V224_RESTORE: 2136 case AMDGPU::SI_SPILL_V256_RESTORE: 2137 case AMDGPU::SI_SPILL_V288_RESTORE: 2138 case AMDGPU::SI_SPILL_V320_RESTORE: 2139 case AMDGPU::SI_SPILL_V352_RESTORE: 2140 case AMDGPU::SI_SPILL_V384_RESTORE: 2141 case AMDGPU::SI_SPILL_V512_RESTORE: 2142 case AMDGPU::SI_SPILL_V1024_RESTORE: 2143 case AMDGPU::SI_SPILL_A32_RESTORE: 2144 case AMDGPU::SI_SPILL_A64_RESTORE: 2145 case AMDGPU::SI_SPILL_A96_RESTORE: 2146 case AMDGPU::SI_SPILL_A128_RESTORE: 2147 case AMDGPU::SI_SPILL_A160_RESTORE: 2148 case AMDGPU::SI_SPILL_A192_RESTORE: 2149 case AMDGPU::SI_SPILL_A224_RESTORE: 2150 case AMDGPU::SI_SPILL_A256_RESTORE: 2151 case AMDGPU::SI_SPILL_A288_RESTORE: 2152 case AMDGPU::SI_SPILL_A320_RESTORE: 2153 case AMDGPU::SI_SPILL_A352_RESTORE: 2154 case AMDGPU::SI_SPILL_A384_RESTORE: 2155 case AMDGPU::SI_SPILL_A512_RESTORE: 2156 case AMDGPU::SI_SPILL_A1024_RESTORE: 2157 case AMDGPU::SI_SPILL_AV32_RESTORE: 2158 case AMDGPU::SI_SPILL_AV64_RESTORE: 2159 case AMDGPU::SI_SPILL_AV96_RESTORE: 2160 case AMDGPU::SI_SPILL_AV128_RESTORE: 2161 case AMDGPU::SI_SPILL_AV160_RESTORE: 2162 case AMDGPU::SI_SPILL_AV192_RESTORE: 2163 case AMDGPU::SI_SPILL_AV224_RESTORE: 2164 case AMDGPU::SI_SPILL_AV256_RESTORE: 2165 case AMDGPU::SI_SPILL_AV288_RESTORE: 2166 case AMDGPU::SI_SPILL_AV320_RESTORE: 2167 case AMDGPU::SI_SPILL_AV352_RESTORE: 2168 case AMDGPU::SI_SPILL_AV384_RESTORE: 2169 case AMDGPU::SI_SPILL_AV512_RESTORE: 2170 case AMDGPU::SI_SPILL_AV1024_RESTORE: { 2171 const MachineOperand *VData = TII->getNamedOperand(*MI, 2172 AMDGPU::OpName::vdata); 2173 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 2174 MFI->getStackPtrOffsetReg()); 2175 2176 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR 2177 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 2178 auto *MBB = MI->getParent(); 2179 buildSpillLoadStore( 2180 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, 2181 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 2182 *MI->memoperands_begin(), RS); 2183 MI->eraseFromParent(); 2184 return true; 2185 } 2186 2187 default: { 2188 // Other access to frame index 2189 const DebugLoc &DL = MI->getDebugLoc(); 2190 2191 int64_t Offset = FrameInfo.getObjectOffset(Index); 2192 if (ST.enableFlatScratch()) { 2193 if (TII->isFLATScratch(*MI)) { 2194 assert((int16_t)FIOperandNum == 2195 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2196 AMDGPU::OpName::saddr)); 2197 2198 // The offset is always swizzled, just replace it 2199 if (FrameReg) 2200 FIOp.ChangeToRegister(FrameReg, false); 2201 2202 if (!Offset) 2203 return false; 2204 2205 MachineOperand *OffsetOp = 2206 TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 2207 int64_t NewOffset = Offset + OffsetOp->getImm(); 2208 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, 2209 SIInstrFlags::FlatScratch)) { 2210 OffsetOp->setImm(NewOffset); 2211 if (FrameReg) 2212 return false; 2213 Offset = 0; 2214 } 2215 2216 if (!Offset) { 2217 unsigned Opc = MI->getOpcode(); 2218 int NewOpc = -1; 2219 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) { 2220 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); 2221 } else if (ST.hasFlatScratchSTMode()) { 2222 // On GFX10 we have ST mode to use no registers for an address. 2223 // Otherwise we need to materialize 0 into an SGPR. 2224 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); 2225 } 2226 2227 if (NewOpc != -1) { 2228 // removeOperand doesn't fixup tied operand indexes as it goes, so 2229 // it asserts. Untie vdst_in for now and retie them afterwards. 2230 int VDstIn = AMDGPU::getNamedOperandIdx(Opc, 2231 AMDGPU::OpName::vdst_in); 2232 bool TiedVDst = VDstIn != -1 && 2233 MI->getOperand(VDstIn).isReg() && 2234 MI->getOperand(VDstIn).isTied(); 2235 if (TiedVDst) 2236 MI->untieRegOperand(VDstIn); 2237 2238 MI->removeOperand( 2239 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); 2240 2241 if (TiedVDst) { 2242 int NewVDst = 2243 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 2244 int NewVDstIn = 2245 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); 2246 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!"); 2247 MI->tieOperands(NewVDst, NewVDstIn); 2248 } 2249 MI->setDesc(TII->get(NewOpc)); 2250 return false; 2251 } 2252 } 2253 } 2254 2255 if (!FrameReg) { 2256 FIOp.ChangeToImmediate(Offset); 2257 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) 2258 return false; 2259 } 2260 2261 // We need to use register here. Check if we can use an SGPR or need 2262 // a VGPR. 2263 FIOp.ChangeToRegister(AMDGPU::M0, false); 2264 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); 2265 2266 if (!Offset && FrameReg && UseSGPR) { 2267 FIOp.setReg(FrameReg); 2268 return false; 2269 } 2270 2271 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass 2272 : &AMDGPU::VGPR_32RegClass; 2273 2274 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); 2275 FIOp.setReg(TmpReg); 2276 FIOp.setIsKill(); 2277 2278 if ((!FrameReg || !Offset) && TmpReg) { 2279 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2280 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg); 2281 if (FrameReg) 2282 MIB.addReg(FrameReg); 2283 else 2284 MIB.addImm(Offset); 2285 2286 return false; 2287 } 2288 2289 bool NeedSaveSCC = 2290 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2291 2292 Register TmpSReg = 2293 UseSGPR ? TmpReg 2294 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, 2295 !UseSGPR); 2296 2297 // TODO: for flat scratch another attempt can be made with a VGPR index 2298 // if no SGPRs can be scavenged. 2299 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) 2300 report_fatal_error("Cannot scavenge register in FI elimination!"); 2301 2302 if (!TmpSReg) { 2303 // Use frame register and restore it after. 2304 TmpSReg = FrameReg; 2305 FIOp.setReg(FrameReg); 2306 FIOp.setIsKill(false); 2307 } 2308 2309 if (NeedSaveSCC) { 2310 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!"); 2311 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) 2312 .addReg(FrameReg) 2313 .addImm(Offset); 2314 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2315 .addReg(TmpSReg) 2316 .addImm(0); 2317 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) 2318 .addImm(0) 2319 .addReg(TmpSReg); 2320 } else { 2321 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) 2322 .addReg(FrameReg) 2323 .addImm(Offset); 2324 } 2325 2326 if (!UseSGPR) 2327 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2328 .addReg(TmpSReg, RegState::Kill); 2329 2330 if (TmpSReg == FrameReg) { 2331 // Undo frame register modification. 2332 if (NeedSaveSCC && !MI->registerDefIsDead(AMDGPU::SCC)) { 2333 MachineBasicBlock::iterator I = 2334 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32), 2335 TmpSReg) 2336 .addReg(FrameReg) 2337 .addImm(-Offset); 2338 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32)) 2339 .addReg(TmpSReg) 2340 .addImm(0); 2341 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32), 2342 TmpSReg) 2343 .addImm(0) 2344 .addReg(TmpSReg); 2345 } else { 2346 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), 2347 FrameReg) 2348 .addReg(FrameReg) 2349 .addImm(-Offset); 2350 } 2351 } 2352 2353 return false; 2354 } 2355 2356 bool IsMUBUF = TII->isMUBUF(*MI); 2357 2358 if (!IsMUBUF && !MFI->isEntryFunction()) { 2359 // Convert to a swizzled stack address by scaling by the wave size. 2360 // In an entry function/kernel the offset is already swizzled. 2361 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); 2362 bool LiveSCC = 2363 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC); 2364 const TargetRegisterClass *RC = IsSALU && !LiveSCC 2365 ? &AMDGPU::SReg_32RegClass 2366 : &AMDGPU::VGPR_32RegClass; 2367 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || 2368 MI->getOpcode() == AMDGPU::V_MOV_B32_e64; 2369 Register ResultReg = IsCopy ? MI->getOperand(0).getReg() 2370 : RS->scavengeRegister(RC, MI, 0); 2371 2372 int64_t Offset = FrameInfo.getObjectOffset(Index); 2373 if (Offset == 0) { 2374 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 2375 : AMDGPU::V_LSHRREV_B32_e64; 2376 // XXX - This never happens because of emergency scavenging slot at 0? 2377 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg) 2378 .addImm(ST.getWavefrontSizeLog2()) 2379 .addReg(FrameReg); 2380 if (IsSALU && !LiveSCC) 2381 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. 2382 if (IsSALU && LiveSCC) { 2383 Register NewDest = 2384 RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0); 2385 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 2386 NewDest) 2387 .addReg(ResultReg); 2388 ResultReg = NewDest; 2389 } 2390 } else { 2391 MachineInstrBuilder MIB; 2392 if (!IsSALU) { 2393 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != 2394 nullptr) { 2395 // Reuse ResultReg in intermediate step. 2396 Register ScaledReg = ResultReg; 2397 2398 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 2399 ScaledReg) 2400 .addImm(ST.getWavefrontSizeLog2()) 2401 .addReg(FrameReg); 2402 2403 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 2404 2405 // TODO: Fold if use instruction is another add of a constant. 2406 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 2407 // FIXME: This can fail 2408 MIB.addImm(Offset); 2409 MIB.addReg(ScaledReg, RegState::Kill); 2410 if (!IsVOP2) 2411 MIB.addImm(0); // clamp bit 2412 } else { 2413 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && 2414 "Need to reuse carry out register"); 2415 2416 // Use scavenged unused carry out as offset register. 2417 Register ConstOffsetReg; 2418 if (!isWave32) 2419 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 2420 else 2421 ConstOffsetReg = MIB.getReg(1); 2422 2423 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 2424 .addImm(Offset); 2425 MIB.addReg(ConstOffsetReg, RegState::Kill); 2426 MIB.addReg(ScaledReg, RegState::Kill); 2427 MIB.addImm(0); // clamp bit 2428 } 2429 } 2430 } 2431 if (!MIB || IsSALU) { 2432 // We have to produce a carry out, and there isn't a free SGPR pair 2433 // for it. We can keep the whole computation on the SALU to avoid 2434 // clobbering an additional register at the cost of an extra mov. 2435 2436 // We may have 1 free scratch SGPR even though a carry out is 2437 // unavailable. Only one additional mov is needed. 2438 Register TmpScaledReg = 2439 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 2440 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; 2441 2442 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 2443 .addReg(FrameReg) 2444 .addImm(ST.getWavefrontSizeLog2()); 2445 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2446 .addReg(ScaledReg, RegState::Kill) 2447 .addImm(Offset); 2448 if (!IsSALU) 2449 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 2450 .addReg(ScaledReg, RegState::Kill); 2451 else 2452 ResultReg = ScaledReg; 2453 2454 // If there were truly no free SGPRs, we need to undo everything. 2455 if (!TmpScaledReg.isValid()) { 2456 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) 2457 .addReg(ScaledReg, RegState::Kill) 2458 .addImm(-Offset); 2459 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 2460 .addReg(FrameReg) 2461 .addImm(ST.getWavefrontSizeLog2()); 2462 } 2463 } 2464 } 2465 2466 // Don't introduce an extra copy if we're just materializing in a mov. 2467 if (IsCopy) { 2468 MI->eraseFromParent(); 2469 return true; 2470 } 2471 FIOp.ChangeToRegister(ResultReg, false, false, true); 2472 return false; 2473 } 2474 2475 if (IsMUBUF) { 2476 // Disable offen so we don't need a 0 vgpr base. 2477 assert(static_cast<int>(FIOperandNum) == 2478 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 2479 AMDGPU::OpName::vaddr)); 2480 2481 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); 2482 assert((SOffset.isImm() && SOffset.getImm() == 0)); 2483 2484 if (FrameReg != AMDGPU::NoRegister) 2485 SOffset.ChangeToRegister(FrameReg, false); 2486 2487 int64_t Offset = FrameInfo.getObjectOffset(Index); 2488 int64_t OldImm 2489 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 2490 int64_t NewOffset = OldImm + Offset; 2491 2492 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && 2493 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 2494 MI->eraseFromParent(); 2495 return true; 2496 } 2497 } 2498 2499 // If the offset is simply too big, don't convert to a scratch wave offset 2500 // relative index. 2501 2502 FIOp.ChangeToImmediate(Offset); 2503 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 2504 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 2505 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 2506 .addImm(Offset); 2507 FIOp.ChangeToRegister(TmpReg, false, false, true); 2508 } 2509 } 2510 } 2511 return false; 2512 } 2513 2514 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { 2515 return AMDGPUInstPrinter::getRegisterName(Reg); 2516 } 2517 2518 static const TargetRegisterClass * 2519 getAnyVGPRClassForBitWidth(unsigned BitWidth) { 2520 if (BitWidth <= 64) 2521 return &AMDGPU::VReg_64RegClass; 2522 if (BitWidth <= 96) 2523 return &AMDGPU::VReg_96RegClass; 2524 if (BitWidth <= 128) 2525 return &AMDGPU::VReg_128RegClass; 2526 if (BitWidth <= 160) 2527 return &AMDGPU::VReg_160RegClass; 2528 if (BitWidth <= 192) 2529 return &AMDGPU::VReg_192RegClass; 2530 if (BitWidth <= 224) 2531 return &AMDGPU::VReg_224RegClass; 2532 if (BitWidth <= 256) 2533 return &AMDGPU::VReg_256RegClass; 2534 if (BitWidth <= 288) 2535 return &AMDGPU::VReg_288RegClass; 2536 if (BitWidth <= 320) 2537 return &AMDGPU::VReg_320RegClass; 2538 if (BitWidth <= 352) 2539 return &AMDGPU::VReg_352RegClass; 2540 if (BitWidth <= 384) 2541 return &AMDGPU::VReg_384RegClass; 2542 if (BitWidth <= 512) 2543 return &AMDGPU::VReg_512RegClass; 2544 if (BitWidth <= 1024) 2545 return &AMDGPU::VReg_1024RegClass; 2546 2547 return nullptr; 2548 } 2549 2550 static const TargetRegisterClass * 2551 getAlignedVGPRClassForBitWidth(unsigned BitWidth) { 2552 if (BitWidth <= 64) 2553 return &AMDGPU::VReg_64_Align2RegClass; 2554 if (BitWidth <= 96) 2555 return &AMDGPU::VReg_96_Align2RegClass; 2556 if (BitWidth <= 128) 2557 return &AMDGPU::VReg_128_Align2RegClass; 2558 if (BitWidth <= 160) 2559 return &AMDGPU::VReg_160_Align2RegClass; 2560 if (BitWidth <= 192) 2561 return &AMDGPU::VReg_192_Align2RegClass; 2562 if (BitWidth <= 224) 2563 return &AMDGPU::VReg_224_Align2RegClass; 2564 if (BitWidth <= 256) 2565 return &AMDGPU::VReg_256_Align2RegClass; 2566 if (BitWidth <= 288) 2567 return &AMDGPU::VReg_288_Align2RegClass; 2568 if (BitWidth <= 320) 2569 return &AMDGPU::VReg_320_Align2RegClass; 2570 if (BitWidth <= 352) 2571 return &AMDGPU::VReg_352_Align2RegClass; 2572 if (BitWidth <= 384) 2573 return &AMDGPU::VReg_384_Align2RegClass; 2574 if (BitWidth <= 512) 2575 return &AMDGPU::VReg_512_Align2RegClass; 2576 if (BitWidth <= 1024) 2577 return &AMDGPU::VReg_1024_Align2RegClass; 2578 2579 return nullptr; 2580 } 2581 2582 const TargetRegisterClass * 2583 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { 2584 if (BitWidth == 1) 2585 return &AMDGPU::VReg_1RegClass; 2586 if (BitWidth <= 16) 2587 return &AMDGPU::VGPR_LO16RegClass; 2588 if (BitWidth <= 32) 2589 return &AMDGPU::VGPR_32RegClass; 2590 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) 2591 : getAnyVGPRClassForBitWidth(BitWidth); 2592 } 2593 2594 static const TargetRegisterClass * 2595 getAnyAGPRClassForBitWidth(unsigned BitWidth) { 2596 if (BitWidth <= 64) 2597 return &AMDGPU::AReg_64RegClass; 2598 if (BitWidth <= 96) 2599 return &AMDGPU::AReg_96RegClass; 2600 if (BitWidth <= 128) 2601 return &AMDGPU::AReg_128RegClass; 2602 if (BitWidth <= 160) 2603 return &AMDGPU::AReg_160RegClass; 2604 if (BitWidth <= 192) 2605 return &AMDGPU::AReg_192RegClass; 2606 if (BitWidth <= 224) 2607 return &AMDGPU::AReg_224RegClass; 2608 if (BitWidth <= 256) 2609 return &AMDGPU::AReg_256RegClass; 2610 if (BitWidth <= 288) 2611 return &AMDGPU::AReg_288RegClass; 2612 if (BitWidth <= 320) 2613 return &AMDGPU::AReg_320RegClass; 2614 if (BitWidth <= 352) 2615 return &AMDGPU::AReg_352RegClass; 2616 if (BitWidth <= 384) 2617 return &AMDGPU::AReg_384RegClass; 2618 if (BitWidth <= 512) 2619 return &AMDGPU::AReg_512RegClass; 2620 if (BitWidth <= 1024) 2621 return &AMDGPU::AReg_1024RegClass; 2622 2623 return nullptr; 2624 } 2625 2626 static const TargetRegisterClass * 2627 getAlignedAGPRClassForBitWidth(unsigned BitWidth) { 2628 if (BitWidth <= 64) 2629 return &AMDGPU::AReg_64_Align2RegClass; 2630 if (BitWidth <= 96) 2631 return &AMDGPU::AReg_96_Align2RegClass; 2632 if (BitWidth <= 128) 2633 return &AMDGPU::AReg_128_Align2RegClass; 2634 if (BitWidth <= 160) 2635 return &AMDGPU::AReg_160_Align2RegClass; 2636 if (BitWidth <= 192) 2637 return &AMDGPU::AReg_192_Align2RegClass; 2638 if (BitWidth <= 224) 2639 return &AMDGPU::AReg_224_Align2RegClass; 2640 if (BitWidth <= 256) 2641 return &AMDGPU::AReg_256_Align2RegClass; 2642 if (BitWidth <= 288) 2643 return &AMDGPU::AReg_288_Align2RegClass; 2644 if (BitWidth <= 320) 2645 return &AMDGPU::AReg_320_Align2RegClass; 2646 if (BitWidth <= 352) 2647 return &AMDGPU::AReg_352_Align2RegClass; 2648 if (BitWidth <= 384) 2649 return &AMDGPU::AReg_384_Align2RegClass; 2650 if (BitWidth <= 512) 2651 return &AMDGPU::AReg_512_Align2RegClass; 2652 if (BitWidth <= 1024) 2653 return &AMDGPU::AReg_1024_Align2RegClass; 2654 2655 return nullptr; 2656 } 2657 2658 const TargetRegisterClass * 2659 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { 2660 if (BitWidth <= 16) 2661 return &AMDGPU::AGPR_LO16RegClass; 2662 if (BitWidth <= 32) 2663 return &AMDGPU::AGPR_32RegClass; 2664 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) 2665 : getAnyAGPRClassForBitWidth(BitWidth); 2666 } 2667 2668 static const TargetRegisterClass * 2669 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { 2670 if (BitWidth <= 64) 2671 return &AMDGPU::AV_64RegClass; 2672 if (BitWidth <= 96) 2673 return &AMDGPU::AV_96RegClass; 2674 if (BitWidth <= 128) 2675 return &AMDGPU::AV_128RegClass; 2676 if (BitWidth <= 160) 2677 return &AMDGPU::AV_160RegClass; 2678 if (BitWidth <= 192) 2679 return &AMDGPU::AV_192RegClass; 2680 if (BitWidth <= 224) 2681 return &AMDGPU::AV_224RegClass; 2682 if (BitWidth <= 256) 2683 return &AMDGPU::AV_256RegClass; 2684 if (BitWidth <= 288) 2685 return &AMDGPU::AV_288RegClass; 2686 if (BitWidth <= 320) 2687 return &AMDGPU::AV_320RegClass; 2688 if (BitWidth <= 352) 2689 return &AMDGPU::AV_352RegClass; 2690 if (BitWidth <= 384) 2691 return &AMDGPU::AV_384RegClass; 2692 if (BitWidth <= 512) 2693 return &AMDGPU::AV_512RegClass; 2694 if (BitWidth <= 1024) 2695 return &AMDGPU::AV_1024RegClass; 2696 2697 return nullptr; 2698 } 2699 2700 static const TargetRegisterClass * 2701 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { 2702 if (BitWidth <= 64) 2703 return &AMDGPU::AV_64_Align2RegClass; 2704 if (BitWidth <= 96) 2705 return &AMDGPU::AV_96_Align2RegClass; 2706 if (BitWidth <= 128) 2707 return &AMDGPU::AV_128_Align2RegClass; 2708 if (BitWidth <= 160) 2709 return &AMDGPU::AV_160_Align2RegClass; 2710 if (BitWidth <= 192) 2711 return &AMDGPU::AV_192_Align2RegClass; 2712 if (BitWidth <= 224) 2713 return &AMDGPU::AV_224_Align2RegClass; 2714 if (BitWidth <= 256) 2715 return &AMDGPU::AV_256_Align2RegClass; 2716 if (BitWidth <= 288) 2717 return &AMDGPU::AV_288_Align2RegClass; 2718 if (BitWidth <= 320) 2719 return &AMDGPU::AV_320_Align2RegClass; 2720 if (BitWidth <= 352) 2721 return &AMDGPU::AV_352_Align2RegClass; 2722 if (BitWidth <= 384) 2723 return &AMDGPU::AV_384_Align2RegClass; 2724 if (BitWidth <= 512) 2725 return &AMDGPU::AV_512_Align2RegClass; 2726 if (BitWidth <= 1024) 2727 return &AMDGPU::AV_1024_Align2RegClass; 2728 2729 return nullptr; 2730 } 2731 2732 const TargetRegisterClass * 2733 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { 2734 if (BitWidth <= 16) 2735 return &AMDGPU::VGPR_LO16RegClass; 2736 if (BitWidth <= 32) 2737 return &AMDGPU::AV_32RegClass; 2738 return ST.needsAlignedVGPRs() 2739 ? getAlignedVectorSuperClassForBitWidth(BitWidth) 2740 : getAnyVectorSuperClassForBitWidth(BitWidth); 2741 } 2742 2743 const TargetRegisterClass * 2744 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { 2745 if (BitWidth <= 16) 2746 return &AMDGPU::SGPR_LO16RegClass; 2747 if (BitWidth <= 32) 2748 return &AMDGPU::SReg_32RegClass; 2749 if (BitWidth <= 64) 2750 return &AMDGPU::SReg_64RegClass; 2751 if (BitWidth <= 96) 2752 return &AMDGPU::SGPR_96RegClass; 2753 if (BitWidth <= 128) 2754 return &AMDGPU::SGPR_128RegClass; 2755 if (BitWidth <= 160) 2756 return &AMDGPU::SGPR_160RegClass; 2757 if (BitWidth <= 192) 2758 return &AMDGPU::SGPR_192RegClass; 2759 if (BitWidth <= 224) 2760 return &AMDGPU::SGPR_224RegClass; 2761 if (BitWidth <= 256) 2762 return &AMDGPU::SGPR_256RegClass; 2763 if (BitWidth <= 288) 2764 return &AMDGPU::SGPR_288RegClass; 2765 if (BitWidth <= 320) 2766 return &AMDGPU::SGPR_320RegClass; 2767 if (BitWidth <= 352) 2768 return &AMDGPU::SGPR_352RegClass; 2769 if (BitWidth <= 384) 2770 return &AMDGPU::SGPR_384RegClass; 2771 if (BitWidth <= 512) 2772 return &AMDGPU::SGPR_512RegClass; 2773 if (BitWidth <= 1024) 2774 return &AMDGPU::SGPR_1024RegClass; 2775 2776 return nullptr; 2777 } 2778 2779 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI, 2780 Register Reg) const { 2781 const TargetRegisterClass *RC; 2782 if (Reg.isVirtual()) 2783 RC = MRI.getRegClass(Reg); 2784 else 2785 RC = getPhysRegBaseClass(Reg); 2786 return RC ? isSGPRClass(RC) : false; 2787 } 2788 2789 const TargetRegisterClass * 2790 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { 2791 unsigned Size = getRegSizeInBits(*SRC); 2792 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); 2793 assert(VRC && "Invalid register class size"); 2794 return VRC; 2795 } 2796 2797 const TargetRegisterClass * 2798 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { 2799 unsigned Size = getRegSizeInBits(*SRC); 2800 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); 2801 assert(ARC && "Invalid register class size"); 2802 return ARC; 2803 } 2804 2805 const TargetRegisterClass * 2806 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { 2807 unsigned Size = getRegSizeInBits(*VRC); 2808 if (Size == 32) 2809 return &AMDGPU::SGPR_32RegClass; 2810 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); 2811 assert(SRC && "Invalid register class size"); 2812 return SRC; 2813 } 2814 2815 const TargetRegisterClass * 2816 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, 2817 const TargetRegisterClass *SubRC, 2818 unsigned SubIdx) const { 2819 // Ensure this subregister index is aligned in the super register. 2820 const TargetRegisterClass *MatchRC = 2821 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); 2822 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; 2823 } 2824 2825 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 2826 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 2827 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 2828 return !ST.hasMFMAInlineLiteralBug(); 2829 2830 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 2831 OpType <= AMDGPU::OPERAND_SRC_LAST; 2832 } 2833 2834 bool SIRegisterInfo::shouldRewriteCopySrc( 2835 const TargetRegisterClass *DefRC, 2836 unsigned DefSubReg, 2837 const TargetRegisterClass *SrcRC, 2838 unsigned SrcSubReg) const { 2839 // We want to prefer the smallest register class possible, so we don't want to 2840 // stop and rewrite on anything that looks like a subregister 2841 // extract. Operations mostly don't care about the super register class, so we 2842 // only want to stop on the most basic of copies between the same register 2843 // class. 2844 // 2845 // e.g. if we have something like 2846 // %0 = ... 2847 // %1 = ... 2848 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 2849 // %3 = COPY %2, sub0 2850 // 2851 // We want to look through the COPY to find: 2852 // => %3 = COPY %0 2853 2854 // Plain copy. 2855 return getCommonSubClass(DefRC, SrcRC) != nullptr; 2856 } 2857 2858 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 2859 // TODO: 64-bit operands have extending behavior from 32-bit literal. 2860 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && 2861 OpType <= AMDGPU::OPERAND_REG_IMM_LAST; 2862 } 2863 2864 /// Returns a lowest register that is not used at any point in the function. 2865 /// If all registers are used, then this function will return 2866 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return 2867 /// highest unused register. 2868 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 2869 const TargetRegisterClass *RC, 2870 const MachineFunction &MF, 2871 bool ReserveHighestVGPR) const { 2872 if (ReserveHighestVGPR) { 2873 for (MCRegister Reg : reverse(*RC)) 2874 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2875 return Reg; 2876 } else { 2877 for (MCRegister Reg : *RC) 2878 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 2879 return Reg; 2880 } 2881 return MCRegister(); 2882 } 2883 2884 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 2885 unsigned EltSize) const { 2886 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); 2887 assert(RegBitWidth >= 32 && RegBitWidth <= 1024); 2888 2889 const unsigned RegDWORDs = RegBitWidth / 32; 2890 const unsigned EltDWORDs = EltSize / 4; 2891 assert(RegSplitParts.size() + 1 >= EltDWORDs); 2892 2893 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; 2894 const unsigned NumParts = RegDWORDs / EltDWORDs; 2895 2896 return ArrayRef(Parts.data(), NumParts); 2897 } 2898 2899 const TargetRegisterClass* 2900 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 2901 Register Reg) const { 2902 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg); 2903 } 2904 2905 const TargetRegisterClass * 2906 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI, 2907 const MachineOperand &MO) const { 2908 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg()); 2909 return getSubRegisterClass(SrcRC, MO.getSubReg()); 2910 } 2911 2912 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 2913 Register Reg) const { 2914 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2915 // Registers without classes are unaddressable, SGPR-like registers. 2916 return RC && isVGPRClass(RC); 2917 } 2918 2919 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 2920 Register Reg) const { 2921 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); 2922 2923 // Registers without classes are unaddressable, SGPR-like registers. 2924 return RC && isAGPRClass(RC); 2925 } 2926 2927 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 2928 const TargetRegisterClass *SrcRC, 2929 unsigned SubReg, 2930 const TargetRegisterClass *DstRC, 2931 unsigned DstSubReg, 2932 const TargetRegisterClass *NewRC, 2933 LiveIntervals &LIS) const { 2934 unsigned SrcSize = getRegSizeInBits(*SrcRC); 2935 unsigned DstSize = getRegSizeInBits(*DstRC); 2936 unsigned NewSize = getRegSizeInBits(*NewRC); 2937 2938 // Do not increase size of registers beyond dword, we would need to allocate 2939 // adjacent registers and constraint regalloc more than needed. 2940 2941 // Always allow dword coalescing. 2942 if (SrcSize <= 32 || DstSize <= 32) 2943 return true; 2944 2945 return NewSize <= DstSize || NewSize <= SrcSize; 2946 } 2947 2948 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 2949 MachineFunction &MF) const { 2950 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2951 2952 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 2953 MF.getFunction()); 2954 switch (RC->getID()) { 2955 default: 2956 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); 2957 case AMDGPU::VGPR_32RegClassID: 2958 case AMDGPU::VGPR_LO16RegClassID: 2959 case AMDGPU::VGPR_HI16RegClassID: 2960 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 2961 case AMDGPU::SGPR_32RegClassID: 2962 case AMDGPU::SGPR_LO16RegClassID: 2963 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 2964 } 2965 } 2966 2967 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 2968 unsigned Idx) const { 2969 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || 2970 Idx == AMDGPU::RegisterPressureSets::AGPR_32) 2971 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 2972 const_cast<MachineFunction &>(MF)); 2973 2974 if (Idx == AMDGPU::RegisterPressureSets::SReg_32) 2975 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 2976 const_cast<MachineFunction &>(MF)); 2977 2978 llvm_unreachable("Unexpected register pressure set!"); 2979 } 2980 2981 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 2982 static const int Empty[] = { -1 }; 2983 2984 if (RegPressureIgnoredUnits[RegUnit]) 2985 return Empty; 2986 2987 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); 2988 } 2989 2990 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 2991 // Not a callee saved register. 2992 return AMDGPU::SGPR30_SGPR31; 2993 } 2994 2995 const TargetRegisterClass * 2996 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 2997 const RegisterBank &RB) const { 2998 switch (RB.getID()) { 2999 case AMDGPU::VGPRRegBankID: 3000 return getVGPRClassForBitWidth(std::max(32u, Size)); 3001 case AMDGPU::VCCRegBankID: 3002 assert(Size == 1); 3003 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3004 : &AMDGPU::SReg_64_XEXECRegClass; 3005 case AMDGPU::SGPRRegBankID: 3006 return getSGPRClassForBitWidth(std::max(32u, Size)); 3007 case AMDGPU::AGPRRegBankID: 3008 return getAGPRClassForBitWidth(std::max(32u, Size)); 3009 default: 3010 llvm_unreachable("unknown register bank"); 3011 } 3012 } 3013 3014 const TargetRegisterClass * 3015 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 3016 const MachineRegisterInfo &MRI) const { 3017 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 3018 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 3019 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); 3020 3021 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) 3022 return getAllocatableClass(RC); 3023 3024 return nullptr; 3025 } 3026 3027 MCRegister SIRegisterInfo::getVCC() const { 3028 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 3029 } 3030 3031 MCRegister SIRegisterInfo::getExec() const { 3032 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 3033 } 3034 3035 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { 3036 // VGPR tuples have an alignment requirement on gfx90a variants. 3037 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass 3038 : &AMDGPU::VReg_64RegClass; 3039 } 3040 3041 const TargetRegisterClass * 3042 SIRegisterInfo::getRegClass(unsigned RCID) const { 3043 switch ((int)RCID) { 3044 case AMDGPU::SReg_1RegClassID: 3045 return getBoolRC(); 3046 case AMDGPU::SReg_1_XEXECRegClassID: 3047 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 3048 : &AMDGPU::SReg_64_XEXECRegClass; 3049 case -1: 3050 return nullptr; 3051 default: 3052 return AMDGPUGenRegisterInfo::getRegClass(RCID); 3053 } 3054 } 3055 3056 // Find reaching register definition 3057 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, 3058 MachineInstr &Use, 3059 MachineRegisterInfo &MRI, 3060 LiveIntervals *LIS) const { 3061 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 3062 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 3063 SlotIndex DefIdx; 3064 3065 if (Reg.isVirtual()) { 3066 if (!LIS->hasInterval(Reg)) 3067 return nullptr; 3068 LiveInterval &LI = LIS->getInterval(Reg); 3069 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 3070 : MRI.getMaxLaneMaskForVReg(Reg); 3071 VNInfo *V = nullptr; 3072 if (LI.hasSubRanges()) { 3073 for (auto &S : LI.subranges()) { 3074 if ((S.LaneMask & SubLanes) == SubLanes) { 3075 V = S.getVNInfoAt(UseIdx); 3076 break; 3077 } 3078 } 3079 } else { 3080 V = LI.getVNInfoAt(UseIdx); 3081 } 3082 if (!V) 3083 return nullptr; 3084 DefIdx = V->def; 3085 } else { 3086 // Find last def. 3087 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); 3088 ++Units) { 3089 LiveRange &LR = LIS->getRegUnit(*Units); 3090 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 3091 if (!DefIdx.isValid() || 3092 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 3093 LIS->getInstructionFromIndex(V->def))) 3094 DefIdx = V->def; 3095 } else { 3096 return nullptr; 3097 } 3098 } 3099 } 3100 3101 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 3102 3103 if (!Def || !MDT.dominates(Def, &Use)) 3104 return nullptr; 3105 3106 assert(Def->modifiesRegister(Reg, this)); 3107 3108 return Def; 3109 } 3110 3111 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { 3112 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32); 3113 3114 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, 3115 AMDGPU::SReg_32RegClass, 3116 AMDGPU::AGPR_32RegClass } ) { 3117 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) 3118 return Super; 3119 } 3120 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, 3121 &AMDGPU::VGPR_32RegClass)) { 3122 return Super; 3123 } 3124 3125 return AMDGPU::NoRegister; 3126 } 3127 3128 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { 3129 if (!ST.needsAlignedVGPRs()) 3130 return true; 3131 3132 if (isVGPRClass(&RC)) 3133 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); 3134 if (isAGPRClass(&RC)) 3135 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); 3136 if (isVectorSuperClass(&RC)) 3137 return RC.hasSuperClassEq( 3138 getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); 3139 3140 return true; 3141 } 3142 3143 const TargetRegisterClass * 3144 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { 3145 if (!RC || !ST.needsAlignedVGPRs()) 3146 return RC; 3147 3148 unsigned Size = getRegSizeInBits(*RC); 3149 if (Size <= 32) 3150 return RC; 3151 3152 if (isVGPRClass(RC)) 3153 return getAlignedVGPRClassForBitWidth(Size); 3154 if (isAGPRClass(RC)) 3155 return getAlignedAGPRClassForBitWidth(Size); 3156 if (isVectorSuperClass(RC)) 3157 return getAlignedVectorSuperClassForBitWidth(Size); 3158 3159 return RC; 3160 } 3161 3162 ArrayRef<MCPhysReg> 3163 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { 3164 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); 3165 } 3166 3167 ArrayRef<MCPhysReg> 3168 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const { 3169 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2); 3170 } 3171 3172 ArrayRef<MCPhysReg> 3173 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { 3174 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); 3175 } 3176