1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI implementation of the TargetRegisterInfo class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIRegisterInfo.h" 15 #include "AMDGPURegisterBankInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "MCTargetDesc/AMDGPUInstPrinter.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/CodeGen/RegisterScavenging.h" 26 #include "llvm/CodeGen/SlotIndexes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/LLVMContext.h" 29 30 using namespace llvm; 31 32 static bool hasPressureSet(const int *PSets, unsigned PSetID) { 33 for (unsigned i = 0; PSets[i] != -1; ++i) { 34 if (PSets[i] == (int)PSetID) 35 return true; 36 } 37 return false; 38 } 39 40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, 41 BitVector &PressureSets) const { 42 for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { 43 const int *PSets = getRegUnitPressureSets(*U); 44 if (hasPressureSet(PSets, PSetID)) { 45 PressureSets.set(PSetID); 46 break; 47 } 48 } 49 } 50 51 static cl::opt<bool> EnableSpillSGPRToVGPR( 52 "amdgpu-spill-sgpr-to-vgpr", 53 cl::desc("Enable spilling VGPRs to SGPRs"), 54 cl::ReallyHidden, 55 cl::init(true)); 56 57 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : 58 AMDGPURegisterInfo(), 59 ST(ST), 60 SGPRPressureSets(getNumRegPressureSets()), 61 VGPRPressureSets(getNumRegPressureSets()), 62 AGPRPressureSets(getNumRegPressureSets()), 63 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), 64 isWave32(ST.isWave32()) { 65 unsigned NumRegPressureSets = getNumRegPressureSets(); 66 67 SGPRSetID = NumRegPressureSets; 68 VGPRSetID = NumRegPressureSets; 69 AGPRSetID = NumRegPressureSets; 70 71 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 72 classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); 73 classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); 74 classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); 75 } 76 77 // Determine the number of reg units for each pressure set. 78 std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); 79 for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { 80 const int *PSets = getRegUnitPressureSets(i); 81 for (unsigned j = 0; PSets[j] != -1; ++j) { 82 ++PressureSetRegUnits[PSets[j]]; 83 } 84 } 85 86 unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; 87 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 88 if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { 89 VGPRSetID = i; 90 VGPRMax = PressureSetRegUnits[i]; 91 continue; 92 } 93 if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { 94 SGPRSetID = i; 95 SGPRMax = PressureSetRegUnits[i]; 96 } 97 if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { 98 AGPRSetID = i; 99 AGPRMax = PressureSetRegUnits[i]; 100 continue; 101 } 102 } 103 104 assert(SGPRSetID < NumRegPressureSets && 105 VGPRSetID < NumRegPressureSets && 106 AGPRSetID < NumRegPressureSets); 107 } 108 109 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( 110 const MachineFunction &MF) const { 111 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; 112 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 113 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); 114 } 115 116 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { 117 unsigned Reg; 118 119 // Try to place it in a hole after PrivateSegmentBufferReg. 120 if (RegCount & 3) { 121 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to 122 // alignment constraints, so we have a hole where can put the wave offset. 123 Reg = RegCount - 1; 124 } else { 125 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the 126 // wave offset before it. 127 Reg = RegCount - 5; 128 } 129 130 return Reg; 131 } 132 133 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( 134 const MachineFunction &MF) const { 135 unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); 136 return AMDGPU::SGPR_32RegClass.getRegister(Reg); 137 } 138 139 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 140 BitVector Reserved(getNumRegs()); 141 142 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 143 // this seems likely to result in bugs, so I'm marking them as reserved. 144 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 145 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 146 147 // M0 has to be reserved so that llvm accepts it as a live-in into a block. 148 reserveRegisterTuples(Reserved, AMDGPU::M0); 149 150 // Reserve src_vccz, src_execz, src_scc. 151 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); 152 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); 153 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); 154 155 // Reserve the memory aperture registers. 156 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); 157 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); 158 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); 159 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); 160 161 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. 162 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); 163 164 // Reserve xnack_mask registers - support is not implemented in Codegen. 165 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); 166 167 // Reserve lds_direct register - support is not implemented in Codegen. 168 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); 169 170 // Reserve Trap Handler registers - support is not implemented in Codegen. 171 reserveRegisterTuples(Reserved, AMDGPU::TBA); 172 reserveRegisterTuples(Reserved, AMDGPU::TMA); 173 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 174 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 175 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 176 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 177 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 178 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 179 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); 180 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); 181 182 // Reserve null register - it shall never be allocated 183 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); 184 185 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely 186 // will result in bugs. 187 if (isWave32) { 188 Reserved.set(AMDGPU::VCC); 189 Reserved.set(AMDGPU::VCC_HI); 190 } 191 192 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 193 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 194 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { 195 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 196 reserveRegisterTuples(Reserved, Reg); 197 } 198 199 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); 200 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 201 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { 202 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 203 reserveRegisterTuples(Reserved, Reg); 204 Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 205 reserveRegisterTuples(Reserved, Reg); 206 } 207 208 // Reserve all the rest AGPRs if there are no instructions to use it. 209 if (!ST.hasMAIInsts()) { 210 for (unsigned i = 0; i < MaxNumVGPRs; ++i) { 211 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); 212 reserveRegisterTuples(Reserved, Reg); 213 } 214 } 215 216 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 217 218 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 219 if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { 220 // Reserve 1 SGPR for scratch wave offset in case we need to spill. 221 reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); 222 } 223 224 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 225 if (ScratchRSrcReg != AMDGPU::NoRegister) { 226 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 227 // to spill. 228 // TODO: May need to reserve a VGPR if doing LDS spilling. 229 reserveRegisterTuples(Reserved, ScratchRSrcReg); 230 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); 231 } 232 233 // We have to assume the SP is needed in case there are calls in the function, 234 // which is detected after the function is lowered. If we aren't really going 235 // to need SP, don't bother reserving it. 236 unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); 237 238 if (StackPtrReg != AMDGPU::NoRegister) { 239 reserveRegisterTuples(Reserved, StackPtrReg); 240 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); 241 } 242 243 unsigned FrameReg = MFI->getFrameOffsetReg(); 244 if (FrameReg != AMDGPU::NoRegister) { 245 reserveRegisterTuples(Reserved, FrameReg); 246 assert(!isSubRegister(ScratchRSrcReg, FrameReg)); 247 } 248 249 for (unsigned Reg : MFI->WWMReservedRegs) { 250 reserveRegisterTuples(Reserved, Reg); 251 } 252 253 // FIXME: Stop using reserved registers for this. 254 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) 255 reserveRegisterTuples(Reserved, Reg); 256 257 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) 258 reserveRegisterTuples(Reserved, Reg); 259 260 return Reserved; 261 } 262 263 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { 264 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 265 // On entry, the base address is 0, so it can't possibly need any more 266 // alignment. 267 268 // FIXME: Should be able to specify the entry frame alignment per calling 269 // convention instead. 270 if (Info->isEntryFunction()) 271 return false; 272 273 return TargetRegisterInfo::canRealignStack(MF); 274 } 275 276 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 277 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); 278 if (Info->isEntryFunction()) { 279 const MachineFrameInfo &MFI = Fn.getFrameInfo(); 280 return MFI.hasStackObjects() || MFI.hasCalls(); 281 } 282 283 // May need scavenger for dealing with callee saved registers. 284 return true; 285 } 286 287 bool SIRegisterInfo::requiresFrameIndexScavenging( 288 const MachineFunction &MF) const { 289 // Do not use frame virtual registers. They used to be used for SGPRs, but 290 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the 291 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a 292 // spill. 293 return false; 294 } 295 296 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( 297 const MachineFunction &MF) const { 298 const MachineFrameInfo &MFI = MF.getFrameInfo(); 299 return MFI.hasStackObjects(); 300 } 301 302 bool SIRegisterInfo::requiresVirtualBaseRegisters( 303 const MachineFunction &) const { 304 // There are no special dedicated stack or frame pointers. 305 return true; 306 } 307 308 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { 309 // This helps catch bugs as verifier errors. 310 return true; 311 } 312 313 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { 314 assert(SIInstrInfo::isMUBUF(*MI)); 315 316 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 317 AMDGPU::OpName::offset); 318 return MI->getOperand(OffIdx).getImm(); 319 } 320 321 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 322 int Idx) const { 323 if (!SIInstrInfo::isMUBUF(*MI)) 324 return 0; 325 326 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 327 AMDGPU::OpName::vaddr) && 328 "Should never see frame index on non-address operand"); 329 330 return getMUBUFInstrOffset(MI); 331 } 332 333 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 334 if (!MI->mayLoadOrStore()) 335 return false; 336 337 int64_t FullOffset = Offset + getMUBUFInstrOffset(MI); 338 339 return !isUInt<12>(FullOffset); 340 } 341 342 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 343 unsigned BaseReg, 344 int FrameIdx, 345 int64_t Offset) const { 346 MachineBasicBlock::iterator Ins = MBB->begin(); 347 DebugLoc DL; // Defaults to "unknown" 348 349 if (Ins != MBB->end()) 350 DL = Ins->getDebugLoc(); 351 352 MachineFunction *MF = MBB->getParent(); 353 const SIInstrInfo *TII = ST.getInstrInfo(); 354 355 if (Offset == 0) { 356 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 357 .addFrameIndex(FrameIdx); 358 return; 359 } 360 361 MachineRegisterInfo &MRI = MF->getRegInfo(); 362 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 363 364 Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 365 366 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 367 .addImm(Offset); 368 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) 369 .addFrameIndex(FrameIdx); 370 371 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) 372 .addReg(OffsetReg, RegState::Kill) 373 .addReg(FIReg) 374 .addImm(0); // clamp bit 375 } 376 377 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 378 int64_t Offset) const { 379 const SIInstrInfo *TII = ST.getInstrInfo(); 380 381 #ifndef NDEBUG 382 // FIXME: Is it possible to be storing a frame index to itself? 383 bool SeenFI = false; 384 for (const MachineOperand &MO: MI.operands()) { 385 if (MO.isFI()) { 386 if (SeenFI) 387 llvm_unreachable("should not see multiple frame indices"); 388 389 SeenFI = true; 390 } 391 } 392 #endif 393 394 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 395 #ifndef NDEBUG 396 MachineBasicBlock *MBB = MI.getParent(); 397 MachineFunction *MF = MBB->getParent(); 398 #endif 399 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 400 assert(TII->isMUBUF(MI)); 401 assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == 402 MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() && 403 "should only be seeing stack pointer offset relative FrameIndex"); 404 405 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 406 int64_t NewOffset = OffsetOp->getImm() + Offset; 407 assert(isUInt<12>(NewOffset) && "offset should be legal"); 408 409 FIOp->ChangeToRegister(BaseReg, false); 410 OffsetOp->setImm(NewOffset); 411 } 412 413 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 414 unsigned BaseReg, 415 int64_t Offset) const { 416 if (!SIInstrInfo::isMUBUF(*MI)) 417 return false; 418 419 int64_t NewOffset = Offset + getMUBUFInstrOffset(MI); 420 421 return isUInt<12>(NewOffset); 422 } 423 424 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 425 const MachineFunction &MF, unsigned Kind) const { 426 // This is inaccurate. It depends on the instruction and address space. The 427 // only place where we should hit this is for dealing with frame indexes / 428 // private accesses, so this is correct in that case. 429 return &AMDGPU::VGPR_32RegClass; 430 } 431 432 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 433 434 switch (Op) { 435 case AMDGPU::SI_SPILL_S1024_SAVE: 436 case AMDGPU::SI_SPILL_S1024_RESTORE: 437 case AMDGPU::SI_SPILL_V1024_SAVE: 438 case AMDGPU::SI_SPILL_V1024_RESTORE: 439 case AMDGPU::SI_SPILL_A1024_SAVE: 440 case AMDGPU::SI_SPILL_A1024_RESTORE: 441 return 32; 442 case AMDGPU::SI_SPILL_S512_SAVE: 443 case AMDGPU::SI_SPILL_S512_RESTORE: 444 case AMDGPU::SI_SPILL_V512_SAVE: 445 case AMDGPU::SI_SPILL_V512_RESTORE: 446 case AMDGPU::SI_SPILL_A512_SAVE: 447 case AMDGPU::SI_SPILL_A512_RESTORE: 448 return 16; 449 case AMDGPU::SI_SPILL_S256_SAVE: 450 case AMDGPU::SI_SPILL_S256_RESTORE: 451 case AMDGPU::SI_SPILL_V256_SAVE: 452 case AMDGPU::SI_SPILL_V256_RESTORE: 453 return 8; 454 case AMDGPU::SI_SPILL_S160_SAVE: 455 case AMDGPU::SI_SPILL_S160_RESTORE: 456 case AMDGPU::SI_SPILL_V160_SAVE: 457 case AMDGPU::SI_SPILL_V160_RESTORE: 458 return 5; 459 case AMDGPU::SI_SPILL_S128_SAVE: 460 case AMDGPU::SI_SPILL_S128_RESTORE: 461 case AMDGPU::SI_SPILL_V128_SAVE: 462 case AMDGPU::SI_SPILL_V128_RESTORE: 463 case AMDGPU::SI_SPILL_A128_SAVE: 464 case AMDGPU::SI_SPILL_A128_RESTORE: 465 return 4; 466 case AMDGPU::SI_SPILL_S96_SAVE: 467 case AMDGPU::SI_SPILL_S96_RESTORE: 468 case AMDGPU::SI_SPILL_V96_SAVE: 469 case AMDGPU::SI_SPILL_V96_RESTORE: 470 return 3; 471 case AMDGPU::SI_SPILL_S64_SAVE: 472 case AMDGPU::SI_SPILL_S64_RESTORE: 473 case AMDGPU::SI_SPILL_V64_SAVE: 474 case AMDGPU::SI_SPILL_V64_RESTORE: 475 case AMDGPU::SI_SPILL_A64_SAVE: 476 case AMDGPU::SI_SPILL_A64_RESTORE: 477 return 2; 478 case AMDGPU::SI_SPILL_S32_SAVE: 479 case AMDGPU::SI_SPILL_S32_RESTORE: 480 case AMDGPU::SI_SPILL_V32_SAVE: 481 case AMDGPU::SI_SPILL_V32_RESTORE: 482 case AMDGPU::SI_SPILL_A32_SAVE: 483 case AMDGPU::SI_SPILL_A32_RESTORE: 484 return 1; 485 default: llvm_unreachable("Invalid spill opcode"); 486 } 487 } 488 489 static int getOffsetMUBUFStore(unsigned Opc) { 490 switch (Opc) { 491 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 492 return AMDGPU::BUFFER_STORE_DWORD_OFFSET; 493 case AMDGPU::BUFFER_STORE_BYTE_OFFEN: 494 return AMDGPU::BUFFER_STORE_BYTE_OFFSET; 495 case AMDGPU::BUFFER_STORE_SHORT_OFFEN: 496 return AMDGPU::BUFFER_STORE_SHORT_OFFSET; 497 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 498 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 499 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: 500 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 501 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: 502 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; 503 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: 504 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; 505 default: 506 return -1; 507 } 508 } 509 510 static int getOffsetMUBUFLoad(unsigned Opc) { 511 switch (Opc) { 512 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 513 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET; 514 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN: 515 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET; 516 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN: 517 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET; 518 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN: 519 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET; 520 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN: 521 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; 522 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: 523 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 524 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: 525 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; 526 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: 527 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; 528 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: 529 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; 530 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: 531 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; 532 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: 533 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; 534 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: 535 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; 536 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: 537 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; 538 default: 539 return -1; 540 } 541 } 542 543 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, 544 MachineBasicBlock::iterator MI, 545 int Index, 546 unsigned Lane, 547 unsigned ValueReg, 548 bool IsKill) { 549 MachineBasicBlock *MBB = MI->getParent(); 550 MachineFunction *MF = MI->getParent()->getParent(); 551 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 552 const SIInstrInfo *TII = ST.getInstrInfo(); 553 554 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); 555 556 if (Reg == AMDGPU::NoRegister) 557 return MachineInstrBuilder(); 558 559 bool IsStore = MI->mayStore(); 560 MachineRegisterInfo &MRI = MF->getRegInfo(); 561 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); 562 563 unsigned Dst = IsStore ? Reg : ValueReg; 564 unsigned Src = IsStore ? ValueReg : Reg; 565 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 566 : AMDGPU::V_ACCVGPR_READ_B32; 567 568 return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) 569 .addReg(Src, getKillRegState(IsKill)); 570 } 571 572 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not 573 // need to handle the case where an SGPR may need to be spilled while spilling. 574 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, 575 MachineFrameInfo &MFI, 576 MachineBasicBlock::iterator MI, 577 int Index, 578 int64_t Offset) { 579 const SIInstrInfo *TII = ST.getInstrInfo(); 580 MachineBasicBlock *MBB = MI->getParent(); 581 const DebugLoc &DL = MI->getDebugLoc(); 582 bool IsStore = MI->mayStore(); 583 584 unsigned Opc = MI->getOpcode(); 585 int LoadStoreOp = IsStore ? 586 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); 587 if (LoadStoreOp == -1) 588 return false; 589 590 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); 591 if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) 592 return true; 593 594 MachineInstrBuilder NewMI = 595 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 596 .add(*Reg) 597 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) 598 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) 599 .addImm(Offset) 600 .addImm(0) // glc 601 .addImm(0) // slc 602 .addImm(0) // tfe 603 .addImm(0) // dlc 604 .addImm(0) // swz 605 .cloneMemRefs(*MI); 606 607 const MachineOperand *VDataIn = TII->getNamedOperand(*MI, 608 AMDGPU::OpName::vdata_in); 609 if (VDataIn) 610 NewMI.add(*VDataIn); 611 return true; 612 } 613 614 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, 615 unsigned LoadStoreOp, 616 int Index, 617 unsigned ValueReg, 618 bool IsKill, 619 unsigned ScratchRsrcReg, 620 unsigned ScratchOffsetReg, 621 int64_t InstOffset, 622 MachineMemOperand *MMO, 623 RegScavenger *RS) const { 624 MachineBasicBlock *MBB = MI->getParent(); 625 MachineFunction *MF = MI->getParent()->getParent(); 626 const SIInstrInfo *TII = ST.getInstrInfo(); 627 const MachineFrameInfo &MFI = MF->getFrameInfo(); 628 629 const MCInstrDesc &Desc = TII->get(LoadStoreOp); 630 const DebugLoc &DL = MI->getDebugLoc(); 631 bool IsStore = Desc.mayStore(); 632 633 bool Scavenged = false; 634 unsigned SOffset = ScratchOffsetReg; 635 636 const unsigned EltSize = 4; 637 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); 638 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); 639 unsigned Size = NumSubRegs * EltSize; 640 int64_t Offset = InstOffset + MFI.getObjectOffset(Index); 641 int64_t ScratchOffsetRegDelta = 0; 642 643 unsigned Align = MFI.getObjectAlignment(Index); 644 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); 645 646 Register TmpReg = 647 hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() 648 : Register(); 649 650 assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); 651 652 if (!isUInt<12>(Offset + Size - EltSize)) { 653 SOffset = AMDGPU::NoRegister; 654 655 // We currently only support spilling VGPRs to EltSize boundaries, meaning 656 // we can simplify the adjustment of Offset here to just scale with 657 // WavefrontSize. 658 Offset *= ST.getWavefrontSize(); 659 660 // We don't have access to the register scavenger if this function is called 661 // during PEI::scavengeFrameVirtualRegs(). 662 if (RS) 663 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); 664 665 if (SOffset == AMDGPU::NoRegister) { 666 // There are no free SGPRs, and since we are in the process of spilling 667 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 668 // on SI/CI and on VI it is true until we implement spilling using scalar 669 // stores), we have no way to free up an SGPR. Our solution here is to 670 // add the offset directly to the ScratchOffset register, and then 671 // subtract the offset after the spill to return ScratchOffset to it's 672 // original value. 673 SOffset = ScratchOffsetReg; 674 ScratchOffsetRegDelta = Offset; 675 } else { 676 Scavenged = true; 677 } 678 679 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 680 .addReg(ScratchOffsetReg) 681 .addImm(Offset); 682 683 Offset = 0; 684 } 685 686 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { 687 Register SubReg = NumSubRegs == 1 688 ? Register(ValueReg) 689 : getSubReg(ValueReg, getSubRegFromChannel(i)); 690 691 unsigned SOffsetRegState = 0; 692 unsigned SrcDstRegState = getDefRegState(!IsStore); 693 if (i + 1 == e) { 694 SOffsetRegState |= getKillRegState(Scavenged); 695 // The last implicit use carries the "Kill" flag. 696 SrcDstRegState |= getKillRegState(IsKill); 697 } 698 699 auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill); 700 701 if (!MIB.getInstr()) { 702 unsigned FinalReg = SubReg; 703 if (TmpReg != AMDGPU::NoRegister) { 704 if (IsStore) 705 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) 706 .addReg(SubReg, getKillRegState(IsKill)); 707 SubReg = TmpReg; 708 } 709 710 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); 711 MachineMemOperand *NewMMO 712 = MF->getMachineMemOperand(PInfo, MMO->getFlags(), 713 EltSize, MinAlign(Align, EltSize * i)); 714 715 MIB = BuildMI(*MBB, MI, DL, Desc) 716 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) 717 .addReg(ScratchRsrcReg) 718 .addReg(SOffset, SOffsetRegState) 719 .addImm(Offset) 720 .addImm(0) // glc 721 .addImm(0) // slc 722 .addImm(0) // tfe 723 .addImm(0) // dlc 724 .addImm(0) // swz 725 .addMemOperand(NewMMO); 726 727 if (!IsStore && TmpReg != AMDGPU::NoRegister) 728 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), 729 FinalReg) 730 .addReg(TmpReg, RegState::Kill); 731 } 732 733 if (NumSubRegs > 1) 734 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); 735 } 736 737 if (ScratchOffsetRegDelta != 0) { 738 // Subtract the offset we added to the ScratchOffset register. 739 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) 740 .addReg(ScratchOffsetReg) 741 .addImm(ScratchOffsetRegDelta); 742 } 743 } 744 745 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, 746 int Index, 747 RegScavenger *RS, 748 bool OnlyToVGPR) const { 749 MachineBasicBlock *MBB = MI->getParent(); 750 MachineFunction *MF = MBB->getParent(); 751 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 752 DenseSet<unsigned> SGPRSpillVGPRDefinedSet; 753 754 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 755 = MFI->getSGPRToVGPRSpills(Index); 756 bool SpillToVGPR = !VGPRSpills.empty(); 757 if (OnlyToVGPR && !SpillToVGPR) 758 return false; 759 760 const SIInstrInfo *TII = ST.getInstrInfo(); 761 762 Register SuperReg = MI->getOperand(0).getReg(); 763 bool IsKill = MI->getOperand(0).isKill(); 764 const DebugLoc &DL = MI->getDebugLoc(); 765 766 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 767 768 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && 769 SuperReg != MFI->getFrameOffsetReg() && 770 SuperReg != MFI->getScratchWaveOffsetReg())); 771 772 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 773 774 unsigned EltSize = 4; 775 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 776 777 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 778 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 779 780 // Scavenged temporary VGPR to use. It must be scavenged once for any number 781 // of spilled subregs. 782 Register TmpVGPR; 783 784 // SubReg carries the "Kill" flag when SubReg == SuperReg. 785 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 786 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 787 Register SubReg = 788 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 789 790 if (SpillToVGPR) { 791 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 792 793 // During SGPR spilling to VGPR, determine if the VGPR is defined. The 794 // only circumstance in which we say it is undefined is when it is the 795 // first spill to this VGPR in the first basic block. 796 bool VGPRDefined = true; 797 if (MBB == &MF->front()) 798 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; 799 800 // Mark the "old value of vgpr" input undef only if this is the first sgpr 801 // spill to this specific vgpr in the first basic block. 802 BuildMI(*MBB, MI, DL, 803 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 804 Spill.VGPR) 805 .addReg(SubReg, getKillRegState(IsKill)) 806 .addImm(Spill.Lane) 807 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); 808 809 // FIXME: Since this spills to another register instead of an actual 810 // frame index, we should delete the frame index when all references to 811 // it are fixed. 812 } else { 813 // XXX - Can to VGPR spill fail for some subregisters but not others? 814 if (OnlyToVGPR) 815 return false; 816 817 // Spill SGPR to a frame index. 818 if (!TmpVGPR.isValid()) 819 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 820 821 MachineInstrBuilder Mov 822 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) 823 .addReg(SubReg, SubKillState); 824 825 // There could be undef components of a spilled super register. 826 // TODO: Can we detect this and skip the spill? 827 if (NumSubRegs > 1) { 828 // The last implicit use of the SuperReg carries the "Kill" flag. 829 unsigned SuperKillState = 0; 830 if (i + 1 == e) 831 SuperKillState |= getKillRegState(IsKill); 832 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 833 } 834 835 unsigned Align = FrameInfo.getObjectAlignment(Index); 836 MachinePointerInfo PtrInfo 837 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 838 MachineMemOperand *MMO 839 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 840 EltSize, MinAlign(Align, EltSize * i)); 841 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 842 .addReg(TmpVGPR, RegState::Kill) // src 843 .addFrameIndex(Index) // vaddr 844 .addReg(MFI->getScratchRSrcReg()) // srrsrc 845 .addReg(MFI->getStackPtrOffsetReg()) // soffset 846 .addImm(i * 4) // offset 847 .addMemOperand(MMO); 848 } 849 } 850 851 MI->eraseFromParent(); 852 MFI->addToSpilledSGPRs(NumSubRegs); 853 return true; 854 } 855 856 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, 857 int Index, 858 RegScavenger *RS, 859 bool OnlyToVGPR) const { 860 MachineFunction *MF = MI->getParent()->getParent(); 861 MachineBasicBlock *MBB = MI->getParent(); 862 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 863 864 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills 865 = MFI->getSGPRToVGPRSpills(Index); 866 bool SpillToVGPR = !VGPRSpills.empty(); 867 if (OnlyToVGPR && !SpillToVGPR) 868 return false; 869 870 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 871 const SIInstrInfo *TII = ST.getInstrInfo(); 872 const DebugLoc &DL = MI->getDebugLoc(); 873 874 Register SuperReg = MI->getOperand(0).getReg(); 875 876 assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); 877 878 unsigned EltSize = 4; 879 880 const TargetRegisterClass *RC = getPhysRegClass(SuperReg); 881 882 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); 883 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); 884 885 Register TmpVGPR; 886 887 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 888 Register SubReg = 889 NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); 890 891 if (SpillToVGPR) { 892 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; 893 auto MIB = 894 BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 895 SubReg) 896 .addReg(Spill.VGPR) 897 .addImm(Spill.Lane); 898 899 if (NumSubRegs > 1 && i == 0) 900 MIB.addReg(SuperReg, RegState::ImplicitDefine); 901 } else { 902 if (OnlyToVGPR) 903 return false; 904 905 // Restore SGPR from a stack slot. 906 // FIXME: We should use S_LOAD_DWORD here for VI. 907 if (!TmpVGPR.isValid()) 908 TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 909 unsigned Align = FrameInfo.getObjectAlignment(Index); 910 911 MachinePointerInfo PtrInfo 912 = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); 913 914 MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, 915 MachineMemOperand::MOLoad, EltSize, 916 MinAlign(Align, EltSize * i)); 917 918 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) 919 .addFrameIndex(Index) // vaddr 920 .addReg(MFI->getScratchRSrcReg()) // srsrc 921 .addReg(MFI->getStackPtrOffsetReg()) // soffset 922 .addImm(i * 4) // offset 923 .addMemOperand(MMO); 924 925 auto MIB = 926 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 927 .addReg(TmpVGPR, RegState::Kill); 928 929 if (NumSubRegs > 1) 930 MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 931 } 932 } 933 934 MI->eraseFromParent(); 935 return true; 936 } 937 938 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to 939 /// a VGPR and the stack slot can be safely eliminated when all other users are 940 /// handled. 941 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( 942 MachineBasicBlock::iterator MI, 943 int FI, 944 RegScavenger *RS) const { 945 switch (MI->getOpcode()) { 946 case AMDGPU::SI_SPILL_S1024_SAVE: 947 case AMDGPU::SI_SPILL_S512_SAVE: 948 case AMDGPU::SI_SPILL_S256_SAVE: 949 case AMDGPU::SI_SPILL_S160_SAVE: 950 case AMDGPU::SI_SPILL_S128_SAVE: 951 case AMDGPU::SI_SPILL_S96_SAVE: 952 case AMDGPU::SI_SPILL_S64_SAVE: 953 case AMDGPU::SI_SPILL_S32_SAVE: 954 return spillSGPR(MI, FI, RS, true); 955 case AMDGPU::SI_SPILL_S1024_RESTORE: 956 case AMDGPU::SI_SPILL_S512_RESTORE: 957 case AMDGPU::SI_SPILL_S256_RESTORE: 958 case AMDGPU::SI_SPILL_S160_RESTORE: 959 case AMDGPU::SI_SPILL_S128_RESTORE: 960 case AMDGPU::SI_SPILL_S96_RESTORE: 961 case AMDGPU::SI_SPILL_S64_RESTORE: 962 case AMDGPU::SI_SPILL_S32_RESTORE: 963 return restoreSGPR(MI, FI, RS, true); 964 default: 965 llvm_unreachable("not an SGPR spill instruction"); 966 } 967 } 968 969 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 970 int SPAdj, unsigned FIOperandNum, 971 RegScavenger *RS) const { 972 MachineFunction *MF = MI->getParent()->getParent(); 973 MachineBasicBlock *MBB = MI->getParent(); 974 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 975 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 976 const SIInstrInfo *TII = ST.getInstrInfo(); 977 DebugLoc DL = MI->getDebugLoc(); 978 979 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); 980 981 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 982 int Index = MI->getOperand(FIOperandNum).getIndex(); 983 984 Register FrameReg = getFrameRegister(*MF); 985 986 switch (MI->getOpcode()) { 987 // SGPR register spill 988 case AMDGPU::SI_SPILL_S1024_SAVE: 989 case AMDGPU::SI_SPILL_S512_SAVE: 990 case AMDGPU::SI_SPILL_S256_SAVE: 991 case AMDGPU::SI_SPILL_S160_SAVE: 992 case AMDGPU::SI_SPILL_S128_SAVE: 993 case AMDGPU::SI_SPILL_S96_SAVE: 994 case AMDGPU::SI_SPILL_S64_SAVE: 995 case AMDGPU::SI_SPILL_S32_SAVE: { 996 spillSGPR(MI, Index, RS); 997 break; 998 } 999 1000 // SGPR register restore 1001 case AMDGPU::SI_SPILL_S1024_RESTORE: 1002 case AMDGPU::SI_SPILL_S512_RESTORE: 1003 case AMDGPU::SI_SPILL_S256_RESTORE: 1004 case AMDGPU::SI_SPILL_S160_RESTORE: 1005 case AMDGPU::SI_SPILL_S128_RESTORE: 1006 case AMDGPU::SI_SPILL_S96_RESTORE: 1007 case AMDGPU::SI_SPILL_S64_RESTORE: 1008 case AMDGPU::SI_SPILL_S32_RESTORE: { 1009 restoreSGPR(MI, Index, RS); 1010 break; 1011 } 1012 1013 // VGPR register spill 1014 case AMDGPU::SI_SPILL_V1024_SAVE: 1015 case AMDGPU::SI_SPILL_V512_SAVE: 1016 case AMDGPU::SI_SPILL_V256_SAVE: 1017 case AMDGPU::SI_SPILL_V160_SAVE: 1018 case AMDGPU::SI_SPILL_V128_SAVE: 1019 case AMDGPU::SI_SPILL_V96_SAVE: 1020 case AMDGPU::SI_SPILL_V64_SAVE: 1021 case AMDGPU::SI_SPILL_V32_SAVE: 1022 case AMDGPU::SI_SPILL_A1024_SAVE: 1023 case AMDGPU::SI_SPILL_A512_SAVE: 1024 case AMDGPU::SI_SPILL_A128_SAVE: 1025 case AMDGPU::SI_SPILL_A64_SAVE: 1026 case AMDGPU::SI_SPILL_A32_SAVE: { 1027 const MachineOperand *VData = TII->getNamedOperand(*MI, 1028 AMDGPU::OpName::vdata); 1029 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1030 MFI->getStackPtrOffsetReg()); 1031 1032 buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 1033 Index, 1034 VData->getReg(), VData->isKill(), 1035 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1036 FrameReg, 1037 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1038 *MI->memoperands_begin(), 1039 RS); 1040 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 1041 MI->eraseFromParent(); 1042 break; 1043 } 1044 case AMDGPU::SI_SPILL_V32_RESTORE: 1045 case AMDGPU::SI_SPILL_V64_RESTORE: 1046 case AMDGPU::SI_SPILL_V96_RESTORE: 1047 case AMDGPU::SI_SPILL_V128_RESTORE: 1048 case AMDGPU::SI_SPILL_V160_RESTORE: 1049 case AMDGPU::SI_SPILL_V256_RESTORE: 1050 case AMDGPU::SI_SPILL_V512_RESTORE: 1051 case AMDGPU::SI_SPILL_V1024_RESTORE: 1052 case AMDGPU::SI_SPILL_A32_RESTORE: 1053 case AMDGPU::SI_SPILL_A64_RESTORE: 1054 case AMDGPU::SI_SPILL_A128_RESTORE: 1055 case AMDGPU::SI_SPILL_A512_RESTORE: 1056 case AMDGPU::SI_SPILL_A1024_RESTORE: { 1057 const MachineOperand *VData = TII->getNamedOperand(*MI, 1058 AMDGPU::OpName::vdata); 1059 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1060 MFI->getStackPtrOffsetReg()); 1061 1062 buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 1063 Index, 1064 VData->getReg(), VData->isKill(), 1065 TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), 1066 FrameReg, 1067 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), 1068 *MI->memoperands_begin(), 1069 RS); 1070 MI->eraseFromParent(); 1071 break; 1072 } 1073 1074 default: { 1075 const DebugLoc &DL = MI->getDebugLoc(); 1076 bool IsMUBUF = TII->isMUBUF(*MI); 1077 1078 if (!IsMUBUF && !MFI->isEntryFunction()) { 1079 // Convert to an absolute stack address by finding the offset from the 1080 // scratch wave base and scaling by the wave size. 1081 // 1082 // In an entry function/kernel the offset is already the absolute 1083 // address relative to the frame register. 1084 1085 Register TmpDiffReg = 1086 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1087 1088 // If there's no free SGPR, in-place modify the FP 1089 Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; 1090 1091 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; 1092 Register ResultReg = IsCopy ? 1093 MI->getOperand(0).getReg() : 1094 RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1095 1096 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) 1097 .addReg(FrameReg) 1098 .addReg(MFI->getScratchWaveOffsetReg()); 1099 1100 int64_t Offset = FrameInfo.getObjectOffset(Index); 1101 if (Offset == 0) { 1102 // XXX - This never happens because of emergency scavenging slot at 0? 1103 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) 1104 .addImm(ST.getWavefrontSizeLog2()) 1105 .addReg(DiffReg); 1106 } else { 1107 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { 1108 Register ScaledReg = 1109 RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); 1110 1111 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), 1112 ScaledReg) 1113 .addImm(ST.getWavefrontSizeLog2()) 1114 .addReg(DiffReg, RegState::Kill); 1115 1116 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; 1117 1118 // TODO: Fold if use instruction is another add of a constant. 1119 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { 1120 // FIXME: This can fail 1121 MIB.addImm(Offset); 1122 MIB.addReg(ScaledReg, RegState::Kill); 1123 if (!IsVOP2) 1124 MIB.addImm(0); // clamp bit 1125 } else { 1126 assert(MIB->getOpcode() == AMDGPU::V_ADD_I32_e64 && 1127 "Need to reuse carry out register"); 1128 1129 // Use scavenged unused carry out as offset register. 1130 Register ConstOffsetReg; 1131 if (!isWave32) 1132 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); 1133 else 1134 ConstOffsetReg = MIB.getReg(1); 1135 1136 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) 1137 .addImm(Offset); 1138 MIB.addReg(ConstOffsetReg, RegState::Kill); 1139 MIB.addReg(ScaledReg, RegState::Kill); 1140 MIB.addImm(0); // clamp bit 1141 } 1142 } else { 1143 // We have to produce a carry out, and there isn't a free SGPR pair 1144 // for it. We can keep the whole computation on the SALU to avoid 1145 // clobbering an additional register at the cost of an extra mov. 1146 1147 // We may have 1 free scratch SGPR even though a carry out is 1148 // unavailable. Only one additional mov is needed. 1149 Register TmpScaledReg = 1150 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); 1151 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; 1152 1153 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) 1154 .addReg(DiffReg, RegState::Kill) 1155 .addImm(ST.getWavefrontSizeLog2()); 1156 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) 1157 .addReg(ScaledReg, RegState::Kill) 1158 .addImm(Offset); 1159 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) 1160 .addReg(ScaledReg, RegState::Kill); 1161 1162 // If there were truly no free SGPRs, we need to undo everything. 1163 if (!TmpScaledReg.isValid()) { 1164 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) 1165 .addReg(ScaledReg, RegState::Kill) 1166 .addImm(Offset); 1167 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) 1168 .addReg(DiffReg, RegState::Kill) 1169 .addImm(ST.getWavefrontSizeLog2()); 1170 } 1171 } 1172 } 1173 1174 if (!TmpDiffReg.isValid()) { 1175 // Restore the FP. 1176 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) 1177 .addReg(FrameReg) 1178 .addReg(MFI->getScratchWaveOffsetReg()); 1179 } 1180 1181 // Don't introduce an extra copy if we're just materializing in a mov. 1182 if (IsCopy) 1183 MI->eraseFromParent(); 1184 else 1185 FIOp.ChangeToRegister(ResultReg, false, false, true); 1186 return; 1187 } 1188 1189 if (IsMUBUF) { 1190 // Disable offen so we don't need a 0 vgpr base. 1191 assert(static_cast<int>(FIOperandNum) == 1192 AMDGPU::getNamedOperandIdx(MI->getOpcode(), 1193 AMDGPU::OpName::vaddr)); 1194 1195 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == 1196 MFI->getStackPtrOffsetReg()); 1197 1198 TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); 1199 1200 int64_t Offset = FrameInfo.getObjectOffset(Index); 1201 int64_t OldImm 1202 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); 1203 int64_t NewOffset = OldImm + Offset; 1204 1205 if (isUInt<12>(NewOffset) && 1206 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { 1207 MI->eraseFromParent(); 1208 return; 1209 } 1210 } 1211 1212 // If the offset is simply too big, don't convert to a scratch wave offset 1213 // relative index. 1214 1215 int64_t Offset = FrameInfo.getObjectOffset(Index); 1216 FIOp.ChangeToImmediate(Offset); 1217 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 1218 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); 1219 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 1220 .addImm(Offset); 1221 FIOp.ChangeToRegister(TmpReg, false, false, true); 1222 } 1223 } 1224 } 1225 } 1226 1227 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { 1228 return AMDGPUInstPrinter::getRegisterName(Reg); 1229 } 1230 1231 // FIXME: This is very slow. It might be worth creating a map from physreg to 1232 // register class. 1233 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { 1234 assert(!Register::isVirtualRegister(Reg)); 1235 1236 static const TargetRegisterClass *const BaseClasses[] = { 1237 &AMDGPU::VGPR_32RegClass, 1238 &AMDGPU::SReg_32RegClass, 1239 &AMDGPU::AGPR_32RegClass, 1240 &AMDGPU::VReg_64RegClass, 1241 &AMDGPU::SReg_64RegClass, 1242 &AMDGPU::AReg_64RegClass, 1243 &AMDGPU::VReg_96RegClass, 1244 &AMDGPU::SReg_96RegClass, 1245 &AMDGPU::VReg_128RegClass, 1246 &AMDGPU::SReg_128RegClass, 1247 &AMDGPU::AReg_128RegClass, 1248 &AMDGPU::VReg_160RegClass, 1249 &AMDGPU::SReg_160RegClass, 1250 &AMDGPU::VReg_256RegClass, 1251 &AMDGPU::SReg_256RegClass, 1252 &AMDGPU::VReg_512RegClass, 1253 &AMDGPU::SReg_512RegClass, 1254 &AMDGPU::AReg_512RegClass, 1255 &AMDGPU::SReg_1024RegClass, 1256 &AMDGPU::VReg_1024RegClass, 1257 &AMDGPU::AReg_1024RegClass, 1258 &AMDGPU::SCC_CLASSRegClass, 1259 &AMDGPU::Pseudo_SReg_32RegClass, 1260 &AMDGPU::Pseudo_SReg_128RegClass, 1261 }; 1262 1263 for (const TargetRegisterClass *BaseClass : BaseClasses) { 1264 if (BaseClass->contains(Reg)) { 1265 return BaseClass; 1266 } 1267 } 1268 return nullptr; 1269 } 1270 1271 // TODO: It might be helpful to have some target specific flags in 1272 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 1273 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 1274 unsigned Size = getRegSizeInBits(*RC); 1275 switch (Size) { 1276 case 32: 1277 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 1278 case 64: 1279 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 1280 case 96: 1281 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 1282 case 128: 1283 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 1284 case 160: 1285 return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; 1286 case 256: 1287 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 1288 case 512: 1289 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 1290 case 1024: 1291 return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; 1292 case 1: 1293 return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr; 1294 default: 1295 assert(Size < 32 && "Invalid register class size"); 1296 return false; 1297 } 1298 } 1299 1300 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { 1301 unsigned Size = getRegSizeInBits(*RC); 1302 if (Size < 32) 1303 return false; 1304 switch (Size) { 1305 case 32: 1306 return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; 1307 case 64: 1308 return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; 1309 case 96: 1310 return false; 1311 case 128: 1312 return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; 1313 case 160: 1314 case 256: 1315 return false; 1316 case 512: 1317 return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; 1318 case 1024: 1319 return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; 1320 default: 1321 llvm_unreachable("Invalid register class size"); 1322 } 1323 } 1324 1325 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 1326 const TargetRegisterClass *SRC) const { 1327 switch (getRegSizeInBits(*SRC)) { 1328 case 32: 1329 return &AMDGPU::VGPR_32RegClass; 1330 case 64: 1331 return &AMDGPU::VReg_64RegClass; 1332 case 96: 1333 return &AMDGPU::VReg_96RegClass; 1334 case 128: 1335 return &AMDGPU::VReg_128RegClass; 1336 case 160: 1337 return &AMDGPU::VReg_160RegClass; 1338 case 256: 1339 return &AMDGPU::VReg_256RegClass; 1340 case 512: 1341 return &AMDGPU::VReg_512RegClass; 1342 case 1024: 1343 return &AMDGPU::VReg_1024RegClass; 1344 case 1: 1345 return &AMDGPU::VReg_1RegClass; 1346 default: 1347 llvm_unreachable("Invalid register class size"); 1348 } 1349 } 1350 1351 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( 1352 const TargetRegisterClass *SRC) const { 1353 switch (getRegSizeInBits(*SRC)) { 1354 case 32: 1355 return &AMDGPU::AGPR_32RegClass; 1356 case 64: 1357 return &AMDGPU::AReg_64RegClass; 1358 case 128: 1359 return &AMDGPU::AReg_128RegClass; 1360 case 512: 1361 return &AMDGPU::AReg_512RegClass; 1362 case 1024: 1363 return &AMDGPU::AReg_1024RegClass; 1364 default: 1365 llvm_unreachable("Invalid register class size"); 1366 } 1367 } 1368 1369 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 1370 const TargetRegisterClass *VRC) const { 1371 switch (getRegSizeInBits(*VRC)) { 1372 case 32: 1373 return &AMDGPU::SGPR_32RegClass; 1374 case 64: 1375 return &AMDGPU::SReg_64RegClass; 1376 case 96: 1377 return &AMDGPU::SReg_96RegClass; 1378 case 128: 1379 return &AMDGPU::SGPR_128RegClass; 1380 case 160: 1381 return &AMDGPU::SReg_160RegClass; 1382 case 256: 1383 return &AMDGPU::SReg_256RegClass; 1384 case 512: 1385 return &AMDGPU::SReg_512RegClass; 1386 case 1024: 1387 return &AMDGPU::SReg_1024RegClass; 1388 default: 1389 llvm_unreachable("Invalid register class size"); 1390 } 1391 } 1392 1393 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 1394 const TargetRegisterClass *RC, unsigned SubIdx) const { 1395 if (SubIdx == AMDGPU::NoSubRegister) 1396 return RC; 1397 1398 // We can assume that each lane corresponds to one 32-bit register. 1399 unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); 1400 if (isSGPRClass(RC)) { 1401 switch (Count) { 1402 case 1: 1403 return &AMDGPU::SGPR_32RegClass; 1404 case 2: 1405 return &AMDGPU::SReg_64RegClass; 1406 case 3: 1407 return &AMDGPU::SReg_96RegClass; 1408 case 4: 1409 return &AMDGPU::SGPR_128RegClass; 1410 case 5: 1411 return &AMDGPU::SReg_160RegClass; 1412 case 8: 1413 return &AMDGPU::SReg_256RegClass; 1414 case 16: 1415 return &AMDGPU::SReg_512RegClass; 1416 case 32: /* fall-through */ 1417 default: 1418 llvm_unreachable("Invalid sub-register class size"); 1419 } 1420 } else if (hasAGPRs(RC)) { 1421 switch (Count) { 1422 case 1: 1423 return &AMDGPU::AGPR_32RegClass; 1424 case 2: 1425 return &AMDGPU::AReg_64RegClass; 1426 case 4: 1427 return &AMDGPU::AReg_128RegClass; 1428 case 16: 1429 return &AMDGPU::AReg_512RegClass; 1430 case 32: /* fall-through */ 1431 default: 1432 llvm_unreachable("Invalid sub-register class size"); 1433 } 1434 } else { 1435 switch (Count) { 1436 case 1: 1437 return &AMDGPU::VGPR_32RegClass; 1438 case 2: 1439 return &AMDGPU::VReg_64RegClass; 1440 case 3: 1441 return &AMDGPU::VReg_96RegClass; 1442 case 4: 1443 return &AMDGPU::VReg_128RegClass; 1444 case 5: 1445 return &AMDGPU::VReg_160RegClass; 1446 case 8: 1447 return &AMDGPU::VReg_256RegClass; 1448 case 16: 1449 return &AMDGPU::VReg_512RegClass; 1450 case 32: /* fall-through */ 1451 default: 1452 llvm_unreachable("Invalid sub-register class size"); 1453 } 1454 } 1455 } 1456 1457 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 1458 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && 1459 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) 1460 return !ST.hasMFMAInlineLiteralBug(); 1461 1462 return OpType >= AMDGPU::OPERAND_SRC_FIRST && 1463 OpType <= AMDGPU::OPERAND_SRC_LAST; 1464 } 1465 1466 bool SIRegisterInfo::shouldRewriteCopySrc( 1467 const TargetRegisterClass *DefRC, 1468 unsigned DefSubReg, 1469 const TargetRegisterClass *SrcRC, 1470 unsigned SrcSubReg) const { 1471 // We want to prefer the smallest register class possible, so we don't want to 1472 // stop and rewrite on anything that looks like a subregister 1473 // extract. Operations mostly don't care about the super register class, so we 1474 // only want to stop on the most basic of copies between the same register 1475 // class. 1476 // 1477 // e.g. if we have something like 1478 // %0 = ... 1479 // %1 = ... 1480 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 1481 // %3 = COPY %2, sub0 1482 // 1483 // We want to look through the COPY to find: 1484 // => %3 = COPY %0 1485 1486 // Plain copy. 1487 return getCommonSubClass(DefRC, SrcRC) != nullptr; 1488 } 1489 1490 /// Returns a register that is not used at any point in the function. 1491 /// If all registers are used, then this function will return 1492 // AMDGPU::NoRegister. 1493 unsigned 1494 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 1495 const TargetRegisterClass *RC, 1496 const MachineFunction &MF) const { 1497 1498 for (unsigned Reg : *RC) 1499 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) 1500 return Reg; 1501 return AMDGPU::NoRegister; 1502 } 1503 1504 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, 1505 unsigned EltSize) const { 1506 if (EltSize == 4) { 1507 static const int16_t Sub0_31[] = { 1508 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1509 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1510 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1511 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1512 AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, 1513 AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, 1514 AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, 1515 AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, 1516 }; 1517 1518 static const int16_t Sub0_15[] = { 1519 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1520 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1521 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 1522 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 1523 }; 1524 1525 static const int16_t Sub0_7[] = { 1526 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1527 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 1528 }; 1529 1530 static const int16_t Sub0_4[] = { 1531 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, 1532 }; 1533 1534 static const int16_t Sub0_3[] = { 1535 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 1536 }; 1537 1538 static const int16_t Sub0_2[] = { 1539 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 1540 }; 1541 1542 static const int16_t Sub0_1[] = { 1543 AMDGPU::sub0, AMDGPU::sub1, 1544 }; 1545 1546 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1547 case 32: 1548 return {}; 1549 case 64: 1550 return makeArrayRef(Sub0_1); 1551 case 96: 1552 return makeArrayRef(Sub0_2); 1553 case 128: 1554 return makeArrayRef(Sub0_3); 1555 case 160: 1556 return makeArrayRef(Sub0_4); 1557 case 256: 1558 return makeArrayRef(Sub0_7); 1559 case 512: 1560 return makeArrayRef(Sub0_15); 1561 case 1024: 1562 return makeArrayRef(Sub0_31); 1563 default: 1564 llvm_unreachable("unhandled register size"); 1565 } 1566 } 1567 1568 if (EltSize == 8) { 1569 static const int16_t Sub0_31_64[] = { 1570 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1571 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1572 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1573 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 1574 AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, 1575 AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, 1576 AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, 1577 AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 1578 }; 1579 1580 static const int16_t Sub0_15_64[] = { 1581 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1582 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 1583 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 1584 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 1585 }; 1586 1587 static const int16_t Sub0_7_64[] = { 1588 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 1589 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 1590 }; 1591 1592 1593 static const int16_t Sub0_3_64[] = { 1594 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 1595 }; 1596 1597 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1598 case 64: 1599 return {}; 1600 case 128: 1601 return makeArrayRef(Sub0_3_64); 1602 case 256: 1603 return makeArrayRef(Sub0_7_64); 1604 case 512: 1605 return makeArrayRef(Sub0_15_64); 1606 case 1024: 1607 return makeArrayRef(Sub0_31_64); 1608 default: 1609 llvm_unreachable("unhandled register size"); 1610 } 1611 } 1612 1613 if (EltSize == 16) { 1614 1615 static const int16_t Sub0_31_128[] = { 1616 AMDGPU::sub0_sub1_sub2_sub3, 1617 AMDGPU::sub4_sub5_sub6_sub7, 1618 AMDGPU::sub8_sub9_sub10_sub11, 1619 AMDGPU::sub12_sub13_sub14_sub15, 1620 AMDGPU::sub16_sub17_sub18_sub19, 1621 AMDGPU::sub20_sub21_sub22_sub23, 1622 AMDGPU::sub24_sub25_sub26_sub27, 1623 AMDGPU::sub28_sub29_sub30_sub31 1624 }; 1625 1626 static const int16_t Sub0_15_128[] = { 1627 AMDGPU::sub0_sub1_sub2_sub3, 1628 AMDGPU::sub4_sub5_sub6_sub7, 1629 AMDGPU::sub8_sub9_sub10_sub11, 1630 AMDGPU::sub12_sub13_sub14_sub15 1631 }; 1632 1633 static const int16_t Sub0_7_128[] = { 1634 AMDGPU::sub0_sub1_sub2_sub3, 1635 AMDGPU::sub4_sub5_sub6_sub7 1636 }; 1637 1638 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1639 case 128: 1640 return {}; 1641 case 256: 1642 return makeArrayRef(Sub0_7_128); 1643 case 512: 1644 return makeArrayRef(Sub0_15_128); 1645 case 1024: 1646 return makeArrayRef(Sub0_31_128); 1647 default: 1648 llvm_unreachable("unhandled register size"); 1649 } 1650 } 1651 1652 assert(EltSize == 32 && "unhandled elt size"); 1653 1654 static const int16_t Sub0_31_256[] = { 1655 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1656 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 1657 AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, 1658 AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 1659 }; 1660 1661 static const int16_t Sub0_15_256[] = { 1662 AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, 1663 AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 1664 }; 1665 1666 switch (AMDGPU::getRegBitWidth(*RC->MC)) { 1667 case 256: 1668 return {}; 1669 case 512: 1670 return makeArrayRef(Sub0_15_256); 1671 case 1024: 1672 return makeArrayRef(Sub0_31_256); 1673 default: 1674 llvm_unreachable("unhandled register size"); 1675 } 1676 } 1677 1678 const TargetRegisterClass* 1679 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, 1680 unsigned Reg) const { 1681 if (Register::isVirtualRegister(Reg)) 1682 return MRI.getRegClass(Reg); 1683 1684 return getPhysRegClass(Reg); 1685 } 1686 1687 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1688 unsigned Reg) const { 1689 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1690 assert(RC && "Register class for the reg not found"); 1691 return hasVGPRs(RC); 1692 } 1693 1694 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, 1695 unsigned Reg) const { 1696 const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); 1697 assert(RC && "Register class for the reg not found"); 1698 return hasAGPRs(RC); 1699 } 1700 1701 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, 1702 const TargetRegisterClass *SrcRC, 1703 unsigned SubReg, 1704 const TargetRegisterClass *DstRC, 1705 unsigned DstSubReg, 1706 const TargetRegisterClass *NewRC, 1707 LiveIntervals &LIS) const { 1708 unsigned SrcSize = getRegSizeInBits(*SrcRC); 1709 unsigned DstSize = getRegSizeInBits(*DstRC); 1710 unsigned NewSize = getRegSizeInBits(*NewRC); 1711 1712 // Do not increase size of registers beyond dword, we would need to allocate 1713 // adjacent registers and constraint regalloc more than needed. 1714 1715 // Always allow dword coalescing. 1716 if (SrcSize <= 32 || DstSize <= 32) 1717 return true; 1718 1719 return NewSize <= DstSize || NewSize <= SrcSize; 1720 } 1721 1722 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, 1723 MachineFunction &MF) const { 1724 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1725 1726 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), 1727 MF.getFunction()); 1728 switch (RC->getID()) { 1729 default: 1730 return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); 1731 case AMDGPU::VGPR_32RegClassID: 1732 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); 1733 case AMDGPU::SGPR_32RegClassID: 1734 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); 1735 } 1736 } 1737 1738 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 1739 unsigned Idx) const { 1740 if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) 1741 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 1742 const_cast<MachineFunction &>(MF)); 1743 1744 if (Idx == getSGPRPressureSet()) 1745 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, 1746 const_cast<MachineFunction &>(MF)); 1747 1748 return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); 1749 } 1750 1751 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { 1752 static const int Empty[] = { -1 }; 1753 1754 if (hasRegUnit(AMDGPU::M0, RegUnit)) 1755 return Empty; 1756 return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); 1757 } 1758 1759 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { 1760 // Not a callee saved register. 1761 return AMDGPU::SGPR30_SGPR31; 1762 } 1763 1764 const TargetRegisterClass * 1765 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, 1766 const RegisterBank &RB, 1767 const MachineRegisterInfo &MRI) const { 1768 switch (Size) { 1769 case 1: { 1770 switch (RB.getID()) { 1771 case AMDGPU::VGPRRegBankID: 1772 return &AMDGPU::VGPR_32RegClass; 1773 case AMDGPU::VCCRegBankID: 1774 return isWave32 ? 1775 &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; 1776 case AMDGPU::SGPRRegBankID: 1777 return &AMDGPU::SReg_32RegClass; 1778 default: 1779 llvm_unreachable("unknown register bank"); 1780 } 1781 } 1782 case 32: 1783 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1784 &AMDGPU::SReg_32RegClass; 1785 case 64: 1786 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : 1787 &AMDGPU::SReg_64RegClass; 1788 case 96: 1789 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : 1790 &AMDGPU::SReg_96RegClass; 1791 case 128: 1792 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : 1793 &AMDGPU::SGPR_128RegClass; 1794 case 160: 1795 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : 1796 &AMDGPU::SReg_160RegClass; 1797 case 256: 1798 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : 1799 &AMDGPU::SReg_256RegClass; 1800 case 512: 1801 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : 1802 &AMDGPU::SReg_512RegClass; 1803 case 1024: 1804 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass : 1805 &AMDGPU::SReg_1024RegClass; 1806 default: 1807 if (Size < 32) 1808 return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : 1809 &AMDGPU::SReg_32RegClass; 1810 return nullptr; 1811 } 1812 } 1813 1814 const TargetRegisterClass * 1815 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, 1816 const MachineRegisterInfo &MRI) const { 1817 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); 1818 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) 1819 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); 1820 1821 const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); 1822 return getAllocatableClass(RC); 1823 } 1824 1825 unsigned SIRegisterInfo::getVCC() const { 1826 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; 1827 } 1828 1829 const TargetRegisterClass * 1830 SIRegisterInfo::getRegClass(unsigned RCID) const { 1831 switch ((int)RCID) { 1832 case AMDGPU::SReg_1RegClassID: 1833 return getBoolRC(); 1834 case AMDGPU::SReg_1_XEXECRegClassID: 1835 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass 1836 : &AMDGPU::SReg_64_XEXECRegClass; 1837 case -1: 1838 return nullptr; 1839 default: 1840 return AMDGPURegisterInfo::getRegClass(RCID); 1841 } 1842 } 1843 1844 // Find reaching register definition 1845 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, 1846 MachineInstr &Use, 1847 MachineRegisterInfo &MRI, 1848 LiveIntervals *LIS) const { 1849 auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); 1850 SlotIndex UseIdx = LIS->getInstructionIndex(Use); 1851 SlotIndex DefIdx; 1852 1853 if (Register::isVirtualRegister(Reg)) { 1854 if (!LIS->hasInterval(Reg)) 1855 return nullptr; 1856 LiveInterval &LI = LIS->getInterval(Reg); 1857 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) 1858 : MRI.getMaxLaneMaskForVReg(Reg); 1859 VNInfo *V = nullptr; 1860 if (LI.hasSubRanges()) { 1861 for (auto &S : LI.subranges()) { 1862 if ((S.LaneMask & SubLanes) == SubLanes) { 1863 V = S.getVNInfoAt(UseIdx); 1864 break; 1865 } 1866 } 1867 } else { 1868 V = LI.getVNInfoAt(UseIdx); 1869 } 1870 if (!V) 1871 return nullptr; 1872 DefIdx = V->def; 1873 } else { 1874 // Find last def. 1875 for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { 1876 LiveRange &LR = LIS->getRegUnit(*Units); 1877 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { 1878 if (!DefIdx.isValid() || 1879 MDT.dominates(LIS->getInstructionFromIndex(DefIdx), 1880 LIS->getInstructionFromIndex(V->def))) 1881 DefIdx = V->def; 1882 } else { 1883 return nullptr; 1884 } 1885 } 1886 } 1887 1888 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); 1889 1890 if (!Def || !MDT.dominates(Def, &Use)) 1891 return nullptr; 1892 1893 assert(Def->modifiesRegister(Reg, this)); 1894 1895 return Def; 1896 } 1897