1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 GCNSubtarget::~GCNSubtarget() = default; 49 50 R600Subtarget & 51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 52 StringRef GPU, StringRef FS) { 53 SmallString<256> FullFS("+promote-alloca,"); 54 FullFS += FS; 55 ParseSubtargetFeatures(GPU, FullFS); 56 57 // FIXME: I don't think think Evergreen has any useful support for 58 // denormals, but should be checked. Should we issue a warning somewhere 59 // if someone tries to enable these? 60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 61 FP32Denormals = false; 62 } 63 64 HasMulU24 = getGeneration() >= EVERGREEN; 65 HasMulI24 = hasCaymanISA(); 66 67 return *this; 68 } 69 70 GCNSubtarget & 71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 72 StringRef GPU, StringRef FS) { 73 // Determine default and user-specified characteristics 74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 75 // enabled, but some instructions do not respect them and they run at the 76 // double precision rate, so don't enable by default. 77 // 78 // We want to be able to turn these off, but making this a subtarget feature 79 // for SI has the unhelpful behavior that it unsets everything else if you 80 // disable it. 81 // 82 // Similarly we want enable-prt-strict-null to be on by default and not to 83 // unset everything else if it is disabled 84 85 // Assuming ECC is enabled is the conservative default. 86 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 87 88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 89 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 90 91 // FIXME: I don't think think Evergreen has any useful support for 92 // denormals, but should be checked. Should we issue a warning somewhere 93 // if someone tries to enable these? 94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 95 FullFS += "+fp64-fp16-denormals,"; 96 } else { 97 FullFS += "-fp32-denormals,"; 98 } 99 100 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 101 102 // Disable mutually exclusive bits. 103 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 104 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 105 FullFS += "-wavefrontsize16,"; 106 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 107 FullFS += "-wavefrontsize32,"; 108 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 109 FullFS += "-wavefrontsize64,"; 110 } 111 112 FullFS += FS; 113 114 ParseSubtargetFeatures(GPU, FullFS); 115 116 // We don't support FP64 for EG/NI atm. 117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 118 119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 121 // variants of MUBUF instructions. 122 if (!hasAddr64() && !FS.contains("flat-for-global")) { 123 FlatForGlobal = true; 124 } 125 126 // Set defaults if needed. 127 if (MaxPrivateElementSize == 0) 128 MaxPrivateElementSize = 4; 129 130 if (LDSBankCount == 0) 131 LDSBankCount = 32; 132 133 if (TT.getArch() == Triple::amdgcn) { 134 if (LocalMemorySize == 0) 135 LocalMemorySize = 32768; 136 137 // Do something sensible for unspecified target. 138 if (!HasMovrel && !HasVGPRIndexMode) 139 HasMovrel = true; 140 } 141 142 // Don't crash on invalid devices. 143 if (WavefrontSize == 0) 144 WavefrontSize = 64; 145 146 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 147 148 if (DoesNotSupportXNACK && EnableXNACK) { 149 ToggleFeature(AMDGPU::FeatureXNACK); 150 EnableXNACK = false; 151 } 152 153 // ECC is on by default, but turn it off if the hardware doesn't support it 154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 155 // ECC. 156 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 157 ToggleFeature(AMDGPU::FeatureSRAMECC); 158 EnableSRAMECC = false; 159 } 160 161 return *this; 162 } 163 164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 165 TargetTriple(TT), 166 Has16BitInsts(false), 167 HasMadMixInsts(false), 168 FP32Denormals(false), 169 FPExceptions(false), 170 HasSDWA(false), 171 HasVOP3PInsts(false), 172 HasMulI24(true), 173 HasMulU24(true), 174 HasInv2PiInlineImm(false), 175 HasFminFmaxLegacy(true), 176 EnablePromoteAlloca(false), 177 HasTrigReducedRange(false), 178 LocalMemorySize(0), 179 WavefrontSize(0) 180 { } 181 182 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 183 const GCNTargetMachine &TM) : 184 AMDGPUGenSubtargetInfo(TT, GPU, FS), 185 AMDGPUSubtarget(TT), 186 TargetTriple(TT), 187 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 188 InstrItins(getInstrItineraryForCPU(GPU)), 189 LDSBankCount(0), 190 MaxPrivateElementSize(0), 191 192 FastFMAF32(false), 193 HalfRate64Ops(false), 194 195 FP64FP16Denormals(false), 196 FlatForGlobal(false), 197 AutoWaitcntBeforeBarrier(false), 198 CodeObjectV3(false), 199 UnalignedScratchAccess(false), 200 UnalignedBufferAccess(false), 201 202 HasApertureRegs(false), 203 EnableXNACK(false), 204 DoesNotSupportXNACK(false), 205 EnableCuMode(false), 206 TrapHandler(false), 207 208 EnableLoadStoreOpt(false), 209 EnableUnsafeDSOffsetFolding(false), 210 EnableSIScheduler(false), 211 EnableDS128(false), 212 EnablePRTStrictNull(false), 213 DumpCode(false), 214 215 FP64(false), 216 GCN3Encoding(false), 217 CIInsts(false), 218 GFX8Insts(false), 219 GFX9Insts(false), 220 GFX10Insts(false), 221 GFX7GFX8GFX9Insts(false), 222 SGPRInitBug(false), 223 HasSMemRealTime(false), 224 HasIntClamp(false), 225 HasFmaMixInsts(false), 226 HasMovrel(false), 227 HasVGPRIndexMode(false), 228 HasScalarStores(false), 229 HasScalarAtomics(false), 230 HasSDWAOmod(false), 231 HasSDWAScalar(false), 232 HasSDWASdst(false), 233 HasSDWAMac(false), 234 HasSDWAOutModsVOPC(false), 235 HasDPP(false), 236 HasDPP8(false), 237 HasR128A16(false), 238 HasNSAEncoding(false), 239 HasDLInsts(false), 240 HasDot1Insts(false), 241 HasDot2Insts(false), 242 HasDot3Insts(false), 243 HasDot4Insts(false), 244 HasDot5Insts(false), 245 HasDot6Insts(false), 246 HasMAIInsts(false), 247 HasPkFmacF16Inst(false), 248 HasAtomicFaddInsts(false), 249 EnableSRAMECC(false), 250 DoesNotSupportSRAMECC(false), 251 HasNoSdstCMPX(false), 252 HasVscnt(false), 253 HasRegisterBanking(false), 254 HasVOP3Literal(false), 255 HasNoDataDepHazard(false), 256 FlatAddressSpace(false), 257 FlatInstOffsets(false), 258 FlatGlobalInsts(false), 259 FlatScratchInsts(false), 260 ScalarFlatScratchInsts(false), 261 AddNoCarryInsts(false), 262 HasUnpackedD16VMem(false), 263 LDSMisalignedBug(false), 264 265 ScalarizeGlobal(false), 266 267 HasVcmpxPermlaneHazard(false), 268 HasVMEMtoScalarWriteHazard(false), 269 HasSMEMtoVectorWriteHazard(false), 270 HasInstFwdPrefetchBug(false), 271 HasVcmpxExecWARHazard(false), 272 HasLdsBranchVmemWARHazard(false), 273 HasNSAtoVMEMBug(false), 274 HasOffset3fBug(false), 275 HasFlatSegmentOffsetBug(false), 276 277 FeatureDisable(false), 278 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 279 TLInfo(TM, *this), 280 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 281 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 282 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 283 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 284 InstSelector.reset(new AMDGPUInstructionSelector( 285 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 286 } 287 288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 289 if (getGeneration() < GFX10) 290 return 1; 291 292 switch (Opcode) { 293 case AMDGPU::V_LSHLREV_B64: 294 case AMDGPU::V_LSHLREV_B64_gfx10: 295 case AMDGPU::V_LSHL_B64: 296 case AMDGPU::V_LSHRREV_B64: 297 case AMDGPU::V_LSHRREV_B64_gfx10: 298 case AMDGPU::V_LSHR_B64: 299 case AMDGPU::V_ASHRREV_I64: 300 case AMDGPU::V_ASHRREV_I64_gfx10: 301 case AMDGPU::V_ASHR_I64: 302 return 1; 303 } 304 305 return 2; 306 } 307 308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 309 const Function &F) const { 310 if (NWaves == 1) 311 return getLocalMemorySize(); 312 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 313 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 314 if (!WorkGroupsPerCu) 315 return 0; 316 unsigned MaxWaves = getMaxWavesPerEU(); 317 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 318 } 319 320 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 321 const Function &F) const { 322 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 323 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 324 if (!WorkGroupsPerCu) 325 return 0; 326 unsigned MaxWaves = getMaxWavesPerEU(); 327 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 328 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 329 NumWaves = std::min(NumWaves, MaxWaves); 330 NumWaves = std::max(NumWaves, 1u); 331 return NumWaves; 332 } 333 334 unsigned 335 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 336 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 337 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 338 } 339 340 std::pair<unsigned, unsigned> 341 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 342 switch (CC) { 343 case CallingConv::AMDGPU_CS: 344 case CallingConv::AMDGPU_KERNEL: 345 case CallingConv::SPIR_KERNEL: 346 return std::make_pair(getWavefrontSize() * 2, 347 std::max(getWavefrontSize() * 4, 256u)); 348 case CallingConv::AMDGPU_VS: 349 case CallingConv::AMDGPU_LS: 350 case CallingConv::AMDGPU_HS: 351 case CallingConv::AMDGPU_ES: 352 case CallingConv::AMDGPU_GS: 353 case CallingConv::AMDGPU_PS: 354 return std::make_pair(1, getWavefrontSize()); 355 default: 356 return std::make_pair(1, 16 * getWavefrontSize()); 357 } 358 } 359 360 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 361 const Function &F) const { 362 // FIXME: 1024 if function. 363 // Default minimum/maximum flat work group sizes. 364 std::pair<unsigned, unsigned> Default = 365 getDefaultFlatWorkGroupSize(F.getCallingConv()); 366 367 // Requested minimum/maximum flat work group sizes. 368 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 369 F, "amdgpu-flat-work-group-size", Default); 370 371 // Make sure requested minimum is less than requested maximum. 372 if (Requested.first > Requested.second) 373 return Default; 374 375 // Make sure requested values do not violate subtarget's specifications. 376 if (Requested.first < getMinFlatWorkGroupSize()) 377 return Default; 378 if (Requested.second > getMaxFlatWorkGroupSize()) 379 return Default; 380 381 return Requested; 382 } 383 384 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 385 const Function &F) const { 386 // Default minimum/maximum number of waves per execution unit. 387 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 388 389 // Default/requested minimum/maximum flat work group sizes. 390 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 391 392 // If minimum/maximum flat work group sizes were explicitly requested using 393 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 394 // number of waves per execution unit to values implied by requested 395 // minimum/maximum flat work group sizes. 396 unsigned MinImpliedByFlatWorkGroupSize = 397 getMaxWavesPerEU(FlatWorkGroupSizes.second); 398 bool RequestedFlatWorkGroupSize = false; 399 400 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 401 Default.first = MinImpliedByFlatWorkGroupSize; 402 RequestedFlatWorkGroupSize = true; 403 } 404 405 // Requested minimum/maximum number of waves per execution unit. 406 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 407 F, "amdgpu-waves-per-eu", Default, true); 408 409 // Make sure requested minimum is less than requested maximum. 410 if (Requested.second && Requested.first > Requested.second) 411 return Default; 412 413 // Make sure requested values do not violate subtarget's specifications. 414 if (Requested.first < getMinWavesPerEU() || 415 Requested.first > getMaxWavesPerEU()) 416 return Default; 417 if (Requested.second > getMaxWavesPerEU()) 418 return Default; 419 420 // Make sure requested values are compatible with values implied by requested 421 // minimum/maximum flat work group sizes. 422 if (RequestedFlatWorkGroupSize && 423 Requested.first < MinImpliedByFlatWorkGroupSize) 424 return Default; 425 426 return Requested; 427 } 428 429 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 430 Function *Kernel = I->getParent()->getParent(); 431 unsigned MinSize = 0; 432 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 433 bool IdQuery = false; 434 435 // If reqd_work_group_size is present it narrows value down. 436 if (auto *CI = dyn_cast<CallInst>(I)) { 437 const Function *F = CI->getCalledFunction(); 438 if (F) { 439 unsigned Dim = UINT_MAX; 440 switch (F->getIntrinsicID()) { 441 case Intrinsic::amdgcn_workitem_id_x: 442 case Intrinsic::r600_read_tidig_x: 443 IdQuery = true; 444 LLVM_FALLTHROUGH; 445 case Intrinsic::r600_read_local_size_x: 446 Dim = 0; 447 break; 448 case Intrinsic::amdgcn_workitem_id_y: 449 case Intrinsic::r600_read_tidig_y: 450 IdQuery = true; 451 LLVM_FALLTHROUGH; 452 case Intrinsic::r600_read_local_size_y: 453 Dim = 1; 454 break; 455 case Intrinsic::amdgcn_workitem_id_z: 456 case Intrinsic::r600_read_tidig_z: 457 IdQuery = true; 458 LLVM_FALLTHROUGH; 459 case Intrinsic::r600_read_local_size_z: 460 Dim = 2; 461 break; 462 default: 463 break; 464 } 465 if (Dim <= 3) { 466 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 467 if (Node->getNumOperands() == 3) 468 MinSize = MaxSize = mdconst::extract<ConstantInt>( 469 Node->getOperand(Dim))->getZExtValue(); 470 } 471 } 472 } 473 474 if (!MaxSize) 475 return false; 476 477 // Range metadata is [Lo, Hi). For ID query we need to pass max size 478 // as Hi. For size query we need to pass Hi + 1. 479 if (IdQuery) 480 MinSize = 0; 481 else 482 ++MaxSize; 483 484 MDBuilder MDB(I->getContext()); 485 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 486 APInt(32, MaxSize)); 487 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 488 return true; 489 } 490 491 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 492 unsigned &MaxAlign) const { 493 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 494 F.getCallingConv() == CallingConv::SPIR_KERNEL); 495 496 const DataLayout &DL = F.getParent()->getDataLayout(); 497 uint64_t ExplicitArgBytes = 0; 498 MaxAlign = 1; 499 500 for (const Argument &Arg : F.args()) { 501 Type *ArgTy = Arg.getType(); 502 503 unsigned Align = DL.getABITypeAlignment(ArgTy); 504 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 505 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 506 MaxAlign = std::max(MaxAlign, Align); 507 } 508 509 return ExplicitArgBytes; 510 } 511 512 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 513 unsigned &MaxAlign) const { 514 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 515 516 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 517 518 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 519 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 520 if (ImplicitBytes != 0) { 521 unsigned Alignment = getAlignmentForImplicitArgPtr(); 522 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 523 } 524 525 // Being able to dereference past the end is useful for emitting scalar loads. 526 return alignTo(TotalSize, 4); 527 } 528 529 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 530 const TargetMachine &TM) : 531 R600GenSubtargetInfo(TT, GPU, FS), 532 AMDGPUSubtarget(TT), 533 InstrInfo(*this), 534 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 535 FMA(false), 536 CaymanISA(false), 537 CFALUBug(false), 538 HasVertexCache(false), 539 R600ALUInst(false), 540 FP64(false), 541 TexVTXClauseSize(0), 542 Gen(R600), 543 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 544 InstrItins(getInstrItineraryForCPU(GPU)) { } 545 546 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 547 unsigned NumRegionInstrs) const { 548 // Track register pressure so the scheduler can try to decrease 549 // pressure once register usage is above the threshold defined by 550 // SIRegisterInfo::getRegPressureSetLimit() 551 Policy.ShouldTrackPressure = true; 552 553 // Enabling both top down and bottom up scheduling seems to give us less 554 // register spills than just using one of these approaches on its own. 555 Policy.OnlyTopDown = false; 556 Policy.OnlyBottomUp = false; 557 558 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 559 if (!enableSIScheduler()) 560 Policy.ShouldTrackLaneMasks = true; 561 } 562 563 bool GCNSubtarget::hasMadF16() const { 564 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 565 } 566 567 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 568 if (getGeneration() >= AMDGPUSubtarget::GFX10) 569 return 10; 570 571 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 572 if (SGPRs <= 80) 573 return 10; 574 if (SGPRs <= 88) 575 return 9; 576 if (SGPRs <= 100) 577 return 8; 578 return 7; 579 } 580 if (SGPRs <= 48) 581 return 10; 582 if (SGPRs <= 56) 583 return 9; 584 if (SGPRs <= 64) 585 return 8; 586 if (SGPRs <= 72) 587 return 7; 588 if (SGPRs <= 80) 589 return 6; 590 return 5; 591 } 592 593 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 594 if (VGPRs <= 24) 595 return 10; 596 if (VGPRs <= 28) 597 return 9; 598 if (VGPRs <= 32) 599 return 8; 600 if (VGPRs <= 36) 601 return 7; 602 if (VGPRs <= 40) 603 return 6; 604 if (VGPRs <= 48) 605 return 5; 606 if (VGPRs <= 64) 607 return 4; 608 if (VGPRs <= 84) 609 return 3; 610 if (VGPRs <= 128) 611 return 2; 612 return 1; 613 } 614 615 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 616 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 617 if (getGeneration() >= AMDGPUSubtarget::GFX10) 618 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 619 620 if (MFI.hasFlatScratchInit()) { 621 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 622 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 623 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 624 return 4; // FLAT_SCRATCH, VCC (in that order). 625 } 626 627 if (isXNACKEnabled()) 628 return 4; // XNACK, VCC (in that order). 629 return 2; // VCC. 630 } 631 632 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 633 const Function &F = MF.getFunction(); 634 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 635 636 // Compute maximum number of SGPRs function can use using default/requested 637 // minimum number of waves per execution unit. 638 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 639 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 640 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 641 642 // Check if maximum number of SGPRs was explicitly requested using 643 // "amdgpu-num-sgpr" attribute. 644 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 645 unsigned Requested = AMDGPU::getIntegerAttribute( 646 F, "amdgpu-num-sgpr", MaxNumSGPRs); 647 648 // Make sure requested value does not violate subtarget's specifications. 649 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 650 Requested = 0; 651 652 // If more SGPRs are required to support the input user/system SGPRs, 653 // increase to accommodate them. 654 // 655 // FIXME: This really ends up using the requested number of SGPRs + number 656 // of reserved special registers in total. Theoretically you could re-use 657 // the last input registers for these special registers, but this would 658 // require a lot of complexity to deal with the weird aliasing. 659 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 660 if (Requested && Requested < InputNumSGPRs) 661 Requested = InputNumSGPRs; 662 663 // Make sure requested value is compatible with values implied by 664 // default/requested minimum/maximum number of waves per execution unit. 665 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 666 Requested = 0; 667 if (WavesPerEU.second && 668 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 669 Requested = 0; 670 671 if (Requested) 672 MaxNumSGPRs = Requested; 673 } 674 675 if (hasSGPRInitBug()) 676 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 677 678 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 679 MaxAddressableNumSGPRs); 680 } 681 682 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 683 const Function &F = MF.getFunction(); 684 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 685 686 // Compute maximum number of VGPRs function can use using default/requested 687 // minimum number of waves per execution unit. 688 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 689 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 690 691 // Check if maximum number of VGPRs was explicitly requested using 692 // "amdgpu-num-vgpr" attribute. 693 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 694 unsigned Requested = AMDGPU::getIntegerAttribute( 695 F, "amdgpu-num-vgpr", MaxNumVGPRs); 696 697 // Make sure requested value is compatible with values implied by 698 // default/requested minimum/maximum number of waves per execution unit. 699 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 700 Requested = 0; 701 if (WavesPerEU.second && 702 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 703 Requested = 0; 704 705 if (Requested) 706 MaxNumVGPRs = Requested; 707 } 708 709 return MaxNumVGPRs; 710 } 711 712 namespace { 713 struct MemOpClusterMutation : ScheduleDAGMutation { 714 const SIInstrInfo *TII; 715 716 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 717 718 void apply(ScheduleDAGInstrs *DAG) override { 719 SUnit *SUa = nullptr; 720 // Search for two consequent memory operations and link them 721 // to prevent scheduler from moving them apart. 722 // In DAG pre-process SUnits are in the original order of 723 // the instructions before scheduling. 724 for (SUnit &SU : DAG->SUnits) { 725 MachineInstr &MI2 = *SU.getInstr(); 726 if (!MI2.mayLoad() && !MI2.mayStore()) { 727 SUa = nullptr; 728 continue; 729 } 730 if (!SUa) { 731 SUa = &SU; 732 continue; 733 } 734 735 MachineInstr &MI1 = *SUa->getInstr(); 736 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 737 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 738 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 739 (TII->isDS(MI1) && TII->isDS(MI2))) { 740 SU.addPredBarrier(SUa); 741 742 for (const SDep &SI : SU.Preds) { 743 if (SI.getSUnit() != SUa) 744 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 745 } 746 747 if (&SU != &DAG->ExitSU) { 748 for (const SDep &SI : SUa->Succs) { 749 if (SI.getSUnit() != &SU) 750 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 751 } 752 } 753 } 754 755 SUa = &SU; 756 } 757 } 758 }; 759 760 struct FillMFMAShadowMutation : ScheduleDAGMutation { 761 const SIInstrInfo *TII; 762 763 ScheduleDAGMI *DAG; 764 765 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 766 767 bool isSALU(const SUnit *SU) const { 768 const MachineInstr *MI = SU->getInstr(); 769 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 770 } 771 772 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 773 if (Pred->NodeNum < Succ->NodeNum) 774 return true; 775 776 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 777 778 for (unsigned I = 0; I < Succs.size(); ++I) { 779 for (const SDep &SI : Succs[I]->Succs) { 780 const SUnit *SU = SI.getSUnit(); 781 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 782 Succs.push_back(SU); 783 } 784 } 785 786 SmallPtrSet<const SUnit*, 32> Visited; 787 while (!Preds.empty()) { 788 const SUnit *SU = Preds.pop_back_val(); 789 if (llvm::find(Succs, SU) != Succs.end()) 790 return false; 791 Visited.insert(SU); 792 for (const SDep &SI : SU->Preds) 793 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 794 Preds.push_back(SI.getSUnit()); 795 } 796 797 return true; 798 } 799 800 // Link as much SALU intructions in chain as possible. Return the size 801 // of the chain. Links up to MaxChain instructions. 802 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 803 SmallPtrSetImpl<SUnit *> &Visited) const { 804 SmallVector<SUnit *, 8> Worklist({To}); 805 unsigned Linked = 0; 806 807 while (!Worklist.empty() && MaxChain-- > 0) { 808 SUnit *SU = Worklist.pop_back_val(); 809 if (!Visited.insert(SU).second) 810 continue; 811 812 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 813 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 814 815 if (SU->addPred(SDep(From, SDep::Artificial), false)) 816 ++Linked; 817 818 for (SDep &SI : From->Succs) { 819 SUnit *SUv = SI.getSUnit(); 820 if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU)) 821 SUv->addPred(SDep(SU, SDep::Artificial), false); 822 } 823 824 for (SDep &SI : SU->Succs) { 825 SUnit *Succ = SI.getSUnit(); 826 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 827 Worklist.push_back(Succ); 828 } 829 } 830 831 return Linked; 832 } 833 834 void apply(ScheduleDAGInstrs *DAGInstrs) override { 835 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 836 if (!ST.hasMAIInsts() || DisablePowerSched) 837 return; 838 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 839 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 840 if (!TSchedModel || DAG->SUnits.empty()) 841 return; 842 843 // Scan for MFMA long latency instructions and try to add a dependency 844 // of available SALU instructions to give them a chance to fill MFMA 845 // shadow. That is desirable to fill MFMA shadow with SALU instructions 846 // rather than VALU to prevent power consumption bursts and throttle. 847 auto LastSALU = DAG->SUnits.begin(); 848 auto E = DAG->SUnits.end(); 849 SmallPtrSet<SUnit*, 32> Visited; 850 for (SUnit &SU : DAG->SUnits) { 851 MachineInstr &MAI = *SU.getInstr(); 852 if (!TII->isMAI(MAI) || 853 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 854 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 855 continue; 856 857 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 858 859 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 860 dbgs() << "Need " << Lat 861 << " instructions to cover latency.\n"); 862 863 // Find up to Lat independent scalar instructions as early as 864 // possible such that they can be scheduled after this MFMA. 865 for ( ; Lat && LastSALU != E; ++LastSALU) { 866 if (Visited.count(&*LastSALU)) 867 continue; 868 869 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 870 continue; 871 872 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 873 } 874 } 875 } 876 }; 877 } // namespace 878 879 void GCNSubtarget::getPostRAMutations( 880 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 881 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 882 Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 883 } 884 885 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 886 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 887 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 888 else 889 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 890 } 891 892 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 893 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 894 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 895 else 896 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 897 } 898