1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 ///
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code. When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
14 //
15 //===----------------------------------------------------------------------===//
16 //
17
18 #include "AMDGPUAsmPrinter.h"
19 #include "AMDGPU.h"
20 #include "AMDGPUHSAMetadataStreamer.h"
21 #include "AMDKernelCodeT.h"
22 #include "GCNSubtarget.h"
23 #include "MCTargetDesc/AMDGPUInstPrinter.h"
24 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
25 #include "R600AsmPrinter.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "TargetInfo/AMDGPUTargetInfo.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/IR/DiagnosticInfo.h"
30 #include "llvm/MC/MCAssembler.h"
31 #include "llvm/MC/MCContext.h"
32 #include "llvm/MC/MCSectionELF.h"
33 #include "llvm/MC/MCStreamer.h"
34 #include "llvm/Support/AMDHSAKernelDescriptor.h"
35 #include "llvm/Support/TargetRegistry.h"
36 #include "llvm/Target/TargetLoweringObjectFile.h"
37 #include "llvm/Target/TargetMachine.h"
38
39 using namespace llvm;
40 using namespace llvm::AMDGPU;
41
42 // We need to tell the runtime some amount ahead of time if we don't know the
43 // true stack size. Assume a smaller number if this is only due to dynamic /
44 // non-entry block allocas.
45 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
46 "amdgpu-assume-external-call-stack-size",
47 cl::desc("Assumed stack use of any external call (in bytes)"),
48 cl::Hidden,
49 cl::init(16384));
50
51 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
52 "amdgpu-assume-dynamic-stack-object-size",
53 cl::desc("Assumed extra stack use if there are any "
54 "variable sized objects (in bytes)"),
55 cl::Hidden,
56 cl::init(4096));
57
58 // This should get the default rounding mode from the kernel. We just set the
59 // default here, but this could change if the OpenCL rounding mode pragmas are
60 // used.
61 //
62 // The denormal mode here should match what is reported by the OpenCL runtime
63 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
64 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
65 //
66 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
67 // precision, and leaves single precision to flush all and does not report
68 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
69 // CL_FP_DENORM for both.
70 //
71 // FIXME: It seems some instructions do not support single precision denormals
72 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
73 // and sin_f32, cos_f32 on most parts).
74
75 // We want to use these instructions, and using fp32 denormals also causes
76 // instructions to run at the double precision rate for the device so it's
77 // probably best to just report no single precision denormals.
getFPMode(AMDGPU::SIModeRegisterDefaults Mode)78 static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
79 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
80 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
81 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
82 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
83 }
84
85 static AsmPrinter *
createAMDGPUAsmPrinterPass(TargetMachine & tm,std::unique_ptr<MCStreamer> && Streamer)86 createAMDGPUAsmPrinterPass(TargetMachine &tm,
87 std::unique_ptr<MCStreamer> &&Streamer) {
88 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
89 }
90
LLVMInitializeAMDGPUAsmPrinter()91 extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() {
92 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
93 llvm::createR600AsmPrinterPass);
94 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
95 createAMDGPUAsmPrinterPass);
96 }
97
AMDGPUAsmPrinter(TargetMachine & TM,std::unique_ptr<MCStreamer> Streamer)98 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
99 std::unique_ptr<MCStreamer> Streamer)
100 : AsmPrinter(TM, std::move(Streamer)) {
101 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
102 if (isHsaAbiVersion2(getGlobalSTI())) {
103 HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
104 } else if (isHsaAbiVersion3(getGlobalSTI())) {
105 HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
106 } else {
107 HSAMetadataStream.reset(new HSAMD::MetadataStreamerV4());
108 }
109 }
110 }
111
getPassName() const112 StringRef AMDGPUAsmPrinter::getPassName() const {
113 return "AMDGPU Assembly Printer";
114 }
115
getGlobalSTI() const116 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
117 return TM.getMCSubtargetInfo();
118 }
119
getTargetStreamer() const120 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
121 if (!OutStreamer)
122 return nullptr;
123 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
124 }
125
emitStartOfAsmFile(Module & M)126 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
127 // TODO: Which one is called first, emitStartOfAsmFile or
128 // emitFunctionBodyStart?
129 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
130 initializeTargetID(M);
131
132 if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
133 TM.getTargetTriple().getOS() != Triple::AMDPAL)
134 return;
135
136 if (isHsaAbiVersion3Or4(getGlobalSTI()))
137 getTargetStreamer()->EmitDirectiveAMDGCNTarget();
138
139 if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
140 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
141
142 if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
143 getTargetStreamer()->getPALMetadata()->readFromIR(M);
144
145 if (isHsaAbiVersion3Or4(getGlobalSTI()))
146 return;
147
148 // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
149 if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
150 getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
151
152 // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2.
153 IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
154 getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2(
155 Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
156 }
157
emitEndOfAsmFile(Module & M)158 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
159 // Following code requires TargetStreamer to be present.
160 if (!getTargetStreamer())
161 return;
162
163 if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
164 isHsaAbiVersion2(getGlobalSTI()))
165 getTargetStreamer()->EmitISAVersion();
166
167 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
168 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
169 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
170 HSAMetadataStream->end();
171 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
172 (void)Success;
173 assert(Success && "Malformed HSA Metadata");
174 }
175 }
176
isBlockOnlyReachableByFallthrough(const MachineBasicBlock * MBB) const177 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
178 const MachineBasicBlock *MBB) const {
179 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
180 return false;
181
182 if (MBB->empty())
183 return true;
184
185 // If this is a block implementing a long branch, an expression relative to
186 // the start of the block is needed. to the start of the block.
187 // XXX - Is there a smarter way to check this?
188 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
189 }
190
emitFunctionBodyStart()191 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
192 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
193 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
194 const Function &F = MF->getFunction();
195
196 // TODO: Which one is called first, emitStartOfAsmFile or
197 // emitFunctionBodyStart?
198 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
199 initializeTargetID(*F.getParent());
200
201 const auto &FunctionTargetID = STM.getTargetID();
202 // Make sure function's xnack settings are compatible with module's
203 // xnack settings.
204 if (FunctionTargetID.isXnackSupported() &&
205 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
206 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
207 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
208 "' function does not match module xnack setting");
209 return;
210 }
211 // Make sure function's sramecc settings are compatible with module's
212 // sramecc settings.
213 if (FunctionTargetID.isSramEccSupported() &&
214 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
215 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
216 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
217 "' function does not match module sramecc setting");
218 return;
219 }
220
221 if (!MFI.isEntryFunction())
222 return;
223
224 if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) &&
225 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
226 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
227 amd_kernel_code_t KernelCode;
228 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
229 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
230 }
231
232 if (STM.isAmdHsaOS())
233 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
234 }
235
emitFunctionBodyEnd()236 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
237 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
238 if (!MFI.isEntryFunction())
239 return;
240
241 if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
242 isHsaAbiVersion2(getGlobalSTI()))
243 return;
244
245 auto &Streamer = getTargetStreamer()->getStreamer();
246 auto &Context = Streamer.getContext();
247 auto &ObjectFileInfo = *Context.getObjectFileInfo();
248 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
249
250 Streamer.PushSection();
251 Streamer.SwitchSection(&ReadOnlySection);
252
253 // CP microcode requires the kernel descriptor to be allocated on 64 byte
254 // alignment.
255 Streamer.emitValueToAlignment(64, 0, 1, 0);
256 if (ReadOnlySection.getAlignment() < 64)
257 ReadOnlySection.setAlignment(Align(64));
258
259 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
260
261 SmallString<128> KernelName;
262 getNameWithPrefix(KernelName, &MF->getFunction());
263 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
264 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
265 CurrentProgramInfo.NumVGPRsForWavesPerEU,
266 CurrentProgramInfo.NumSGPRsForWavesPerEU -
267 IsaInfo::getNumExtraSGPRs(&STM,
268 CurrentProgramInfo.VCCUsed,
269 CurrentProgramInfo.FlatUsed),
270 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
271
272 Streamer.PopSection();
273 }
274
emitFunctionEntryLabel()275 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
276 if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
277 isHsaAbiVersion3Or4(getGlobalSTI())) {
278 AsmPrinter::emitFunctionEntryLabel();
279 return;
280 }
281
282 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
283 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
284 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
285 SmallString<128> SymbolName;
286 getNameWithPrefix(SymbolName, &MF->getFunction()),
287 getTargetStreamer()->EmitAMDGPUSymbolType(
288 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
289 }
290 if (DumpCodeInstEmitter) {
291 // Disassemble function name label to text.
292 DisasmLines.push_back(MF->getName().str() + ":");
293 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
294 HexLines.push_back("");
295 }
296
297 AsmPrinter::emitFunctionEntryLabel();
298 }
299
emitBasicBlockStart(const MachineBasicBlock & MBB)300 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
301 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
302 // Write a line for the basic block label if it is not only fallthrough.
303 DisasmLines.push_back(
304 (Twine("BB") + Twine(getFunctionNumber())
305 + "_" + Twine(MBB.getNumber()) + ":").str());
306 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
307 HexLines.push_back("");
308 }
309 AsmPrinter::emitBasicBlockStart(MBB);
310 }
311
emitGlobalVariable(const GlobalVariable * GV)312 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
313 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
314 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
315 OutContext.reportError({},
316 Twine(GV->getName()) +
317 ": unsupported initializer for address space");
318 return;
319 }
320
321 // LDS variables aren't emitted in HSA or PAL yet.
322 const Triple::OSType OS = TM.getTargetTriple().getOS();
323 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
324 return;
325
326 MCSymbol *GVSym = getSymbol(GV);
327
328 GVSym->redefineIfPossible();
329 if (GVSym->isDefined() || GVSym->isVariable())
330 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
331 "' is already defined");
332
333 const DataLayout &DL = GV->getParent()->getDataLayout();
334 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
335 Align Alignment = GV->getAlign().getValueOr(Align(4));
336
337 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
338 emitLinkage(GV, GVSym);
339 if (auto TS = getTargetStreamer())
340 TS->emitAMDGPULDS(GVSym, Size, Alignment);
341 return;
342 }
343
344 AsmPrinter::emitGlobalVariable(GV);
345 }
346
doFinalization(Module & M)347 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
348 CallGraphResourceInfo.clear();
349
350 // Pad with s_code_end to help tools and guard against instruction prefetch
351 // causing stale data in caches. Arguably this should be done by the linker,
352 // which is why this isn't done for Mesa.
353 const MCSubtargetInfo &STI = *getGlobalSTI();
354 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
355 (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
356 STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
357 OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
358 getTargetStreamer()->EmitCodeEnd(STI);
359 }
360
361 return AsmPrinter::doFinalization(M);
362 }
363
364 // Print comments that apply to both callable functions and entry points.
emitCommonFunctionComments(uint32_t NumVGPR,Optional<uint32_t> NumAGPR,uint32_t TotalNumVGPR,uint32_t NumSGPR,uint64_t ScratchSize,uint64_t CodeSize,const AMDGPUMachineFunction * MFI)365 void AMDGPUAsmPrinter::emitCommonFunctionComments(
366 uint32_t NumVGPR,
367 Optional<uint32_t> NumAGPR,
368 uint32_t TotalNumVGPR,
369 uint32_t NumSGPR,
370 uint64_t ScratchSize,
371 uint64_t CodeSize,
372 const AMDGPUMachineFunction *MFI) {
373 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
374 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
375 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
376 if (NumAGPR) {
377 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
378 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
379 false);
380 }
381 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
382 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
383 false);
384 }
385
getAmdhsaKernelCodeProperties(const MachineFunction & MF) const386 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
387 const MachineFunction &MF) const {
388 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
389 uint16_t KernelCodeProperties = 0;
390
391 if (MFI.hasPrivateSegmentBuffer()) {
392 KernelCodeProperties |=
393 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
394 }
395 if (MFI.hasDispatchPtr()) {
396 KernelCodeProperties |=
397 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
398 }
399 if (MFI.hasQueuePtr()) {
400 KernelCodeProperties |=
401 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
402 }
403 if (MFI.hasKernargSegmentPtr()) {
404 KernelCodeProperties |=
405 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
406 }
407 if (MFI.hasDispatchID()) {
408 KernelCodeProperties |=
409 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
410 }
411 if (MFI.hasFlatScratchInit()) {
412 KernelCodeProperties |=
413 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
414 }
415 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
416 KernelCodeProperties |=
417 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
418 }
419
420 return KernelCodeProperties;
421 }
422
getAmdhsaKernelDescriptor(const MachineFunction & MF,const SIProgramInfo & PI) const423 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
424 const MachineFunction &MF,
425 const SIProgramInfo &PI) const {
426 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
427 const Function &F = MF.getFunction();
428
429 amdhsa::kernel_descriptor_t KernelDescriptor;
430 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
431
432 assert(isUInt<32>(PI.ScratchSize));
433 assert(isUInt<32>(PI.getComputePGMRSrc1()));
434 assert(isUInt<32>(PI.ComputePGMRSrc2));
435
436 KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
437 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
438
439 Align MaxKernArgAlign;
440 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
441
442 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
443 KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
444 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
445
446 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
447 if (STM.hasGFX90AInsts())
448 KernelDescriptor.compute_pgm_rsrc3 =
449 CurrentProgramInfo.ComputePGMRSrc3GFX90A;
450
451 return KernelDescriptor;
452 }
453
runOnMachineFunction(MachineFunction & MF)454 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
455 CurrentProgramInfo = SIProgramInfo();
456
457 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
458
459 // The starting address of all shader programs must be 256 bytes aligned.
460 // Regular functions just need the basic required instruction alignment.
461 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
462
463 SetupMachineFunction(MF);
464
465 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
466 MCContext &Context = getObjFileLowering().getContext();
467 // FIXME: This should be an explicit check for Mesa.
468 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
469 MCSectionELF *ConfigSection =
470 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
471 OutStreamer->SwitchSection(ConfigSection);
472 }
473
474 if (MFI->isModuleEntryFunction()) {
475 getSIProgramInfo(CurrentProgramInfo, MF);
476 } else {
477 auto I = CallGraphResourceInfo.insert(
478 std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
479 SIFunctionResourceInfo &Info = I.first->second;
480 assert(I.second && "should only be called once per function");
481 Info = analyzeResourceUsage(MF);
482 }
483
484 if (STM.isAmdPalOS()) {
485 if (MFI->isEntryFunction())
486 EmitPALMetadata(MF, CurrentProgramInfo);
487 else if (MFI->isModuleEntryFunction())
488 emitPALFunctionMetadata(MF);
489 } else if (!STM.isAmdHsaOS()) {
490 EmitProgramInfoSI(MF, CurrentProgramInfo);
491 }
492
493 DumpCodeInstEmitter = nullptr;
494 if (STM.dumpCode()) {
495 // For -dumpcode, get the assembler out of the streamer, even if it does
496 // not really want to let us have it. This only works with -filetype=obj.
497 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
498 OutStreamer->setUseAssemblerInfoForParsing(true);
499 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
500 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
501 if (Assembler)
502 DumpCodeInstEmitter = Assembler->getEmitterPtr();
503 }
504
505 DisasmLines.clear();
506 HexLines.clear();
507 DisasmLineMaxLen = 0;
508
509 emitFunctionBody();
510
511 if (isVerbose()) {
512 MCSectionELF *CommentSection =
513 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
514 OutStreamer->SwitchSection(CommentSection);
515
516 if (!MFI->isEntryFunction()) {
517 OutStreamer->emitRawComment(" Function info:", false);
518 SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
519 emitCommonFunctionComments(
520 Info.NumVGPR,
521 STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
522 Info.getTotalNumVGPRs(STM),
523 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
524 Info.PrivateSegmentSize,
525 getFunctionCodeSize(MF), MFI);
526 return false;
527 }
528
529 OutStreamer->emitRawComment(" Kernel info:", false);
530 emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR,
531 STM.hasMAIInsts()
532 ? CurrentProgramInfo.NumAccVGPR
533 : Optional<uint32_t>(),
534 CurrentProgramInfo.NumVGPR,
535 CurrentProgramInfo.NumSGPR,
536 CurrentProgramInfo.ScratchSize,
537 getFunctionCodeSize(MF), MFI);
538
539 OutStreamer->emitRawComment(
540 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
541 OutStreamer->emitRawComment(
542 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
543 OutStreamer->emitRawComment(
544 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
545 " bytes/workgroup (compile time only)", false);
546
547 OutStreamer->emitRawComment(
548 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
549 OutStreamer->emitRawComment(
550 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
551
552 OutStreamer->emitRawComment(
553 " NumSGPRsForWavesPerEU: " +
554 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
555 OutStreamer->emitRawComment(
556 " NumVGPRsForWavesPerEU: " +
557 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
558
559 if (STM.hasGFX90AInsts())
560 OutStreamer->emitRawComment(
561 " AccumOffset: " +
562 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
563
564 OutStreamer->emitRawComment(
565 " Occupancy: " +
566 Twine(CurrentProgramInfo.Occupancy), false);
567
568 OutStreamer->emitRawComment(
569 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
570
571 OutStreamer->emitRawComment(
572 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
573 Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
574 OutStreamer->emitRawComment(
575 " COMPUTE_PGM_RSRC2:USER_SGPR: " +
576 Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
577 OutStreamer->emitRawComment(
578 " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
579 Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
580 OutStreamer->emitRawComment(
581 " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
582 Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
583 OutStreamer->emitRawComment(
584 " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
585 Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
586 OutStreamer->emitRawComment(
587 " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
588 Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
589 OutStreamer->emitRawComment(
590 " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
591 Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
592 false);
593
594 assert(STM.hasGFX90AInsts() ||
595 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
596 if (STM.hasGFX90AInsts()) {
597 OutStreamer->emitRawComment(
598 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
599 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
600 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
601 false);
602 OutStreamer->emitRawComment(
603 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
604 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
605 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
606 false);
607 }
608 }
609
610 if (DumpCodeInstEmitter) {
611
612 OutStreamer->SwitchSection(
613 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
614
615 for (size_t i = 0; i < DisasmLines.size(); ++i) {
616 std::string Comment = "\n";
617 if (!HexLines[i].empty()) {
618 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
619 Comment += " ; " + HexLines[i] + "\n";
620 }
621
622 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
623 OutStreamer->emitBytes(StringRef(Comment));
624 }
625 }
626
627 return false;
628 }
629
630 // TODO: Fold this into emitFunctionBodyStart.
initializeTargetID(const Module & M)631 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
632 // In the beginning all features are either 'Any' or 'NotSupported',
633 // depending on global target features. This will cover empty modules.
634 getTargetStreamer()->initializeTargetID(
635 *getGlobalSTI(), getGlobalSTI()->getFeatureString());
636
637 // If module is empty, we are done.
638 if (M.empty())
639 return;
640
641 // If module is not empty, need to find first 'Off' or 'On' feature
642 // setting per feature from functions in module.
643 for (auto &F : M) {
644 auto &TSTargetID = getTargetStreamer()->getTargetID();
645 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
646 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
647 break;
648
649 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
650 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
651 if (TSTargetID->isXnackSupported())
652 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
653 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
654 if (TSTargetID->isSramEccSupported())
655 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
656 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
657 }
658 }
659
getFunctionCodeSize(const MachineFunction & MF) const660 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
661 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
662 const SIInstrInfo *TII = STM.getInstrInfo();
663
664 uint64_t CodeSize = 0;
665
666 for (const MachineBasicBlock &MBB : MF) {
667 for (const MachineInstr &MI : MBB) {
668 // TODO: CodeSize should account for multiple functions.
669
670 // TODO: Should we count size of debug info?
671 if (MI.isDebugInstr())
672 continue;
673
674 CodeSize += TII->getInstSizeInBytes(MI);
675 }
676 }
677
678 return CodeSize;
679 }
680
hasAnyNonFlatUseOfReg(const MachineRegisterInfo & MRI,const SIInstrInfo & TII,unsigned Reg)681 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
682 const SIInstrInfo &TII,
683 unsigned Reg) {
684 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
685 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
686 return true;
687 }
688
689 return false;
690 }
691
getTotalNumSGPRs(const GCNSubtarget & ST) const692 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
693 const GCNSubtarget &ST) const {
694 return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(
695 &ST, UsesVCC, UsesFlatScratch, ST.getTargetID().isXnackOnOrAny());
696 }
697
getTotalNumVGPRs(const GCNSubtarget & ST) const698 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
699 const GCNSubtarget &ST) const {
700 if (ST.hasGFX90AInsts() && NumAGPR)
701 return alignTo(NumVGPR, 4) + NumAGPR;
702 return std::max(NumVGPR, NumAGPR);
703 }
704
getCalleeFunction(const MachineOperand & Op)705 static const Function *getCalleeFunction(const MachineOperand &Op) {
706 if (Op.isImm()) {
707 assert(Op.getImm() == 0);
708 return nullptr;
709 }
710
711 return cast<Function>(Op.getGlobal());
712 }
713
analyzeResourceUsage(const MachineFunction & MF) const714 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
715 const MachineFunction &MF) const {
716 SIFunctionResourceInfo Info;
717
718 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
719 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
720 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
721 const MachineRegisterInfo &MRI = MF.getRegInfo();
722 const SIInstrInfo *TII = ST.getInstrInfo();
723 const SIRegisterInfo &TRI = TII->getRegisterInfo();
724
725 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
726 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
727 MRI.isLiveIn(MFI->getPreloadedReg(
728 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
729
730 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
731 // instructions aren't used to access the scratch buffer. Inline assembly may
732 // need it though.
733 //
734 // If we only have implicit uses of flat_scr on flat instructions, it is not
735 // really needed.
736 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
737 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
738 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
739 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
740 Info.UsesFlatScratch = false;
741 }
742
743 Info.PrivateSegmentSize = FrameInfo.getStackSize();
744
745 // Assume a big number if there are any unknown sized objects.
746 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
747 if (Info.HasDynamicallySizedStack)
748 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
749
750 if (MFI->isStackRealigned())
751 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
752
753 Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
754 MRI.isPhysRegUsed(AMDGPU::VCC_HI);
755
756 // If there are no calls, MachineRegisterInfo can tell us the used register
757 // count easily.
758 // A tail call isn't considered a call for MachineFrameInfo's purposes.
759 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
760 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
761 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
762 if (MRI.isPhysRegUsed(Reg)) {
763 HighestVGPRReg = Reg;
764 break;
765 }
766 }
767
768 if (ST.hasMAIInsts()) {
769 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
770 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
771 if (MRI.isPhysRegUsed(Reg)) {
772 HighestAGPRReg = Reg;
773 break;
774 }
775 }
776 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
777 TRI.getHWRegIndex(HighestAGPRReg) + 1;
778 }
779
780 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
781 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
782 if (MRI.isPhysRegUsed(Reg)) {
783 HighestSGPRReg = Reg;
784 break;
785 }
786 }
787
788 // We found the maximum register index. They start at 0, so add one to get the
789 // number of registers.
790 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
791 TRI.getHWRegIndex(HighestVGPRReg) + 1;
792 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
793 TRI.getHWRegIndex(HighestSGPRReg) + 1;
794
795 return Info;
796 }
797
798 int32_t MaxVGPR = -1;
799 int32_t MaxAGPR = -1;
800 int32_t MaxSGPR = -1;
801 uint64_t CalleeFrameSize = 0;
802
803 for (const MachineBasicBlock &MBB : MF) {
804 for (const MachineInstr &MI : MBB) {
805 // TODO: Check regmasks? Do they occur anywhere except calls?
806 for (const MachineOperand &MO : MI.operands()) {
807 unsigned Width = 0;
808 bool IsSGPR = false;
809 bool IsAGPR = false;
810
811 if (!MO.isReg())
812 continue;
813
814 Register Reg = MO.getReg();
815 switch (Reg) {
816 case AMDGPU::EXEC:
817 case AMDGPU::EXEC_LO:
818 case AMDGPU::EXEC_HI:
819 case AMDGPU::SCC:
820 case AMDGPU::M0:
821 case AMDGPU::SRC_SHARED_BASE:
822 case AMDGPU::SRC_SHARED_LIMIT:
823 case AMDGPU::SRC_PRIVATE_BASE:
824 case AMDGPU::SRC_PRIVATE_LIMIT:
825 case AMDGPU::SGPR_NULL:
826 case AMDGPU::MODE:
827 continue;
828
829 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
830 llvm_unreachable("src_pops_exiting_wave_id should not be used");
831
832 case AMDGPU::NoRegister:
833 assert(MI.isDebugInstr() && "Instruction uses invalid noreg register");
834 continue;
835
836 case AMDGPU::VCC:
837 case AMDGPU::VCC_LO:
838 case AMDGPU::VCC_HI:
839 case AMDGPU::VCC_LO_LO16:
840 case AMDGPU::VCC_LO_HI16:
841 case AMDGPU::VCC_HI_LO16:
842 case AMDGPU::VCC_HI_HI16:
843 Info.UsesVCC = true;
844 continue;
845
846 case AMDGPU::FLAT_SCR:
847 case AMDGPU::FLAT_SCR_LO:
848 case AMDGPU::FLAT_SCR_HI:
849 continue;
850
851 case AMDGPU::XNACK_MASK:
852 case AMDGPU::XNACK_MASK_LO:
853 case AMDGPU::XNACK_MASK_HI:
854 llvm_unreachable("xnack_mask registers should not be used");
855
856 case AMDGPU::LDS_DIRECT:
857 llvm_unreachable("lds_direct register should not be used");
858
859 case AMDGPU::TBA:
860 case AMDGPU::TBA_LO:
861 case AMDGPU::TBA_HI:
862 case AMDGPU::TMA:
863 case AMDGPU::TMA_LO:
864 case AMDGPU::TMA_HI:
865 llvm_unreachable("trap handler registers should not be used");
866
867 case AMDGPU::SRC_VCCZ:
868 llvm_unreachable("src_vccz register should not be used");
869
870 case AMDGPU::SRC_EXECZ:
871 llvm_unreachable("src_execz register should not be used");
872
873 case AMDGPU::SRC_SCC:
874 llvm_unreachable("src_scc register should not be used");
875
876 default:
877 break;
878 }
879
880 if (AMDGPU::SReg_32RegClass.contains(Reg) ||
881 AMDGPU::SReg_LO16RegClass.contains(Reg) ||
882 AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
883 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
884 "trap handler registers should not be used");
885 IsSGPR = true;
886 Width = 1;
887 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
888 AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
889 AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
890 IsSGPR = false;
891 Width = 1;
892 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
893 AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
894 IsSGPR = false;
895 IsAGPR = true;
896 Width = 1;
897 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
898 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
899 "trap handler registers should not be used");
900 IsSGPR = true;
901 Width = 2;
902 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
903 IsSGPR = false;
904 Width = 2;
905 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
906 IsSGPR = false;
907 IsAGPR = true;
908 Width = 2;
909 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
910 IsSGPR = false;
911 Width = 3;
912 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
913 IsSGPR = true;
914 Width = 3;
915 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
916 IsSGPR = false;
917 IsAGPR = true;
918 Width = 3;
919 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
920 assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
921 "trap handler registers should not be used");
922 IsSGPR = true;
923 Width = 4;
924 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
925 IsSGPR = false;
926 Width = 4;
927 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
928 IsSGPR = false;
929 IsAGPR = true;
930 Width = 4;
931 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
932 IsSGPR = false;
933 Width = 5;
934 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
935 IsSGPR = true;
936 Width = 5;
937 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
938 IsSGPR = false;
939 IsAGPR = true;
940 Width = 5;
941 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
942 IsSGPR = false;
943 Width = 6;
944 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
945 IsSGPR = true;
946 Width = 6;
947 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
948 IsSGPR = false;
949 IsAGPR = true;
950 Width = 6;
951 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
952 assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
953 "trap handler registers should not be used");
954 IsSGPR = true;
955 Width = 8;
956 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
957 IsSGPR = false;
958 Width = 8;
959 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
960 IsSGPR = false;
961 IsAGPR = true;
962 Width = 8;
963 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
964 assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
965 "trap handler registers should not be used");
966 IsSGPR = true;
967 Width = 16;
968 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
969 IsSGPR = false;
970 Width = 16;
971 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
972 IsSGPR = false;
973 IsAGPR = true;
974 Width = 16;
975 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
976 IsSGPR = true;
977 Width = 32;
978 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
979 IsSGPR = false;
980 Width = 32;
981 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
982 IsSGPR = false;
983 IsAGPR = true;
984 Width = 32;
985 } else {
986 llvm_unreachable("Unknown register class");
987 }
988 unsigned HWReg = TRI.getHWRegIndex(Reg);
989 int MaxUsed = HWReg + Width - 1;
990 if (IsSGPR) {
991 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
992 } else if (IsAGPR) {
993 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
994 } else {
995 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
996 }
997 }
998
999 if (MI.isCall()) {
1000 // Pseudo used just to encode the underlying global. Is there a better
1001 // way to track this?
1002
1003 const MachineOperand *CalleeOp
1004 = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
1005
1006 const Function *Callee = getCalleeFunction(*CalleeOp);
1007 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
1008 CallGraphResourceInfo.end();
1009 bool IsExternal = !Callee || Callee->isDeclaration();
1010 if (!IsExternal)
1011 I = CallGraphResourceInfo.find(Callee);
1012
1013 if (IsExternal || I == CallGraphResourceInfo.end()) {
1014 // Avoid crashing on undefined behavior with an illegal call to a
1015 // kernel. If a callsite's calling convention doesn't match the
1016 // function's, it's undefined behavior. If the callsite calling
1017 // convention does match, that would have errored earlier.
1018 // FIXME: The verifier shouldn't allow this.
1019 if (!IsExternal &&
1020 AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
1021 report_fatal_error("invalid call to entry function");
1022
1023 // If this is a call to an external function, we can't do much. Make
1024 // conservative guesses.
1025
1026 // 48 SGPRs - vcc, - flat_scr, -xnack
1027 int MaxSGPRGuess =
1028 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
1029 MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
1030 MaxVGPR = std::max(MaxVGPR, 23);
1031 MaxAGPR = std::max(MaxAGPR, 23);
1032
1033 CalleeFrameSize = std::max(CalleeFrameSize,
1034 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
1035
1036 Info.UsesVCC = true;
1037 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
1038 Info.HasDynamicallySizedStack = true;
1039 } else {
1040 // We force CodeGen to run in SCC order, so the callee's register
1041 // usage etc. should be the cumulative usage of all callees.
1042
1043 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
1044 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
1045 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
1046 CalleeFrameSize
1047 = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
1048 Info.UsesVCC |= I->second.UsesVCC;
1049 Info.UsesFlatScratch |= I->second.UsesFlatScratch;
1050 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
1051 Info.HasRecursion |= I->second.HasRecursion;
1052 }
1053
1054 // FIXME: Call site could have norecurse on it
1055 if (!Callee || !Callee->doesNotRecurse())
1056 Info.HasRecursion = true;
1057 }
1058 }
1059 }
1060
1061 Info.NumExplicitSGPR = MaxSGPR + 1;
1062 Info.NumVGPR = MaxVGPR + 1;
1063 Info.NumAGPR = MaxAGPR + 1;
1064 Info.PrivateSegmentSize += CalleeFrameSize;
1065
1066 return Info;
1067 }
1068
getSIProgramInfo(SIProgramInfo & ProgInfo,const MachineFunction & MF)1069 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1070 const MachineFunction &MF) {
1071 SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
1072 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1073
1074 ProgInfo.NumArchVGPR = Info.NumVGPR;
1075 ProgInfo.NumAccVGPR = Info.NumAGPR;
1076 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
1077 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
1078 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1079 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
1080 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
1081 ProgInfo.VCCUsed = Info.UsesVCC;
1082 ProgInfo.FlatUsed = Info.UsesFlatScratch;
1083 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
1084
1085 const uint64_t MaxScratchPerWorkitem =
1086 GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
1087 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
1088 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
1089 ProgInfo.ScratchSize, DS_Error);
1090 MF.getFunction().getContext().diagnose(DiagStackSize);
1091 }
1092
1093 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1094
1095 // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
1096 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1097 // unified.
1098 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
1099 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
1100
1101 // Check the addressable register limit before we add ExtraSGPRs.
1102 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1103 !STM.hasSGPRInitBug()) {
1104 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1105 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
1106 // This can happen due to a compiler bug or when using inline asm.
1107 LLVMContext &Ctx = MF.getFunction().getContext();
1108 DiagnosticInfoResourceLimit Diag(MF.getFunction(),
1109 "addressable scalar registers",
1110 ProgInfo.NumSGPR, DS_Error,
1111 DK_ResourceLimit,
1112 MaxAddressableNumSGPRs);
1113 Ctx.diagnose(Diag);
1114 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
1115 }
1116 }
1117
1118 // Account for extra SGPRs and VGPRs reserved for debugger use.
1119 ProgInfo.NumSGPR += ExtraSGPRs;
1120
1121 const Function &F = MF.getFunction();
1122
1123 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1124 // dispatch registers are function args.
1125 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1126
1127 if (isShader(F.getCallingConv())) {
1128 // FIXME: We should be using the number of registers determined during
1129 // calling convention lowering to legalize the types.
1130 const DataLayout &DL = F.getParent()->getDataLayout();
1131 for (auto &Arg : F.args()) {
1132 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1133 if (Arg.hasAttribute(Attribute::InReg))
1134 WaveDispatchNumSGPR += NumRegs;
1135 else
1136 WaveDispatchNumVGPR += NumRegs;
1137 }
1138 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
1139 ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
1140 }
1141
1142 // Adjust number of registers used to meet default/requested minimum/maximum
1143 // number of waves per execution unit request.
1144 ProgInfo.NumSGPRsForWavesPerEU = std::max(
1145 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
1146 ProgInfo.NumVGPRsForWavesPerEU = std::max(
1147 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
1148
1149 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
1150 STM.hasSGPRInitBug()) {
1151 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1152 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
1153 // This can happen due to a compiler bug or when using inline asm to use
1154 // the registers which are usually reserved for vcc etc.
1155 LLVMContext &Ctx = MF.getFunction().getContext();
1156 DiagnosticInfoResourceLimit Diag(MF.getFunction(),
1157 "scalar registers",
1158 ProgInfo.NumSGPR, DS_Error,
1159 DK_ResourceLimit,
1160 MaxAddressableNumSGPRs);
1161 Ctx.diagnose(Diag);
1162 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
1163 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
1164 }
1165 }
1166
1167 if (STM.hasSGPRInitBug()) {
1168 ProgInfo.NumSGPR =
1169 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
1170 ProgInfo.NumSGPRsForWavesPerEU =
1171 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
1172 }
1173
1174 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1175 LLVMContext &Ctx = MF.getFunction().getContext();
1176 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1177 MFI->getNumUserSGPRs(), DS_Error);
1178 Ctx.diagnose(Diag);
1179 }
1180
1181 if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
1182 LLVMContext &Ctx = MF.getFunction().getContext();
1183 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
1184 MFI->getLDSSize(), DS_Error);
1185 Ctx.diagnose(Diag);
1186 }
1187
1188 ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
1189 &STM, ProgInfo.NumSGPRsForWavesPerEU);
1190 ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
1191 &STM, ProgInfo.NumVGPRsForWavesPerEU);
1192
1193 const SIModeRegisterDefaults Mode = MFI->getMode();
1194
1195 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1196 // register.
1197 ProgInfo.FloatMode = getFPMode(Mode);
1198
1199 ProgInfo.IEEEMode = Mode.IEEE;
1200
1201 // Make clamp modifier on NaN input returns 0.
1202 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1203
1204 unsigned LDSAlignShift;
1205 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
1206 // LDS is allocated in 64 dword blocks.
1207 LDSAlignShift = 8;
1208 } else {
1209 // LDS is allocated in 128 dword blocks.
1210 LDSAlignShift = 9;
1211 }
1212
1213 unsigned LDSSpillSize =
1214 MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
1215
1216 ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
1217 ProgInfo.LDSBlocks =
1218 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1219
1220 // Scratch is allocated in 256 dword blocks.
1221 unsigned ScratchAlignShift = 10;
1222 // We need to program the hardware with the amount of scratch memory that
1223 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1224 // scratch memory used per thread.
1225 ProgInfo.ScratchBlocks =
1226 alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
1227 1ULL << ScratchAlignShift) >>
1228 ScratchAlignShift;
1229
1230 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1231 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1232 ProgInfo.MemOrdered = 1;
1233 }
1234
1235 // 0 = X, 1 = XY, 2 = XYZ
1236 unsigned TIDIGCompCnt = 0;
1237 if (MFI->hasWorkItemIDZ())
1238 TIDIGCompCnt = 2;
1239 else if (MFI->hasWorkItemIDY())
1240 TIDIGCompCnt = 1;
1241
1242 ProgInfo.ComputePGMRSrc2 =
1243 S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
1244 S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
1245 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1246 S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
1247 S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
1248 S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
1249 S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
1250 S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
1251 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
1252 S_00B84C_EXCP_EN_MSB(0) |
1253 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1254 S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
1255 S_00B84C_EXCP_EN(0);
1256
1257 if (STM.hasGFX90AInsts()) {
1258 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
1259 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1260 ProgInfo.AccumOffset);
1261 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
1262 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1263 ProgInfo.TgSplit);
1264 }
1265
1266 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
1267 ProgInfo.NumSGPRsForWavesPerEU,
1268 ProgInfo.NumVGPRsForWavesPerEU);
1269 }
1270
getRsrcReg(CallingConv::ID CallConv)1271 static unsigned getRsrcReg(CallingConv::ID CallConv) {
1272 switch (CallConv) {
1273 default: LLVM_FALLTHROUGH;
1274 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1275 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1276 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1277 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1278 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1279 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1280 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1281 }
1282 }
1283
EmitProgramInfoSI(const MachineFunction & MF,const SIProgramInfo & CurrentProgramInfo)1284 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1285 const SIProgramInfo &CurrentProgramInfo) {
1286 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1287 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1288
1289 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1290 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1291
1292 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
1293
1294 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1295 OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
1296
1297 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1298 OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
1299
1300 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1301 // 0" comment but I don't see a corresponding field in the register spec.
1302 } else {
1303 OutStreamer->emitInt32(RsrcReg);
1304 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1305 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1306 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1307 OutStreamer->emitIntValue(
1308 S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1309 }
1310
1311 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1312 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1313 OutStreamer->emitInt32(
1314 S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
1315 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1316 OutStreamer->emitInt32(MFI->getPSInputEnable());
1317 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1318 OutStreamer->emitInt32(MFI->getPSInputAddr());
1319 }
1320
1321 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1322 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1323 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1324 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1325 }
1326
1327 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1328 // is AMDPAL. It stores each compute/SPI register setting and other PAL
1329 // metadata items into the PALMD::Metadata, combining with any provided by the
1330 // frontend as LLVM metadata. Once all functions are written, the PAL metadata
1331 // is then written as a single block in the .note section.
EmitPALMetadata(const MachineFunction & MF,const SIProgramInfo & CurrentProgramInfo)1332 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1333 const SIProgramInfo &CurrentProgramInfo) {
1334 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1335 auto CC = MF.getFunction().getCallingConv();
1336 auto MD = getTargetStreamer()->getPALMetadata();
1337
1338 MD->setEntryPoint(CC, MF.getFunction().getName());
1339 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1340 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1341 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1342 if (AMDGPU::isCompute(CC)) {
1343 MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
1344 } else {
1345 if (CurrentProgramInfo.ScratchBlocks > 0)
1346 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1347 }
1348 // ScratchSize is in bytes, 16 aligned.
1349 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1350 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1351 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
1352 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1353 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1354 }
1355
1356 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1357 if (STM.isWave32())
1358 MD->setWave32(MF.getFunction().getCallingConv());
1359 }
1360
emitPALFunctionMetadata(const MachineFunction & MF)1361 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1362 auto *MD = getTargetStreamer()->getPALMetadata();
1363 const MachineFrameInfo &MFI = MF.getFrameInfo();
1364 MD->setFunctionScratchSize(MF, MFI.getStackSize());
1365 // Set compute registers
1366 MD->setRsrc1(CallingConv::AMDGPU_CS,
1367 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1368 MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
1369 }
1370
1371 // This is supposed to be log2(Size)
getElementByteSizeValue(unsigned Size)1372 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1373 switch (Size) {
1374 case 4:
1375 return AMD_ELEMENT_4_BYTES;
1376 case 8:
1377 return AMD_ELEMENT_8_BYTES;
1378 case 16:
1379 return AMD_ELEMENT_16_BYTES;
1380 default:
1381 llvm_unreachable("invalid private_element_size");
1382 }
1383 }
1384
getAmdKernelCode(amd_kernel_code_t & Out,const SIProgramInfo & CurrentProgramInfo,const MachineFunction & MF) const1385 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1386 const SIProgramInfo &CurrentProgramInfo,
1387 const MachineFunction &MF) const {
1388 const Function &F = MF.getFunction();
1389 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1390 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1391
1392 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1393 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1394
1395 AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
1396
1397 Out.compute_pgm_resource_registers =
1398 CurrentProgramInfo.getComputePGMRSrc1() |
1399 (CurrentProgramInfo.ComputePGMRSrc2 << 32);
1400 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1401
1402 if (CurrentProgramInfo.DynamicCallStack)
1403 Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1404
1405 AMD_HSA_BITS_SET(Out.code_properties,
1406 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1407 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1408
1409 if (MFI->hasPrivateSegmentBuffer()) {
1410 Out.code_properties |=
1411 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1412 }
1413
1414 if (MFI->hasDispatchPtr())
1415 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1416
1417 if (MFI->hasQueuePtr())
1418 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1419
1420 if (MFI->hasKernargSegmentPtr())
1421 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1422
1423 if (MFI->hasDispatchID())
1424 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1425
1426 if (MFI->hasFlatScratchInit())
1427 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1428
1429 if (MFI->hasDispatchPtr())
1430 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1431
1432 if (STM.isXNACKEnabled())
1433 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1434
1435 Align MaxKernArgAlign;
1436 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1437 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1438 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1439 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1440 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1441
1442 // kernarg_segment_alignment is specified as log of the alignment.
1443 // The minimum alignment is 16.
1444 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1445 }
1446
PrintAsmOperand(const MachineInstr * MI,unsigned OpNo,const char * ExtraCode,raw_ostream & O)1447 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1448 const char *ExtraCode, raw_ostream &O) {
1449 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1450 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1451 return false;
1452
1453 if (ExtraCode && ExtraCode[0]) {
1454 if (ExtraCode[1] != 0)
1455 return true; // Unknown modifier.
1456
1457 switch (ExtraCode[0]) {
1458 case 'r':
1459 break;
1460 default:
1461 return true;
1462 }
1463 }
1464
1465 // TODO: Should be able to support other operand types like globals.
1466 const MachineOperand &MO = MI->getOperand(OpNo);
1467 if (MO.isReg()) {
1468 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1469 *MF->getSubtarget().getRegisterInfo());
1470 return false;
1471 } else if (MO.isImm()) {
1472 int64_t Val = MO.getImm();
1473 if (AMDGPU::isInlinableIntLiteral(Val)) {
1474 O << Val;
1475 } else if (isUInt<16>(Val)) {
1476 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1477 } else if (isUInt<32>(Val)) {
1478 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1479 } else {
1480 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1481 }
1482 return false;
1483 }
1484 return true;
1485 }
1486