1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for SI+ GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCtorDtorLowering.h"
19 #include "AMDGPUExportClustering.h"
20 #include "AMDGPUIGroupLP.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPURegBankSelect.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "AMDGPUUnifyDivergentExitNodes.h"
26 #include "GCNIterativeScheduler.h"
27 #include "GCNSchedStrategy.h"
28 #include "GCNVOPDUtils.h"
29 #include "R600.h"
30 #include "R600MachineFunctionInfo.h"
31 #include "R600TargetMachine.h"
32 #include "SIMachineFunctionInfo.h"
33 #include "SIMachineScheduler.h"
34 #include "TargetInfo/AMDGPUTargetInfo.h"
35 #include "Utils/AMDGPUBaseInfo.h"
36 #include "llvm/Analysis/CGSCCPassManager.h"
37 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
38 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
39 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
40 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
41 #include "llvm/CodeGen/GlobalISel/Localizer.h"
42 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
43 #include "llvm/CodeGen/MIRParser/MIParser.h"
44 #include "llvm/CodeGen/Passes.h"
45 #include "llvm/CodeGen/RegAllocRegistry.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/IR/IntrinsicsAMDGPU.h"
48 #include "llvm/IR/PassManager.h"
49 #include "llvm/IR/PatternMatch.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/MC/TargetRegistry.h"
52 #include "llvm/Passes/PassBuilder.h"
53 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
54 #include "llvm/Transforms/IPO.h"
55 #include "llvm/Transforms/IPO/AlwaysInliner.h"
56 #include "llvm/Transforms/IPO/GlobalDCE.h"
57 #include "llvm/Transforms/IPO/Internalize.h"
58 #include "llvm/Transforms/Scalar.h"
59 #include "llvm/Transforms/Scalar/GVN.h"
60 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
61 #include "llvm/Transforms/Utils.h"
62 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
63 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
64 #include <optional>
65
66 using namespace llvm;
67 using namespace llvm::PatternMatch;
68
69 namespace {
70 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
71 public:
SGPRRegisterRegAlloc(const char * N,const char * D,FunctionPassCtor C)72 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
73 : RegisterRegAllocBase(N, D, C) {}
74 };
75
76 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
77 public:
VGPRRegisterRegAlloc(const char * N,const char * D,FunctionPassCtor C)78 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
79 : RegisterRegAllocBase(N, D, C) {}
80 };
81
onlyAllocateSGPRs(const TargetRegisterInfo & TRI,const TargetRegisterClass & RC)82 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
83 const TargetRegisterClass &RC) {
84 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
85 }
86
onlyAllocateVGPRs(const TargetRegisterInfo & TRI,const TargetRegisterClass & RC)87 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
88 const TargetRegisterClass &RC) {
89 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
90 }
91
92
93 /// -{sgpr|vgpr}-regalloc=... command line option.
useDefaultRegisterAllocator()94 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
95
96 /// A dummy default pass factory indicates whether the register allocator is
97 /// overridden on the command line.
98 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
99 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
100
101 static SGPRRegisterRegAlloc
102 defaultSGPRRegAlloc("default",
103 "pick SGPR register allocator based on -O option",
104 useDefaultRegisterAllocator);
105
106 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
107 RegisterPassParser<SGPRRegisterRegAlloc>>
108 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
109 cl::desc("Register allocator to use for SGPRs"));
110
111 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
112 RegisterPassParser<VGPRRegisterRegAlloc>>
113 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
114 cl::desc("Register allocator to use for VGPRs"));
115
116
initializeDefaultSGPRRegisterAllocatorOnce()117 static void initializeDefaultSGPRRegisterAllocatorOnce() {
118 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
119
120 if (!Ctor) {
121 Ctor = SGPRRegAlloc;
122 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
123 }
124 }
125
initializeDefaultVGPRRegisterAllocatorOnce()126 static void initializeDefaultVGPRRegisterAllocatorOnce() {
127 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
128
129 if (!Ctor) {
130 Ctor = VGPRRegAlloc;
131 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
132 }
133 }
134
createBasicSGPRRegisterAllocator()135 static FunctionPass *createBasicSGPRRegisterAllocator() {
136 return createBasicRegisterAllocator(onlyAllocateSGPRs);
137 }
138
createGreedySGPRRegisterAllocator()139 static FunctionPass *createGreedySGPRRegisterAllocator() {
140 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
141 }
142
createFastSGPRRegisterAllocator()143 static FunctionPass *createFastSGPRRegisterAllocator() {
144 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
145 }
146
createBasicVGPRRegisterAllocator()147 static FunctionPass *createBasicVGPRRegisterAllocator() {
148 return createBasicRegisterAllocator(onlyAllocateVGPRs);
149 }
150
createGreedyVGPRRegisterAllocator()151 static FunctionPass *createGreedyVGPRRegisterAllocator() {
152 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
153 }
154
createFastVGPRRegisterAllocator()155 static FunctionPass *createFastVGPRRegisterAllocator() {
156 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
157 }
158
159 static SGPRRegisterRegAlloc basicRegAllocSGPR(
160 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
161 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
162 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
163
164 static SGPRRegisterRegAlloc fastRegAllocSGPR(
165 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
166
167
168 static VGPRRegisterRegAlloc basicRegAllocVGPR(
169 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
170 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
171 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
172
173 static VGPRRegisterRegAlloc fastRegAllocVGPR(
174 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
175 }
176
177 static cl::opt<bool>
178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
179 cl::desc("Run early if-conversion"),
180 cl::init(false));
181
182 static cl::opt<bool>
183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
184 cl::desc("Run pre-RA exec mask optimizations"),
185 cl::init(true));
186
187 static cl::opt<bool>
188 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
189 cl::desc("Lower GPU ctor / dtors to globals on the device."),
190 cl::init(true), cl::Hidden);
191
192 // Option to disable vectorizer for tests.
193 static cl::opt<bool> EnableLoadStoreVectorizer(
194 "amdgpu-load-store-vectorizer",
195 cl::desc("Enable load store vectorizer"),
196 cl::init(true),
197 cl::Hidden);
198
199 // Option to control global loads scalarization
200 static cl::opt<bool> ScalarizeGlobal(
201 "amdgpu-scalarize-global-loads",
202 cl::desc("Enable global load scalarization"),
203 cl::init(true),
204 cl::Hidden);
205
206 // Option to run internalize pass.
207 static cl::opt<bool> InternalizeSymbols(
208 "amdgpu-internalize-symbols",
209 cl::desc("Enable elimination of non-kernel functions and unused globals"),
210 cl::init(false),
211 cl::Hidden);
212
213 // Option to inline all early.
214 static cl::opt<bool> EarlyInlineAll(
215 "amdgpu-early-inline-all",
216 cl::desc("Inline all functions early"),
217 cl::init(false),
218 cl::Hidden);
219
220 static cl::opt<bool> RemoveIncompatibleFunctions(
221 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
222 cl::desc("Enable removal of functions when they"
223 "use features not supported by the target GPU"),
224 cl::init(true));
225
226 static cl::opt<bool> EnableSDWAPeephole(
227 "amdgpu-sdwa-peephole",
228 cl::desc("Enable SDWA peepholer"),
229 cl::init(true));
230
231 static cl::opt<bool> EnableDPPCombine(
232 "amdgpu-dpp-combine",
233 cl::desc("Enable DPP combiner"),
234 cl::init(true));
235
236 // Enable address space based alias analysis
237 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
238 cl::desc("Enable AMDGPU Alias Analysis"),
239 cl::init(true));
240
241 // Option to run late CFG structurizer
242 static cl::opt<bool, true> LateCFGStructurize(
243 "amdgpu-late-structurize",
244 cl::desc("Enable late CFG structurization"),
245 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
246 cl::Hidden);
247
248 // Enable lib calls simplifications
249 static cl::opt<bool> EnableLibCallSimplify(
250 "amdgpu-simplify-libcall",
251 cl::desc("Enable amdgpu library simplifications"),
252 cl::init(true),
253 cl::Hidden);
254
255 static cl::opt<bool> EnableLowerKernelArguments(
256 "amdgpu-ir-lower-kernel-arguments",
257 cl::desc("Lower kernel argument loads in IR pass"),
258 cl::init(true),
259 cl::Hidden);
260
261 static cl::opt<bool> EnableRegReassign(
262 "amdgpu-reassign-regs",
263 cl::desc("Enable register reassign optimizations on gfx10+"),
264 cl::init(true),
265 cl::Hidden);
266
267 static cl::opt<bool> OptVGPRLiveRange(
268 "amdgpu-opt-vgpr-liverange",
269 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
270 cl::init(true), cl::Hidden);
271
272 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
273 "amdgpu-atomic-optimizer-strategy",
274 cl::desc("Select DPP or Iterative strategy for scan"),
275 cl::init(ScanOptions::Iterative),
276 cl::values(
277 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
278 clEnumValN(ScanOptions::Iterative, "Iterative",
279 "Use Iterative approach for scan"),
280 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
281
282 // Enable Mode register optimization
283 static cl::opt<bool> EnableSIModeRegisterPass(
284 "amdgpu-mode-register",
285 cl::desc("Enable mode register pass"),
286 cl::init(true),
287 cl::Hidden);
288
289 // Enable GFX11.5+ s_singleuse_vdst insertion
290 static cl::opt<bool>
291 EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
292 cl::desc("Enable s_singleuse_vdst insertion"),
293 cl::init(false), cl::Hidden);
294
295 // Enable GFX11+ s_delay_alu insertion
296 static cl::opt<bool>
297 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
298 cl::desc("Enable s_delay_alu insertion"),
299 cl::init(true), cl::Hidden);
300
301 // Enable GFX11+ VOPD
302 static cl::opt<bool>
303 EnableVOPD("amdgpu-enable-vopd",
304 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
305 cl::init(true), cl::Hidden);
306
307 // Option is used in lit tests to prevent deadcoding of patterns inspected.
308 static cl::opt<bool>
309 EnableDCEInRA("amdgpu-dce-in-ra",
310 cl::init(true), cl::Hidden,
311 cl::desc("Enable machine DCE inside regalloc"));
312
313 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
314 cl::desc("Adjust wave priority"),
315 cl::init(false), cl::Hidden);
316
317 static cl::opt<bool> EnableScalarIRPasses(
318 "amdgpu-scalar-ir-passes",
319 cl::desc("Enable scalar IR passes"),
320 cl::init(true),
321 cl::Hidden);
322
323 static cl::opt<bool> EnableStructurizerWorkarounds(
324 "amdgpu-enable-structurizer-workarounds",
325 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
326 cl::Hidden);
327
328 static cl::opt<bool, true> EnableLowerModuleLDS(
329 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
330 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
331 cl::Hidden);
332
333 static cl::opt<bool> EnablePreRAOptimizations(
334 "amdgpu-enable-pre-ra-optimizations",
335 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
336 cl::Hidden);
337
338 static cl::opt<bool> EnablePromoteKernelArguments(
339 "amdgpu-enable-promote-kernel-arguments",
340 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
341 cl::Hidden, cl::init(true));
342
343 static cl::opt<bool> EnableImageIntrinsicOptimizer(
344 "amdgpu-enable-image-intrinsic-optimizer",
345 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
346 cl::Hidden);
347
348 static cl::opt<bool>
349 EnableLoopPrefetch("amdgpu-loop-prefetch",
350 cl::desc("Enable loop data prefetch on AMDGPU"),
351 cl::Hidden, cl::init(false));
352
353 static cl::opt<bool> EnableMaxIlpSchedStrategy(
354 "amdgpu-enable-max-ilp-scheduling-strategy",
355 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
356 cl::Hidden, cl::init(false));
357
358 static cl::opt<bool> EnableRewritePartialRegUses(
359 "amdgpu-enable-rewrite-partial-reg-uses",
360 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
361 cl::Hidden);
362
363 static cl::opt<bool> EnableHipStdPar(
364 "amdgpu-enable-hipstdpar",
365 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
366 cl::Hidden);
367
LLVMInitializeAMDGPUTarget()368 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
369 // Register the target
370 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
371 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
372
373 PassRegistry *PR = PassRegistry::getPassRegistry();
374 initializeR600ClauseMergePassPass(*PR);
375 initializeR600ControlFlowFinalizerPass(*PR);
376 initializeR600PacketizerPass(*PR);
377 initializeR600ExpandSpecialInstrsPassPass(*PR);
378 initializeR600VectorRegMergerPass(*PR);
379 initializeGlobalISel(*PR);
380 initializeAMDGPUDAGToDAGISelPass(*PR);
381 initializeGCNDPPCombinePass(*PR);
382 initializeSILowerI1CopiesPass(*PR);
383 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
384 initializeSILowerWWMCopiesPass(*PR);
385 initializeAMDGPUMarkLastScratchLoadPass(*PR);
386 initializeSILowerSGPRSpillsPass(*PR);
387 initializeSIFixSGPRCopiesPass(*PR);
388 initializeSIFixVGPRCopiesPass(*PR);
389 initializeSIFoldOperandsPass(*PR);
390 initializeSIPeepholeSDWAPass(*PR);
391 initializeSIShrinkInstructionsPass(*PR);
392 initializeSIOptimizeExecMaskingPreRAPass(*PR);
393 initializeSIOptimizeVGPRLiveRangePass(*PR);
394 initializeSILoadStoreOptimizerPass(*PR);
395 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
396 initializeAMDGPUAlwaysInlinePass(*PR);
397 initializeAMDGPUAttributorLegacyPass(*PR);
398 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
399 initializeAMDGPUAnnotateUniformValuesPass(*PR);
400 initializeAMDGPUArgumentUsageInfoPass(*PR);
401 initializeAMDGPUAtomicOptimizerPass(*PR);
402 initializeAMDGPULowerKernelArgumentsPass(*PR);
403 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
404 initializeAMDGPULowerKernelAttributesPass(*PR);
405 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
406 initializeAMDGPUPostLegalizerCombinerPass(*PR);
407 initializeAMDGPUPreLegalizerCombinerPass(*PR);
408 initializeAMDGPURegBankCombinerPass(*PR);
409 initializeAMDGPURegBankSelectPass(*PR);
410 initializeAMDGPUPromoteAllocaPass(*PR);
411 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
412 initializeAMDGPUCodeGenPreparePass(*PR);
413 initializeAMDGPULateCodeGenPreparePass(*PR);
414 initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
415 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
416 initializeAMDGPURewriteOutArgumentsPass(*PR);
417 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
418 initializeAMDGPUUnifyMetadataPass(*PR);
419 initializeSIAnnotateControlFlowPass(*PR);
420 initializeAMDGPUInsertSingleUseVDSTPass(*PR);
421 initializeAMDGPUInsertDelayAluPass(*PR);
422 initializeSIInsertHardClausesPass(*PR);
423 initializeSIInsertWaitcntsPass(*PR);
424 initializeSIModeRegisterPass(*PR);
425 initializeSIWholeQuadModePass(*PR);
426 initializeSILowerControlFlowPass(*PR);
427 initializeSIPreEmitPeepholePass(*PR);
428 initializeSILateBranchLoweringPass(*PR);
429 initializeSIMemoryLegalizerPass(*PR);
430 initializeSIOptimizeExecMaskingPass(*PR);
431 initializeSIPreAllocateWWMRegsPass(*PR);
432 initializeSIFormMemoryClausesPass(*PR);
433 initializeSIPostRABundlerPass(*PR);
434 initializeGCNCreateVOPDPass(*PR);
435 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
436 initializeAMDGPUAAWrapperPassPass(*PR);
437 initializeAMDGPUExternalAAWrapperPass(*PR);
438 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
439 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
440 initializeAMDGPUResourceUsageAnalysisPass(*PR);
441 initializeGCNNSAReassignPass(*PR);
442 initializeGCNPreRAOptimizationsPass(*PR);
443 initializeGCNPreRALongBranchRegPass(*PR);
444 initializeGCNRewritePartialRegUsesPass(*PR);
445 initializeGCNRegPressurePrinterPass(*PR);
446 }
447
createTLOF(const Triple & TT)448 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
449 return std::make_unique<AMDGPUTargetObjectFile>();
450 }
451
createSIMachineScheduler(MachineSchedContext * C)452 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
453 return new SIScheduleDAGMI(C);
454 }
455
456 static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)457 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
458 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
459 ScheduleDAGMILive *DAG =
460 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
461 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
462 if (ST.shouldClusterStores())
463 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
464 DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
465 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
466 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
467 return DAG;
468 }
469
470 static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext * C)471 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
472 ScheduleDAGMILive *DAG =
473 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
474 DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
475 return DAG;
476 }
477
478 static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)479 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
480 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
481 auto DAG = new GCNIterativeScheduler(C,
482 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
483 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
484 if (ST.shouldClusterStores())
485 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
486 return DAG;
487 }
488
createMinRegScheduler(MachineSchedContext * C)489 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
490 return new GCNIterativeScheduler(C,
491 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
492 }
493
494 static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext * C)495 createIterativeILPMachineScheduler(MachineSchedContext *C) {
496 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
497 auto DAG = new GCNIterativeScheduler(C,
498 GCNIterativeScheduler::SCHEDULE_ILP);
499 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
500 if (ST.shouldClusterStores())
501 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
502 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
503 return DAG;
504 }
505
506 static MachineSchedRegistry
507 SISchedRegistry("si", "Run SI's custom scheduler",
508 createSIMachineScheduler);
509
510 static MachineSchedRegistry
511 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
512 "Run GCN scheduler to maximize occupancy",
513 createGCNMaxOccupancyMachineScheduler);
514
515 static MachineSchedRegistry
516 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
517 createGCNMaxILPMachineScheduler);
518
519 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
520 "gcn-iterative-max-occupancy-experimental",
521 "Run GCN scheduler to maximize occupancy (experimental)",
522 createIterativeGCNMaxOccupancyMachineScheduler);
523
524 static MachineSchedRegistry GCNMinRegSchedRegistry(
525 "gcn-iterative-minreg",
526 "Run GCN iterative scheduler for minimal register usage (experimental)",
527 createMinRegScheduler);
528
529 static MachineSchedRegistry GCNILPSchedRegistry(
530 "gcn-iterative-ilp",
531 "Run GCN iterative scheduler for ILP scheduling (experimental)",
532 createIterativeILPMachineScheduler);
533
computeDataLayout(const Triple & TT)534 static StringRef computeDataLayout(const Triple &TT) {
535 if (TT.getArch() == Triple::r600) {
536 // 32-bit pointers.
537 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
538 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
539 }
540
541 // 32-bit private, local, and region pointers. 64-bit global, constant and
542 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
543 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
544 // (address space 7), and 128-bit non-integral buffer resourcees (address
545 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
546 // like getelementptr.
547 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
548 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
549 "v32:32-v48:64-v96:"
550 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
551 "G1-ni:7:8:9";
552 }
553
554 LLVM_READNONE
getGPUOrDefault(const Triple & TT,StringRef GPU)555 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
556 if (!GPU.empty())
557 return GPU;
558
559 // Need to default to a target with flat support for HSA.
560 if (TT.getArch() == Triple::amdgcn)
561 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
562
563 return "r600";
564 }
565
getEffectiveRelocModel(std::optional<Reloc::Model> RM)566 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
567 // The AMDGPU toolchain only supports generating shared objects, so we
568 // must always use PIC.
569 return Reloc::PIC_;
570 }
571
AMDGPUTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,std::optional<Reloc::Model> RM,std::optional<CodeModel::Model> CM,CodeGenOptLevel OptLevel)572 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
573 StringRef CPU, StringRef FS,
574 TargetOptions Options,
575 std::optional<Reloc::Model> RM,
576 std::optional<CodeModel::Model> CM,
577 CodeGenOptLevel OptLevel)
578 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
579 FS, Options, getEffectiveRelocModel(RM),
580 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
581 TLOF(createTLOF(getTargetTriple())) {
582 initAsmInfo();
583 if (TT.getArch() == Triple::amdgcn) {
584 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
585 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
586 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
587 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
588 }
589 }
590
591 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
592 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
593 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
594
595 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
596
getGPUName(const Function & F) const597 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
598 Attribute GPUAttr = F.getFnAttribute("target-cpu");
599 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
600 }
601
getFeatureString(const Function & F) const602 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
603 Attribute FSAttr = F.getFnAttribute("target-features");
604
605 return FSAttr.isValid() ? FSAttr.getValueAsString()
606 : getTargetFeatureString();
607 }
608
609 /// Predicate for Internalize pass.
mustPreserveGV(const GlobalValue & GV)610 static bool mustPreserveGV(const GlobalValue &GV) {
611 if (const Function *F = dyn_cast<Function>(&GV))
612 return F->isDeclaration() || F->getName().starts_with("__asan_") ||
613 F->getName().starts_with("__sanitizer_") ||
614 AMDGPU::isEntryFunctionCC(F->getCallingConv());
615
616 GV.removeDeadConstantUsers();
617 return !GV.use_empty();
618 }
619
registerDefaultAliasAnalyses(AAManager & AAM)620 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
621 AAM.registerFunctionAnalysis<AMDGPUAA>();
622 }
623
registerPassBuilderCallbacks(PassBuilder & PB,bool PopulateClassToPassNames)624 void AMDGPUTargetMachine::registerPassBuilderCallbacks(
625 PassBuilder &PB, bool PopulateClassToPassNames) {
626 PB.registerPipelineParsingCallback(
627 [this](StringRef PassName, ModulePassManager &PM,
628 ArrayRef<PassBuilder::PipelineElement>) {
629 if (PassName == "amdgpu-attributor") {
630 PM.addPass(AMDGPUAttributorPass(*this));
631 return true;
632 }
633 if (PassName == "amdgpu-unify-metadata") {
634 PM.addPass(AMDGPUUnifyMetadataPass());
635 return true;
636 }
637 if (PassName == "amdgpu-printf-runtime-binding") {
638 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
639 return true;
640 }
641 if (PassName == "amdgpu-always-inline") {
642 PM.addPass(AMDGPUAlwaysInlinePass());
643 return true;
644 }
645 if (PassName == "amdgpu-lower-module-lds") {
646 PM.addPass(AMDGPULowerModuleLDSPass(*this));
647 return true;
648 }
649 if (PassName == "amdgpu-lower-ctor-dtor") {
650 PM.addPass(AMDGPUCtorDtorLoweringPass());
651 return true;
652 }
653 return false;
654 });
655 PB.registerPipelineParsingCallback(
656 [this](StringRef PassName, FunctionPassManager &PM,
657 ArrayRef<PassBuilder::PipelineElement>) {
658 if (PassName == "amdgpu-simplifylib") {
659 PM.addPass(AMDGPUSimplifyLibCallsPass());
660 return true;
661 }
662 if (PassName == "amdgpu-image-intrinsic-opt") {
663 PM.addPass(AMDGPUImageIntrinsicOptimizerPass(*this));
664 return true;
665 }
666 if (PassName == "amdgpu-usenative") {
667 PM.addPass(AMDGPUUseNativeCallsPass());
668 return true;
669 }
670 if (PassName == "amdgpu-promote-alloca") {
671 PM.addPass(AMDGPUPromoteAllocaPass(*this));
672 return true;
673 }
674 if (PassName == "amdgpu-promote-alloca-to-vector") {
675 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
676 return true;
677 }
678 if (PassName == "amdgpu-lower-kernel-attributes") {
679 PM.addPass(AMDGPULowerKernelAttributesPass());
680 return true;
681 }
682 if (PassName == "amdgpu-promote-kernel-arguments") {
683 PM.addPass(AMDGPUPromoteKernelArgumentsPass());
684 return true;
685 }
686 if (PassName == "amdgpu-unify-divergent-exit-nodes") {
687 PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
688 return true;
689 }
690 if (PassName == "amdgpu-atomic-optimizer") {
691 PM.addPass(
692 AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy));
693 return true;
694 }
695 if (PassName == "amdgpu-codegenprepare") {
696 PM.addPass(AMDGPUCodeGenPreparePass(*this));
697 return true;
698 }
699 if (PassName == "amdgpu-lower-kernel-arguments") {
700 PM.addPass(AMDGPULowerKernelArgumentsPass(*this));
701 return true;
702 }
703 if (PassName == "amdgpu-rewrite-undef-for-phi") {
704 PM.addPass(AMDGPURewriteUndefForPHIPass());
705 return true;
706 }
707 return false;
708 });
709
710 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
711 FAM.registerPass([&] { return AMDGPUAA(); });
712 });
713
714 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
715 if (AAName == "amdgpu-aa") {
716 AAM.registerFunctionAnalysis<AMDGPUAA>();
717 return true;
718 }
719 return false;
720 });
721
722 PB.registerPipelineStartEPCallback(
723 [](ModulePassManager &PM, OptimizationLevel Level) {
724 FunctionPassManager FPM;
725 FPM.addPass(AMDGPUUseNativeCallsPass());
726 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
727 FPM.addPass(AMDGPUSimplifyLibCallsPass());
728 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
729 if (EnableHipStdPar)
730 PM.addPass(HipStdParAcceleratorCodeSelectionPass());
731 });
732
733 PB.registerPipelineEarlySimplificationEPCallback(
734 [](ModulePassManager &PM, OptimizationLevel Level) {
735 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
736
737 if (Level == OptimizationLevel::O0)
738 return;
739
740 PM.addPass(AMDGPUUnifyMetadataPass());
741
742 if (InternalizeSymbols) {
743 PM.addPass(InternalizePass(mustPreserveGV));
744 PM.addPass(GlobalDCEPass());
745 }
746
747 if (EarlyInlineAll && !EnableFunctionCalls)
748 PM.addPass(AMDGPUAlwaysInlinePass());
749 });
750
751 PB.registerCGSCCOptimizerLateEPCallback(
752 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
753 if (Level == OptimizationLevel::O0)
754 return;
755
756 FunctionPassManager FPM;
757
758 // Add promote kernel arguments pass to the opt pipeline right before
759 // infer address spaces which is needed to do actual address space
760 // rewriting.
761 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
762 EnablePromoteKernelArguments)
763 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
764
765 // Add infer address spaces pass to the opt pipeline after inlining
766 // but before SROA to increase SROA opportunities.
767 FPM.addPass(InferAddressSpacesPass());
768
769 // This should run after inlining to have any chance of doing
770 // anything, and before other cleanup optimizations.
771 FPM.addPass(AMDGPULowerKernelAttributesPass());
772
773 if (Level != OptimizationLevel::O0) {
774 // Promote alloca to vector before SROA and loop unroll. If we
775 // manage to eliminate allocas before unroll we may choose to unroll
776 // less.
777 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
778 }
779
780 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
781 });
782 }
783
getNullPointerValue(unsigned AddrSpace)784 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
785 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
786 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
787 AddrSpace == AMDGPUAS::REGION_ADDRESS)
788 ? -1
789 : 0;
790 }
791
isNoopAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const792 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
793 unsigned DestAS) const {
794 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
795 AMDGPU::isFlatGlobalAddrSpace(DestAS);
796 }
797
getAssumedAddrSpace(const Value * V) const798 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
799 const auto *LD = dyn_cast<LoadInst>(V);
800 if (!LD)
801 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
802
803 // It must be a generic pointer loaded.
804 assert(V->getType()->isPointerTy() &&
805 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
806
807 const auto *Ptr = LD->getPointerOperand();
808 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
809 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
810 // For a generic pointer loaded from the constant memory, it could be assumed
811 // as a global pointer since the constant memory is only populated on the
812 // host side. As implied by the offload programming model, only global
813 // pointers could be referenced on the host side.
814 return AMDGPUAS::GLOBAL_ADDRESS;
815 }
816
817 std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value * V) const818 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
819 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
820 switch (II->getIntrinsicID()) {
821 case Intrinsic::amdgcn_is_shared:
822 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
823 case Intrinsic::amdgcn_is_private:
824 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
825 default:
826 break;
827 }
828 return std::pair(nullptr, -1);
829 }
830 // Check the global pointer predication based on
831 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
832 // the order of 'is_shared' and 'is_private' is not significant.
833 Value *Ptr;
834 if (match(
835 const_cast<Value *>(V),
836 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
837 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
838 m_Deferred(Ptr))))))
839 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
840
841 return std::pair(nullptr, -1);
842 }
843
844 unsigned
getAddressSpaceForPseudoSourceKind(unsigned Kind) const845 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
846 switch (Kind) {
847 case PseudoSourceValue::Stack:
848 case PseudoSourceValue::FixedStack:
849 return AMDGPUAS::PRIVATE_ADDRESS;
850 case PseudoSourceValue::ConstantPool:
851 case PseudoSourceValue::GOT:
852 case PseudoSourceValue::JumpTable:
853 case PseudoSourceValue::GlobalValueCallEntry:
854 case PseudoSourceValue::ExternalSymbolCallEntry:
855 return AMDGPUAS::CONSTANT_ADDRESS;
856 }
857 return AMDGPUAS::FLAT_ADDRESS;
858 }
859
860 //===----------------------------------------------------------------------===//
861 // GCN Target Machine (SI+)
862 //===----------------------------------------------------------------------===//
863
GCNTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,std::optional<Reloc::Model> RM,std::optional<CodeModel::Model> CM,CodeGenOptLevel OL,bool JIT)864 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
865 StringRef CPU, StringRef FS,
866 TargetOptions Options,
867 std::optional<Reloc::Model> RM,
868 std::optional<CodeModel::Model> CM,
869 CodeGenOptLevel OL, bool JIT)
870 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
871
872 const TargetSubtargetInfo *
getSubtargetImpl(const Function & F) const873 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
874 StringRef GPU = getGPUName(F);
875 StringRef FS = getFeatureString(F);
876
877 SmallString<128> SubtargetKey(GPU);
878 SubtargetKey.append(FS);
879
880 auto &I = SubtargetMap[SubtargetKey];
881 if (!I) {
882 // This needs to be done before we create a new subtarget since any
883 // creation will depend on the TM and the code generation flags on the
884 // function that reside in TargetOptions.
885 resetTargetOptions(F);
886 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
887 }
888
889 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
890
891 return I.get();
892 }
893
894 TargetTransformInfo
getTargetTransformInfo(const Function & F) const895 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
896 return TargetTransformInfo(GCNTTIImpl(this, F));
897 }
898
899 //===----------------------------------------------------------------------===//
900 // AMDGPU Pass Setup
901 //===----------------------------------------------------------------------===//
902
getCSEConfig() const903 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
904 return getStandardCSEConfigForOpt(TM->getOptLevel());
905 }
906
907 namespace {
908
909 class GCNPassConfig final : public AMDGPUPassConfig {
910 public:
GCNPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)911 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
912 : AMDGPUPassConfig(TM, PM) {
913 // It is necessary to know the register usage of the entire call graph. We
914 // allow calls without EnableAMDGPUFunctionCalls if they are marked
915 // noinline, so this is always required.
916 setRequiresCodeGenSCCOrder(true);
917 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
918 }
919
getGCNTargetMachine() const920 GCNTargetMachine &getGCNTargetMachine() const {
921 return getTM<GCNTargetMachine>();
922 }
923
924 ScheduleDAGInstrs *
925 createMachineScheduler(MachineSchedContext *C) const override;
926
927 ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext * C) const928 createPostMachineScheduler(MachineSchedContext *C) const override {
929 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
930 C, std::make_unique<PostGenericScheduler>(C),
931 /*RemoveKillFlags=*/true);
932 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
933 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
934 if (ST.shouldClusterStores())
935 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
936 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
937 DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
938 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
939 DAG->addMutation(createVOPDPairingMutation());
940 return DAG;
941 }
942
943 bool addPreISel() override;
944 void addMachineSSAOptimization() override;
945 bool addILPOpts() override;
946 bool addInstSelector() override;
947 bool addIRTranslator() override;
948 void addPreLegalizeMachineIR() override;
949 bool addLegalizeMachineIR() override;
950 void addPreRegBankSelect() override;
951 bool addRegBankSelect() override;
952 void addPreGlobalInstructionSelect() override;
953 bool addGlobalInstructionSelect() override;
954 void addFastRegAlloc() override;
955 void addOptimizedRegAlloc() override;
956
957 FunctionPass *createSGPRAllocPass(bool Optimized);
958 FunctionPass *createVGPRAllocPass(bool Optimized);
959 FunctionPass *createRegAllocPass(bool Optimized) override;
960
961 bool addRegAssignAndRewriteFast() override;
962 bool addRegAssignAndRewriteOptimized() override;
963
964 void addPreRegAlloc() override;
965 bool addPreRewrite() override;
966 void addPostRegAlloc() override;
967 void addPreSched2() override;
968 void addPreEmitPass() override;
969 };
970
971 } // end anonymous namespace
972
AMDGPUPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)973 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
974 : TargetPassConfig(TM, PM) {
975 // Exceptions and StackMaps are not supported, so these passes will never do
976 // anything.
977 disablePass(&StackMapLivenessID);
978 disablePass(&FuncletLayoutID);
979 // Garbage collection is not supported.
980 disablePass(&GCLoweringID);
981 disablePass(&ShadowStackGCLoweringID);
982 }
983
addEarlyCSEOrGVNPass()984 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
985 if (getOptLevel() == CodeGenOptLevel::Aggressive)
986 addPass(createGVNPass());
987 else
988 addPass(createEarlyCSEPass());
989 }
990
addStraightLineScalarOptimizationPasses()991 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
992 if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
993 addPass(createLoopDataPrefetchPass());
994 addPass(createSeparateConstOffsetFromGEPPass());
995 // ReassociateGEPs exposes more opportunities for SLSR. See
996 // the example in reassociate-geps-and-slsr.ll.
997 addPass(createStraightLineStrengthReducePass());
998 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
999 // EarlyCSE can reuse.
1000 addEarlyCSEOrGVNPass();
1001 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1002 addPass(createNaryReassociatePass());
1003 // NaryReassociate on GEPs creates redundant common expressions, so run
1004 // EarlyCSE after it.
1005 addPass(createEarlyCSEPass());
1006 }
1007
addIRPasses()1008 void AMDGPUPassConfig::addIRPasses() {
1009 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1010
1011 Triple::ArchType Arch = TM.getTargetTriple().getArch();
1012 if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
1013 addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1014
1015 // There is no reason to run these.
1016 disablePass(&StackMapLivenessID);
1017 disablePass(&FuncletLayoutID);
1018 disablePass(&PatchableFunctionID);
1019
1020 addPass(createAMDGPUPrintfRuntimeBinding());
1021 if (LowerCtorDtor)
1022 addPass(createAMDGPUCtorDtorLoweringLegacyPass());
1023
1024 if (isPassEnabled(EnableImageIntrinsicOptimizer))
1025 addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
1026
1027 // Function calls are not supported, so make sure we inline everything.
1028 addPass(createAMDGPUAlwaysInlinePass());
1029 addPass(createAlwaysInlinerLegacyPass());
1030
1031 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1032 if (Arch == Triple::r600)
1033 addPass(createR600OpenCLImageTypeLoweringPass());
1034
1035 // Replace OpenCL enqueued block function pointers with global variables.
1036 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1037
1038 // Runs before PromoteAlloca so the latter can account for function uses
1039 if (EnableLowerModuleLDS) {
1040 addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
1041 }
1042
1043 // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
1044 // after their introduction
1045 if (TM.getOptLevel() > CodeGenOptLevel::None)
1046 addPass(createAMDGPUAttributorLegacyPass());
1047
1048 if (TM.getOptLevel() > CodeGenOptLevel::None)
1049 addPass(createInferAddressSpacesPass());
1050
1051 // Run atomic optimizer before Atomic Expand
1052 if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
1053 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1054 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1055 addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
1056 }
1057
1058 addPass(createAtomicExpandPass());
1059
1060 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1061 addPass(createAMDGPUPromoteAlloca());
1062
1063 if (isPassEnabled(EnableScalarIRPasses))
1064 addStraightLineScalarOptimizationPasses();
1065
1066 if (EnableAMDGPUAliasAnalysis) {
1067 addPass(createAMDGPUAAWrapperPass());
1068 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1069 AAResults &AAR) {
1070 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1071 AAR.addAAResult(WrapperPass->getResult());
1072 }));
1073 }
1074
1075 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1076 // TODO: May want to move later or split into an early and late one.
1077 addPass(createAMDGPUCodeGenPreparePass());
1078 }
1079
1080 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1081 // have expanded.
1082 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1083 addPass(createLICMPass());
1084 }
1085
1086 TargetPassConfig::addIRPasses();
1087
1088 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1089 // example, GVN can combine
1090 //
1091 // %0 = add %a, %b
1092 // %1 = add %b, %a
1093 //
1094 // and
1095 //
1096 // %0 = shl nsw %a, 2
1097 // %1 = shl %a, 2
1098 //
1099 // but EarlyCSE can do neither of them.
1100 if (isPassEnabled(EnableScalarIRPasses))
1101 addEarlyCSEOrGVNPass();
1102 }
1103
addCodeGenPrepare()1104 void AMDGPUPassConfig::addCodeGenPrepare() {
1105 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1106 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1107 // analysis, and should be removed.
1108 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1109 }
1110
1111 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1112 EnableLowerKernelArguments)
1113 addPass(createAMDGPULowerKernelArgumentsPass());
1114
1115 TargetPassConfig::addCodeGenPrepare();
1116
1117 if (isPassEnabled(EnableLoadStoreVectorizer))
1118 addPass(createLoadStoreVectorizerPass());
1119
1120 // LowerSwitch pass may introduce unreachable blocks that can
1121 // cause unexpected behavior for subsequent passes. Placing it
1122 // here seems better that these blocks would get cleaned up by
1123 // UnreachableBlockElim inserted next in the pass flow.
1124 addPass(createLowerSwitchPass());
1125 }
1126
addPreISel()1127 bool AMDGPUPassConfig::addPreISel() {
1128 if (TM->getOptLevel() > CodeGenOptLevel::None)
1129 addPass(createFlattenCFGPass());
1130 return false;
1131 }
1132
addInstSelector()1133 bool AMDGPUPassConfig::addInstSelector() {
1134 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1135 return false;
1136 }
1137
addGCPasses()1138 bool AMDGPUPassConfig::addGCPasses() {
1139 // Do nothing. GC is not supported.
1140 return false;
1141 }
1142
1143 llvm::ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext * C) const1144 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1145 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1146 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1147 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1148 if (ST.shouldClusterStores())
1149 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1150 return DAG;
1151 }
1152
createMachineFunctionInfo(BumpPtrAllocator & Allocator,const Function & F,const TargetSubtargetInfo * STI) const1153 MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1154 BumpPtrAllocator &Allocator, const Function &F,
1155 const TargetSubtargetInfo *STI) const {
1156 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1157 Allocator, F, static_cast<const R600Subtarget *>(STI));
1158 }
1159
1160 //===----------------------------------------------------------------------===//
1161 // GCN Pass Setup
1162 //===----------------------------------------------------------------------===//
1163
createMachineScheduler(MachineSchedContext * C) const1164 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1165 MachineSchedContext *C) const {
1166 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1167 if (ST.enableSIScheduler())
1168 return createSIMachineScheduler(C);
1169
1170 if (EnableMaxIlpSchedStrategy)
1171 return createGCNMaxILPMachineScheduler(C);
1172
1173 return createGCNMaxOccupancyMachineScheduler(C);
1174 }
1175
addPreISel()1176 bool GCNPassConfig::addPreISel() {
1177 AMDGPUPassConfig::addPreISel();
1178
1179 if (TM->getOptLevel() > CodeGenOptLevel::None)
1180 addPass(createAMDGPULateCodeGenPreparePass());
1181
1182 if (TM->getOptLevel() > CodeGenOptLevel::None)
1183 addPass(createSinkingPass());
1184
1185 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1186 // regions formed by them.
1187 addPass(&AMDGPUUnifyDivergentExitNodesID);
1188 if (!LateCFGStructurize) {
1189 if (EnableStructurizerWorkarounds) {
1190 addPass(createFixIrreduciblePass());
1191 addPass(createUnifyLoopExitsPass());
1192 }
1193 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1194 }
1195 addPass(createAMDGPUAnnotateUniformValues());
1196 if (!LateCFGStructurize) {
1197 addPass(createSIAnnotateControlFlowPass());
1198 // TODO: Move this right after structurizeCFG to avoid extra divergence
1199 // analysis. This depends on stopping SIAnnotateControlFlow from making
1200 // control flow modifications.
1201 addPass(createAMDGPURewriteUndefForPHILegacyPass());
1202 }
1203 addPass(createLCSSAPass());
1204
1205 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1206 addPass(&AMDGPUPerfHintAnalysisID);
1207
1208 return false;
1209 }
1210
addMachineSSAOptimization()1211 void GCNPassConfig::addMachineSSAOptimization() {
1212 TargetPassConfig::addMachineSSAOptimization();
1213
1214 // We want to fold operands after PeepholeOptimizer has run (or as part of
1215 // it), because it will eliminate extra copies making it easier to fold the
1216 // real source operand. We want to eliminate dead instructions after, so that
1217 // we see fewer uses of the copies. We then need to clean up the dead
1218 // instructions leftover after the operands are folded as well.
1219 //
1220 // XXX - Can we get away without running DeadMachineInstructionElim again?
1221 addPass(&SIFoldOperandsID);
1222 if (EnableDPPCombine)
1223 addPass(&GCNDPPCombineID);
1224 addPass(&SILoadStoreOptimizerID);
1225 if (isPassEnabled(EnableSDWAPeephole)) {
1226 addPass(&SIPeepholeSDWAID);
1227 addPass(&EarlyMachineLICMID);
1228 addPass(&MachineCSEID);
1229 addPass(&SIFoldOperandsID);
1230 }
1231 addPass(&DeadMachineInstructionElimID);
1232 addPass(createSIShrinkInstructionsPass());
1233 }
1234
addILPOpts()1235 bool GCNPassConfig::addILPOpts() {
1236 if (EnableEarlyIfConversion)
1237 addPass(&EarlyIfConverterID);
1238
1239 TargetPassConfig::addILPOpts();
1240 return false;
1241 }
1242
addInstSelector()1243 bool GCNPassConfig::addInstSelector() {
1244 AMDGPUPassConfig::addInstSelector();
1245 addPass(&SIFixSGPRCopiesID);
1246 addPass(createSILowerI1CopiesPass());
1247 return false;
1248 }
1249
addIRTranslator()1250 bool GCNPassConfig::addIRTranslator() {
1251 addPass(new IRTranslator(getOptLevel()));
1252 return false;
1253 }
1254
addPreLegalizeMachineIR()1255 void GCNPassConfig::addPreLegalizeMachineIR() {
1256 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1257 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1258 addPass(new Localizer());
1259 }
1260
addLegalizeMachineIR()1261 bool GCNPassConfig::addLegalizeMachineIR() {
1262 addPass(new Legalizer());
1263 return false;
1264 }
1265
addPreRegBankSelect()1266 void GCNPassConfig::addPreRegBankSelect() {
1267 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1268 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1269 addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
1270 }
1271
addRegBankSelect()1272 bool GCNPassConfig::addRegBankSelect() {
1273 addPass(new AMDGPURegBankSelect());
1274 return false;
1275 }
1276
addPreGlobalInstructionSelect()1277 void GCNPassConfig::addPreGlobalInstructionSelect() {
1278 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1279 addPass(createAMDGPURegBankCombiner(IsOptNone));
1280 }
1281
addGlobalInstructionSelect()1282 bool GCNPassConfig::addGlobalInstructionSelect() {
1283 addPass(new InstructionSelect(getOptLevel()));
1284 return false;
1285 }
1286
addPreRegAlloc()1287 void GCNPassConfig::addPreRegAlloc() {
1288 if (LateCFGStructurize) {
1289 addPass(createAMDGPUMachineCFGStructurizerPass());
1290 }
1291 }
1292
addFastRegAlloc()1293 void GCNPassConfig::addFastRegAlloc() {
1294 // FIXME: We have to disable the verifier here because of PHIElimination +
1295 // TwoAddressInstructions disabling it.
1296
1297 // This must be run immediately after phi elimination and before
1298 // TwoAddressInstructions, otherwise the processing of the tied operand of
1299 // SI_ELSE will introduce a copy of the tied operand source after the else.
1300 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1301
1302 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1303
1304 TargetPassConfig::addFastRegAlloc();
1305 }
1306
addOptimizedRegAlloc()1307 void GCNPassConfig::addOptimizedRegAlloc() {
1308 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1309 // instructions that cause scheduling barriers.
1310 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1311
1312 if (OptExecMaskPreRA)
1313 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1314
1315 if (EnableRewritePartialRegUses)
1316 insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
1317
1318 if (isPassEnabled(EnablePreRAOptimizations))
1319 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1320
1321 // This is not an essential optimization and it has a noticeable impact on
1322 // compilation time, so we only enable it from O2.
1323 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1324 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1325
1326 // FIXME: when an instruction has a Killed operand, and the instruction is
1327 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1328 // the register in LiveVariables, this would trigger a failure in verifier,
1329 // we should fix it and enable the verifier.
1330 if (OptVGPRLiveRange)
1331 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1332 // This must be run immediately after phi elimination and before
1333 // TwoAddressInstructions, otherwise the processing of the tied operand of
1334 // SI_ELSE will introduce a copy of the tied operand source after the else.
1335 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1336
1337 if (EnableDCEInRA)
1338 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1339
1340 TargetPassConfig::addOptimizedRegAlloc();
1341 }
1342
addPreRewrite()1343 bool GCNPassConfig::addPreRewrite() {
1344 addPass(&SILowerWWMCopiesID);
1345 if (EnableRegReassign)
1346 addPass(&GCNNSAReassignID);
1347 return true;
1348 }
1349
createSGPRAllocPass(bool Optimized)1350 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1351 // Initialize the global default.
1352 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1353 initializeDefaultSGPRRegisterAllocatorOnce);
1354
1355 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1356 if (Ctor != useDefaultRegisterAllocator)
1357 return Ctor();
1358
1359 if (Optimized)
1360 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1361
1362 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1363 }
1364
createVGPRAllocPass(bool Optimized)1365 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1366 // Initialize the global default.
1367 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1368 initializeDefaultVGPRRegisterAllocatorOnce);
1369
1370 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1371 if (Ctor != useDefaultRegisterAllocator)
1372 return Ctor();
1373
1374 if (Optimized)
1375 return createGreedyVGPRRegisterAllocator();
1376
1377 return createFastVGPRRegisterAllocator();
1378 }
1379
createRegAllocPass(bool Optimized)1380 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1381 llvm_unreachable("should not be used");
1382 }
1383
1384 static const char RegAllocOptNotSupportedMessage[] =
1385 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1386
addRegAssignAndRewriteFast()1387 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1388 if (!usingDefaultRegAlloc())
1389 report_fatal_error(RegAllocOptNotSupportedMessage);
1390
1391 addPass(&GCNPreRALongBranchRegID);
1392
1393 addPass(createSGPRAllocPass(false));
1394
1395 // Equivalent of PEI for SGPRs.
1396 addPass(&SILowerSGPRSpillsID);
1397 addPass(&SIPreAllocateWWMRegsID);
1398
1399 addPass(createVGPRAllocPass(false));
1400
1401 addPass(&SILowerWWMCopiesID);
1402 return true;
1403 }
1404
addRegAssignAndRewriteOptimized()1405 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1406 if (!usingDefaultRegAlloc())
1407 report_fatal_error(RegAllocOptNotSupportedMessage);
1408
1409 addPass(&GCNPreRALongBranchRegID);
1410
1411 addPass(createSGPRAllocPass(true));
1412
1413 // Commit allocated register changes. This is mostly necessary because too
1414 // many things rely on the use lists of the physical registers, such as the
1415 // verifier. This is only necessary with allocators which use LiveIntervals,
1416 // since FastRegAlloc does the replacements itself.
1417 addPass(createVirtRegRewriter(false));
1418
1419 // Equivalent of PEI for SGPRs.
1420 addPass(&SILowerSGPRSpillsID);
1421 addPass(&SIPreAllocateWWMRegsID);
1422
1423 addPass(createVGPRAllocPass(true));
1424
1425 addPreRewrite();
1426 addPass(&VirtRegRewriterID);
1427
1428 addPass(&AMDGPUMarkLastScratchLoadID);
1429
1430 return true;
1431 }
1432
addPostRegAlloc()1433 void GCNPassConfig::addPostRegAlloc() {
1434 addPass(&SIFixVGPRCopiesID);
1435 if (getOptLevel() > CodeGenOptLevel::None)
1436 addPass(&SIOptimizeExecMaskingID);
1437 TargetPassConfig::addPostRegAlloc();
1438 }
1439
addPreSched2()1440 void GCNPassConfig::addPreSched2() {
1441 if (TM->getOptLevel() > CodeGenOptLevel::None)
1442 addPass(createSIShrinkInstructionsPass());
1443 addPass(&SIPostRABundlerID);
1444 }
1445
addPreEmitPass()1446 void GCNPassConfig::addPreEmitPass() {
1447 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1448 addPass(&GCNCreateVOPDID);
1449 addPass(createSIMemoryLegalizerPass());
1450 addPass(createSIInsertWaitcntsPass());
1451
1452 addPass(createSIModeRegisterPass());
1453
1454 if (getOptLevel() > CodeGenOptLevel::None)
1455 addPass(&SIInsertHardClausesID);
1456
1457 addPass(&SILateBranchLoweringPassID);
1458 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1459 addPass(createAMDGPUSetWavePriorityPass());
1460 if (getOptLevel() > CodeGenOptLevel::None)
1461 addPass(&SIPreEmitPeepholeID);
1462 // The hazard recognizer that runs as part of the post-ra scheduler does not
1463 // guarantee to be able handle all hazards correctly. This is because if there
1464 // are multiple scheduling regions in a basic block, the regions are scheduled
1465 // bottom up, so when we begin to schedule a region we don't know what
1466 // instructions were emitted directly before it.
1467 //
1468 // Here we add a stand-alone hazard recognizer pass which can handle all
1469 // cases.
1470 addPass(&PostRAHazardRecognizerID);
1471
1472 if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
1473 addPass(&AMDGPUInsertSingleUseVDSTID);
1474
1475 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1476 addPass(&AMDGPUInsertDelayAluID);
1477
1478 addPass(&BranchRelaxationPassID);
1479 }
1480
createPassConfig(PassManagerBase & PM)1481 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1482 return new GCNPassConfig(*this, PM);
1483 }
1484
registerMachineRegisterInfoCallback(MachineFunction & MF) const1485 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1486 MachineFunction &MF) const {
1487 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1488 MF.getRegInfo().addDelegate(MFI);
1489 }
1490
createMachineFunctionInfo(BumpPtrAllocator & Allocator,const Function & F,const TargetSubtargetInfo * STI) const1491 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1492 BumpPtrAllocator &Allocator, const Function &F,
1493 const TargetSubtargetInfo *STI) const {
1494 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1495 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1496 }
1497
createDefaultFuncInfoYAML() const1498 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1499 return new yaml::SIMachineFunctionInfo();
1500 }
1501
1502 yaml::MachineFunctionInfo *
convertFuncInfoToYAML(const MachineFunction & MF) const1503 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1504 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1505 return new yaml::SIMachineFunctionInfo(
1506 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1507 }
1508
parseMachineFunctionInfo(const yaml::MachineFunctionInfo & MFI_,PerFunctionMIParsingState & PFS,SMDiagnostic & Error,SMRange & SourceRange) const1509 bool GCNTargetMachine::parseMachineFunctionInfo(
1510 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1511 SMDiagnostic &Error, SMRange &SourceRange) const {
1512 const yaml::SIMachineFunctionInfo &YamlMFI =
1513 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1514 MachineFunction &MF = PFS.MF;
1515 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1516 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1517
1518 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1519 return true;
1520
1521 if (MFI->Occupancy == 0) {
1522 // Fixup the subtarget dependent default value.
1523 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1524 }
1525
1526 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1527 Register TempReg;
1528 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1529 SourceRange = RegName.SourceRange;
1530 return true;
1531 }
1532 RegVal = TempReg;
1533
1534 return false;
1535 };
1536
1537 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1538 Register &RegVal) {
1539 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1540 };
1541
1542 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1543 return true;
1544
1545 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1546 return true;
1547
1548 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1549 MFI->LongBranchReservedReg))
1550 return true;
1551
1552 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1553 // Create a diagnostic for a the register string literal.
1554 const MemoryBuffer &Buffer =
1555 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1556 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1557 RegName.Value.size(), SourceMgr::DK_Error,
1558 "incorrect register class for field", RegName.Value,
1559 std::nullopt, std::nullopt);
1560 SourceRange = RegName.SourceRange;
1561 return true;
1562 };
1563
1564 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1565 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1566 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1567 return true;
1568
1569 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1570 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1571 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1572 }
1573
1574 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1575 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1576 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1577 }
1578
1579 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1580 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1581 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1582 }
1583
1584 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1585 Register ParsedReg;
1586 if (parseRegister(YamlReg, ParsedReg))
1587 return true;
1588
1589 MFI->reserveWWMRegister(ParsedReg);
1590 }
1591
1592 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1593 const TargetRegisterClass &RC,
1594 ArgDescriptor &Arg, unsigned UserSGPRs,
1595 unsigned SystemSGPRs) {
1596 // Skip parsing if it's not present.
1597 if (!A)
1598 return false;
1599
1600 if (A->IsRegister) {
1601 Register Reg;
1602 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1603 SourceRange = A->RegisterName.SourceRange;
1604 return true;
1605 }
1606 if (!RC.contains(Reg))
1607 return diagnoseRegisterClass(A->RegisterName);
1608 Arg = ArgDescriptor::createRegister(Reg);
1609 } else
1610 Arg = ArgDescriptor::createStack(A->StackOffset);
1611 // Check and apply the optional mask.
1612 if (A->Mask)
1613 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1614
1615 MFI->NumUserSGPRs += UserSGPRs;
1616 MFI->NumSystemSGPRs += SystemSGPRs;
1617 return false;
1618 };
1619
1620 if (YamlMFI.ArgInfo &&
1621 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1622 AMDGPU::SGPR_128RegClass,
1623 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1624 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1625 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1626 2, 0) ||
1627 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1628 MFI->ArgInfo.QueuePtr, 2, 0) ||
1629 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1630 AMDGPU::SReg_64RegClass,
1631 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1632 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1633 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1634 2, 0) ||
1635 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1636 AMDGPU::SReg_64RegClass,
1637 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1638 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1639 AMDGPU::SGPR_32RegClass,
1640 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1641 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1642 AMDGPU::SGPR_32RegClass,
1643 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1644 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1645 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1646 0, 1) ||
1647 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1648 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1649 0, 1) ||
1650 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1651 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1652 0, 1) ||
1653 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1654 AMDGPU::SGPR_32RegClass,
1655 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1656 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1657 AMDGPU::SGPR_32RegClass,
1658 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1659 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1660 AMDGPU::SReg_64RegClass,
1661 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1662 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1663 AMDGPU::SReg_64RegClass,
1664 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1665 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1666 AMDGPU::VGPR_32RegClass,
1667 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1668 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1669 AMDGPU::VGPR_32RegClass,
1670 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1671 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1672 AMDGPU::VGPR_32RegClass,
1673 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1674 return true;
1675
1676 if (ST.hasIEEEMode())
1677 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1678 if (ST.hasDX10ClampMode())
1679 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1680
1681 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1682 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1683 ? DenormalMode::IEEE
1684 : DenormalMode::PreserveSign;
1685 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1686 ? DenormalMode::IEEE
1687 : DenormalMode::PreserveSign;
1688
1689 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1690 ? DenormalMode::IEEE
1691 : DenormalMode::PreserveSign;
1692 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1693 ? DenormalMode::IEEE
1694 : DenormalMode::PreserveSign;
1695
1696 return false;
1697 }
1698