1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for SI+ GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCtorDtorLowering.h"
19 #include "AMDGPUExportClustering.h"
20 #include "AMDGPUIGroupLP.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPURegBankSelect.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "AMDGPUUnifyDivergentExitNodes.h"
26 #include "GCNIterativeScheduler.h"
27 #include "GCNSchedStrategy.h"
28 #include "GCNVOPDUtils.h"
29 #include "R600.h"
30 #include "R600MachineFunctionInfo.h"
31 #include "R600TargetMachine.h"
32 #include "SIMachineFunctionInfo.h"
33 #include "SIMachineScheduler.h"
34 #include "TargetInfo/AMDGPUTargetInfo.h"
35 #include "Utils/AMDGPUBaseInfo.h"
36 #include "llvm/Analysis/CGSCCPassManager.h"
37 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
38 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
39 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
40 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
41 #include "llvm/CodeGen/GlobalISel/Localizer.h"
42 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
43 #include "llvm/CodeGen/MIRParser/MIParser.h"
44 #include "llvm/CodeGen/Passes.h"
45 #include "llvm/CodeGen/RegAllocRegistry.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/IR/IntrinsicsAMDGPU.h"
48 #include "llvm/IR/PassManager.h"
49 #include "llvm/IR/PatternMatch.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/MC/TargetRegistry.h"
52 #include "llvm/Passes/PassBuilder.h"
53 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
54 #include "llvm/Transforms/IPO.h"
55 #include "llvm/Transforms/IPO/AlwaysInliner.h"
56 #include "llvm/Transforms/IPO/GlobalDCE.h"
57 #include "llvm/Transforms/IPO/Internalize.h"
58 #include "llvm/Transforms/Scalar.h"
59 #include "llvm/Transforms/Scalar/GVN.h"
60 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
61 #include "llvm/Transforms/Utils.h"
62 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
63 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
64 #include <optional>
65 
66 using namespace llvm;
67 using namespace llvm::PatternMatch;
68 
69 namespace {
70 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
71 public:
72   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
73     : RegisterRegAllocBase(N, D, C) {}
74 };
75 
76 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
77 public:
78   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
79     : RegisterRegAllocBase(N, D, C) {}
80 };
81 
82 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
83                               const TargetRegisterClass &RC) {
84   return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
85 }
86 
87 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
88                               const TargetRegisterClass &RC) {
89   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
90 }
91 
92 
93 /// -{sgpr|vgpr}-regalloc=... command line option.
94 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
95 
96 /// A dummy default pass factory indicates whether the register allocator is
97 /// overridden on the command line.
98 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
99 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
100 
101 static SGPRRegisterRegAlloc
102 defaultSGPRRegAlloc("default",
103                     "pick SGPR register allocator based on -O option",
104                     useDefaultRegisterAllocator);
105 
106 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
107                RegisterPassParser<SGPRRegisterRegAlloc>>
108 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
109              cl::desc("Register allocator to use for SGPRs"));
110 
111 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
112                RegisterPassParser<VGPRRegisterRegAlloc>>
113 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
114              cl::desc("Register allocator to use for VGPRs"));
115 
116 
117 static void initializeDefaultSGPRRegisterAllocatorOnce() {
118   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
119 
120   if (!Ctor) {
121     Ctor = SGPRRegAlloc;
122     SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
123   }
124 }
125 
126 static void initializeDefaultVGPRRegisterAllocatorOnce() {
127   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
128 
129   if (!Ctor) {
130     Ctor = VGPRRegAlloc;
131     VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
132   }
133 }
134 
135 static FunctionPass *createBasicSGPRRegisterAllocator() {
136   return createBasicRegisterAllocator(onlyAllocateSGPRs);
137 }
138 
139 static FunctionPass *createGreedySGPRRegisterAllocator() {
140   return createGreedyRegisterAllocator(onlyAllocateSGPRs);
141 }
142 
143 static FunctionPass *createFastSGPRRegisterAllocator() {
144   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
145 }
146 
147 static FunctionPass *createBasicVGPRRegisterAllocator() {
148   return createBasicRegisterAllocator(onlyAllocateVGPRs);
149 }
150 
151 static FunctionPass *createGreedyVGPRRegisterAllocator() {
152   return createGreedyRegisterAllocator(onlyAllocateVGPRs);
153 }
154 
155 static FunctionPass *createFastVGPRRegisterAllocator() {
156   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
157 }
158 
159 static SGPRRegisterRegAlloc basicRegAllocSGPR(
160   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
161 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
162   "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
163 
164 static SGPRRegisterRegAlloc fastRegAllocSGPR(
165   "fast", "fast register allocator", createFastSGPRRegisterAllocator);
166 
167 
168 static VGPRRegisterRegAlloc basicRegAllocVGPR(
169   "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
170 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
171   "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
172 
173 static VGPRRegisterRegAlloc fastRegAllocVGPR(
174   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
175 }
176 
177 static cl::opt<bool>
178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
179                         cl::desc("Run early if-conversion"),
180                         cl::init(false));
181 
182 static cl::opt<bool>
183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
184             cl::desc("Run pre-RA exec mask optimizations"),
185             cl::init(true));
186 
187 static cl::opt<bool>
188     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
189                   cl::desc("Lower GPU ctor / dtors to globals on the device."),
190                   cl::init(true), cl::Hidden);
191 
192 // Option to disable vectorizer for tests.
193 static cl::opt<bool> EnableLoadStoreVectorizer(
194   "amdgpu-load-store-vectorizer",
195   cl::desc("Enable load store vectorizer"),
196   cl::init(true),
197   cl::Hidden);
198 
199 // Option to control global loads scalarization
200 static cl::opt<bool> ScalarizeGlobal(
201   "amdgpu-scalarize-global-loads",
202   cl::desc("Enable global load scalarization"),
203   cl::init(true),
204   cl::Hidden);
205 
206 // Option to run internalize pass.
207 static cl::opt<bool> InternalizeSymbols(
208   "amdgpu-internalize-symbols",
209   cl::desc("Enable elimination of non-kernel functions and unused globals"),
210   cl::init(false),
211   cl::Hidden);
212 
213 // Option to inline all early.
214 static cl::opt<bool> EarlyInlineAll(
215   "amdgpu-early-inline-all",
216   cl::desc("Inline all functions early"),
217   cl::init(false),
218   cl::Hidden);
219 
220 static cl::opt<bool> RemoveIncompatibleFunctions(
221     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
222     cl::desc("Enable removal of functions when they"
223              "use features not supported by the target GPU"),
224     cl::init(true));
225 
226 static cl::opt<bool> EnableSDWAPeephole(
227   "amdgpu-sdwa-peephole",
228   cl::desc("Enable SDWA peepholer"),
229   cl::init(true));
230 
231 static cl::opt<bool> EnableDPPCombine(
232   "amdgpu-dpp-combine",
233   cl::desc("Enable DPP combiner"),
234   cl::init(true));
235 
236 // Enable address space based alias analysis
237 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
238   cl::desc("Enable AMDGPU Alias Analysis"),
239   cl::init(true));
240 
241 // Option to run late CFG structurizer
242 static cl::opt<bool, true> LateCFGStructurize(
243   "amdgpu-late-structurize",
244   cl::desc("Enable late CFG structurization"),
245   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
246   cl::Hidden);
247 
248 // Enable lib calls simplifications
249 static cl::opt<bool> EnableLibCallSimplify(
250   "amdgpu-simplify-libcall",
251   cl::desc("Enable amdgpu library simplifications"),
252   cl::init(true),
253   cl::Hidden);
254 
255 static cl::opt<bool> EnableLowerKernelArguments(
256   "amdgpu-ir-lower-kernel-arguments",
257   cl::desc("Lower kernel argument loads in IR pass"),
258   cl::init(true),
259   cl::Hidden);
260 
261 static cl::opt<bool> EnableRegReassign(
262   "amdgpu-reassign-regs",
263   cl::desc("Enable register reassign optimizations on gfx10+"),
264   cl::init(true),
265   cl::Hidden);
266 
267 static cl::opt<bool> OptVGPRLiveRange(
268     "amdgpu-opt-vgpr-liverange",
269     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
270     cl::init(true), cl::Hidden);
271 
272 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
273     "amdgpu-atomic-optimizer-strategy",
274     cl::desc("Select DPP or Iterative strategy for scan"),
275     cl::init(ScanOptions::Iterative),
276     cl::values(
277         clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
278         clEnumValN(ScanOptions::Iterative, "Iterative",
279                    "Use Iterative approach for scan"),
280         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
281 
282 // Enable Mode register optimization
283 static cl::opt<bool> EnableSIModeRegisterPass(
284   "amdgpu-mode-register",
285   cl::desc("Enable mode register pass"),
286   cl::init(true),
287   cl::Hidden);
288 
289 // Enable GFX11.5+ s_singleuse_vdst insertion
290 static cl::opt<bool>
291     EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
292                               cl::desc("Enable s_singleuse_vdst insertion"),
293                               cl::init(false), cl::Hidden);
294 
295 // Enable GFX11+ s_delay_alu insertion
296 static cl::opt<bool>
297     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
298                          cl::desc("Enable s_delay_alu insertion"),
299                          cl::init(true), cl::Hidden);
300 
301 // Enable GFX11+ VOPD
302 static cl::opt<bool>
303     EnableVOPD("amdgpu-enable-vopd",
304                cl::desc("Enable VOPD, dual issue of VALU in wave32"),
305                cl::init(true), cl::Hidden);
306 
307 // Option is used in lit tests to prevent deadcoding of patterns inspected.
308 static cl::opt<bool>
309 EnableDCEInRA("amdgpu-dce-in-ra",
310     cl::init(true), cl::Hidden,
311     cl::desc("Enable machine DCE inside regalloc"));
312 
313 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
314                                            cl::desc("Adjust wave priority"),
315                                            cl::init(false), cl::Hidden);
316 
317 static cl::opt<bool> EnableScalarIRPasses(
318   "amdgpu-scalar-ir-passes",
319   cl::desc("Enable scalar IR passes"),
320   cl::init(true),
321   cl::Hidden);
322 
323 static cl::opt<bool> EnableStructurizerWorkarounds(
324     "amdgpu-enable-structurizer-workarounds",
325     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
326     cl::Hidden);
327 
328 static cl::opt<bool, true> EnableLowerModuleLDS(
329     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
330     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
331     cl::Hidden);
332 
333 static cl::opt<bool> EnablePreRAOptimizations(
334     "amdgpu-enable-pre-ra-optimizations",
335     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
336     cl::Hidden);
337 
338 static cl::opt<bool> EnablePromoteKernelArguments(
339     "amdgpu-enable-promote-kernel-arguments",
340     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
341     cl::Hidden, cl::init(true));
342 
343 static cl::opt<bool> EnableImageIntrinsicOptimizer(
344     "amdgpu-enable-image-intrinsic-optimizer",
345     cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
346     cl::Hidden);
347 
348 static cl::opt<bool>
349     EnableLoopPrefetch("amdgpu-loop-prefetch",
350                        cl::desc("Enable loop data prefetch on AMDGPU"),
351                        cl::Hidden, cl::init(false));
352 
353 static cl::opt<bool> EnableMaxIlpSchedStrategy(
354     "amdgpu-enable-max-ilp-scheduling-strategy",
355     cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
356     cl::Hidden, cl::init(false));
357 
358 static cl::opt<bool> EnableRewritePartialRegUses(
359     "amdgpu-enable-rewrite-partial-reg-uses",
360     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
361     cl::Hidden);
362 
363 static cl::opt<bool> EnableHipStdPar(
364   "amdgpu-enable-hipstdpar",
365   cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
366   cl::Hidden);
367 
368 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
369   // Register the target
370   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
371   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
372 
373   PassRegistry *PR = PassRegistry::getPassRegistry();
374   initializeR600ClauseMergePassPass(*PR);
375   initializeR600ControlFlowFinalizerPass(*PR);
376   initializeR600PacketizerPass(*PR);
377   initializeR600ExpandSpecialInstrsPassPass(*PR);
378   initializeR600VectorRegMergerPass(*PR);
379   initializeGlobalISel(*PR);
380   initializeAMDGPUDAGToDAGISelPass(*PR);
381   initializeGCNDPPCombinePass(*PR);
382   initializeSILowerI1CopiesPass(*PR);
383   initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
384   initializeSILowerWWMCopiesPass(*PR);
385   initializeAMDGPUMarkLastScratchLoadPass(*PR);
386   initializeSILowerSGPRSpillsPass(*PR);
387   initializeSIFixSGPRCopiesPass(*PR);
388   initializeSIFixVGPRCopiesPass(*PR);
389   initializeSIFoldOperandsPass(*PR);
390   initializeSIPeepholeSDWAPass(*PR);
391   initializeSIShrinkInstructionsPass(*PR);
392   initializeSIOptimizeExecMaskingPreRAPass(*PR);
393   initializeSIOptimizeVGPRLiveRangePass(*PR);
394   initializeSILoadStoreOptimizerPass(*PR);
395   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
396   initializeAMDGPUAlwaysInlinePass(*PR);
397   initializeAMDGPUAttributorLegacyPass(*PR);
398   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
399   initializeAMDGPUAnnotateUniformValuesPass(*PR);
400   initializeAMDGPUArgumentUsageInfoPass(*PR);
401   initializeAMDGPUAtomicOptimizerPass(*PR);
402   initializeAMDGPULowerKernelArgumentsPass(*PR);
403   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
404   initializeAMDGPULowerKernelAttributesPass(*PR);
405   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
406   initializeAMDGPUPostLegalizerCombinerPass(*PR);
407   initializeAMDGPUPreLegalizerCombinerPass(*PR);
408   initializeAMDGPURegBankCombinerPass(*PR);
409   initializeAMDGPURegBankSelectPass(*PR);
410   initializeAMDGPUPromoteAllocaPass(*PR);
411   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
412   initializeAMDGPUCodeGenPreparePass(*PR);
413   initializeAMDGPULateCodeGenPreparePass(*PR);
414   initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
415   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
416   initializeAMDGPURewriteOutArgumentsPass(*PR);
417   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
418   initializeAMDGPUUnifyMetadataPass(*PR);
419   initializeSIAnnotateControlFlowPass(*PR);
420   initializeAMDGPUInsertSingleUseVDSTPass(*PR);
421   initializeAMDGPUInsertDelayAluPass(*PR);
422   initializeSIInsertHardClausesPass(*PR);
423   initializeSIInsertWaitcntsPass(*PR);
424   initializeSIModeRegisterPass(*PR);
425   initializeSIWholeQuadModePass(*PR);
426   initializeSILowerControlFlowPass(*PR);
427   initializeSIPreEmitPeepholePass(*PR);
428   initializeSILateBranchLoweringPass(*PR);
429   initializeSIMemoryLegalizerPass(*PR);
430   initializeSIOptimizeExecMaskingPass(*PR);
431   initializeSIPreAllocateWWMRegsPass(*PR);
432   initializeSIFormMemoryClausesPass(*PR);
433   initializeSIPostRABundlerPass(*PR);
434   initializeGCNCreateVOPDPass(*PR);
435   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
436   initializeAMDGPUAAWrapperPassPass(*PR);
437   initializeAMDGPUExternalAAWrapperPass(*PR);
438   initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
439   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
440   initializeAMDGPUResourceUsageAnalysisPass(*PR);
441   initializeGCNNSAReassignPass(*PR);
442   initializeGCNPreRAOptimizationsPass(*PR);
443   initializeGCNPreRALongBranchRegPass(*PR);
444   initializeGCNRewritePartialRegUsesPass(*PR);
445   initializeGCNRegPressurePrinterPass(*PR);
446 }
447 
448 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
449   return std::make_unique<AMDGPUTargetObjectFile>();
450 }
451 
452 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
453   return new SIScheduleDAGMI(C);
454 }
455 
456 static ScheduleDAGInstrs *
457 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
458   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
459   ScheduleDAGMILive *DAG =
460     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
461   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
462   if (ST.shouldClusterStores())
463     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
464   DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
465   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
466   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
467   return DAG;
468 }
469 
470 static ScheduleDAGInstrs *
471 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
472   ScheduleDAGMILive *DAG =
473       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
474   DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
475   return DAG;
476 }
477 
478 static ScheduleDAGInstrs *
479 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
480   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
481   auto DAG = new GCNIterativeScheduler(C,
482     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
483   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
484   if (ST.shouldClusterStores())
485     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
486   return DAG;
487 }
488 
489 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
490   return new GCNIterativeScheduler(C,
491     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
492 }
493 
494 static ScheduleDAGInstrs *
495 createIterativeILPMachineScheduler(MachineSchedContext *C) {
496   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
497   auto DAG = new GCNIterativeScheduler(C,
498     GCNIterativeScheduler::SCHEDULE_ILP);
499   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
500   if (ST.shouldClusterStores())
501     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
502   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
503   return DAG;
504 }
505 
506 static MachineSchedRegistry
507 SISchedRegistry("si", "Run SI's custom scheduler",
508                 createSIMachineScheduler);
509 
510 static MachineSchedRegistry
511 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
512                              "Run GCN scheduler to maximize occupancy",
513                              createGCNMaxOccupancyMachineScheduler);
514 
515 static MachineSchedRegistry
516     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
517                            createGCNMaxILPMachineScheduler);
518 
519 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
520     "gcn-iterative-max-occupancy-experimental",
521     "Run GCN scheduler to maximize occupancy (experimental)",
522     createIterativeGCNMaxOccupancyMachineScheduler);
523 
524 static MachineSchedRegistry GCNMinRegSchedRegistry(
525     "gcn-iterative-minreg",
526     "Run GCN iterative scheduler for minimal register usage (experimental)",
527     createMinRegScheduler);
528 
529 static MachineSchedRegistry GCNILPSchedRegistry(
530     "gcn-iterative-ilp",
531     "Run GCN iterative scheduler for ILP scheduling (experimental)",
532     createIterativeILPMachineScheduler);
533 
534 static StringRef computeDataLayout(const Triple &TT) {
535   if (TT.getArch() == Triple::r600) {
536     // 32-bit pointers.
537     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
538            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
539   }
540 
541   // 32-bit private, local, and region pointers. 64-bit global, constant and
542   // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
543   // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
544   // (address space 7), and 128-bit non-integral buffer resourcees (address
545   // space 8) which cannot be non-trivilally accessed by LLVM memory operations
546   // like getelementptr.
547   return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
548          "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
549          "v32:32-v48:64-v96:"
550          "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
551          "G1-ni:7:8:9";
552 }
553 
554 LLVM_READNONE
555 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
556   if (!GPU.empty())
557     return GPU;
558 
559   // Need to default to a target with flat support for HSA.
560   if (TT.getArch() == Triple::amdgcn)
561     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
562 
563   return "r600";
564 }
565 
566 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
567   // The AMDGPU toolchain only supports generating shared objects, so we
568   // must always use PIC.
569   return Reloc::PIC_;
570 }
571 
572 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
573                                          StringRef CPU, StringRef FS,
574                                          TargetOptions Options,
575                                          std::optional<Reloc::Model> RM,
576                                          std::optional<CodeModel::Model> CM,
577                                          CodeGenOptLevel OptLevel)
578     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
579                         FS, Options, getEffectiveRelocModel(RM),
580                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
581       TLOF(createTLOF(getTargetTriple())) {
582   initAsmInfo();
583   if (TT.getArch() == Triple::amdgcn) {
584     if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
585       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
586     else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
587       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
588   }
589 }
590 
591 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
592 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
593 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
594 
595 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
596 
597 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
598   Attribute GPUAttr = F.getFnAttribute("target-cpu");
599   return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
600 }
601 
602 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
603   Attribute FSAttr = F.getFnAttribute("target-features");
604 
605   return FSAttr.isValid() ? FSAttr.getValueAsString()
606                           : getTargetFeatureString();
607 }
608 
609 /// Predicate for Internalize pass.
610 static bool mustPreserveGV(const GlobalValue &GV) {
611   if (const Function *F = dyn_cast<Function>(&GV))
612     return F->isDeclaration() || F->getName().starts_with("__asan_") ||
613            F->getName().starts_with("__sanitizer_") ||
614            AMDGPU::isEntryFunctionCC(F->getCallingConv());
615 
616   GV.removeDeadConstantUsers();
617   return !GV.use_empty();
618 }
619 
620 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
621   AAM.registerFunctionAnalysis<AMDGPUAA>();
622 }
623 
624 void AMDGPUTargetMachine::registerPassBuilderCallbacks(
625     PassBuilder &PB, bool PopulateClassToPassNames) {
626   PB.registerPipelineParsingCallback(
627       [this](StringRef PassName, ModulePassManager &PM,
628              ArrayRef<PassBuilder::PipelineElement>) {
629         if (PassName == "amdgpu-attributor") {
630           PM.addPass(AMDGPUAttributorPass(*this));
631           return true;
632         }
633         if (PassName == "amdgpu-unify-metadata") {
634           PM.addPass(AMDGPUUnifyMetadataPass());
635           return true;
636         }
637         if (PassName == "amdgpu-printf-runtime-binding") {
638           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
639           return true;
640         }
641         if (PassName == "amdgpu-always-inline") {
642           PM.addPass(AMDGPUAlwaysInlinePass());
643           return true;
644         }
645         if (PassName == "amdgpu-lower-module-lds") {
646           PM.addPass(AMDGPULowerModuleLDSPass(*this));
647           return true;
648         }
649         if (PassName == "amdgpu-lower-ctor-dtor") {
650           PM.addPass(AMDGPUCtorDtorLoweringPass());
651           return true;
652         }
653         return false;
654       });
655   PB.registerPipelineParsingCallback(
656       [this](StringRef PassName, FunctionPassManager &PM,
657              ArrayRef<PassBuilder::PipelineElement>) {
658         if (PassName == "amdgpu-simplifylib") {
659           PM.addPass(AMDGPUSimplifyLibCallsPass());
660           return true;
661         }
662         if (PassName == "amdgpu-image-intrinsic-opt") {
663           PM.addPass(AMDGPUImageIntrinsicOptimizerPass(*this));
664           return true;
665         }
666         if (PassName == "amdgpu-usenative") {
667           PM.addPass(AMDGPUUseNativeCallsPass());
668           return true;
669         }
670         if (PassName == "amdgpu-promote-alloca") {
671           PM.addPass(AMDGPUPromoteAllocaPass(*this));
672           return true;
673         }
674         if (PassName == "amdgpu-promote-alloca-to-vector") {
675           PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
676           return true;
677         }
678         if (PassName == "amdgpu-lower-kernel-attributes") {
679           PM.addPass(AMDGPULowerKernelAttributesPass());
680           return true;
681         }
682         if (PassName == "amdgpu-promote-kernel-arguments") {
683           PM.addPass(AMDGPUPromoteKernelArgumentsPass());
684           return true;
685         }
686         if (PassName == "amdgpu-unify-divergent-exit-nodes") {
687           PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
688           return true;
689         }
690         if (PassName == "amdgpu-atomic-optimizer") {
691           PM.addPass(
692               AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy));
693           return true;
694         }
695         if (PassName == "amdgpu-codegenprepare") {
696           PM.addPass(AMDGPUCodeGenPreparePass(*this));
697           return true;
698         }
699         if (PassName == "amdgpu-lower-kernel-arguments") {
700           PM.addPass(AMDGPULowerKernelArgumentsPass(*this));
701           return true;
702         }
703         if (PassName == "amdgpu-rewrite-undef-for-phi") {
704           PM.addPass(AMDGPURewriteUndefForPHIPass());
705           return true;
706         }
707         return false;
708       });
709 
710   PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
711     FAM.registerPass([&] { return AMDGPUAA(); });
712   });
713 
714   PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
715     if (AAName == "amdgpu-aa") {
716       AAM.registerFunctionAnalysis<AMDGPUAA>();
717       return true;
718     }
719     return false;
720   });
721 
722   PB.registerPipelineStartEPCallback(
723       [](ModulePassManager &PM, OptimizationLevel Level) {
724         FunctionPassManager FPM;
725         FPM.addPass(AMDGPUUseNativeCallsPass());
726         if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
727           FPM.addPass(AMDGPUSimplifyLibCallsPass());
728         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
729         if (EnableHipStdPar)
730           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
731       });
732 
733   PB.registerPipelineEarlySimplificationEPCallback(
734       [](ModulePassManager &PM, OptimizationLevel Level) {
735         PM.addPass(AMDGPUPrintfRuntimeBindingPass());
736 
737         if (Level == OptimizationLevel::O0)
738           return;
739 
740         PM.addPass(AMDGPUUnifyMetadataPass());
741 
742         if (InternalizeSymbols) {
743           PM.addPass(InternalizePass(mustPreserveGV));
744           PM.addPass(GlobalDCEPass());
745         }
746 
747         if (EarlyInlineAll && !EnableFunctionCalls)
748           PM.addPass(AMDGPUAlwaysInlinePass());
749       });
750 
751   PB.registerCGSCCOptimizerLateEPCallback(
752       [this](CGSCCPassManager &PM, OptimizationLevel Level) {
753         if (Level == OptimizationLevel::O0)
754           return;
755 
756         FunctionPassManager FPM;
757 
758         // Add promote kernel arguments pass to the opt pipeline right before
759         // infer address spaces which is needed to do actual address space
760         // rewriting.
761         if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
762             EnablePromoteKernelArguments)
763           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
764 
765         // Add infer address spaces pass to the opt pipeline after inlining
766         // but before SROA to increase SROA opportunities.
767         FPM.addPass(InferAddressSpacesPass());
768 
769         // This should run after inlining to have any chance of doing
770         // anything, and before other cleanup optimizations.
771         FPM.addPass(AMDGPULowerKernelAttributesPass());
772 
773         if (Level != OptimizationLevel::O0) {
774           // Promote alloca to vector before SROA and loop unroll. If we
775           // manage to eliminate allocas before unroll we may choose to unroll
776           // less.
777           FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
778         }
779 
780         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
781       });
782 }
783 
784 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
785   return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
786           AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
787           AddrSpace == AMDGPUAS::REGION_ADDRESS)
788              ? -1
789              : 0;
790 }
791 
792 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
793                                               unsigned DestAS) const {
794   return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
795          AMDGPU::isFlatGlobalAddrSpace(DestAS);
796 }
797 
798 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
799   const auto *LD = dyn_cast<LoadInst>(V);
800   if (!LD)
801     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
802 
803   // It must be a generic pointer loaded.
804   assert(V->getType()->isPointerTy() &&
805          V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
806 
807   const auto *Ptr = LD->getPointerOperand();
808   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
809     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
810   // For a generic pointer loaded from the constant memory, it could be assumed
811   // as a global pointer since the constant memory is only populated on the
812   // host side. As implied by the offload programming model, only global
813   // pointers could be referenced on the host side.
814   return AMDGPUAS::GLOBAL_ADDRESS;
815 }
816 
817 std::pair<const Value *, unsigned>
818 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
819   if (auto *II = dyn_cast<IntrinsicInst>(V)) {
820     switch (II->getIntrinsicID()) {
821     case Intrinsic::amdgcn_is_shared:
822       return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
823     case Intrinsic::amdgcn_is_private:
824       return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
825     default:
826       break;
827     }
828     return std::pair(nullptr, -1);
829   }
830   // Check the global pointer predication based on
831   // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
832   // the order of 'is_shared' and 'is_private' is not significant.
833   Value *Ptr;
834   if (match(
835           const_cast<Value *>(V),
836           m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
837                   m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
838                       m_Deferred(Ptr))))))
839     return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
840 
841   return std::pair(nullptr, -1);
842 }
843 
844 unsigned
845 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
846   switch (Kind) {
847   case PseudoSourceValue::Stack:
848   case PseudoSourceValue::FixedStack:
849     return AMDGPUAS::PRIVATE_ADDRESS;
850   case PseudoSourceValue::ConstantPool:
851   case PseudoSourceValue::GOT:
852   case PseudoSourceValue::JumpTable:
853   case PseudoSourceValue::GlobalValueCallEntry:
854   case PseudoSourceValue::ExternalSymbolCallEntry:
855     return AMDGPUAS::CONSTANT_ADDRESS;
856   }
857   return AMDGPUAS::FLAT_ADDRESS;
858 }
859 
860 //===----------------------------------------------------------------------===//
861 // GCN Target Machine (SI+)
862 //===----------------------------------------------------------------------===//
863 
864 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
865                                    StringRef CPU, StringRef FS,
866                                    TargetOptions Options,
867                                    std::optional<Reloc::Model> RM,
868                                    std::optional<CodeModel::Model> CM,
869                                    CodeGenOptLevel OL, bool JIT)
870     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
871 
872 const TargetSubtargetInfo *
873 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
874   StringRef GPU = getGPUName(F);
875   StringRef FS = getFeatureString(F);
876 
877   SmallString<128> SubtargetKey(GPU);
878   SubtargetKey.append(FS);
879 
880   auto &I = SubtargetMap[SubtargetKey];
881   if (!I) {
882     // This needs to be done before we create a new subtarget since any
883     // creation will depend on the TM and the code generation flags on the
884     // function that reside in TargetOptions.
885     resetTargetOptions(F);
886     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
887   }
888 
889   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
890 
891   return I.get();
892 }
893 
894 TargetTransformInfo
895 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
896   return TargetTransformInfo(GCNTTIImpl(this, F));
897 }
898 
899 //===----------------------------------------------------------------------===//
900 // AMDGPU Pass Setup
901 //===----------------------------------------------------------------------===//
902 
903 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
904   return getStandardCSEConfigForOpt(TM->getOptLevel());
905 }
906 
907 namespace {
908 
909 class GCNPassConfig final : public AMDGPUPassConfig {
910 public:
911   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
912     : AMDGPUPassConfig(TM, PM) {
913     // It is necessary to know the register usage of the entire call graph.  We
914     // allow calls without EnableAMDGPUFunctionCalls if they are marked
915     // noinline, so this is always required.
916     setRequiresCodeGenSCCOrder(true);
917     substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
918   }
919 
920   GCNTargetMachine &getGCNTargetMachine() const {
921     return getTM<GCNTargetMachine>();
922   }
923 
924   ScheduleDAGInstrs *
925   createMachineScheduler(MachineSchedContext *C) const override;
926 
927   ScheduleDAGInstrs *
928   createPostMachineScheduler(MachineSchedContext *C) const override {
929     ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
930         C, std::make_unique<PostGenericScheduler>(C),
931         /*RemoveKillFlags=*/true);
932     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
933     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
934     if (ST.shouldClusterStores())
935       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
936     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
937     DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
938     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
939       DAG->addMutation(createVOPDPairingMutation());
940     return DAG;
941   }
942 
943   bool addPreISel() override;
944   void addMachineSSAOptimization() override;
945   bool addILPOpts() override;
946   bool addInstSelector() override;
947   bool addIRTranslator() override;
948   void addPreLegalizeMachineIR() override;
949   bool addLegalizeMachineIR() override;
950   void addPreRegBankSelect() override;
951   bool addRegBankSelect() override;
952   void addPreGlobalInstructionSelect() override;
953   bool addGlobalInstructionSelect() override;
954   void addFastRegAlloc() override;
955   void addOptimizedRegAlloc() override;
956 
957   FunctionPass *createSGPRAllocPass(bool Optimized);
958   FunctionPass *createVGPRAllocPass(bool Optimized);
959   FunctionPass *createRegAllocPass(bool Optimized) override;
960 
961   bool addRegAssignAndRewriteFast() override;
962   bool addRegAssignAndRewriteOptimized() override;
963 
964   void addPreRegAlloc() override;
965   bool addPreRewrite() override;
966   void addPostRegAlloc() override;
967   void addPreSched2() override;
968   void addPreEmitPass() override;
969 };
970 
971 } // end anonymous namespace
972 
973 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
974     : TargetPassConfig(TM, PM) {
975   // Exceptions and StackMaps are not supported, so these passes will never do
976   // anything.
977   disablePass(&StackMapLivenessID);
978   disablePass(&FuncletLayoutID);
979   // Garbage collection is not supported.
980   disablePass(&GCLoweringID);
981   disablePass(&ShadowStackGCLoweringID);
982 }
983 
984 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
985   if (getOptLevel() == CodeGenOptLevel::Aggressive)
986     addPass(createGVNPass());
987   else
988     addPass(createEarlyCSEPass());
989 }
990 
991 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
992   if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
993     addPass(createLoopDataPrefetchPass());
994   addPass(createSeparateConstOffsetFromGEPPass());
995   // ReassociateGEPs exposes more opportunities for SLSR. See
996   // the example in reassociate-geps-and-slsr.ll.
997   addPass(createStraightLineStrengthReducePass());
998   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
999   // EarlyCSE can reuse.
1000   addEarlyCSEOrGVNPass();
1001   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1002   addPass(createNaryReassociatePass());
1003   // NaryReassociate on GEPs creates redundant common expressions, so run
1004   // EarlyCSE after it.
1005   addPass(createEarlyCSEPass());
1006 }
1007 
1008 void AMDGPUPassConfig::addIRPasses() {
1009   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1010 
1011   Triple::ArchType Arch = TM.getTargetTriple().getArch();
1012   if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
1013     addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
1014 
1015   // There is no reason to run these.
1016   disablePass(&StackMapLivenessID);
1017   disablePass(&FuncletLayoutID);
1018   disablePass(&PatchableFunctionID);
1019 
1020   addPass(createAMDGPUPrintfRuntimeBinding());
1021   if (LowerCtorDtor)
1022     addPass(createAMDGPUCtorDtorLoweringLegacyPass());
1023 
1024   if (isPassEnabled(EnableImageIntrinsicOptimizer))
1025     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
1026 
1027   // Function calls are not supported, so make sure we inline everything.
1028   addPass(createAMDGPUAlwaysInlinePass());
1029   addPass(createAlwaysInlinerLegacyPass());
1030 
1031   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1032   if (Arch == Triple::r600)
1033     addPass(createR600OpenCLImageTypeLoweringPass());
1034 
1035   // Replace OpenCL enqueued block function pointers with global variables.
1036   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1037 
1038   // Runs before PromoteAlloca so the latter can account for function uses
1039   if (EnableLowerModuleLDS) {
1040     addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
1041   }
1042 
1043   // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
1044   // after their introduction
1045   if (TM.getOptLevel() > CodeGenOptLevel::None)
1046     addPass(createAMDGPUAttributorLegacyPass());
1047 
1048   if (TM.getOptLevel() > CodeGenOptLevel::None)
1049     addPass(createInferAddressSpacesPass());
1050 
1051   // Run atomic optimizer before Atomic Expand
1052   if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
1053       (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1054       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1055     addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
1056   }
1057 
1058   addPass(createAtomicExpandPass());
1059 
1060   if (TM.getOptLevel() > CodeGenOptLevel::None) {
1061     addPass(createAMDGPUPromoteAlloca());
1062 
1063     if (isPassEnabled(EnableScalarIRPasses))
1064       addStraightLineScalarOptimizationPasses();
1065 
1066     if (EnableAMDGPUAliasAnalysis) {
1067       addPass(createAMDGPUAAWrapperPass());
1068       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1069                                              AAResults &AAR) {
1070         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1071           AAR.addAAResult(WrapperPass->getResult());
1072         }));
1073     }
1074 
1075     if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1076       // TODO: May want to move later or split into an early and late one.
1077       addPass(createAMDGPUCodeGenPreparePass());
1078     }
1079 
1080     // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1081     // have expanded.
1082     if (TM.getOptLevel() > CodeGenOptLevel::Less)
1083       addPass(createLICMPass());
1084   }
1085 
1086   TargetPassConfig::addIRPasses();
1087 
1088   // EarlyCSE is not always strong enough to clean up what LSR produces. For
1089   // example, GVN can combine
1090   //
1091   //   %0 = add %a, %b
1092   //   %1 = add %b, %a
1093   //
1094   // and
1095   //
1096   //   %0 = shl nsw %a, 2
1097   //   %1 = shl %a, 2
1098   //
1099   // but EarlyCSE can do neither of them.
1100   if (isPassEnabled(EnableScalarIRPasses))
1101     addEarlyCSEOrGVNPass();
1102 }
1103 
1104 void AMDGPUPassConfig::addCodeGenPrepare() {
1105   if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1106     // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1107     // analysis, and should be removed.
1108     addPass(createAMDGPUAnnotateKernelFeaturesPass());
1109   }
1110 
1111   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1112       EnableLowerKernelArguments)
1113     addPass(createAMDGPULowerKernelArgumentsPass());
1114 
1115   TargetPassConfig::addCodeGenPrepare();
1116 
1117   if (isPassEnabled(EnableLoadStoreVectorizer))
1118     addPass(createLoadStoreVectorizerPass());
1119 
1120   // LowerSwitch pass may introduce unreachable blocks that can
1121   // cause unexpected behavior for subsequent passes. Placing it
1122   // here seems better that these blocks would get cleaned up by
1123   // UnreachableBlockElim inserted next in the pass flow.
1124   addPass(createLowerSwitchPass());
1125 }
1126 
1127 bool AMDGPUPassConfig::addPreISel() {
1128   if (TM->getOptLevel() > CodeGenOptLevel::None)
1129     addPass(createFlattenCFGPass());
1130   return false;
1131 }
1132 
1133 bool AMDGPUPassConfig::addInstSelector() {
1134   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1135   return false;
1136 }
1137 
1138 bool AMDGPUPassConfig::addGCPasses() {
1139   // Do nothing. GC is not supported.
1140   return false;
1141 }
1142 
1143 llvm::ScheduleDAGInstrs *
1144 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1145   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1146   ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1147   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1148   if (ST.shouldClusterStores())
1149     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1150   return DAG;
1151 }
1152 
1153 MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1154     BumpPtrAllocator &Allocator, const Function &F,
1155     const TargetSubtargetInfo *STI) const {
1156   return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1157       Allocator, F, static_cast<const R600Subtarget *>(STI));
1158 }
1159 
1160 //===----------------------------------------------------------------------===//
1161 // GCN Pass Setup
1162 //===----------------------------------------------------------------------===//
1163 
1164 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1165   MachineSchedContext *C) const {
1166   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1167   if (ST.enableSIScheduler())
1168     return createSIMachineScheduler(C);
1169 
1170   if (EnableMaxIlpSchedStrategy)
1171     return createGCNMaxILPMachineScheduler(C);
1172 
1173   return createGCNMaxOccupancyMachineScheduler(C);
1174 }
1175 
1176 bool GCNPassConfig::addPreISel() {
1177   AMDGPUPassConfig::addPreISel();
1178 
1179   if (TM->getOptLevel() > CodeGenOptLevel::None)
1180     addPass(createAMDGPULateCodeGenPreparePass());
1181 
1182   if (TM->getOptLevel() > CodeGenOptLevel::None)
1183     addPass(createSinkingPass());
1184 
1185   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1186   // regions formed by them.
1187   addPass(&AMDGPUUnifyDivergentExitNodesID);
1188   if (!LateCFGStructurize) {
1189     if (EnableStructurizerWorkarounds) {
1190       addPass(createFixIrreduciblePass());
1191       addPass(createUnifyLoopExitsPass());
1192     }
1193     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1194   }
1195   addPass(createAMDGPUAnnotateUniformValues());
1196   if (!LateCFGStructurize) {
1197     addPass(createSIAnnotateControlFlowPass());
1198     // TODO: Move this right after structurizeCFG to avoid extra divergence
1199     // analysis. This depends on stopping SIAnnotateControlFlow from making
1200     // control flow modifications.
1201     addPass(createAMDGPURewriteUndefForPHILegacyPass());
1202   }
1203   addPass(createLCSSAPass());
1204 
1205   if (TM->getOptLevel() > CodeGenOptLevel::Less)
1206     addPass(&AMDGPUPerfHintAnalysisID);
1207 
1208   return false;
1209 }
1210 
1211 void GCNPassConfig::addMachineSSAOptimization() {
1212   TargetPassConfig::addMachineSSAOptimization();
1213 
1214   // We want to fold operands after PeepholeOptimizer has run (or as part of
1215   // it), because it will eliminate extra copies making it easier to fold the
1216   // real source operand. We want to eliminate dead instructions after, so that
1217   // we see fewer uses of the copies. We then need to clean up the dead
1218   // instructions leftover after the operands are folded as well.
1219   //
1220   // XXX - Can we get away without running DeadMachineInstructionElim again?
1221   addPass(&SIFoldOperandsID);
1222   if (EnableDPPCombine)
1223     addPass(&GCNDPPCombineID);
1224   addPass(&SILoadStoreOptimizerID);
1225   if (isPassEnabled(EnableSDWAPeephole)) {
1226     addPass(&SIPeepholeSDWAID);
1227     addPass(&EarlyMachineLICMID);
1228     addPass(&MachineCSEID);
1229     addPass(&SIFoldOperandsID);
1230   }
1231   addPass(&DeadMachineInstructionElimID);
1232   addPass(createSIShrinkInstructionsPass());
1233 }
1234 
1235 bool GCNPassConfig::addILPOpts() {
1236   if (EnableEarlyIfConversion)
1237     addPass(&EarlyIfConverterID);
1238 
1239   TargetPassConfig::addILPOpts();
1240   return false;
1241 }
1242 
1243 bool GCNPassConfig::addInstSelector() {
1244   AMDGPUPassConfig::addInstSelector();
1245   addPass(&SIFixSGPRCopiesID);
1246   addPass(createSILowerI1CopiesPass());
1247   return false;
1248 }
1249 
1250 bool GCNPassConfig::addIRTranslator() {
1251   addPass(new IRTranslator(getOptLevel()));
1252   return false;
1253 }
1254 
1255 void GCNPassConfig::addPreLegalizeMachineIR() {
1256   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1257   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1258   addPass(new Localizer());
1259 }
1260 
1261 bool GCNPassConfig::addLegalizeMachineIR() {
1262   addPass(new Legalizer());
1263   return false;
1264 }
1265 
1266 void GCNPassConfig::addPreRegBankSelect() {
1267   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1268   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1269   addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
1270 }
1271 
1272 bool GCNPassConfig::addRegBankSelect() {
1273   addPass(new AMDGPURegBankSelect());
1274   return false;
1275 }
1276 
1277 void GCNPassConfig::addPreGlobalInstructionSelect() {
1278   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1279   addPass(createAMDGPURegBankCombiner(IsOptNone));
1280 }
1281 
1282 bool GCNPassConfig::addGlobalInstructionSelect() {
1283   addPass(new InstructionSelect(getOptLevel()));
1284   return false;
1285 }
1286 
1287 void GCNPassConfig::addPreRegAlloc() {
1288   if (LateCFGStructurize) {
1289     addPass(createAMDGPUMachineCFGStructurizerPass());
1290   }
1291 }
1292 
1293 void GCNPassConfig::addFastRegAlloc() {
1294   // FIXME: We have to disable the verifier here because of PHIElimination +
1295   // TwoAddressInstructions disabling it.
1296 
1297   // This must be run immediately after phi elimination and before
1298   // TwoAddressInstructions, otherwise the processing of the tied operand of
1299   // SI_ELSE will introduce a copy of the tied operand source after the else.
1300   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1301 
1302   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1303 
1304   TargetPassConfig::addFastRegAlloc();
1305 }
1306 
1307 void GCNPassConfig::addOptimizedRegAlloc() {
1308   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1309   // instructions that cause scheduling barriers.
1310   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1311 
1312   if (OptExecMaskPreRA)
1313     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1314 
1315   if (EnableRewritePartialRegUses)
1316     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
1317 
1318   if (isPassEnabled(EnablePreRAOptimizations))
1319     insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1320 
1321   // This is not an essential optimization and it has a noticeable impact on
1322   // compilation time, so we only enable it from O2.
1323   if (TM->getOptLevel() > CodeGenOptLevel::Less)
1324     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1325 
1326   // FIXME: when an instruction has a Killed operand, and the instruction is
1327   // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1328   // the register in LiveVariables, this would trigger a failure in verifier,
1329   // we should fix it and enable the verifier.
1330   if (OptVGPRLiveRange)
1331     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1332   // This must be run immediately after phi elimination and before
1333   // TwoAddressInstructions, otherwise the processing of the tied operand of
1334   // SI_ELSE will introduce a copy of the tied operand source after the else.
1335   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1336 
1337   if (EnableDCEInRA)
1338     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1339 
1340   TargetPassConfig::addOptimizedRegAlloc();
1341 }
1342 
1343 bool GCNPassConfig::addPreRewrite() {
1344   addPass(&SILowerWWMCopiesID);
1345   if (EnableRegReassign)
1346     addPass(&GCNNSAReassignID);
1347   return true;
1348 }
1349 
1350 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1351   // Initialize the global default.
1352   llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1353                   initializeDefaultSGPRRegisterAllocatorOnce);
1354 
1355   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1356   if (Ctor != useDefaultRegisterAllocator)
1357     return Ctor();
1358 
1359   if (Optimized)
1360     return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1361 
1362   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1363 }
1364 
1365 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1366   // Initialize the global default.
1367   llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1368                   initializeDefaultVGPRRegisterAllocatorOnce);
1369 
1370   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1371   if (Ctor != useDefaultRegisterAllocator)
1372     return Ctor();
1373 
1374   if (Optimized)
1375     return createGreedyVGPRRegisterAllocator();
1376 
1377   return createFastVGPRRegisterAllocator();
1378 }
1379 
1380 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1381   llvm_unreachable("should not be used");
1382 }
1383 
1384 static const char RegAllocOptNotSupportedMessage[] =
1385   "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1386 
1387 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1388   if (!usingDefaultRegAlloc())
1389     report_fatal_error(RegAllocOptNotSupportedMessage);
1390 
1391   addPass(&GCNPreRALongBranchRegID);
1392 
1393   addPass(createSGPRAllocPass(false));
1394 
1395   // Equivalent of PEI for SGPRs.
1396   addPass(&SILowerSGPRSpillsID);
1397   addPass(&SIPreAllocateWWMRegsID);
1398 
1399   addPass(createVGPRAllocPass(false));
1400 
1401   addPass(&SILowerWWMCopiesID);
1402   return true;
1403 }
1404 
1405 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1406   if (!usingDefaultRegAlloc())
1407     report_fatal_error(RegAllocOptNotSupportedMessage);
1408 
1409   addPass(&GCNPreRALongBranchRegID);
1410 
1411   addPass(createSGPRAllocPass(true));
1412 
1413   // Commit allocated register changes. This is mostly necessary because too
1414   // many things rely on the use lists of the physical registers, such as the
1415   // verifier. This is only necessary with allocators which use LiveIntervals,
1416   // since FastRegAlloc does the replacements itself.
1417   addPass(createVirtRegRewriter(false));
1418 
1419   // Equivalent of PEI for SGPRs.
1420   addPass(&SILowerSGPRSpillsID);
1421   addPass(&SIPreAllocateWWMRegsID);
1422 
1423   addPass(createVGPRAllocPass(true));
1424 
1425   addPreRewrite();
1426   addPass(&VirtRegRewriterID);
1427 
1428   addPass(&AMDGPUMarkLastScratchLoadID);
1429 
1430   return true;
1431 }
1432 
1433 void GCNPassConfig::addPostRegAlloc() {
1434   addPass(&SIFixVGPRCopiesID);
1435   if (getOptLevel() > CodeGenOptLevel::None)
1436     addPass(&SIOptimizeExecMaskingID);
1437   TargetPassConfig::addPostRegAlloc();
1438 }
1439 
1440 void GCNPassConfig::addPreSched2() {
1441   if (TM->getOptLevel() > CodeGenOptLevel::None)
1442     addPass(createSIShrinkInstructionsPass());
1443   addPass(&SIPostRABundlerID);
1444 }
1445 
1446 void GCNPassConfig::addPreEmitPass() {
1447   if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1448     addPass(&GCNCreateVOPDID);
1449   addPass(createSIMemoryLegalizerPass());
1450   addPass(createSIInsertWaitcntsPass());
1451 
1452   addPass(createSIModeRegisterPass());
1453 
1454   if (getOptLevel() > CodeGenOptLevel::None)
1455     addPass(&SIInsertHardClausesID);
1456 
1457   addPass(&SILateBranchLoweringPassID);
1458   if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1459     addPass(createAMDGPUSetWavePriorityPass());
1460   if (getOptLevel() > CodeGenOptLevel::None)
1461     addPass(&SIPreEmitPeepholeID);
1462   // The hazard recognizer that runs as part of the post-ra scheduler does not
1463   // guarantee to be able handle all hazards correctly. This is because if there
1464   // are multiple scheduling regions in a basic block, the regions are scheduled
1465   // bottom up, so when we begin to schedule a region we don't know what
1466   // instructions were emitted directly before it.
1467   //
1468   // Here we add a stand-alone hazard recognizer pass which can handle all
1469   // cases.
1470   addPass(&PostRAHazardRecognizerID);
1471 
1472   if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
1473     addPass(&AMDGPUInsertSingleUseVDSTID);
1474 
1475   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1476     addPass(&AMDGPUInsertDelayAluID);
1477 
1478   addPass(&BranchRelaxationPassID);
1479 }
1480 
1481 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1482   return new GCNPassConfig(*this, PM);
1483 }
1484 
1485 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1486     MachineFunction &MF) const {
1487   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1488   MF.getRegInfo().addDelegate(MFI);
1489 }
1490 
1491 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1492     BumpPtrAllocator &Allocator, const Function &F,
1493     const TargetSubtargetInfo *STI) const {
1494   return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1495       Allocator, F, static_cast<const GCNSubtarget *>(STI));
1496 }
1497 
1498 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1499   return new yaml::SIMachineFunctionInfo();
1500 }
1501 
1502 yaml::MachineFunctionInfo *
1503 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1504   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1505   return new yaml::SIMachineFunctionInfo(
1506       *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1507 }
1508 
1509 bool GCNTargetMachine::parseMachineFunctionInfo(
1510     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1511     SMDiagnostic &Error, SMRange &SourceRange) const {
1512   const yaml::SIMachineFunctionInfo &YamlMFI =
1513       static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1514   MachineFunction &MF = PFS.MF;
1515   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1516   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1517 
1518   if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1519     return true;
1520 
1521   if (MFI->Occupancy == 0) {
1522     // Fixup the subtarget dependent default value.
1523     MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1524   }
1525 
1526   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1527     Register TempReg;
1528     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1529       SourceRange = RegName.SourceRange;
1530       return true;
1531     }
1532     RegVal = TempReg;
1533 
1534     return false;
1535   };
1536 
1537   auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1538                                    Register &RegVal) {
1539     return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1540   };
1541 
1542   if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1543     return true;
1544 
1545   if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1546     return true;
1547 
1548   if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1549                             MFI->LongBranchReservedReg))
1550     return true;
1551 
1552   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1553     // Create a diagnostic for a the register string literal.
1554     const MemoryBuffer &Buffer =
1555         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1556     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1557                          RegName.Value.size(), SourceMgr::DK_Error,
1558                          "incorrect register class for field", RegName.Value,
1559                          std::nullopt, std::nullopt);
1560     SourceRange = RegName.SourceRange;
1561     return true;
1562   };
1563 
1564   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1565       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1566       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1567     return true;
1568 
1569   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1570       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1571     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1572   }
1573 
1574   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1575       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1576     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1577   }
1578 
1579   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1580       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1581     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1582   }
1583 
1584   for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1585     Register ParsedReg;
1586     if (parseRegister(YamlReg, ParsedReg))
1587       return true;
1588 
1589     MFI->reserveWWMRegister(ParsedReg);
1590   }
1591 
1592   auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1593                                    const TargetRegisterClass &RC,
1594                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1595                                    unsigned SystemSGPRs) {
1596     // Skip parsing if it's not present.
1597     if (!A)
1598       return false;
1599 
1600     if (A->IsRegister) {
1601       Register Reg;
1602       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1603         SourceRange = A->RegisterName.SourceRange;
1604         return true;
1605       }
1606       if (!RC.contains(Reg))
1607         return diagnoseRegisterClass(A->RegisterName);
1608       Arg = ArgDescriptor::createRegister(Reg);
1609     } else
1610       Arg = ArgDescriptor::createStack(A->StackOffset);
1611     // Check and apply the optional mask.
1612     if (A->Mask)
1613       Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1614 
1615     MFI->NumUserSGPRs += UserSGPRs;
1616     MFI->NumSystemSGPRs += SystemSGPRs;
1617     return false;
1618   };
1619 
1620   if (YamlMFI.ArgInfo &&
1621       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1622                              AMDGPU::SGPR_128RegClass,
1623                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1624        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1625                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1626                              2, 0) ||
1627        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1628                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1629        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1630                              AMDGPU::SReg_64RegClass,
1631                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1632        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1633                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1634                              2, 0) ||
1635        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1636                              AMDGPU::SReg_64RegClass,
1637                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1638        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1639                              AMDGPU::SGPR_32RegClass,
1640                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1641        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1642                              AMDGPU::SGPR_32RegClass,
1643                              MFI->ArgInfo.LDSKernelId, 0, 1) ||
1644        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1645                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1646                              0, 1) ||
1647        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1648                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1649                              0, 1) ||
1650        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1651                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1652                              0, 1) ||
1653        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1654                              AMDGPU::SGPR_32RegClass,
1655                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1656        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1657                              AMDGPU::SGPR_32RegClass,
1658                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1659        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1660                              AMDGPU::SReg_64RegClass,
1661                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1662        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1663                              AMDGPU::SReg_64RegClass,
1664                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1665        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1666                              AMDGPU::VGPR_32RegClass,
1667                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1668        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1669                              AMDGPU::VGPR_32RegClass,
1670                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1671        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1672                              AMDGPU::VGPR_32RegClass,
1673                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1674     return true;
1675 
1676   if (ST.hasIEEEMode())
1677     MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1678   if (ST.hasDX10ClampMode())
1679     MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1680 
1681   // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1682   MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1683                                       ? DenormalMode::IEEE
1684                                       : DenormalMode::PreserveSign;
1685   MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1686                                        ? DenormalMode::IEEE
1687                                        : DenormalMode::PreserveSign;
1688 
1689   MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1690                                           ? DenormalMode::IEEE
1691                                           : DenormalMode::PreserveSign;
1692   MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1693                                            ? DenormalMode::IEEE
1694                                            : DenormalMode::PreserveSign;
1695 
1696   return false;
1697 }
1698