1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for SI+ GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCtorDtorLowering.h"
19 #include "AMDGPUExportClustering.h"
20 #include "AMDGPUIGroupLP.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPURegBankSelect.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "AMDGPUUnifyDivergentExitNodes.h"
26 #include "GCNIterativeScheduler.h"
27 #include "GCNSchedStrategy.h"
28 #include "GCNVOPDUtils.h"
29 #include "R600.h"
30 #include "R600MachineFunctionInfo.h"
31 #include "R600TargetMachine.h"
32 #include "SIMachineFunctionInfo.h"
33 #include "SIMachineScheduler.h"
34 #include "TargetInfo/AMDGPUTargetInfo.h"
35 #include "Utils/AMDGPUBaseInfo.h"
36 #include "llvm/Analysis/CGSCCPassManager.h"
37 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
38 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
39 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
40 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
41 #include "llvm/CodeGen/GlobalISel/Localizer.h"
42 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
43 #include "llvm/CodeGen/MIRParser/MIParser.h"
44 #include "llvm/CodeGen/Passes.h"
45 #include "llvm/CodeGen/RegAllocRegistry.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/IR/IntrinsicsAMDGPU.h"
48 #include "llvm/IR/PassManager.h"
49 #include "llvm/IR/PatternMatch.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/MC/TargetRegistry.h"
52 #include "llvm/Passes/PassBuilder.h"
53 #include "llvm/Transforms/IPO.h"
54 #include "llvm/Transforms/IPO/AlwaysInliner.h"
55 #include "llvm/Transforms/IPO/GlobalDCE.h"
56 #include "llvm/Transforms/IPO/Internalize.h"
57 #include "llvm/Transforms/Scalar.h"
58 #include "llvm/Transforms/Scalar/GVN.h"
59 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
60 #include "llvm/Transforms/Utils.h"
61 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
62 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
63 #include <optional>
64 
65 using namespace llvm;
66 using namespace llvm::PatternMatch;
67 
68 namespace {
69 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
70 public:
71   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
72     : RegisterRegAllocBase(N, D, C) {}
73 };
74 
75 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
76 public:
77   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
78     : RegisterRegAllocBase(N, D, C) {}
79 };
80 
81 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
82                               const TargetRegisterClass &RC) {
83   return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
84 }
85 
86 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
87                               const TargetRegisterClass &RC) {
88   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
89 }
90 
91 
92 /// -{sgpr|vgpr}-regalloc=... command line option.
93 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
94 
95 /// A dummy default pass factory indicates whether the register allocator is
96 /// overridden on the command line.
97 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
98 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
99 
100 static SGPRRegisterRegAlloc
101 defaultSGPRRegAlloc("default",
102                     "pick SGPR register allocator based on -O option",
103                     useDefaultRegisterAllocator);
104 
105 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
106                RegisterPassParser<SGPRRegisterRegAlloc>>
107 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
108              cl::desc("Register allocator to use for SGPRs"));
109 
110 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
111                RegisterPassParser<VGPRRegisterRegAlloc>>
112 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
113              cl::desc("Register allocator to use for VGPRs"));
114 
115 
116 static void initializeDefaultSGPRRegisterAllocatorOnce() {
117   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
118 
119   if (!Ctor) {
120     Ctor = SGPRRegAlloc;
121     SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
122   }
123 }
124 
125 static void initializeDefaultVGPRRegisterAllocatorOnce() {
126   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
127 
128   if (!Ctor) {
129     Ctor = VGPRRegAlloc;
130     VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
131   }
132 }
133 
134 static FunctionPass *createBasicSGPRRegisterAllocator() {
135   return createBasicRegisterAllocator(onlyAllocateSGPRs);
136 }
137 
138 static FunctionPass *createGreedySGPRRegisterAllocator() {
139   return createGreedyRegisterAllocator(onlyAllocateSGPRs);
140 }
141 
142 static FunctionPass *createFastSGPRRegisterAllocator() {
143   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
144 }
145 
146 static FunctionPass *createBasicVGPRRegisterAllocator() {
147   return createBasicRegisterAllocator(onlyAllocateVGPRs);
148 }
149 
150 static FunctionPass *createGreedyVGPRRegisterAllocator() {
151   return createGreedyRegisterAllocator(onlyAllocateVGPRs);
152 }
153 
154 static FunctionPass *createFastVGPRRegisterAllocator() {
155   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
156 }
157 
158 static SGPRRegisterRegAlloc basicRegAllocSGPR(
159   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
160 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
161   "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
162 
163 static SGPRRegisterRegAlloc fastRegAllocSGPR(
164   "fast", "fast register allocator", createFastSGPRRegisterAllocator);
165 
166 
167 static VGPRRegisterRegAlloc basicRegAllocVGPR(
168   "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
169 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
170   "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
171 
172 static VGPRRegisterRegAlloc fastRegAllocVGPR(
173   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
174 }
175 
176 static cl::opt<bool> EnableSROA(
177   "amdgpu-sroa",
178   cl::desc("Run SROA after promote alloca pass"),
179   cl::ReallyHidden,
180   cl::init(true));
181 
182 static cl::opt<bool>
183 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
184                         cl::desc("Run early if-conversion"),
185                         cl::init(false));
186 
187 static cl::opt<bool>
188 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
189             cl::desc("Run pre-RA exec mask optimizations"),
190             cl::init(true));
191 
192 static cl::opt<bool>
193     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
194                   cl::desc("Lower GPU ctor / dtors to globals on the device."),
195                   cl::init(true), cl::Hidden);
196 
197 // Option to disable vectorizer for tests.
198 static cl::opt<bool> EnableLoadStoreVectorizer(
199   "amdgpu-load-store-vectorizer",
200   cl::desc("Enable load store vectorizer"),
201   cl::init(true),
202   cl::Hidden);
203 
204 // Option to control global loads scalarization
205 static cl::opt<bool> ScalarizeGlobal(
206   "amdgpu-scalarize-global-loads",
207   cl::desc("Enable global load scalarization"),
208   cl::init(true),
209   cl::Hidden);
210 
211 // Option to run internalize pass.
212 static cl::opt<bool> InternalizeSymbols(
213   "amdgpu-internalize-symbols",
214   cl::desc("Enable elimination of non-kernel functions and unused globals"),
215   cl::init(false),
216   cl::Hidden);
217 
218 // Option to inline all early.
219 static cl::opt<bool> EarlyInlineAll(
220   "amdgpu-early-inline-all",
221   cl::desc("Inline all functions early"),
222   cl::init(false),
223   cl::Hidden);
224 
225 static cl::opt<bool> RemoveIncompatibleFunctions(
226     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
227     cl::desc("Enable removal of functions when they"
228              "use features not supported by the target GPU"),
229     cl::init(true));
230 
231 static cl::opt<bool> EnableSDWAPeephole(
232   "amdgpu-sdwa-peephole",
233   cl::desc("Enable SDWA peepholer"),
234   cl::init(true));
235 
236 static cl::opt<bool> EnableDPPCombine(
237   "amdgpu-dpp-combine",
238   cl::desc("Enable DPP combiner"),
239   cl::init(true));
240 
241 // Enable address space based alias analysis
242 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
243   cl::desc("Enable AMDGPU Alias Analysis"),
244   cl::init(true));
245 
246 // Option to run late CFG structurizer
247 static cl::opt<bool, true> LateCFGStructurize(
248   "amdgpu-late-structurize",
249   cl::desc("Enable late CFG structurization"),
250   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
251   cl::Hidden);
252 
253 // Enable lib calls simplifications
254 static cl::opt<bool> EnableLibCallSimplify(
255   "amdgpu-simplify-libcall",
256   cl::desc("Enable amdgpu library simplifications"),
257   cl::init(true),
258   cl::Hidden);
259 
260 static cl::opt<bool> EnableLowerKernelArguments(
261   "amdgpu-ir-lower-kernel-arguments",
262   cl::desc("Lower kernel argument loads in IR pass"),
263   cl::init(true),
264   cl::Hidden);
265 
266 static cl::opt<bool> EnableRegReassign(
267   "amdgpu-reassign-regs",
268   cl::desc("Enable register reassign optimizations on gfx10+"),
269   cl::init(true),
270   cl::Hidden);
271 
272 static cl::opt<bool> OptVGPRLiveRange(
273     "amdgpu-opt-vgpr-liverange",
274     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
275     cl::init(true), cl::Hidden);
276 
277 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
278     "amdgpu-atomic-optimizer-strategy",
279     cl::desc("Select DPP or Iterative strategy for scan"),
280     cl::init(ScanOptions::Iterative),
281     cl::values(
282         clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
283         clEnumValN(ScanOptions::Iterative, "Iterative",
284                    "Use Iterative approach for scan"),
285         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
286 
287 // Enable Mode register optimization
288 static cl::opt<bool> EnableSIModeRegisterPass(
289   "amdgpu-mode-register",
290   cl::desc("Enable mode register pass"),
291   cl::init(true),
292   cl::Hidden);
293 
294 // Enable GFX11+ s_delay_alu insertion
295 static cl::opt<bool>
296     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
297                          cl::desc("Enable s_delay_alu insertion"),
298                          cl::init(true), cl::Hidden);
299 
300 // Enable GFX11+ VOPD
301 static cl::opt<bool>
302     EnableVOPD("amdgpu-enable-vopd",
303                cl::desc("Enable VOPD, dual issue of VALU in wave32"),
304                cl::init(true), cl::Hidden);
305 
306 // Option is used in lit tests to prevent deadcoding of patterns inspected.
307 static cl::opt<bool>
308 EnableDCEInRA("amdgpu-dce-in-ra",
309     cl::init(true), cl::Hidden,
310     cl::desc("Enable machine DCE inside regalloc"));
311 
312 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
313                                            cl::desc("Adjust wave priority"),
314                                            cl::init(false), cl::Hidden);
315 
316 static cl::opt<bool> EnableScalarIRPasses(
317   "amdgpu-scalar-ir-passes",
318   cl::desc("Enable scalar IR passes"),
319   cl::init(true),
320   cl::Hidden);
321 
322 static cl::opt<bool> EnableStructurizerWorkarounds(
323     "amdgpu-enable-structurizer-workarounds",
324     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
325     cl::Hidden);
326 
327 static cl::opt<bool, true> EnableLowerModuleLDS(
328     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
329     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
330     cl::Hidden);
331 
332 static cl::opt<bool> EnablePreRAOptimizations(
333     "amdgpu-enable-pre-ra-optimizations",
334     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
335     cl::Hidden);
336 
337 static cl::opt<bool> EnablePromoteKernelArguments(
338     "amdgpu-enable-promote-kernel-arguments",
339     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
340     cl::Hidden, cl::init(true));
341 
342 static cl::opt<bool> EnableMaxIlpSchedStrategy(
343     "amdgpu-enable-max-ilp-scheduling-strategy",
344     cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
345     cl::Hidden, cl::init(false));
346 
347 static cl::opt<bool> EnableRewritePartialRegUses(
348     "amdgpu-enable-rewrite-partial-reg-uses",
349     cl::desc("Enable rewrite partial reg uses pass"), cl::init(false),
350     cl::Hidden);
351 
352 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
353   // Register the target
354   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
355   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
356 
357   PassRegistry *PR = PassRegistry::getPassRegistry();
358   initializeR600ClauseMergePassPass(*PR);
359   initializeR600ControlFlowFinalizerPass(*PR);
360   initializeR600PacketizerPass(*PR);
361   initializeR600ExpandSpecialInstrsPassPass(*PR);
362   initializeR600VectorRegMergerPass(*PR);
363   initializeGlobalISel(*PR);
364   initializeAMDGPUDAGToDAGISelPass(*PR);
365   initializeGCNDPPCombinePass(*PR);
366   initializeSILowerI1CopiesPass(*PR);
367   initializeSILowerSGPRSpillsPass(*PR);
368   initializeSIFixSGPRCopiesPass(*PR);
369   initializeSIFixVGPRCopiesPass(*PR);
370   initializeSIFoldOperandsPass(*PR);
371   initializeSIPeepholeSDWAPass(*PR);
372   initializeSIShrinkInstructionsPass(*PR);
373   initializeSIOptimizeExecMaskingPreRAPass(*PR);
374   initializeSIOptimizeVGPRLiveRangePass(*PR);
375   initializeSILoadStoreOptimizerPass(*PR);
376   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
377   initializeAMDGPUAlwaysInlinePass(*PR);
378   initializeAMDGPUAttributorPass(*PR);
379   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
380   initializeAMDGPUAnnotateUniformValuesPass(*PR);
381   initializeAMDGPUArgumentUsageInfoPass(*PR);
382   initializeAMDGPUAtomicOptimizerPass(*PR);
383   initializeAMDGPULowerKernelArgumentsPass(*PR);
384   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
385   initializeAMDGPULowerKernelAttributesPass(*PR);
386   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
387   initializeAMDGPUPostLegalizerCombinerPass(*PR);
388   initializeAMDGPUPreLegalizerCombinerPass(*PR);
389   initializeAMDGPURegBankCombinerPass(*PR);
390   initializeAMDGPURegBankSelectPass(*PR);
391   initializeAMDGPUPromoteAllocaPass(*PR);
392   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
393   initializeAMDGPUCodeGenPreparePass(*PR);
394   initializeAMDGPULateCodeGenPreparePass(*PR);
395   initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
396   initializeAMDGPULowerModuleLDSPass(*PR);
397   initializeAMDGPURewriteOutArgumentsPass(*PR);
398   initializeAMDGPURewriteUndefForPHIPass(*PR);
399   initializeAMDGPUUnifyMetadataPass(*PR);
400   initializeSIAnnotateControlFlowPass(*PR);
401   initializeAMDGPUInsertDelayAluPass(*PR);
402   initializeSIInsertHardClausesPass(*PR);
403   initializeSIInsertWaitcntsPass(*PR);
404   initializeSIModeRegisterPass(*PR);
405   initializeSIWholeQuadModePass(*PR);
406   initializeSILowerControlFlowPass(*PR);
407   initializeSIPreEmitPeepholePass(*PR);
408   initializeSILateBranchLoweringPass(*PR);
409   initializeSIMemoryLegalizerPass(*PR);
410   initializeSIOptimizeExecMaskingPass(*PR);
411   initializeSIPreAllocateWWMRegsPass(*PR);
412   initializeSIFormMemoryClausesPass(*PR);
413   initializeSIPostRABundlerPass(*PR);
414   initializeGCNCreateVOPDPass(*PR);
415   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
416   initializeAMDGPUAAWrapperPassPass(*PR);
417   initializeAMDGPUExternalAAWrapperPass(*PR);
418   initializeAMDGPUUseNativeCallsPass(*PR);
419   initializeAMDGPUSimplifyLibCallsPass(*PR);
420   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
421   initializeAMDGPUResourceUsageAnalysisPass(*PR);
422   initializeGCNNSAReassignPass(*PR);
423   initializeGCNPreRAOptimizationsPass(*PR);
424   initializeGCNPreRALongBranchRegPass(*PR);
425   initializeGCNRewritePartialRegUsesPass(*PR);
426 }
427 
428 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
429   return std::make_unique<AMDGPUTargetObjectFile>();
430 }
431 
432 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
433   return new SIScheduleDAGMI(C);
434 }
435 
436 static ScheduleDAGInstrs *
437 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
438   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
439   ScheduleDAGMILive *DAG =
440     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
441   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
442   if (ST.shouldClusterStores())
443     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
444   DAG->addMutation(createIGroupLPDAGMutation());
445   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
446   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
447   return DAG;
448 }
449 
450 static ScheduleDAGInstrs *
451 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
452   ScheduleDAGMILive *DAG =
453       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
454   DAG->addMutation(createIGroupLPDAGMutation());
455   return DAG;
456 }
457 
458 static ScheduleDAGInstrs *
459 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
460   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
461   auto DAG = new GCNIterativeScheduler(C,
462     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
463   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
464   if (ST.shouldClusterStores())
465     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
466   return DAG;
467 }
468 
469 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
470   return new GCNIterativeScheduler(C,
471     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
472 }
473 
474 static ScheduleDAGInstrs *
475 createIterativeILPMachineScheduler(MachineSchedContext *C) {
476   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
477   auto DAG = new GCNIterativeScheduler(C,
478     GCNIterativeScheduler::SCHEDULE_ILP);
479   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
480   if (ST.shouldClusterStores())
481     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
482   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
483   return DAG;
484 }
485 
486 static MachineSchedRegistry
487 SISchedRegistry("si", "Run SI's custom scheduler",
488                 createSIMachineScheduler);
489 
490 static MachineSchedRegistry
491 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
492                              "Run GCN scheduler to maximize occupancy",
493                              createGCNMaxOccupancyMachineScheduler);
494 
495 static MachineSchedRegistry
496     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
497                            createGCNMaxILPMachineScheduler);
498 
499 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
500     "gcn-iterative-max-occupancy-experimental",
501     "Run GCN scheduler to maximize occupancy (experimental)",
502     createIterativeGCNMaxOccupancyMachineScheduler);
503 
504 static MachineSchedRegistry GCNMinRegSchedRegistry(
505     "gcn-iterative-minreg",
506     "Run GCN iterative scheduler for minimal register usage (experimental)",
507     createMinRegScheduler);
508 
509 static MachineSchedRegistry GCNILPSchedRegistry(
510     "gcn-iterative-ilp",
511     "Run GCN iterative scheduler for ILP scheduling (experimental)",
512     createIterativeILPMachineScheduler);
513 
514 static StringRef computeDataLayout(const Triple &TT) {
515   if (TT.getArch() == Triple::r600) {
516     // 32-bit pointers.
517     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
518            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
519   }
520 
521   // 32-bit private, local, and region pointers. 64-bit global, constant and
522   // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
523   // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
524   // (address space 7), and 128-bit non-integral buffer resourcees (address
525   // space 8) which cannot be non-trivilally accessed by LLVM memory operations
526   // like getelementptr.
527   return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
528          "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
529          "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
530          "G1-ni:7:8";
531 }
532 
533 LLVM_READNONE
534 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
535   if (!GPU.empty())
536     return GPU;
537 
538   // Need to default to a target with flat support for HSA.
539   if (TT.getArch() == Triple::amdgcn)
540     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
541 
542   return "r600";
543 }
544 
545 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
546   // The AMDGPU toolchain only supports generating shared objects, so we
547   // must always use PIC.
548   return Reloc::PIC_;
549 }
550 
551 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
552                                          StringRef CPU, StringRef FS,
553                                          TargetOptions Options,
554                                          std::optional<Reloc::Model> RM,
555                                          std::optional<CodeModel::Model> CM,
556                                          CodeGenOpt::Level OptLevel)
557     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
558                         FS, Options, getEffectiveRelocModel(RM),
559                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
560       TLOF(createTLOF(getTargetTriple())) {
561   initAsmInfo();
562   if (TT.getArch() == Triple::amdgcn) {
563     if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
564       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
565     else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
566       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
567   }
568 }
569 
570 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
571 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
572 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
573 
574 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
575 
576 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
577   Attribute GPUAttr = F.getFnAttribute("target-cpu");
578   return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
579 }
580 
581 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
582   Attribute FSAttr = F.getFnAttribute("target-features");
583 
584   return FSAttr.isValid() ? FSAttr.getValueAsString()
585                           : getTargetFeatureString();
586 }
587 
588 /// Predicate for Internalize pass.
589 static bool mustPreserveGV(const GlobalValue &GV) {
590   if (const Function *F = dyn_cast<Function>(&GV))
591     return F->isDeclaration() || F->getName().startswith("__asan_") ||
592            F->getName().startswith("__sanitizer_") ||
593            AMDGPU::isEntryFunctionCC(F->getCallingConv());
594 
595   GV.removeDeadConstantUsers();
596   return !GV.use_empty();
597 }
598 
599 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
600   AAM.registerFunctionAnalysis<AMDGPUAA>();
601 }
602 
603 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
604   PB.registerPipelineParsingCallback(
605       [](StringRef PassName, ModulePassManager &PM,
606          ArrayRef<PassBuilder::PipelineElement>) {
607         if (PassName == "amdgpu-unify-metadata") {
608           PM.addPass(AMDGPUUnifyMetadataPass());
609           return true;
610         }
611         if (PassName == "amdgpu-printf-runtime-binding") {
612           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
613           return true;
614         }
615         if (PassName == "amdgpu-always-inline") {
616           PM.addPass(AMDGPUAlwaysInlinePass());
617           return true;
618         }
619         if (PassName == "amdgpu-lower-module-lds") {
620           PM.addPass(AMDGPULowerModuleLDSPass());
621           return true;
622         }
623         if (PassName == "amdgpu-lower-ctor-dtor") {
624           PM.addPass(AMDGPUCtorDtorLoweringPass());
625           return true;
626         }
627         return false;
628       });
629   PB.registerPipelineParsingCallback(
630       [this](StringRef PassName, FunctionPassManager &PM,
631              ArrayRef<PassBuilder::PipelineElement>) {
632         if (PassName == "amdgpu-simplifylib") {
633           PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
634           return true;
635         }
636         if (PassName == "amdgpu-usenative") {
637           PM.addPass(AMDGPUUseNativeCallsPass());
638           return true;
639         }
640         if (PassName == "amdgpu-promote-alloca") {
641           PM.addPass(AMDGPUPromoteAllocaPass(*this));
642           return true;
643         }
644         if (PassName == "amdgpu-promote-alloca-to-vector") {
645           PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
646           return true;
647         }
648         if (PassName == "amdgpu-lower-kernel-attributes") {
649           PM.addPass(AMDGPULowerKernelAttributesPass());
650           return true;
651         }
652         if (PassName == "amdgpu-promote-kernel-arguments") {
653           PM.addPass(AMDGPUPromoteKernelArgumentsPass());
654           return true;
655         }
656         if (PassName == "amdgpu-unify-divergent-exit-nodes") {
657           PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
658           return true;
659         }
660         if (PassName == "amdgpu-atomic-optimizer") {
661           PM.addPass(
662               AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy));
663           return true;
664         }
665         if (PassName == "amdgpu-codegenprepare") {
666           PM.addPass(AMDGPUCodeGenPreparePass(*this));
667           return true;
668         }
669         return false;
670       });
671 
672   PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
673     FAM.registerPass([&] { return AMDGPUAA(); });
674   });
675 
676   PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
677     if (AAName == "amdgpu-aa") {
678       AAM.registerFunctionAnalysis<AMDGPUAA>();
679       return true;
680     }
681     return false;
682   });
683 
684   PB.registerPipelineStartEPCallback(
685       [this](ModulePassManager &PM, OptimizationLevel Level) {
686         FunctionPassManager FPM;
687         FPM.addPass(AMDGPUUseNativeCallsPass());
688         if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
689           FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
690         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
691       });
692 
693   PB.registerPipelineEarlySimplificationEPCallback(
694       [](ModulePassManager &PM, OptimizationLevel Level) {
695         PM.addPass(AMDGPUPrintfRuntimeBindingPass());
696 
697         if (Level == OptimizationLevel::O0)
698           return;
699 
700         PM.addPass(AMDGPUUnifyMetadataPass());
701 
702         if (InternalizeSymbols) {
703           PM.addPass(InternalizePass(mustPreserveGV));
704           PM.addPass(GlobalDCEPass());
705         }
706 
707         if (EarlyInlineAll && !EnableFunctionCalls)
708           PM.addPass(AMDGPUAlwaysInlinePass());
709       });
710 
711   PB.registerCGSCCOptimizerLateEPCallback(
712       [this](CGSCCPassManager &PM, OptimizationLevel Level) {
713         if (Level == OptimizationLevel::O0)
714           return;
715 
716         FunctionPassManager FPM;
717 
718         // Add promote kernel arguments pass to the opt pipeline right before
719         // infer address spaces which is needed to do actual address space
720         // rewriting.
721         if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
722             EnablePromoteKernelArguments)
723           FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
724 
725         // Add infer address spaces pass to the opt pipeline after inlining
726         // but before SROA to increase SROA opportunities.
727         FPM.addPass(InferAddressSpacesPass());
728 
729         // This should run after inlining to have any chance of doing
730         // anything, and before other cleanup optimizations.
731         FPM.addPass(AMDGPULowerKernelAttributesPass());
732 
733         if (Level != OptimizationLevel::O0) {
734           // Promote alloca to vector before SROA and loop unroll. If we
735           // manage to eliminate allocas before unroll we may choose to unroll
736           // less.
737           FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
738         }
739 
740         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
741       });
742 }
743 
744 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
745   return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
746           AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
747           AddrSpace == AMDGPUAS::REGION_ADDRESS)
748              ? -1
749              : 0;
750 }
751 
752 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
753                                               unsigned DestAS) const {
754   return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
755          AMDGPU::isFlatGlobalAddrSpace(DestAS);
756 }
757 
758 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
759   const auto *LD = dyn_cast<LoadInst>(V);
760   if (!LD)
761     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
762 
763   // It must be a generic pointer loaded.
764   assert(V->getType()->isPointerTy() &&
765          V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
766 
767   const auto *Ptr = LD->getPointerOperand();
768   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
769     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
770   // For a generic pointer loaded from the constant memory, it could be assumed
771   // as a global pointer since the constant memory is only populated on the
772   // host side. As implied by the offload programming model, only global
773   // pointers could be referenced on the host side.
774   return AMDGPUAS::GLOBAL_ADDRESS;
775 }
776 
777 std::pair<const Value *, unsigned>
778 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
779   if (auto *II = dyn_cast<IntrinsicInst>(V)) {
780     switch (II->getIntrinsicID()) {
781     case Intrinsic::amdgcn_is_shared:
782       return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
783     case Intrinsic::amdgcn_is_private:
784       return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
785     default:
786       break;
787     }
788     return std::pair(nullptr, -1);
789   }
790   // Check the global pointer predication based on
791   // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
792   // the order of 'is_shared' and 'is_private' is not significant.
793   Value *Ptr;
794   if (match(
795           const_cast<Value *>(V),
796           m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
797                   m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
798                       m_Deferred(Ptr))))))
799     return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
800 
801   return std::pair(nullptr, -1);
802 }
803 
804 unsigned
805 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
806   switch (Kind) {
807   case PseudoSourceValue::Stack:
808   case PseudoSourceValue::FixedStack:
809     return AMDGPUAS::PRIVATE_ADDRESS;
810   case PseudoSourceValue::ConstantPool:
811   case PseudoSourceValue::GOT:
812   case PseudoSourceValue::JumpTable:
813   case PseudoSourceValue::GlobalValueCallEntry:
814   case PseudoSourceValue::ExternalSymbolCallEntry:
815     return AMDGPUAS::CONSTANT_ADDRESS;
816   }
817   return AMDGPUAS::FLAT_ADDRESS;
818 }
819 
820 //===----------------------------------------------------------------------===//
821 // GCN Target Machine (SI+)
822 //===----------------------------------------------------------------------===//
823 
824 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
825                                    StringRef CPU, StringRef FS,
826                                    TargetOptions Options,
827                                    std::optional<Reloc::Model> RM,
828                                    std::optional<CodeModel::Model> CM,
829                                    CodeGenOpt::Level OL, bool JIT)
830     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
831 
832 const TargetSubtargetInfo *
833 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
834   StringRef GPU = getGPUName(F);
835   StringRef FS = getFeatureString(F);
836 
837   SmallString<128> SubtargetKey(GPU);
838   SubtargetKey.append(FS);
839 
840   auto &I = SubtargetMap[SubtargetKey];
841   if (!I) {
842     // This needs to be done before we create a new subtarget since any
843     // creation will depend on the TM and the code generation flags on the
844     // function that reside in TargetOptions.
845     resetTargetOptions(F);
846     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
847   }
848 
849   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
850 
851   return I.get();
852 }
853 
854 TargetTransformInfo
855 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
856   return TargetTransformInfo(GCNTTIImpl(this, F));
857 }
858 
859 //===----------------------------------------------------------------------===//
860 // AMDGPU Pass Setup
861 //===----------------------------------------------------------------------===//
862 
863 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
864   return getStandardCSEConfigForOpt(TM->getOptLevel());
865 }
866 
867 namespace {
868 
869 class GCNPassConfig final : public AMDGPUPassConfig {
870 public:
871   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
872     : AMDGPUPassConfig(TM, PM) {
873     // It is necessary to know the register usage of the entire call graph.  We
874     // allow calls without EnableAMDGPUFunctionCalls if they are marked
875     // noinline, so this is always required.
876     setRequiresCodeGenSCCOrder(true);
877     substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
878   }
879 
880   GCNTargetMachine &getGCNTargetMachine() const {
881     return getTM<GCNTargetMachine>();
882   }
883 
884   ScheduleDAGInstrs *
885   createMachineScheduler(MachineSchedContext *C) const override;
886 
887   ScheduleDAGInstrs *
888   createPostMachineScheduler(MachineSchedContext *C) const override {
889     ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
890         C, std::make_unique<PostGenericScheduler>(C),
891         /*RemoveKillFlags=*/true);
892     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
893     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
894     if (ST.shouldClusterStores())
895       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
896     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
897     DAG->addMutation(createIGroupLPDAGMutation());
898     if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
899       DAG->addMutation(createVOPDPairingMutation());
900     return DAG;
901   }
902 
903   bool addPreISel() override;
904   void addMachineSSAOptimization() override;
905   bool addILPOpts() override;
906   bool addInstSelector() override;
907   bool addIRTranslator() override;
908   void addPreLegalizeMachineIR() override;
909   bool addLegalizeMachineIR() override;
910   void addPreRegBankSelect() override;
911   bool addRegBankSelect() override;
912   void addPreGlobalInstructionSelect() override;
913   bool addGlobalInstructionSelect() override;
914   void addFastRegAlloc() override;
915   void addOptimizedRegAlloc() override;
916 
917   FunctionPass *createSGPRAllocPass(bool Optimized);
918   FunctionPass *createVGPRAllocPass(bool Optimized);
919   FunctionPass *createRegAllocPass(bool Optimized) override;
920 
921   bool addRegAssignAndRewriteFast() override;
922   bool addRegAssignAndRewriteOptimized() override;
923 
924   void addPreRegAlloc() override;
925   bool addPreRewrite() override;
926   void addPostRegAlloc() override;
927   void addPreSched2() override;
928   void addPreEmitPass() override;
929 };
930 
931 } // end anonymous namespace
932 
933 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
934     : TargetPassConfig(TM, PM) {
935   // Exceptions and StackMaps are not supported, so these passes will never do
936   // anything.
937   disablePass(&StackMapLivenessID);
938   disablePass(&FuncletLayoutID);
939   // Garbage collection is not supported.
940   disablePass(&GCLoweringID);
941   disablePass(&ShadowStackGCLoweringID);
942 }
943 
944 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
945   if (getOptLevel() == CodeGenOpt::Aggressive)
946     addPass(createGVNPass());
947   else
948     addPass(createEarlyCSEPass());
949 }
950 
951 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
952   addPass(createSeparateConstOffsetFromGEPPass());
953   // ReassociateGEPs exposes more opportunities for SLSR. See
954   // the example in reassociate-geps-and-slsr.ll.
955   addPass(createStraightLineStrengthReducePass());
956   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
957   // EarlyCSE can reuse.
958   addEarlyCSEOrGVNPass();
959   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
960   addPass(createNaryReassociatePass());
961   // NaryReassociate on GEPs creates redundant common expressions, so run
962   // EarlyCSE after it.
963   addPass(createEarlyCSEPass());
964 }
965 
966 void AMDGPUPassConfig::addIRPasses() {
967   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
968 
969   // There is no reason to run these.
970   disablePass(&StackMapLivenessID);
971   disablePass(&FuncletLayoutID);
972   disablePass(&PatchableFunctionID);
973 
974   addPass(createAMDGPUPrintfRuntimeBinding());
975   if (LowerCtorDtor)
976     addPass(createAMDGPUCtorDtorLoweringLegacyPass());
977 
978   // Function calls are not supported, so make sure we inline everything.
979   addPass(createAMDGPUAlwaysInlinePass());
980   addPass(createAlwaysInlinerLegacyPass());
981 
982   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
983   if (TM.getTargetTriple().getArch() == Triple::r600)
984     addPass(createR600OpenCLImageTypeLoweringPass());
985 
986   // Replace OpenCL enqueued block function pointers with global variables.
987   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
988 
989   // Runs before PromoteAlloca so the latter can account for function uses
990   if (EnableLowerModuleLDS) {
991     addPass(createAMDGPULowerModuleLDSPass());
992   }
993 
994   // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
995   // after their introduction
996   if (TM.getOptLevel() > CodeGenOpt::None)
997     addPass(createAMDGPUAttributorPass());
998 
999   if (TM.getOptLevel() > CodeGenOpt::None)
1000     addPass(createInferAddressSpacesPass());
1001 
1002   addPass(createAtomicExpandPass());
1003 
1004   if (TM.getOptLevel() > CodeGenOpt::None) {
1005     addPass(createAMDGPUPromoteAlloca());
1006 
1007     if (EnableSROA)
1008       addPass(createSROAPass());
1009     if (isPassEnabled(EnableScalarIRPasses))
1010       addStraightLineScalarOptimizationPasses();
1011 
1012     if (EnableAMDGPUAliasAnalysis) {
1013       addPass(createAMDGPUAAWrapperPass());
1014       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1015                                              AAResults &AAR) {
1016         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1017           AAR.addAAResult(WrapperPass->getResult());
1018         }));
1019     }
1020 
1021     if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1022       // TODO: May want to move later or split into an early and late one.
1023       addPass(createAMDGPUCodeGenPreparePass());
1024     }
1025 
1026     // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1027     // have expanded.
1028     if (TM.getOptLevel() > CodeGenOpt::Less)
1029       addPass(createLICMPass());
1030   }
1031 
1032   TargetPassConfig::addIRPasses();
1033 
1034   // EarlyCSE is not always strong enough to clean up what LSR produces. For
1035   // example, GVN can combine
1036   //
1037   //   %0 = add %a, %b
1038   //   %1 = add %b, %a
1039   //
1040   // and
1041   //
1042   //   %0 = shl nsw %a, 2
1043   //   %1 = shl %a, 2
1044   //
1045   // but EarlyCSE can do neither of them.
1046   if (isPassEnabled(EnableScalarIRPasses))
1047     addEarlyCSEOrGVNPass();
1048 }
1049 
1050 void AMDGPUPassConfig::addCodeGenPrepare() {
1051   if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1052     if (RemoveIncompatibleFunctions)
1053       addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM));
1054 
1055     // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1056     // analysis, and should be removed.
1057     addPass(createAMDGPUAnnotateKernelFeaturesPass());
1058   }
1059 
1060   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1061       EnableLowerKernelArguments)
1062     addPass(createAMDGPULowerKernelArgumentsPass());
1063 
1064   TargetPassConfig::addCodeGenPrepare();
1065 
1066   if (isPassEnabled(EnableLoadStoreVectorizer))
1067     addPass(createLoadStoreVectorizerPass());
1068 
1069   // LowerSwitch pass may introduce unreachable blocks that can
1070   // cause unexpected behavior for subsequent passes. Placing it
1071   // here seems better that these blocks would get cleaned up by
1072   // UnreachableBlockElim inserted next in the pass flow.
1073   addPass(createLowerSwitchPass());
1074 }
1075 
1076 bool AMDGPUPassConfig::addPreISel() {
1077   if (TM->getOptLevel() > CodeGenOpt::None)
1078     addPass(createFlattenCFGPass());
1079   return false;
1080 }
1081 
1082 bool AMDGPUPassConfig::addInstSelector() {
1083   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1084   return false;
1085 }
1086 
1087 bool AMDGPUPassConfig::addGCPasses() {
1088   // Do nothing. GC is not supported.
1089   return false;
1090 }
1091 
1092 llvm::ScheduleDAGInstrs *
1093 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1094   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1095   ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1096   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1097   if (ST.shouldClusterStores())
1098     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1099   return DAG;
1100 }
1101 
1102 MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1103     BumpPtrAllocator &Allocator, const Function &F,
1104     const TargetSubtargetInfo *STI) const {
1105   return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1106       Allocator, F, static_cast<const R600Subtarget *>(STI));
1107 }
1108 
1109 //===----------------------------------------------------------------------===//
1110 // GCN Pass Setup
1111 //===----------------------------------------------------------------------===//
1112 
1113 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1114   MachineSchedContext *C) const {
1115   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1116   if (ST.enableSIScheduler())
1117     return createSIMachineScheduler(C);
1118 
1119   if (EnableMaxIlpSchedStrategy)
1120     return createGCNMaxILPMachineScheduler(C);
1121 
1122   return createGCNMaxOccupancyMachineScheduler(C);
1123 }
1124 
1125 bool GCNPassConfig::addPreISel() {
1126   AMDGPUPassConfig::addPreISel();
1127 
1128   if (TM->getOptLevel() > CodeGenOpt::None)
1129     addPass(createAMDGPULateCodeGenPreparePass());
1130 
1131   if ((TM->getOptLevel() >= CodeGenOpt::Less) &&
1132       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1133     addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
1134   }
1135 
1136   if (TM->getOptLevel() > CodeGenOpt::None)
1137     addPass(createSinkingPass());
1138 
1139   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1140   // regions formed by them.
1141   addPass(&AMDGPUUnifyDivergentExitNodesID);
1142   if (!LateCFGStructurize) {
1143     if (EnableStructurizerWorkarounds) {
1144       addPass(createFixIrreduciblePass());
1145       addPass(createUnifyLoopExitsPass());
1146     }
1147     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1148   }
1149   addPass(createAMDGPUAnnotateUniformValues());
1150   if (!LateCFGStructurize) {
1151     addPass(createSIAnnotateControlFlowPass());
1152     // TODO: Move this right after structurizeCFG to avoid extra divergence
1153     // analysis. This depends on stopping SIAnnotateControlFlow from making
1154     // control flow modifications.
1155     addPass(createAMDGPURewriteUndefForPHIPass());
1156   }
1157   addPass(createLCSSAPass());
1158 
1159   if (TM->getOptLevel() > CodeGenOpt::Less)
1160     addPass(&AMDGPUPerfHintAnalysisID);
1161 
1162   return false;
1163 }
1164 
1165 void GCNPassConfig::addMachineSSAOptimization() {
1166   TargetPassConfig::addMachineSSAOptimization();
1167 
1168   // We want to fold operands after PeepholeOptimizer has run (or as part of
1169   // it), because it will eliminate extra copies making it easier to fold the
1170   // real source operand. We want to eliminate dead instructions after, so that
1171   // we see fewer uses of the copies. We then need to clean up the dead
1172   // instructions leftover after the operands are folded as well.
1173   //
1174   // XXX - Can we get away without running DeadMachineInstructionElim again?
1175   addPass(&SIFoldOperandsID);
1176   if (EnableDPPCombine)
1177     addPass(&GCNDPPCombineID);
1178   addPass(&SILoadStoreOptimizerID);
1179   if (isPassEnabled(EnableSDWAPeephole)) {
1180     addPass(&SIPeepholeSDWAID);
1181     addPass(&EarlyMachineLICMID);
1182     addPass(&MachineCSEID);
1183     addPass(&SIFoldOperandsID);
1184   }
1185   addPass(&DeadMachineInstructionElimID);
1186   addPass(createSIShrinkInstructionsPass());
1187 }
1188 
1189 bool GCNPassConfig::addILPOpts() {
1190   if (EnableEarlyIfConversion)
1191     addPass(&EarlyIfConverterID);
1192 
1193   TargetPassConfig::addILPOpts();
1194   return false;
1195 }
1196 
1197 bool GCNPassConfig::addInstSelector() {
1198   AMDGPUPassConfig::addInstSelector();
1199   addPass(&SIFixSGPRCopiesID);
1200   addPass(createSILowerI1CopiesPass());
1201   return false;
1202 }
1203 
1204 bool GCNPassConfig::addIRTranslator() {
1205   addPass(new IRTranslator(getOptLevel()));
1206   return false;
1207 }
1208 
1209 void GCNPassConfig::addPreLegalizeMachineIR() {
1210   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1211   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1212   addPass(new Localizer());
1213 }
1214 
1215 bool GCNPassConfig::addLegalizeMachineIR() {
1216   addPass(new Legalizer());
1217   return false;
1218 }
1219 
1220 void GCNPassConfig::addPreRegBankSelect() {
1221   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1222   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1223 }
1224 
1225 bool GCNPassConfig::addRegBankSelect() {
1226   addPass(new AMDGPURegBankSelect());
1227   return false;
1228 }
1229 
1230 void GCNPassConfig::addPreGlobalInstructionSelect() {
1231   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1232   addPass(createAMDGPURegBankCombiner(IsOptNone));
1233 }
1234 
1235 bool GCNPassConfig::addGlobalInstructionSelect() {
1236   addPass(new InstructionSelect(getOptLevel()));
1237   return false;
1238 }
1239 
1240 void GCNPassConfig::addPreRegAlloc() {
1241   if (LateCFGStructurize) {
1242     addPass(createAMDGPUMachineCFGStructurizerPass());
1243   }
1244 }
1245 
1246 void GCNPassConfig::addFastRegAlloc() {
1247   // FIXME: We have to disable the verifier here because of PHIElimination +
1248   // TwoAddressInstructions disabling it.
1249 
1250   // This must be run immediately after phi elimination and before
1251   // TwoAddressInstructions, otherwise the processing of the tied operand of
1252   // SI_ELSE will introduce a copy of the tied operand source after the else.
1253   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1254 
1255   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1256   insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1257 
1258   TargetPassConfig::addFastRegAlloc();
1259 }
1260 
1261 void GCNPassConfig::addOptimizedRegAlloc() {
1262   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1263   // instructions that cause scheduling barriers.
1264   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1265   insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1266 
1267   if (OptExecMaskPreRA)
1268     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1269 
1270   if (EnableRewritePartialRegUses)
1271     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
1272 
1273   if (isPassEnabled(EnablePreRAOptimizations))
1274     insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1275 
1276   // This is not an essential optimization and it has a noticeable impact on
1277   // compilation time, so we only enable it from O2.
1278   if (TM->getOptLevel() > CodeGenOpt::Less)
1279     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1280 
1281   // FIXME: when an instruction has a Killed operand, and the instruction is
1282   // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1283   // the register in LiveVariables, this would trigger a failure in verifier,
1284   // we should fix it and enable the verifier.
1285   if (OptVGPRLiveRange)
1286     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1287   // This must be run immediately after phi elimination and before
1288   // TwoAddressInstructions, otherwise the processing of the tied operand of
1289   // SI_ELSE will introduce a copy of the tied operand source after the else.
1290   insertPass(&PHIEliminationID, &SILowerControlFlowID);
1291 
1292   if (EnableDCEInRA)
1293     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1294 
1295   TargetPassConfig::addOptimizedRegAlloc();
1296 }
1297 
1298 bool GCNPassConfig::addPreRewrite() {
1299   if (EnableRegReassign)
1300     addPass(&GCNNSAReassignID);
1301   return true;
1302 }
1303 
1304 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1305   // Initialize the global default.
1306   llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1307                   initializeDefaultSGPRRegisterAllocatorOnce);
1308 
1309   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1310   if (Ctor != useDefaultRegisterAllocator)
1311     return Ctor();
1312 
1313   if (Optimized)
1314     return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1315 
1316   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1317 }
1318 
1319 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1320   // Initialize the global default.
1321   llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1322                   initializeDefaultVGPRRegisterAllocatorOnce);
1323 
1324   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1325   if (Ctor != useDefaultRegisterAllocator)
1326     return Ctor();
1327 
1328   if (Optimized)
1329     return createGreedyVGPRRegisterAllocator();
1330 
1331   return createFastVGPRRegisterAllocator();
1332 }
1333 
1334 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1335   llvm_unreachable("should not be used");
1336 }
1337 
1338 static const char RegAllocOptNotSupportedMessage[] =
1339   "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1340 
1341 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1342   if (!usingDefaultRegAlloc())
1343     report_fatal_error(RegAllocOptNotSupportedMessage);
1344 
1345   addPass(&GCNPreRALongBranchRegID);
1346 
1347   addPass(createSGPRAllocPass(false));
1348 
1349   // Equivalent of PEI for SGPRs.
1350   addPass(&SILowerSGPRSpillsID);
1351 
1352   addPass(createVGPRAllocPass(false));
1353   return true;
1354 }
1355 
1356 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1357   if (!usingDefaultRegAlloc())
1358     report_fatal_error(RegAllocOptNotSupportedMessage);
1359 
1360   addPass(&GCNPreRALongBranchRegID);
1361 
1362   addPass(createSGPRAllocPass(true));
1363 
1364   // Commit allocated register changes. This is mostly necessary because too
1365   // many things rely on the use lists of the physical registers, such as the
1366   // verifier. This is only necessary with allocators which use LiveIntervals,
1367   // since FastRegAlloc does the replacements itself.
1368   addPass(createVirtRegRewriter(false));
1369 
1370   // Equivalent of PEI for SGPRs.
1371   addPass(&SILowerSGPRSpillsID);
1372 
1373   addPass(createVGPRAllocPass(true));
1374 
1375   addPreRewrite();
1376   addPass(&VirtRegRewriterID);
1377 
1378   return true;
1379 }
1380 
1381 void GCNPassConfig::addPostRegAlloc() {
1382   addPass(&SIFixVGPRCopiesID);
1383   if (getOptLevel() > CodeGenOpt::None)
1384     addPass(&SIOptimizeExecMaskingID);
1385   TargetPassConfig::addPostRegAlloc();
1386 }
1387 
1388 void GCNPassConfig::addPreSched2() {
1389   if (TM->getOptLevel() > CodeGenOpt::None)
1390     addPass(createSIShrinkInstructionsPass());
1391   addPass(&SIPostRABundlerID);
1392 }
1393 
1394 void GCNPassConfig::addPreEmitPass() {
1395   if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1396     addPass(&GCNCreateVOPDID);
1397   addPass(createSIMemoryLegalizerPass());
1398   addPass(createSIInsertWaitcntsPass());
1399 
1400   addPass(createSIModeRegisterPass());
1401 
1402   if (getOptLevel() > CodeGenOpt::None)
1403     addPass(&SIInsertHardClausesID);
1404 
1405   addPass(&SILateBranchLoweringPassID);
1406   if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1407     addPass(createAMDGPUSetWavePriorityPass());
1408   if (getOptLevel() > CodeGenOpt::None)
1409     addPass(&SIPreEmitPeepholeID);
1410   // The hazard recognizer that runs as part of the post-ra scheduler does not
1411   // guarantee to be able handle all hazards correctly. This is because if there
1412   // are multiple scheduling regions in a basic block, the regions are scheduled
1413   // bottom up, so when we begin to schedule a region we don't know what
1414   // instructions were emitted directly before it.
1415   //
1416   // Here we add a stand-alone hazard recognizer pass which can handle all
1417   // cases.
1418   addPass(&PostRAHazardRecognizerID);
1419 
1420   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1421     addPass(&AMDGPUInsertDelayAluID);
1422 
1423   addPass(&BranchRelaxationPassID);
1424 }
1425 
1426 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1427   return new GCNPassConfig(*this, PM);
1428 }
1429 
1430 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1431     MachineFunction &MF) const {
1432   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1433   MF.getRegInfo().addDelegate(MFI);
1434 }
1435 
1436 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1437     BumpPtrAllocator &Allocator, const Function &F,
1438     const TargetSubtargetInfo *STI) const {
1439   return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1440       Allocator, F, static_cast<const GCNSubtarget *>(STI));
1441 }
1442 
1443 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1444   return new yaml::SIMachineFunctionInfo();
1445 }
1446 
1447 yaml::MachineFunctionInfo *
1448 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1449   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1450   return new yaml::SIMachineFunctionInfo(
1451       *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1452 }
1453 
1454 bool GCNTargetMachine::parseMachineFunctionInfo(
1455     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1456     SMDiagnostic &Error, SMRange &SourceRange) const {
1457   const yaml::SIMachineFunctionInfo &YamlMFI =
1458       static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1459   MachineFunction &MF = PFS.MF;
1460   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1461 
1462   if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1463     return true;
1464 
1465   if (MFI->Occupancy == 0) {
1466     // Fixup the subtarget dependent default value.
1467     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1468     MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1469   }
1470 
1471   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1472     Register TempReg;
1473     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1474       SourceRange = RegName.SourceRange;
1475       return true;
1476     }
1477     RegVal = TempReg;
1478 
1479     return false;
1480   };
1481 
1482   auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1483                                    Register &RegVal) {
1484     return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1485   };
1486 
1487   if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1488     return true;
1489 
1490   if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1491     return true;
1492 
1493   if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1494                             MFI->LongBranchReservedReg))
1495     return true;
1496 
1497   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1498     // Create a diagnostic for a the register string literal.
1499     const MemoryBuffer &Buffer =
1500         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1501     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1502                          RegName.Value.size(), SourceMgr::DK_Error,
1503                          "incorrect register class for field", RegName.Value,
1504                          std::nullopt, std::nullopt);
1505     SourceRange = RegName.SourceRange;
1506     return true;
1507   };
1508 
1509   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1510       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1511       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1512     return true;
1513 
1514   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1515       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1516     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1517   }
1518 
1519   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1520       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1521     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1522   }
1523 
1524   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1525       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1526     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1527   }
1528 
1529   for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1530     Register ParsedReg;
1531     if (parseRegister(YamlReg, ParsedReg))
1532       return true;
1533 
1534     MFI->reserveWWMRegister(ParsedReg);
1535   }
1536 
1537   auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1538                                    const TargetRegisterClass &RC,
1539                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1540                                    unsigned SystemSGPRs) {
1541     // Skip parsing if it's not present.
1542     if (!A)
1543       return false;
1544 
1545     if (A->IsRegister) {
1546       Register Reg;
1547       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1548         SourceRange = A->RegisterName.SourceRange;
1549         return true;
1550       }
1551       if (!RC.contains(Reg))
1552         return diagnoseRegisterClass(A->RegisterName);
1553       Arg = ArgDescriptor::createRegister(Reg);
1554     } else
1555       Arg = ArgDescriptor::createStack(A->StackOffset);
1556     // Check and apply the optional mask.
1557     if (A->Mask)
1558       Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1559 
1560     MFI->NumUserSGPRs += UserSGPRs;
1561     MFI->NumSystemSGPRs += SystemSGPRs;
1562     return false;
1563   };
1564 
1565   if (YamlMFI.ArgInfo &&
1566       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1567                              AMDGPU::SGPR_128RegClass,
1568                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1569        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1570                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1571                              2, 0) ||
1572        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1573                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1574        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1575                              AMDGPU::SReg_64RegClass,
1576                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1577        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1578                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1579                              2, 0) ||
1580        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1581                              AMDGPU::SReg_64RegClass,
1582                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1583        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1584                              AMDGPU::SGPR_32RegClass,
1585                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1586        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1587                              AMDGPU::SGPR_32RegClass,
1588                              MFI->ArgInfo.LDSKernelId, 0, 1) ||
1589        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1590                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1591                              0, 1) ||
1592        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1593                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1594                              0, 1) ||
1595        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1596                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1597                              0, 1) ||
1598        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1599                              AMDGPU::SGPR_32RegClass,
1600                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1601        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1602                              AMDGPU::SGPR_32RegClass,
1603                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1604        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1605                              AMDGPU::SReg_64RegClass,
1606                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1607        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1608                              AMDGPU::SReg_64RegClass,
1609                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1610        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1611                              AMDGPU::VGPR_32RegClass,
1612                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1613        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1614                              AMDGPU::VGPR_32RegClass,
1615                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1616        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1617                              AMDGPU::VGPR_32RegClass,
1618                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1619     return true;
1620 
1621   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1622   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1623 
1624   // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1625   MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1626                                       ? DenormalMode::IEEE
1627                                       : DenormalMode::PreserveSign;
1628   MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1629                                        ? DenormalMode::IEEE
1630                                        : DenormalMode::PreserveSign;
1631 
1632   MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1633                                           ? DenormalMode::IEEE
1634                                           : DenormalMode::PreserveSign;
1635   MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1636                                            ? DenormalMode::IEEE
1637                                            : DenormalMode::PreserveSign;
1638 
1639   return false;
1640 }
1641