1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for R600 and SI GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUExportClustering.h"
19 #include "AMDGPUMacroFusion.h"
20 #include "AMDGPUTargetObjectFile.h"
21 #include "AMDGPUTargetTransformInfo.h"
22 #include "GCNIterativeScheduler.h"
23 #include "GCNSchedStrategy.h"
24 #include "R600MachineScheduler.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIMachineScheduler.h"
27 #include "TargetInfo/AMDGPUTargetInfo.h"
28 #include "llvm/Analysis/CGSCCPassManager.h"
29 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
30 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
31 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
32 #include "llvm/CodeGen/GlobalISel/Localizer.h"
33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34 #include "llvm/CodeGen/MIRParser/MIParser.h"
35 #include "llvm/CodeGen/TargetPassConfig.h"
36 #include "llvm/IR/LegacyPassManager.h"
37 #include "llvm/IR/PassManager.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/Passes/PassBuilder.h"
40 #include "llvm/Support/TargetRegistry.h"
41 #include "llvm/Transforms/IPO.h"
42 #include "llvm/Transforms/IPO/AlwaysInliner.h"
43 #include "llvm/Transforms/IPO/GlobalDCE.h"
44 #include "llvm/Transforms/IPO/Internalize.h"
45 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Scalar/GVN.h"
48 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
49 #include "llvm/Transforms/Utils.h"
50 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
51 #include "llvm/Transforms/Vectorize.h"
52
53 using namespace llvm;
54
55 static cl::opt<bool> EnableR600StructurizeCFG(
56 "r600-ir-structurize",
57 cl::desc("Use StructurizeCFG IR pass"),
58 cl::init(true));
59
60 static cl::opt<bool> EnableSROA(
61 "amdgpu-sroa",
62 cl::desc("Run SROA after promote alloca pass"),
63 cl::ReallyHidden,
64 cl::init(true));
65
66 static cl::opt<bool>
67 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
68 cl::desc("Run early if-conversion"),
69 cl::init(false));
70
71 static cl::opt<bool>
72 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
73 cl::desc("Run pre-RA exec mask optimizations"),
74 cl::init(true));
75
76 static cl::opt<bool> EnableR600IfConvert(
77 "r600-if-convert",
78 cl::desc("Use if conversion pass"),
79 cl::ReallyHidden,
80 cl::init(true));
81
82 // Option to disable vectorizer for tests.
83 static cl::opt<bool> EnableLoadStoreVectorizer(
84 "amdgpu-load-store-vectorizer",
85 cl::desc("Enable load store vectorizer"),
86 cl::init(true),
87 cl::Hidden);
88
89 // Option to control global loads scalarization
90 static cl::opt<bool> ScalarizeGlobal(
91 "amdgpu-scalarize-global-loads",
92 cl::desc("Enable global load scalarization"),
93 cl::init(true),
94 cl::Hidden);
95
96 // Option to run internalize pass.
97 static cl::opt<bool> InternalizeSymbols(
98 "amdgpu-internalize-symbols",
99 cl::desc("Enable elimination of non-kernel functions and unused globals"),
100 cl::init(false),
101 cl::Hidden);
102
103 // Option to inline all early.
104 static cl::opt<bool> EarlyInlineAll(
105 "amdgpu-early-inline-all",
106 cl::desc("Inline all functions early"),
107 cl::init(false),
108 cl::Hidden);
109
110 static cl::opt<bool> EnableSDWAPeephole(
111 "amdgpu-sdwa-peephole",
112 cl::desc("Enable SDWA peepholer"),
113 cl::init(true));
114
115 static cl::opt<bool> EnableDPPCombine(
116 "amdgpu-dpp-combine",
117 cl::desc("Enable DPP combiner"),
118 cl::init(true));
119
120 // Enable address space based alias analysis
121 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
122 cl::desc("Enable AMDGPU Alias Analysis"),
123 cl::init(true));
124
125 // Option to run late CFG structurizer
126 static cl::opt<bool, true> LateCFGStructurize(
127 "amdgpu-late-structurize",
128 cl::desc("Enable late CFG structurization"),
129 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
130 cl::Hidden);
131
132 static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
133 "amdgpu-function-calls",
134 cl::desc("Enable AMDGPU function call support"),
135 cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
136 cl::init(true),
137 cl::Hidden);
138
139 static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
140 "amdgpu-fixed-function-abi",
141 cl::desc("Enable all implicit function arguments"),
142 cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
143 cl::init(false),
144 cl::Hidden);
145
146 // Enable lib calls simplifications
147 static cl::opt<bool> EnableLibCallSimplify(
148 "amdgpu-simplify-libcall",
149 cl::desc("Enable amdgpu library simplifications"),
150 cl::init(true),
151 cl::Hidden);
152
153 static cl::opt<bool> EnableLowerKernelArguments(
154 "amdgpu-ir-lower-kernel-arguments",
155 cl::desc("Lower kernel argument loads in IR pass"),
156 cl::init(true),
157 cl::Hidden);
158
159 static cl::opt<bool> EnableRegReassign(
160 "amdgpu-reassign-regs",
161 cl::desc("Enable register reassign optimizations on gfx10+"),
162 cl::init(true),
163 cl::Hidden);
164
165 // Enable atomic optimization
166 static cl::opt<bool> EnableAtomicOptimizations(
167 "amdgpu-atomic-optimizations",
168 cl::desc("Enable atomic optimizations"),
169 cl::init(false),
170 cl::Hidden);
171
172 // Enable Mode register optimization
173 static cl::opt<bool> EnableSIModeRegisterPass(
174 "amdgpu-mode-register",
175 cl::desc("Enable mode register pass"),
176 cl::init(true),
177 cl::Hidden);
178
179 // Option is used in lit tests to prevent deadcoding of patterns inspected.
180 static cl::opt<bool>
181 EnableDCEInRA("amdgpu-dce-in-ra",
182 cl::init(true), cl::Hidden,
183 cl::desc("Enable machine DCE inside regalloc"));
184
185 static cl::opt<bool> EnableScalarIRPasses(
186 "amdgpu-scalar-ir-passes",
187 cl::desc("Enable scalar IR passes"),
188 cl::init(true),
189 cl::Hidden);
190
191 static cl::opt<bool> EnableStructurizerWorkarounds(
192 "amdgpu-enable-structurizer-workarounds",
193 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
194 cl::Hidden);
195
196 static cl::opt<bool, true> EnableLowerModuleLDS(
197 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
198 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
199 cl::Hidden);
200
LLVMInitializeAMDGPUTarget()201 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
202 // Register the target
203 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
204 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
205
206 PassRegistry *PR = PassRegistry::getPassRegistry();
207 initializeR600ClauseMergePassPass(*PR);
208 initializeR600ControlFlowFinalizerPass(*PR);
209 initializeR600PacketizerPass(*PR);
210 initializeR600ExpandSpecialInstrsPassPass(*PR);
211 initializeR600VectorRegMergerPass(*PR);
212 initializeGlobalISel(*PR);
213 initializeAMDGPUDAGToDAGISelPass(*PR);
214 initializeGCNDPPCombinePass(*PR);
215 initializeSILowerI1CopiesPass(*PR);
216 initializeSILowerSGPRSpillsPass(*PR);
217 initializeSIFixSGPRCopiesPass(*PR);
218 initializeSIFixVGPRCopiesPass(*PR);
219 initializeSIFoldOperandsPass(*PR);
220 initializeSIPeepholeSDWAPass(*PR);
221 initializeSIShrinkInstructionsPass(*PR);
222 initializeSIOptimizeExecMaskingPreRAPass(*PR);
223 initializeSILoadStoreOptimizerPass(*PR);
224 initializeAMDGPUFixFunctionBitcastsPass(*PR);
225 initializeAMDGPUAlwaysInlinePass(*PR);
226 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
227 initializeAMDGPUAnnotateUniformValuesPass(*PR);
228 initializeAMDGPUArgumentUsageInfoPass(*PR);
229 initializeAMDGPUAtomicOptimizerPass(*PR);
230 initializeAMDGPULowerKernelArgumentsPass(*PR);
231 initializeAMDGPULowerKernelAttributesPass(*PR);
232 initializeAMDGPULowerIntrinsicsPass(*PR);
233 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
234 initializeAMDGPUPostLegalizerCombinerPass(*PR);
235 initializeAMDGPUPreLegalizerCombinerPass(*PR);
236 initializeAMDGPURegBankCombinerPass(*PR);
237 initializeAMDGPUPromoteAllocaPass(*PR);
238 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
239 initializeAMDGPUCodeGenPreparePass(*PR);
240 initializeAMDGPULateCodeGenPreparePass(*PR);
241 initializeAMDGPUPropagateAttributesEarlyPass(*PR);
242 initializeAMDGPUPropagateAttributesLatePass(*PR);
243 initializeAMDGPULowerModuleLDSPass(*PR);
244 initializeAMDGPURewriteOutArgumentsPass(*PR);
245 initializeAMDGPUUnifyMetadataPass(*PR);
246 initializeSIAnnotateControlFlowPass(*PR);
247 initializeSIInsertHardClausesPass(*PR);
248 initializeSIInsertWaitcntsPass(*PR);
249 initializeSIModeRegisterPass(*PR);
250 initializeSIWholeQuadModePass(*PR);
251 initializeSILowerControlFlowPass(*PR);
252 initializeSIPreEmitPeepholePass(*PR);
253 initializeSILateBranchLoweringPass(*PR);
254 initializeSIMemoryLegalizerPass(*PR);
255 initializeSIOptimizeExecMaskingPass(*PR);
256 initializeSIPreAllocateWWMRegsPass(*PR);
257 initializeSIFormMemoryClausesPass(*PR);
258 initializeSIPostRABundlerPass(*PR);
259 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
260 initializeAMDGPUAAWrapperPassPass(*PR);
261 initializeAMDGPUExternalAAWrapperPass(*PR);
262 initializeAMDGPUUseNativeCallsPass(*PR);
263 initializeAMDGPUSimplifyLibCallsPass(*PR);
264 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
265 initializeGCNNSAReassignPass(*PR);
266 }
267
createTLOF(const Triple & TT)268 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
269 return std::make_unique<AMDGPUTargetObjectFile>();
270 }
271
createR600MachineScheduler(MachineSchedContext * C)272 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
273 return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
274 }
275
createSIMachineScheduler(MachineSchedContext * C)276 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
277 return new SIScheduleDAGMI(C);
278 }
279
280 static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)281 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
282 ScheduleDAGMILive *DAG =
283 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
284 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
285 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
286 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
287 return DAG;
288 }
289
290 static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)291 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
292 auto DAG = new GCNIterativeScheduler(C,
293 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
294 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
295 return DAG;
296 }
297
createMinRegScheduler(MachineSchedContext * C)298 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
299 return new GCNIterativeScheduler(C,
300 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
301 }
302
303 static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext * C)304 createIterativeILPMachineScheduler(MachineSchedContext *C) {
305 auto DAG = new GCNIterativeScheduler(C,
306 GCNIterativeScheduler::SCHEDULE_ILP);
307 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
308 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
309 return DAG;
310 }
311
312 static MachineSchedRegistry
313 R600SchedRegistry("r600", "Run R600's custom scheduler",
314 createR600MachineScheduler);
315
316 static MachineSchedRegistry
317 SISchedRegistry("si", "Run SI's custom scheduler",
318 createSIMachineScheduler);
319
320 static MachineSchedRegistry
321 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
322 "Run GCN scheduler to maximize occupancy",
323 createGCNMaxOccupancyMachineScheduler);
324
325 static MachineSchedRegistry
326 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
327 "Run GCN scheduler to maximize occupancy (experimental)",
328 createIterativeGCNMaxOccupancyMachineScheduler);
329
330 static MachineSchedRegistry
331 GCNMinRegSchedRegistry("gcn-minreg",
332 "Run GCN iterative scheduler for minimal register usage (experimental)",
333 createMinRegScheduler);
334
335 static MachineSchedRegistry
336 GCNILPSchedRegistry("gcn-ilp",
337 "Run GCN iterative scheduler for ILP scheduling (experimental)",
338 createIterativeILPMachineScheduler);
339
computeDataLayout(const Triple & TT)340 static StringRef computeDataLayout(const Triple &TT) {
341 if (TT.getArch() == Triple::r600) {
342 // 32-bit pointers.
343 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
344 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
345 }
346
347 // 32-bit private, local, and region pointers. 64-bit global, constant and
348 // flat, non-integral buffer fat pointers.
349 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
350 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
351 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
352 "-ni:7";
353 }
354
355 LLVM_READNONE
getGPUOrDefault(const Triple & TT,StringRef GPU)356 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
357 if (!GPU.empty())
358 return GPU;
359
360 // Need to default to a target with flat support for HSA.
361 if (TT.getArch() == Triple::amdgcn)
362 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
363
364 return "r600";
365 }
366
getEffectiveRelocModel(Optional<Reloc::Model> RM)367 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
368 // The AMDGPU toolchain only supports generating shared objects, so we
369 // must always use PIC.
370 return Reloc::PIC_;
371 }
372
AMDGPUTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OptLevel)373 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
374 StringRef CPU, StringRef FS,
375 TargetOptions Options,
376 Optional<Reloc::Model> RM,
377 Optional<CodeModel::Model> CM,
378 CodeGenOpt::Level OptLevel)
379 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
380 FS, Options, getEffectiveRelocModel(RM),
381 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
382 TLOF(createTLOF(getTargetTriple())) {
383 initAsmInfo();
384 if (TT.getArch() == Triple::amdgcn) {
385 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
386 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
387 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
388 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
389 }
390 }
391
392 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
393 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
394 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
395 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
396
397 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
398
getGPUName(const Function & F) const399 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
400 Attribute GPUAttr = F.getFnAttribute("target-cpu");
401 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
402 }
403
getFeatureString(const Function & F) const404 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
405 Attribute FSAttr = F.getFnAttribute("target-features");
406
407 return FSAttr.isValid() ? FSAttr.getValueAsString()
408 : getTargetFeatureString();
409 }
410
411 /// Predicate for Internalize pass.
mustPreserveGV(const GlobalValue & GV)412 static bool mustPreserveGV(const GlobalValue &GV) {
413 if (const Function *F = dyn_cast<Function>(&GV))
414 return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
415
416 return !GV.use_empty();
417 }
418
adjustPassManager(PassManagerBuilder & Builder)419 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
420 Builder.DivergentTarget = true;
421
422 bool EnableOpt = getOptLevel() > CodeGenOpt::None;
423 bool Internalize = InternalizeSymbols;
424 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
425 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
426 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
427
428 if (EnableFunctionCalls) {
429 delete Builder.Inliner;
430 Builder.Inliner = createFunctionInliningPass();
431 }
432
433 Builder.addExtension(
434 PassManagerBuilder::EP_ModuleOptimizerEarly,
435 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
436 legacy::PassManagerBase &PM) {
437 if (AMDGPUAA) {
438 PM.add(createAMDGPUAAWrapperPass());
439 PM.add(createAMDGPUExternalAAWrapperPass());
440 }
441 PM.add(createAMDGPUUnifyMetadataPass());
442 PM.add(createAMDGPUPrintfRuntimeBinding());
443 if (Internalize)
444 PM.add(createInternalizePass(mustPreserveGV));
445 PM.add(createAMDGPUPropagateAttributesLatePass(this));
446 if (Internalize)
447 PM.add(createGlobalDCEPass());
448 if (EarlyInline)
449 PM.add(createAMDGPUAlwaysInlinePass(false));
450 });
451
452 Builder.addExtension(
453 PassManagerBuilder::EP_EarlyAsPossible,
454 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
455 legacy::PassManagerBase &PM) {
456 if (AMDGPUAA) {
457 PM.add(createAMDGPUAAWrapperPass());
458 PM.add(createAMDGPUExternalAAWrapperPass());
459 }
460 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
461 PM.add(llvm::createAMDGPUUseNativeCallsPass());
462 if (LibCallSimplify)
463 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
464 });
465
466 Builder.addExtension(
467 PassManagerBuilder::EP_CGSCCOptimizerLate,
468 [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
469 // Add infer address spaces pass to the opt pipeline after inlining
470 // but before SROA to increase SROA opportunities.
471 PM.add(createInferAddressSpacesPass());
472
473 // This should run after inlining to have any chance of doing anything,
474 // and before other cleanup optimizations.
475 PM.add(createAMDGPULowerKernelAttributesPass());
476
477 // Promote alloca to vector before SROA and loop unroll. If we manage
478 // to eliminate allocas before unroll we may choose to unroll less.
479 if (EnableOpt)
480 PM.add(createAMDGPUPromoteAllocaToVector());
481 });
482 }
483
registerDefaultAliasAnalyses(AAManager & AAM)484 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
485 AAM.registerFunctionAnalysis<AMDGPUAA>();
486 }
487
registerPassBuilderCallbacks(PassBuilder & PB)488 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
489 PB.registerPipelineParsingCallback(
490 [this](StringRef PassName, ModulePassManager &PM,
491 ArrayRef<PassBuilder::PipelineElement>) {
492 if (PassName == "amdgpu-propagate-attributes-late") {
493 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
494 return true;
495 }
496 if (PassName == "amdgpu-unify-metadata") {
497 PM.addPass(AMDGPUUnifyMetadataPass());
498 return true;
499 }
500 if (PassName == "amdgpu-printf-runtime-binding") {
501 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
502 return true;
503 }
504 if (PassName == "amdgpu-always-inline") {
505 PM.addPass(AMDGPUAlwaysInlinePass());
506 return true;
507 }
508 if (PassName == "amdgpu-lower-module-lds") {
509 PM.addPass(AMDGPULowerModuleLDSPass());
510 return true;
511 }
512 return false;
513 });
514 PB.registerPipelineParsingCallback(
515 [this](StringRef PassName, FunctionPassManager &PM,
516 ArrayRef<PassBuilder::PipelineElement>) {
517 if (PassName == "amdgpu-simplifylib") {
518 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
519 return true;
520 }
521 if (PassName == "amdgpu-usenative") {
522 PM.addPass(AMDGPUUseNativeCallsPass());
523 return true;
524 }
525 if (PassName == "amdgpu-promote-alloca") {
526 PM.addPass(AMDGPUPromoteAllocaPass(*this));
527 return true;
528 }
529 if (PassName == "amdgpu-promote-alloca-to-vector") {
530 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
531 return true;
532 }
533 if (PassName == "amdgpu-lower-kernel-attributes") {
534 PM.addPass(AMDGPULowerKernelAttributesPass());
535 return true;
536 }
537 if (PassName == "amdgpu-propagate-attributes-early") {
538 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
539 return true;
540 }
541 return false;
542 });
543
544 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
545 FAM.registerPass([&] { return AMDGPUAA(); });
546 });
547
548 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
549 if (AAName == "amdgpu-aa") {
550 AAM.registerFunctionAnalysis<AMDGPUAA>();
551 return true;
552 }
553 return false;
554 });
555
556 PB.registerPipelineStartEPCallback(
557 [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
558 FunctionPassManager FPM;
559 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
560 FPM.addPass(AMDGPUUseNativeCallsPass());
561 if (EnableLibCallSimplify &&
562 Level != PassBuilder::OptimizationLevel::O0)
563 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
564 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
565 });
566
567 PB.registerPipelineEarlySimplificationEPCallback(
568 [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
569 if (Level == PassBuilder::OptimizationLevel::O0)
570 return;
571
572 PM.addPass(AMDGPUUnifyMetadataPass());
573 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
574
575 if (InternalizeSymbols) {
576 // Global variables may have dead uses which need to be removed.
577 // Otherwise these useless global variables will not get internalized.
578 PM.addPass(GlobalDCEPass());
579 PM.addPass(InternalizePass(mustPreserveGV));
580 }
581 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
582 if (InternalizeSymbols) {
583 PM.addPass(GlobalDCEPass());
584 }
585 if (EarlyInlineAll && !EnableFunctionCalls)
586 PM.addPass(AMDGPUAlwaysInlinePass());
587 });
588
589 PB.registerCGSCCOptimizerLateEPCallback(
590 [this](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
591 if (Level == PassBuilder::OptimizationLevel::O0)
592 return;
593
594 FunctionPassManager FPM;
595
596 // Add infer address spaces pass to the opt pipeline after inlining
597 // but before SROA to increase SROA opportunities.
598 FPM.addPass(InferAddressSpacesPass());
599
600 // This should run after inlining to have any chance of doing
601 // anything, and before other cleanup optimizations.
602 FPM.addPass(AMDGPULowerKernelAttributesPass());
603
604 if (Level != PassBuilder::OptimizationLevel::O0) {
605 // Promote alloca to vector before SROA and loop unroll. If we
606 // manage to eliminate allocas before unroll we may choose to unroll
607 // less.
608 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
609 }
610
611 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
612 });
613 }
614
615 //===----------------------------------------------------------------------===//
616 // R600 Target Machine (R600 -> Cayman)
617 //===----------------------------------------------------------------------===//
618
R600TargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)619 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
620 StringRef CPU, StringRef FS,
621 TargetOptions Options,
622 Optional<Reloc::Model> RM,
623 Optional<CodeModel::Model> CM,
624 CodeGenOpt::Level OL, bool JIT)
625 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
626 setRequiresStructuredCFG(true);
627
628 // Override the default since calls aren't supported for r600.
629 if (EnableFunctionCalls &&
630 EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
631 EnableFunctionCalls = false;
632 }
633
getSubtargetImpl(const Function & F) const634 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
635 const Function &F) const {
636 StringRef GPU = getGPUName(F);
637 StringRef FS = getFeatureString(F);
638
639 SmallString<128> SubtargetKey(GPU);
640 SubtargetKey.append(FS);
641
642 auto &I = SubtargetMap[SubtargetKey];
643 if (!I) {
644 // This needs to be done before we create a new subtarget since any
645 // creation will depend on the TM and the code generation flags on the
646 // function that reside in TargetOptions.
647 resetTargetOptions(F);
648 I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
649 }
650
651 return I.get();
652 }
653
getNullPointerValue(unsigned AddrSpace)654 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
655 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
656 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
657 AddrSpace == AMDGPUAS::REGION_ADDRESS)
658 ? -1
659 : 0;
660 }
661
isNoopAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const662 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
663 unsigned DestAS) const {
664 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
665 AMDGPU::isFlatGlobalAddrSpace(DestAS);
666 }
667
getAssumedAddrSpace(const Value * V) const668 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
669 const auto *LD = dyn_cast<LoadInst>(V);
670 if (!LD)
671 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
672
673 // It must be a generic pointer loaded.
674 assert(V->getType()->isPointerTy() &&
675 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
676
677 const auto *Ptr = LD->getPointerOperand();
678 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
679 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
680 // For a generic pointer loaded from the constant memory, it could be assumed
681 // as a global pointer since the constant memory is only populated on the
682 // host side. As implied by the offload programming model, only global
683 // pointers could be referenced on the host side.
684 return AMDGPUAS::GLOBAL_ADDRESS;
685 }
686
687 TargetTransformInfo
getTargetTransformInfo(const Function & F)688 R600TargetMachine::getTargetTransformInfo(const Function &F) {
689 return TargetTransformInfo(R600TTIImpl(this, F));
690 }
691
692 //===----------------------------------------------------------------------===//
693 // GCN Target Machine (SI+)
694 //===----------------------------------------------------------------------===//
695
GCNTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)696 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
697 StringRef CPU, StringRef FS,
698 TargetOptions Options,
699 Optional<Reloc::Model> RM,
700 Optional<CodeModel::Model> CM,
701 CodeGenOpt::Level OL, bool JIT)
702 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
703
getSubtargetImpl(const Function & F) const704 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
705 StringRef GPU = getGPUName(F);
706 StringRef FS = getFeatureString(F);
707
708 SmallString<128> SubtargetKey(GPU);
709 SubtargetKey.append(FS);
710
711 auto &I = SubtargetMap[SubtargetKey];
712 if (!I) {
713 // This needs to be done before we create a new subtarget since any
714 // creation will depend on the TM and the code generation flags on the
715 // function that reside in TargetOptions.
716 resetTargetOptions(F);
717 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
718 }
719
720 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
721
722 return I.get();
723 }
724
725 TargetTransformInfo
getTargetTransformInfo(const Function & F)726 GCNTargetMachine::getTargetTransformInfo(const Function &F) {
727 return TargetTransformInfo(GCNTTIImpl(this, F));
728 }
729
730 //===----------------------------------------------------------------------===//
731 // AMDGPU Pass Setup
732 //===----------------------------------------------------------------------===//
733
734 namespace {
735
736 class AMDGPUPassConfig : public TargetPassConfig {
737 public:
AMDGPUPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)738 AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
739 : TargetPassConfig(TM, PM) {
740 // Exceptions and StackMaps are not supported, so these passes will never do
741 // anything.
742 disablePass(&StackMapLivenessID);
743 disablePass(&FuncletLayoutID);
744 }
745
getAMDGPUTargetMachine() const746 AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
747 return getTM<AMDGPUTargetMachine>();
748 }
749
750 ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext * C) const751 createMachineScheduler(MachineSchedContext *C) const override {
752 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
753 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
754 return DAG;
755 }
756
757 void addEarlyCSEOrGVNPass();
758 void addStraightLineScalarOptimizationPasses();
759 void addIRPasses() override;
760 void addCodeGenPrepare() override;
761 bool addPreISel() override;
762 bool addInstSelector() override;
763 bool addGCPasses() override;
764
765 std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
766 };
767
getCSEConfig() const768 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
769 return getStandardCSEConfigForOpt(TM->getOptLevel());
770 }
771
772 class R600PassConfig final : public AMDGPUPassConfig {
773 public:
R600PassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)774 R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
775 : AMDGPUPassConfig(TM, PM) {}
776
createMachineScheduler(MachineSchedContext * C) const777 ScheduleDAGInstrs *createMachineScheduler(
778 MachineSchedContext *C) const override {
779 return createR600MachineScheduler(C);
780 }
781
782 bool addPreISel() override;
783 bool addInstSelector() override;
784 void addPreRegAlloc() override;
785 void addPreSched2() override;
786 void addPreEmitPass() override;
787 };
788
789 class GCNPassConfig final : public AMDGPUPassConfig {
790 public:
GCNPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)791 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
792 : AMDGPUPassConfig(TM, PM) {
793 // It is necessary to know the register usage of the entire call graph. We
794 // allow calls without EnableAMDGPUFunctionCalls if they are marked
795 // noinline, so this is always required.
796 setRequiresCodeGenSCCOrder(true);
797 }
798
getGCNTargetMachine() const799 GCNTargetMachine &getGCNTargetMachine() const {
800 return getTM<GCNTargetMachine>();
801 }
802
803 ScheduleDAGInstrs *
804 createMachineScheduler(MachineSchedContext *C) const override;
805
806 bool addPreISel() override;
807 void addMachineSSAOptimization() override;
808 bool addILPOpts() override;
809 bool addInstSelector() override;
810 bool addIRTranslator() override;
811 void addPreLegalizeMachineIR() override;
812 bool addLegalizeMachineIR() override;
813 void addPreRegBankSelect() override;
814 bool addRegBankSelect() override;
815 void addPreGlobalInstructionSelect() override;
816 bool addGlobalInstructionSelect() override;
817 void addFastRegAlloc() override;
818 void addOptimizedRegAlloc() override;
819 void addPreRegAlloc() override;
820 bool addPreRewrite() override;
821 void addPostRegAlloc() override;
822 void addPreSched2() override;
823 void addPreEmitPass() override;
824 };
825
826 } // end anonymous namespace
827
addEarlyCSEOrGVNPass()828 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
829 if (getOptLevel() == CodeGenOpt::Aggressive)
830 addPass(createGVNPass());
831 else
832 addPass(createEarlyCSEPass());
833 }
834
addStraightLineScalarOptimizationPasses()835 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
836 addPass(createLICMPass());
837 addPass(createSeparateConstOffsetFromGEPPass());
838 addPass(createSpeculativeExecutionPass());
839 // ReassociateGEPs exposes more opportunites for SLSR. See
840 // the example in reassociate-geps-and-slsr.ll.
841 addPass(createStraightLineStrengthReducePass());
842 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
843 // EarlyCSE can reuse.
844 addEarlyCSEOrGVNPass();
845 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
846 addPass(createNaryReassociatePass());
847 // NaryReassociate on GEPs creates redundant common expressions, so run
848 // EarlyCSE after it.
849 addPass(createEarlyCSEPass());
850 }
851
addIRPasses()852 void AMDGPUPassConfig::addIRPasses() {
853 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
854
855 // There is no reason to run these.
856 disablePass(&StackMapLivenessID);
857 disablePass(&FuncletLayoutID);
858 disablePass(&PatchableFunctionID);
859
860 addPass(createAMDGPUPrintfRuntimeBinding());
861
862 // This must occur before inlining, as the inliner will not look through
863 // bitcast calls.
864 addPass(createAMDGPUFixFunctionBitcastsPass());
865
866 // A call to propagate attributes pass in the backend in case opt was not run.
867 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
868
869 addPass(createAtomicExpandPass());
870
871
872 addPass(createAMDGPULowerIntrinsicsPass());
873
874 // Function calls are not supported, so make sure we inline everything.
875 addPass(createAMDGPUAlwaysInlinePass());
876 addPass(createAlwaysInlinerLegacyPass());
877 // We need to add the barrier noop pass, otherwise adding the function
878 // inlining pass will cause all of the PassConfigs passes to be run
879 // one function at a time, which means if we have a nodule with two
880 // functions, then we will generate code for the first function
881 // without ever running any passes on the second.
882 addPass(createBarrierNoopPass());
883
884 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
885 if (TM.getTargetTriple().getArch() == Triple::r600)
886 addPass(createR600OpenCLImageTypeLoweringPass());
887
888 // Replace OpenCL enqueued block function pointers with global variables.
889 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
890
891 // Can increase LDS used by kernel so runs before PromoteAlloca
892 if (EnableLowerModuleLDS)
893 addPass(createAMDGPULowerModuleLDSPass());
894
895 if (TM.getOptLevel() > CodeGenOpt::None) {
896 addPass(createInferAddressSpacesPass());
897 addPass(createAMDGPUPromoteAlloca());
898
899 if (EnableSROA)
900 addPass(createSROAPass());
901 if (EnableScalarIRPasses.getNumOccurrences()
902 ? EnableScalarIRPasses
903 : TM.getOptLevel() > CodeGenOpt::Less)
904 addStraightLineScalarOptimizationPasses();
905
906 if (EnableAMDGPUAliasAnalysis) {
907 addPass(createAMDGPUAAWrapperPass());
908 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
909 AAResults &AAR) {
910 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
911 AAR.addAAResult(WrapperPass->getResult());
912 }));
913 }
914 }
915
916 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
917 // TODO: May want to move later or split into an early and late one.
918 addPass(createAMDGPUCodeGenPreparePass());
919 }
920
921 TargetPassConfig::addIRPasses();
922
923 // EarlyCSE is not always strong enough to clean up what LSR produces. For
924 // example, GVN can combine
925 //
926 // %0 = add %a, %b
927 // %1 = add %b, %a
928 //
929 // and
930 //
931 // %0 = shl nsw %a, 2
932 // %1 = shl %a, 2
933 //
934 // but EarlyCSE can do neither of them.
935 if (EnableScalarIRPasses.getNumOccurrences()
936 ? EnableScalarIRPasses
937 : TM.getOptLevel() > CodeGenOpt::Less)
938 addEarlyCSEOrGVNPass();
939 }
940
addCodeGenPrepare()941 void AMDGPUPassConfig::addCodeGenPrepare() {
942 if (TM->getTargetTriple().getArch() == Triple::amdgcn)
943 addPass(createAMDGPUAnnotateKernelFeaturesPass());
944
945 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
946 EnableLowerKernelArguments)
947 addPass(createAMDGPULowerKernelArgumentsPass());
948
949 addPass(&AMDGPUPerfHintAnalysisID);
950
951 TargetPassConfig::addCodeGenPrepare();
952
953 if (EnableLoadStoreVectorizer.getNumOccurrences()
954 ? EnableLoadStoreVectorizer
955 : TM->getOptLevel() > CodeGenOpt::Less)
956 addPass(createLoadStoreVectorizerPass());
957
958 // LowerSwitch pass may introduce unreachable blocks that can
959 // cause unexpected behavior for subsequent passes. Placing it
960 // here seems better that these blocks would get cleaned up by
961 // UnreachableBlockElim inserted next in the pass flow.
962 addPass(createLowerSwitchPass());
963 }
964
addPreISel()965 bool AMDGPUPassConfig::addPreISel() {
966 addPass(createFlattenCFGPass());
967 return false;
968 }
969
addInstSelector()970 bool AMDGPUPassConfig::addInstSelector() {
971 // Defer the verifier until FinalizeISel.
972 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
973 return false;
974 }
975
addGCPasses()976 bool AMDGPUPassConfig::addGCPasses() {
977 // Do nothing. GC is not supported.
978 return false;
979 }
980
981 //===----------------------------------------------------------------------===//
982 // R600 Pass Setup
983 //===----------------------------------------------------------------------===//
984
addPreISel()985 bool R600PassConfig::addPreISel() {
986 AMDGPUPassConfig::addPreISel();
987
988 if (EnableR600StructurizeCFG)
989 addPass(createStructurizeCFGPass());
990 return false;
991 }
992
addInstSelector()993 bool R600PassConfig::addInstSelector() {
994 addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
995 return false;
996 }
997
addPreRegAlloc()998 void R600PassConfig::addPreRegAlloc() {
999 addPass(createR600VectorRegMerger());
1000 }
1001
addPreSched2()1002 void R600PassConfig::addPreSched2() {
1003 addPass(createR600EmitClauseMarkers(), false);
1004 if (EnableR600IfConvert)
1005 addPass(&IfConverterID, false);
1006 addPass(createR600ClauseMergePass(), false);
1007 }
1008
addPreEmitPass()1009 void R600PassConfig::addPreEmitPass() {
1010 addPass(createAMDGPUCFGStructurizerPass(), false);
1011 addPass(createR600ExpandSpecialInstrsPass(), false);
1012 addPass(&FinalizeMachineBundlesID, false);
1013 addPass(createR600Packetizer(), false);
1014 addPass(createR600ControlFlowFinalizer(), false);
1015 }
1016
createPassConfig(PassManagerBase & PM)1017 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
1018 return new R600PassConfig(*this, PM);
1019 }
1020
1021 //===----------------------------------------------------------------------===//
1022 // GCN Pass Setup
1023 //===----------------------------------------------------------------------===//
1024
createMachineScheduler(MachineSchedContext * C) const1025 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1026 MachineSchedContext *C) const {
1027 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1028 if (ST.enableSIScheduler())
1029 return createSIMachineScheduler(C);
1030 return createGCNMaxOccupancyMachineScheduler(C);
1031 }
1032
addPreISel()1033 bool GCNPassConfig::addPreISel() {
1034 AMDGPUPassConfig::addPreISel();
1035
1036 addPass(createAMDGPULateCodeGenPreparePass());
1037 if (EnableAtomicOptimizations) {
1038 addPass(createAMDGPUAtomicOptimizerPass());
1039 }
1040
1041 // FIXME: We need to run a pass to propagate the attributes when calls are
1042 // supported.
1043
1044 addPass(createSinkingPass());
1045 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1046 // regions formed by them.
1047 addPass(&AMDGPUUnifyDivergentExitNodesID);
1048 if (!LateCFGStructurize) {
1049 if (EnableStructurizerWorkarounds) {
1050 addPass(createFixIrreduciblePass());
1051 addPass(createUnifyLoopExitsPass());
1052 }
1053 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1054 }
1055 addPass(createAMDGPUAnnotateUniformValues());
1056 if (!LateCFGStructurize) {
1057 addPass(createSIAnnotateControlFlowPass());
1058 }
1059 addPass(createLCSSAPass());
1060
1061 return false;
1062 }
1063
addMachineSSAOptimization()1064 void GCNPassConfig::addMachineSSAOptimization() {
1065 TargetPassConfig::addMachineSSAOptimization();
1066
1067 // We want to fold operands after PeepholeOptimizer has run (or as part of
1068 // it), because it will eliminate extra copies making it easier to fold the
1069 // real source operand. We want to eliminate dead instructions after, so that
1070 // we see fewer uses of the copies. We then need to clean up the dead
1071 // instructions leftover after the operands are folded as well.
1072 //
1073 // XXX - Can we get away without running DeadMachineInstructionElim again?
1074 addPass(&SIFoldOperandsID);
1075 if (EnableDPPCombine)
1076 addPass(&GCNDPPCombineID);
1077 addPass(&SILoadStoreOptimizerID);
1078 if (EnableSDWAPeephole.getNumOccurrences()
1079 ? EnableSDWAPeephole
1080 : TM->getOptLevel() > CodeGenOpt::Less) {
1081 addPass(&SIPeepholeSDWAID);
1082 addPass(&EarlyMachineLICMID);
1083 addPass(&MachineCSEID);
1084 addPass(&SIFoldOperandsID);
1085 }
1086 addPass(&DeadMachineInstructionElimID);
1087 addPass(createSIShrinkInstructionsPass());
1088 }
1089
addILPOpts()1090 bool GCNPassConfig::addILPOpts() {
1091 if (EnableEarlyIfConversion)
1092 addPass(&EarlyIfConverterID);
1093
1094 TargetPassConfig::addILPOpts();
1095 return false;
1096 }
1097
addInstSelector()1098 bool GCNPassConfig::addInstSelector() {
1099 AMDGPUPassConfig::addInstSelector();
1100 addPass(&SIFixSGPRCopiesID);
1101 addPass(createSILowerI1CopiesPass());
1102 return false;
1103 }
1104
addIRTranslator()1105 bool GCNPassConfig::addIRTranslator() {
1106 addPass(new IRTranslator(getOptLevel()));
1107 return false;
1108 }
1109
addPreLegalizeMachineIR()1110 void GCNPassConfig::addPreLegalizeMachineIR() {
1111 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1112 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1113 addPass(new Localizer());
1114 }
1115
addLegalizeMachineIR()1116 bool GCNPassConfig::addLegalizeMachineIR() {
1117 addPass(new Legalizer());
1118 return false;
1119 }
1120
addPreRegBankSelect()1121 void GCNPassConfig::addPreRegBankSelect() {
1122 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1123 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1124 }
1125
addRegBankSelect()1126 bool GCNPassConfig::addRegBankSelect() {
1127 addPass(new RegBankSelect());
1128 return false;
1129 }
1130
addPreGlobalInstructionSelect()1131 void GCNPassConfig::addPreGlobalInstructionSelect() {
1132 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1133 addPass(createAMDGPURegBankCombiner(IsOptNone));
1134 }
1135
addGlobalInstructionSelect()1136 bool GCNPassConfig::addGlobalInstructionSelect() {
1137 addPass(new InstructionSelect(getOptLevel()));
1138 return false;
1139 }
1140
addPreRegAlloc()1141 void GCNPassConfig::addPreRegAlloc() {
1142 if (LateCFGStructurize) {
1143 addPass(createAMDGPUMachineCFGStructurizerPass());
1144 }
1145 }
1146
addFastRegAlloc()1147 void GCNPassConfig::addFastRegAlloc() {
1148 // FIXME: We have to disable the verifier here because of PHIElimination +
1149 // TwoAddressInstructions disabling it.
1150
1151 // This must be run immediately after phi elimination and before
1152 // TwoAddressInstructions, otherwise the processing of the tied operand of
1153 // SI_ELSE will introduce a copy of the tied operand source after the else.
1154 insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1155
1156 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1157 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1158
1159 TargetPassConfig::addFastRegAlloc();
1160 }
1161
addOptimizedRegAlloc()1162 void GCNPassConfig::addOptimizedRegAlloc() {
1163 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1164 // instructions that cause scheduling barriers.
1165 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1166 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1167
1168 if (OptExecMaskPreRA)
1169 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1170
1171 // This is not an essential optimization and it has a noticeable impact on
1172 // compilation time, so we only enable it from O2.
1173 if (TM->getOptLevel() > CodeGenOpt::Less)
1174 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1175
1176 // This must be run immediately after phi elimination and before
1177 // TwoAddressInstructions, otherwise the processing of the tied operand of
1178 // SI_ELSE will introduce a copy of the tied operand source after the else.
1179 insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1180
1181 if (EnableDCEInRA)
1182 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1183
1184 TargetPassConfig::addOptimizedRegAlloc();
1185 }
1186
addPreRewrite()1187 bool GCNPassConfig::addPreRewrite() {
1188 if (EnableRegReassign)
1189 addPass(&GCNNSAReassignID);
1190 return true;
1191 }
1192
addPostRegAlloc()1193 void GCNPassConfig::addPostRegAlloc() {
1194 addPass(&SIFixVGPRCopiesID);
1195 if (getOptLevel() > CodeGenOpt::None)
1196 addPass(&SIOptimizeExecMaskingID);
1197 TargetPassConfig::addPostRegAlloc();
1198
1199 // Equivalent of PEI for SGPRs.
1200 addPass(&SILowerSGPRSpillsID);
1201 }
1202
addPreSched2()1203 void GCNPassConfig::addPreSched2() {
1204 addPass(&SIPostRABundlerID);
1205 }
1206
addPreEmitPass()1207 void GCNPassConfig::addPreEmitPass() {
1208 addPass(createSIMemoryLegalizerPass());
1209 addPass(createSIInsertWaitcntsPass());
1210 addPass(createSIShrinkInstructionsPass());
1211 addPass(createSIModeRegisterPass());
1212
1213 if (getOptLevel() > CodeGenOpt::None)
1214 addPass(&SIInsertHardClausesID);
1215
1216 addPass(&SILateBranchLoweringPassID);
1217 if (getOptLevel() > CodeGenOpt::None)
1218 addPass(&SIPreEmitPeepholeID);
1219 // The hazard recognizer that runs as part of the post-ra scheduler does not
1220 // guarantee to be able handle all hazards correctly. This is because if there
1221 // are multiple scheduling regions in a basic block, the regions are scheduled
1222 // bottom up, so when we begin to schedule a region we don't know what
1223 // instructions were emitted directly before it.
1224 //
1225 // Here we add a stand-alone hazard recognizer pass which can handle all
1226 // cases.
1227 addPass(&PostRAHazardRecognizerID);
1228 addPass(&BranchRelaxationPassID);
1229 }
1230
createPassConfig(PassManagerBase & PM)1231 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1232 return new GCNPassConfig(*this, PM);
1233 }
1234
createDefaultFuncInfoYAML() const1235 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1236 return new yaml::SIMachineFunctionInfo();
1237 }
1238
1239 yaml::MachineFunctionInfo *
convertFuncInfoToYAML(const MachineFunction & MF) const1240 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1241 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1242 return new yaml::SIMachineFunctionInfo(
1243 *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
1244 }
1245
parseMachineFunctionInfo(const yaml::MachineFunctionInfo & MFI_,PerFunctionMIParsingState & PFS,SMDiagnostic & Error,SMRange & SourceRange) const1246 bool GCNTargetMachine::parseMachineFunctionInfo(
1247 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1248 SMDiagnostic &Error, SMRange &SourceRange) const {
1249 const yaml::SIMachineFunctionInfo &YamlMFI =
1250 reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1251 MachineFunction &MF = PFS.MF;
1252 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1253
1254 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1255 return true;
1256
1257 if (MFI->Occupancy == 0) {
1258 // Fixup the subtarget dependent default value.
1259 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1260 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1261 }
1262
1263 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1264 Register TempReg;
1265 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1266 SourceRange = RegName.SourceRange;
1267 return true;
1268 }
1269 RegVal = TempReg;
1270
1271 return false;
1272 };
1273
1274 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1275 // Create a diagnostic for a the register string literal.
1276 const MemoryBuffer &Buffer =
1277 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1278 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1279 RegName.Value.size(), SourceMgr::DK_Error,
1280 "incorrect register class for field", RegName.Value,
1281 None, None);
1282 SourceRange = RegName.SourceRange;
1283 return true;
1284 };
1285
1286 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1287 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1288 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1289 return true;
1290
1291 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1292 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1293 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1294 }
1295
1296 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1297 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1298 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1299 }
1300
1301 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1302 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1303 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1304 }
1305
1306 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1307 const TargetRegisterClass &RC,
1308 ArgDescriptor &Arg, unsigned UserSGPRs,
1309 unsigned SystemSGPRs) {
1310 // Skip parsing if it's not present.
1311 if (!A)
1312 return false;
1313
1314 if (A->IsRegister) {
1315 Register Reg;
1316 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1317 SourceRange = A->RegisterName.SourceRange;
1318 return true;
1319 }
1320 if (!RC.contains(Reg))
1321 return diagnoseRegisterClass(A->RegisterName);
1322 Arg = ArgDescriptor::createRegister(Reg);
1323 } else
1324 Arg = ArgDescriptor::createStack(A->StackOffset);
1325 // Check and apply the optional mask.
1326 if (A->Mask)
1327 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1328
1329 MFI->NumUserSGPRs += UserSGPRs;
1330 MFI->NumSystemSGPRs += SystemSGPRs;
1331 return false;
1332 };
1333
1334 if (YamlMFI.ArgInfo &&
1335 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1336 AMDGPU::SGPR_128RegClass,
1337 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1338 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1339 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1340 2, 0) ||
1341 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1342 MFI->ArgInfo.QueuePtr, 2, 0) ||
1343 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1344 AMDGPU::SReg_64RegClass,
1345 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1346 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1347 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1348 2, 0) ||
1349 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1350 AMDGPU::SReg_64RegClass,
1351 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1352 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1353 AMDGPU::SGPR_32RegClass,
1354 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1355 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1356 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1357 0, 1) ||
1358 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1359 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1360 0, 1) ||
1361 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1362 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1363 0, 1) ||
1364 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1365 AMDGPU::SGPR_32RegClass,
1366 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1367 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1368 AMDGPU::SGPR_32RegClass,
1369 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1370 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1371 AMDGPU::SReg_64RegClass,
1372 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1373 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1374 AMDGPU::SReg_64RegClass,
1375 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1376 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1377 AMDGPU::VGPR_32RegClass,
1378 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1379 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1380 AMDGPU::VGPR_32RegClass,
1381 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1382 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1383 AMDGPU::VGPR_32RegClass,
1384 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1385 return true;
1386
1387 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1388 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1389 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1390 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1391 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1392 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
1393
1394 return false;
1395 }
1396