1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Top-level implementation for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXTargetMachine.h"
14 #include "NVPTX.h"
15 #include "NVPTXAliasAnalysis.h"
16 #include "NVPTXAllocaHoisting.h"
17 #include "NVPTXAtomicLower.h"
18 #include "NVPTXCtorDtorLowering.h"
19 #include "NVPTXLowerAggrCopies.h"
20 #include "NVPTXMachineFunctionInfo.h"
21 #include "NVPTXTargetObjectFile.h"
22 #include "NVPTXTargetTransformInfo.h"
23 #include "TargetInfo/NVPTXTargetInfo.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/CodeGen/Passes.h"
27 #include "llvm/CodeGen/TargetPassConfig.h"
28 #include "llvm/IR/IntrinsicsNVPTX.h"
29 #include "llvm/MC/TargetRegistry.h"
30 #include "llvm/Pass.h"
31 #include "llvm/Passes/PassBuilder.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Target/TargetMachine.h"
34 #include "llvm/Target/TargetOptions.h"
35 #include "llvm/TargetParser/Triple.h"
36 #include "llvm/Transforms/Scalar.h"
37 #include "llvm/Transforms/Scalar/GVN.h"
38 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
39 #include <cassert>
40 #include <optional>
41 #include <string>
42 
43 using namespace llvm;
44 
45 // LSV is still relatively new; this switch lets us turn it off in case we
46 // encounter (or suspect) a bug.
47 static cl::opt<bool>
48     DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
49                                cl::desc("Disable load/store vectorizer"),
50                                cl::init(false), cl::Hidden);
51 
52 // TODO: Remove this flag when we are confident with no regressions.
53 static cl::opt<bool> DisableRequireStructuredCFG(
54     "disable-nvptx-require-structured-cfg",
55     cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
56              "structured CFG. The requirement should be disabled only when "
57              "unexpected regressions happen."),
58     cl::init(false), cl::Hidden);
59 
60 static cl::opt<bool> UseShortPointersOpt(
61     "nvptx-short-ptr",
62     cl::desc(
63         "Use 32-bit pointers for accessing const/local/shared address spaces."),
64     cl::init(false), cl::Hidden);
65 
66 // FIXME: intended as a temporary debugging aid. Should be removed before it
67 // makes it into the LLVM-17 release.
68 static cl::opt<bool>
69     ExitOnUnreachable("nvptx-exit-on-unreachable",
70                       cl::desc("Lower 'unreachable' as 'exit' instruction."),
71                       cl::init(true), cl::Hidden);
72 
73 namespace llvm {
74 
75 void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
76 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
77 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
78 void initializeNVPTXAtomicLowerPass(PassRegistry &);
79 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
80 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
81 void initializeNVPTXLowerAllocaPass(PassRegistry &);
82 void initializeNVPTXLowerUnreachablePass(PassRegistry &);
83 void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
84 void initializeNVPTXLowerArgsPass(PassRegistry &);
85 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
86 void initializeNVVMIntrRangePass(PassRegistry &);
87 void initializeNVVMReflectPass(PassRegistry &);
88 void initializeNVPTXAAWrapperPassPass(PassRegistry &);
89 void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
90 
91 } // end namespace llvm
92 
93 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
94   // Register the target.
95   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
96   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
97 
98   PassRegistry &PR = *PassRegistry::getPassRegistry();
99   // FIXME: This pass is really intended to be invoked during IR optimization,
100   // but it's very NVPTX-specific.
101   initializeNVVMReflectPass(PR);
102   initializeNVVMIntrRangePass(PR);
103   initializeGenericToNVVMLegacyPassPass(PR);
104   initializeNVPTXAllocaHoistingPass(PR);
105   initializeNVPTXAssignValidGlobalNamesPass(PR);
106   initializeNVPTXAtomicLowerPass(PR);
107   initializeNVPTXLowerArgsPass(PR);
108   initializeNVPTXLowerAllocaPass(PR);
109   initializeNVPTXLowerUnreachablePass(PR);
110   initializeNVPTXCtorDtorLoweringLegacyPass(PR);
111   initializeNVPTXLowerAggrCopiesPass(PR);
112   initializeNVPTXProxyRegErasurePass(PR);
113   initializeNVPTXDAGToDAGISelPass(PR);
114   initializeNVPTXAAWrapperPassPass(PR);
115   initializeNVPTXExternalAAWrapperPass(PR);
116 }
117 
118 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
119   std::string Ret = "e";
120 
121   if (!is64Bit)
122     Ret += "-p:32:32";
123   else if (UseShortPointers)
124     Ret += "-p3:32:32-p4:32:32-p5:32:32";
125 
126   Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
127 
128   return Ret;
129 }
130 
131 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
132                                        StringRef CPU, StringRef FS,
133                                        const TargetOptions &Options,
134                                        std::optional<Reloc::Model> RM,
135                                        std::optional<CodeModel::Model> CM,
136                                        CodeGenOpt::Level OL, bool is64bit)
137     // The pic relocation model is used regardless of what the client has
138     // specified, as it is the only relocation model currently supported.
139     : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
140                         CPU, FS, Options, Reloc::PIC_,
141                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
142       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
143       TLOF(std::make_unique<NVPTXTargetObjectFile>()),
144       Subtarget(TT, std::string(CPU), std::string(FS), *this),
145       StrPool(StrAlloc) {
146   if (TT.getOS() == Triple::NVCL)
147     drvInterface = NVPTX::NVCL;
148   else
149     drvInterface = NVPTX::CUDA;
150   if (!DisableRequireStructuredCFG)
151     setRequiresStructuredCFG(true);
152   initAsmInfo();
153 }
154 
155 NVPTXTargetMachine::~NVPTXTargetMachine() = default;
156 
157 void NVPTXTargetMachine32::anchor() {}
158 
159 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
160                                            StringRef CPU, StringRef FS,
161                                            const TargetOptions &Options,
162                                            std::optional<Reloc::Model> RM,
163                                            std::optional<CodeModel::Model> CM,
164                                            CodeGenOpt::Level OL, bool JIT)
165     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
166 
167 void NVPTXTargetMachine64::anchor() {}
168 
169 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
170                                            StringRef CPU, StringRef FS,
171                                            const TargetOptions &Options,
172                                            std::optional<Reloc::Model> RM,
173                                            std::optional<CodeModel::Model> CM,
174                                            CodeGenOpt::Level OL, bool JIT)
175     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
176 
177 namespace {
178 
179 class NVPTXPassConfig : public TargetPassConfig {
180 public:
181   NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
182       : TargetPassConfig(TM, PM) {}
183 
184   NVPTXTargetMachine &getNVPTXTargetMachine() const {
185     return getTM<NVPTXTargetMachine>();
186   }
187 
188   void addIRPasses() override;
189   bool addInstSelector() override;
190   void addPreRegAlloc() override;
191   void addPostRegAlloc() override;
192   void addMachineSSAOptimization() override;
193 
194   FunctionPass *createTargetRegisterAllocator(bool) override;
195   void addFastRegAlloc() override;
196   void addOptimizedRegAlloc() override;
197 
198   bool addRegAssignAndRewriteFast() override {
199     llvm_unreachable("should not be used");
200   }
201 
202   bool addRegAssignAndRewriteOptimized() override {
203     llvm_unreachable("should not be used");
204   }
205 
206 private:
207   // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
208   // function is only called in opt mode.
209   void addEarlyCSEOrGVNPass();
210 
211   // Add passes that propagate special memory spaces.
212   void addAddressSpaceInferencePasses();
213 
214   // Add passes that perform straight-line scalar optimizations.
215   void addStraightLineScalarOptimizationPasses();
216 };
217 
218 } // end anonymous namespace
219 
220 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
221   return new NVPTXPassConfig(*this, PM);
222 }
223 
224 MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo(
225     BumpPtrAllocator &Allocator, const Function &F,
226     const TargetSubtargetInfo *STI) const {
227   return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator,
228                                                                     F, STI);
229 }
230 
231 void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
232   AAM.registerFunctionAnalysis<NVPTXAA>();
233 }
234 
235 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
236   PB.registerPipelineParsingCallback(
237       [](StringRef PassName, FunctionPassManager &PM,
238          ArrayRef<PassBuilder::PipelineElement>) {
239         if (PassName == "nvvm-reflect") {
240           PM.addPass(NVVMReflectPass());
241           return true;
242         }
243         if (PassName == "nvvm-intr-range") {
244           PM.addPass(NVVMIntrRangePass());
245           return true;
246         }
247         return false;
248       });
249 
250   PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
251     FAM.registerPass([&] { return NVPTXAA(); });
252   });
253 
254   PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
255     if (AAName == "nvptx-aa") {
256       AAM.registerFunctionAnalysis<NVPTXAA>();
257       return true;
258     }
259     return false;
260   });
261 
262   PB.registerPipelineParsingCallback(
263       [](StringRef PassName, ModulePassManager &PM,
264          ArrayRef<PassBuilder::PipelineElement>) {
265         if (PassName == "nvptx-lower-ctor-dtor") {
266           PM.addPass(NVPTXCtorDtorLoweringPass());
267           return true;
268         }
269         if (PassName == "generic-to-nvvm") {
270           PM.addPass(GenericToNVVMPass());
271           return true;
272         }
273         return false;
274       });
275 
276   PB.registerPipelineStartEPCallback(
277       [this](ModulePassManager &PM, OptimizationLevel Level) {
278         FunctionPassManager FPM;
279         FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
280         // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
281         // investigate and re-enable.
282         // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
283         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
284       });
285 }
286 
287 TargetTransformInfo
288 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const {
289   return TargetTransformInfo(NVPTXTTIImpl(this, F));
290 }
291 
292 std::pair<const Value *, unsigned>
293 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {
294   if (auto *II = dyn_cast<IntrinsicInst>(V)) {
295     switch (II->getIntrinsicID()) {
296     case Intrinsic::nvvm_isspacep_const:
297       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST);
298     case Intrinsic::nvvm_isspacep_global:
299       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL);
300     case Intrinsic::nvvm_isspacep_local:
301       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);
302     case Intrinsic::nvvm_isspacep_shared:
303     case Intrinsic::nvvm_isspacep_shared_cluster:
304       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);
305     default:
306       break;
307     }
308   }
309   return std::make_pair(nullptr, -1);
310 }
311 
312 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
313   if (getOptLevel() == CodeGenOpt::Aggressive)
314     addPass(createGVNPass());
315   else
316     addPass(createEarlyCSEPass());
317 }
318 
319 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
320   // NVPTXLowerArgs emits alloca for byval parameters which can often
321   // be eliminated by SROA.
322   addPass(createSROAPass());
323   addPass(createNVPTXLowerAllocaPass());
324   addPass(createInferAddressSpacesPass());
325   addPass(createNVPTXAtomicLowerPass());
326 }
327 
328 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
329   addPass(createSeparateConstOffsetFromGEPPass());
330   addPass(createSpeculativeExecutionPass());
331   // ReassociateGEPs exposes more opportunites for SLSR. See
332   // the example in reassociate-geps-and-slsr.ll.
333   addPass(createStraightLineStrengthReducePass());
334   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
335   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
336   // for some of our benchmarks.
337   addEarlyCSEOrGVNPass();
338   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
339   addPass(createNaryReassociatePass());
340   // NaryReassociate on GEPs creates redundant common expressions, so run
341   // EarlyCSE after it.
342   addPass(createEarlyCSEPass());
343 }
344 
345 void NVPTXPassConfig::addIRPasses() {
346   // The following passes are known to not play well with virtual regs hanging
347   // around after register allocation (which in our case, is *all* registers).
348   // We explicitly disable them here.  We do, however, need some functionality
349   // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
350   // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
351   disablePass(&PrologEpilogCodeInserterID);
352   disablePass(&MachineLateInstrsCleanupID);
353   disablePass(&MachineCopyPropagationID);
354   disablePass(&TailDuplicateID);
355   disablePass(&StackMapLivenessID);
356   disablePass(&LiveDebugValuesID);
357   disablePass(&PostRAMachineSinkingID);
358   disablePass(&PostRASchedulerID);
359   disablePass(&FuncletLayoutID);
360   disablePass(&PatchableFunctionID);
361   disablePass(&ShrinkWrapID);
362 
363   addPass(createNVPTXAAWrapperPass());
364   addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
365     if (auto *WrapperPass = P.getAnalysisIfAvailable<NVPTXAAWrapperPass>())
366       AAR.addAAResult(WrapperPass->getResult());
367   }));
368 
369   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
370   // it here does nothing.  But since we need it for correctness when lowering
371   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
372   // call addEarlyAsPossiblePasses.
373   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
374   addPass(createNVVMReflectPass(ST.getSmVersion()));
375 
376   if (getOptLevel() != CodeGenOpt::None)
377     addPass(createNVPTXImageOptimizerPass());
378   addPass(createNVPTXAssignValidGlobalNamesPass());
379   addPass(createGenericToNVVMLegacyPass());
380 
381   // NVPTXLowerArgs is required for correctness and should be run right
382   // before the address space inference passes.
383   addPass(createNVPTXLowerArgsPass());
384   if (getOptLevel() != CodeGenOpt::None) {
385     addAddressSpaceInferencePasses();
386     addStraightLineScalarOptimizationPasses();
387   }
388 
389   addPass(createAtomicExpandPass());
390   addPass(createNVPTXCtorDtorLoweringLegacyPass());
391 
392   // === LSR and other generic IR passes ===
393   TargetPassConfig::addIRPasses();
394   // EarlyCSE is not always strong enough to clean up what LSR produces. For
395   // example, GVN can combine
396   //
397   //   %0 = add %a, %b
398   //   %1 = add %b, %a
399   //
400   // and
401   //
402   //   %0 = shl nsw %a, 2
403   //   %1 = shl %a, 2
404   //
405   // but EarlyCSE can do neither of them.
406   if (getOptLevel() != CodeGenOpt::None) {
407     addEarlyCSEOrGVNPass();
408     if (!DisableLoadStoreVectorizer)
409       addPass(createLoadStoreVectorizerPass());
410     addPass(createSROAPass());
411   }
412 
413   if (ExitOnUnreachable)
414     addPass(createNVPTXLowerUnreachablePass());
415 }
416 
417 bool NVPTXPassConfig::addInstSelector() {
418   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
419 
420   addPass(createLowerAggrCopies());
421   addPass(createAllocaHoisting());
422   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
423 
424   if (!ST.hasImageHandles())
425     addPass(createNVPTXReplaceImageHandlesPass());
426 
427   return false;
428 }
429 
430 void NVPTXPassConfig::addPreRegAlloc() {
431   // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
432   addPass(createNVPTXProxyRegErasurePass());
433 }
434 
435 void NVPTXPassConfig::addPostRegAlloc() {
436   addPass(createNVPTXPrologEpilogPass());
437   if (getOptLevel() != CodeGenOpt::None) {
438     // NVPTXPrologEpilogPass calculates frame object offset and replace frame
439     // index with VRFrame register. NVPTXPeephole need to be run after that and
440     // will replace VRFrame with VRFrameLocal when possible.
441     addPass(createNVPTXPeephole());
442   }
443 }
444 
445 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
446   return nullptr; // No reg alloc
447 }
448 
449 void NVPTXPassConfig::addFastRegAlloc() {
450   addPass(&PHIEliminationID);
451   addPass(&TwoAddressInstructionPassID);
452 }
453 
454 void NVPTXPassConfig::addOptimizedRegAlloc() {
455   addPass(&ProcessImplicitDefsID);
456   addPass(&LiveVariablesID);
457   addPass(&MachineLoopInfoID);
458   addPass(&PHIEliminationID);
459 
460   addPass(&TwoAddressInstructionPassID);
461   addPass(&RegisterCoalescerID);
462 
463   // PreRA instruction scheduling.
464   if (addPass(&MachineSchedulerID))
465     printAndVerify("After Machine Scheduling");
466 
467   addPass(&StackSlotColoringID);
468 
469   // FIXME: Needs physical registers
470   // addPass(&MachineLICMID);
471 
472   printAndVerify("After StackSlotColoring");
473 }
474 
475 void NVPTXPassConfig::addMachineSSAOptimization() {
476   // Pre-ra tail duplication.
477   if (addPass(&EarlyTailDuplicateID))
478     printAndVerify("After Pre-RegAlloc TailDuplicate");
479 
480   // Optimize PHIs before DCE: removing dead PHI cycles may make more
481   // instructions dead.
482   addPass(&OptimizePHIsID);
483 
484   // This pass merges large allocas. StackSlotColoring is a different pass
485   // which merges spill slots.
486   addPass(&StackColoringID);
487 
488   // If the target requests it, assign local variables to stack slots relative
489   // to one another and simplify frame index references where possible.
490   addPass(&LocalStackSlotAllocationID);
491 
492   // With optimization, dead code should already be eliminated. However
493   // there is one known exception: lowered code for arguments that are only
494   // used by tail calls, where the tail calls reuse the incoming stack
495   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
496   addPass(&DeadMachineInstructionElimID);
497   printAndVerify("After codegen DCE pass");
498 
499   // Allow targets to insert passes that improve instruction level parallelism,
500   // like if-conversion. Such passes will typically need dominator trees and
501   // loop info, just like LICM and CSE below.
502   if (addILPOpts())
503     printAndVerify("After ILP optimizations");
504 
505   addPass(&EarlyMachineLICMID);
506   addPass(&MachineCSEID);
507 
508   addPass(&MachineSinkingID);
509   printAndVerify("After Machine LICM, CSE and Sinking passes");
510 
511   addPass(&PeepholeOptimizerID);
512   printAndVerify("After codegen peephole optimization pass");
513 }
514