1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Top-level implementation for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXTargetMachine.h"
14 #include "NVPTX.h"
15 #include "NVPTXAllocaHoisting.h"
16 #include "NVPTXAtomicLower.h"
17 #include "NVPTXLowerAggrCopies.h"
18 #include "NVPTXTargetObjectFile.h"
19 #include "NVPTXTargetTransformInfo.h"
20 #include "TargetInfo/NVPTXTargetInfo.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/Triple.h"
23 #include "llvm/Analysis/TargetTransformInfo.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/IntrinsicsNVPTX.h"
27 #include "llvm/IR/LegacyPassManager.h"
28 #include "llvm/MC/TargetRegistry.h"
29 #include "llvm/Pass.h"
30 #include "llvm/Passes/PassBuilder.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Target/TargetMachine.h"
33 #include "llvm/Target/TargetOptions.h"
34 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
35 #include "llvm/Transforms/Scalar.h"
36 #include "llvm/Transforms/Scalar/GVN.h"
37 #include "llvm/Transforms/Vectorize.h"
38 #include <cassert>
39 #include <string>
40 
41 using namespace llvm;
42 
43 // LSV is still relatively new; this switch lets us turn it off in case we
44 // encounter (or suspect) a bug.
45 static cl::opt<bool>
46     DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
47                                cl::desc("Disable load/store vectorizer"),
48                                cl::init(false), cl::Hidden);
49 
50 // TODO: Remove this flag when we are confident with no regressions.
51 static cl::opt<bool> DisableRequireStructuredCFG(
52     "disable-nvptx-require-structured-cfg",
53     cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
54              "structured CFG. The requirement should be disabled only when "
55              "unexpected regressions happen."),
56     cl::init(false), cl::Hidden);
57 
58 static cl::opt<bool> UseShortPointersOpt(
59     "nvptx-short-ptr",
60     cl::desc(
61         "Use 32-bit pointers for accessing const/local/shared address spaces."),
62     cl::init(false), cl::Hidden);
63 
64 namespace llvm {
65 
66 void initializeNVVMIntrRangePass(PassRegistry&);
67 void initializeNVVMReflectPass(PassRegistry&);
68 void initializeGenericToNVVMPass(PassRegistry&);
69 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
70 void initializeNVPTXAtomicLowerPass(PassRegistry &);
71 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
72 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
73 void initializeNVPTXLowerArgsPass(PassRegistry &);
74 void initializeNVPTXLowerAllocaPass(PassRegistry &);
75 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
76 
77 } // end namespace llvm
78 
79 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
80   // Register the target.
81   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
82   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
83 
84   // FIXME: This pass is really intended to be invoked during IR optimization,
85   // but it's very NVPTX-specific.
86   PassRegistry &PR = *PassRegistry::getPassRegistry();
87   initializeNVVMReflectPass(PR);
88   initializeNVVMIntrRangePass(PR);
89   initializeGenericToNVVMPass(PR);
90   initializeNVPTXAllocaHoistingPass(PR);
91   initializeNVPTXAssignValidGlobalNamesPass(PR);
92   initializeNVPTXAtomicLowerPass(PR);
93   initializeNVPTXLowerArgsPass(PR);
94   initializeNVPTXLowerAllocaPass(PR);
95   initializeNVPTXLowerAggrCopiesPass(PR);
96   initializeNVPTXProxyRegErasurePass(PR);
97 }
98 
99 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
100   std::string Ret = "e";
101 
102   if (!is64Bit)
103     Ret += "-p:32:32";
104   else if (UseShortPointers)
105     Ret += "-p3:32:32-p4:32:32-p5:32:32";
106 
107   Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
108 
109   return Ret;
110 }
111 
112 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
113                                        StringRef CPU, StringRef FS,
114                                        const TargetOptions &Options,
115                                        Optional<Reloc::Model> RM,
116                                        Optional<CodeModel::Model> CM,
117                                        CodeGenOpt::Level OL, bool is64bit)
118     // The pic relocation model is used regardless of what the client has
119     // specified, as it is the only relocation model currently supported.
120     : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
121                         CPU, FS, Options, Reloc::PIC_,
122                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
123       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
124       TLOF(std::make_unique<NVPTXTargetObjectFile>()),
125       Subtarget(TT, std::string(CPU), std::string(FS), *this) {
126   if (TT.getOS() == Triple::NVCL)
127     drvInterface = NVPTX::NVCL;
128   else
129     drvInterface = NVPTX::CUDA;
130   if (!DisableRequireStructuredCFG)
131     setRequiresStructuredCFG(true);
132   initAsmInfo();
133 }
134 
135 NVPTXTargetMachine::~NVPTXTargetMachine() = default;
136 
137 void NVPTXTargetMachine32::anchor() {}
138 
139 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
140                                            StringRef CPU, StringRef FS,
141                                            const TargetOptions &Options,
142                                            Optional<Reloc::Model> RM,
143                                            Optional<CodeModel::Model> CM,
144                                            CodeGenOpt::Level OL, bool JIT)
145     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
146 
147 void NVPTXTargetMachine64::anchor() {}
148 
149 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
150                                            StringRef CPU, StringRef FS,
151                                            const TargetOptions &Options,
152                                            Optional<Reloc::Model> RM,
153                                            Optional<CodeModel::Model> CM,
154                                            CodeGenOpt::Level OL, bool JIT)
155     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
156 
157 namespace {
158 
159 class NVPTXPassConfig : public TargetPassConfig {
160 public:
161   NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
162       : TargetPassConfig(TM, PM) {}
163 
164   NVPTXTargetMachine &getNVPTXTargetMachine() const {
165     return getTM<NVPTXTargetMachine>();
166   }
167 
168   void addIRPasses() override;
169   bool addInstSelector() override;
170   void addPreRegAlloc() override;
171   void addPostRegAlloc() override;
172   void addMachineSSAOptimization() override;
173 
174   FunctionPass *createTargetRegisterAllocator(bool) override;
175   void addFastRegAlloc() override;
176   void addOptimizedRegAlloc() override;
177 
178   bool addRegAssignAndRewriteFast() override {
179     llvm_unreachable("should not be used");
180   }
181 
182   bool addRegAssignAndRewriteOptimized() override {
183     llvm_unreachable("should not be used");
184   }
185 
186 private:
187   // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
188   // function is only called in opt mode.
189   void addEarlyCSEOrGVNPass();
190 
191   // Add passes that propagate special memory spaces.
192   void addAddressSpaceInferencePasses();
193 
194   // Add passes that perform straight-line scalar optimizations.
195   void addStraightLineScalarOptimizationPasses();
196 };
197 
198 } // end anonymous namespace
199 
200 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
201   return new NVPTXPassConfig(*this, PM);
202 }
203 
204 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
205   Builder.addExtension(
206     PassManagerBuilder::EP_EarlyAsPossible,
207     [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
208       PM.add(createNVVMReflectPass(Subtarget.getSmVersion()));
209       PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
210     });
211 }
212 
213 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
214   PB.registerPipelineParsingCallback(
215       [](StringRef PassName, FunctionPassManager &PM,
216          ArrayRef<PassBuilder::PipelineElement>) {
217         if (PassName == "nvvm-reflect") {
218           PM.addPass(NVVMReflectPass());
219           return true;
220         }
221         if (PassName == "nvvm-intr-range") {
222           PM.addPass(NVVMIntrRangePass());
223           return true;
224         }
225         return false;
226       });
227 
228   PB.registerPipelineStartEPCallback(
229       [this](ModulePassManager &PM, OptimizationLevel Level) {
230         FunctionPassManager FPM;
231         FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
232         // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
233         // investigate and re-enable.
234         // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
235         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
236       });
237 }
238 
239 TargetTransformInfo
240 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const {
241   return TargetTransformInfo(NVPTXTTIImpl(this, F));
242 }
243 
244 std::pair<const Value *, unsigned>
245 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {
246   if (auto *II = dyn_cast<IntrinsicInst>(V)) {
247     switch (II->getIntrinsicID()) {
248     case Intrinsic::nvvm_isspacep_const:
249       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST);
250     case Intrinsic::nvvm_isspacep_global:
251       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL);
252     case Intrinsic::nvvm_isspacep_local:
253       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);
254     case Intrinsic::nvvm_isspacep_shared:
255       return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);
256     default:
257       break;
258     }
259   }
260   return std::make_pair(nullptr, -1);
261 }
262 
263 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
264   if (getOptLevel() == CodeGenOpt::Aggressive)
265     addPass(createGVNPass());
266   else
267     addPass(createEarlyCSEPass());
268 }
269 
270 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
271   // NVPTXLowerArgs emits alloca for byval parameters which can often
272   // be eliminated by SROA.
273   addPass(createSROAPass());
274   addPass(createNVPTXLowerAllocaPass());
275   addPass(createInferAddressSpacesPass());
276   addPass(createNVPTXAtomicLowerPass());
277 }
278 
279 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
280   addPass(createSeparateConstOffsetFromGEPPass());
281   addPass(createSpeculativeExecutionPass());
282   // ReassociateGEPs exposes more opportunites for SLSR. See
283   // the example in reassociate-geps-and-slsr.ll.
284   addPass(createStraightLineStrengthReducePass());
285   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
286   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
287   // for some of our benchmarks.
288   addEarlyCSEOrGVNPass();
289   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
290   addPass(createNaryReassociatePass());
291   // NaryReassociate on GEPs creates redundant common expressions, so run
292   // EarlyCSE after it.
293   addPass(createEarlyCSEPass());
294 }
295 
296 void NVPTXPassConfig::addIRPasses() {
297   // The following passes are known to not play well with virtual regs hanging
298   // around after register allocation (which in our case, is *all* registers).
299   // We explicitly disable them here.  We do, however, need some functionality
300   // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
301   // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
302   disablePass(&PrologEpilogCodeInserterID);
303   disablePass(&MachineCopyPropagationID);
304   disablePass(&TailDuplicateID);
305   disablePass(&StackMapLivenessID);
306   disablePass(&LiveDebugValuesID);
307   disablePass(&PostRAMachineSinkingID);
308   disablePass(&PostRASchedulerID);
309   disablePass(&FuncletLayoutID);
310   disablePass(&PatchableFunctionID);
311   disablePass(&ShrinkWrapID);
312 
313   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
314   // it here does nothing.  But since we need it for correctness when lowering
315   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
316   // call addEarlyAsPossiblePasses.
317   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
318   addPass(createNVVMReflectPass(ST.getSmVersion()));
319 
320   if (getOptLevel() != CodeGenOpt::None)
321     addPass(createNVPTXImageOptimizerPass());
322   addPass(createNVPTXAssignValidGlobalNamesPass());
323   addPass(createGenericToNVVMPass());
324 
325   // NVPTXLowerArgs is required for correctness and should be run right
326   // before the address space inference passes.
327   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
328   if (getOptLevel() != CodeGenOpt::None) {
329     addAddressSpaceInferencePasses();
330     addStraightLineScalarOptimizationPasses();
331   }
332 
333   addPass(createAtomicExpandPass());
334 
335   // === LSR and other generic IR passes ===
336   TargetPassConfig::addIRPasses();
337   // EarlyCSE is not always strong enough to clean up what LSR produces. For
338   // example, GVN can combine
339   //
340   //   %0 = add %a, %b
341   //   %1 = add %b, %a
342   //
343   // and
344   //
345   //   %0 = shl nsw %a, 2
346   //   %1 = shl %a, 2
347   //
348   // but EarlyCSE can do neither of them.
349   if (getOptLevel() != CodeGenOpt::None) {
350     addEarlyCSEOrGVNPass();
351     if (!DisableLoadStoreVectorizer)
352       addPass(createLoadStoreVectorizerPass());
353     addPass(createSROAPass());
354   }
355 }
356 
357 bool NVPTXPassConfig::addInstSelector() {
358   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
359 
360   addPass(createLowerAggrCopies());
361   addPass(createAllocaHoisting());
362   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
363 
364   if (!ST.hasImageHandles())
365     addPass(createNVPTXReplaceImageHandlesPass());
366 
367   return false;
368 }
369 
370 void NVPTXPassConfig::addPreRegAlloc() {
371   // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
372   addPass(createNVPTXProxyRegErasurePass());
373 }
374 
375 void NVPTXPassConfig::addPostRegAlloc() {
376   addPass(createNVPTXPrologEpilogPass());
377   if (getOptLevel() != CodeGenOpt::None) {
378     // NVPTXPrologEpilogPass calculates frame object offset and replace frame
379     // index with VRFrame register. NVPTXPeephole need to be run after that and
380     // will replace VRFrame with VRFrameLocal when possible.
381     addPass(createNVPTXPeephole());
382   }
383 }
384 
385 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
386   return nullptr; // No reg alloc
387 }
388 
389 void NVPTXPassConfig::addFastRegAlloc() {
390   addPass(&PHIEliminationID);
391   addPass(&TwoAddressInstructionPassID);
392 }
393 
394 void NVPTXPassConfig::addOptimizedRegAlloc() {
395   addPass(&ProcessImplicitDefsID);
396   addPass(&LiveVariablesID);
397   addPass(&MachineLoopInfoID);
398   addPass(&PHIEliminationID);
399 
400   addPass(&TwoAddressInstructionPassID);
401   addPass(&RegisterCoalescerID);
402 
403   // PreRA instruction scheduling.
404   if (addPass(&MachineSchedulerID))
405     printAndVerify("After Machine Scheduling");
406 
407 
408   addPass(&StackSlotColoringID);
409 
410   // FIXME: Needs physical registers
411   //addPass(&MachineLICMID);
412 
413   printAndVerify("After StackSlotColoring");
414 }
415 
416 void NVPTXPassConfig::addMachineSSAOptimization() {
417   // Pre-ra tail duplication.
418   if (addPass(&EarlyTailDuplicateID))
419     printAndVerify("After Pre-RegAlloc TailDuplicate");
420 
421   // Optimize PHIs before DCE: removing dead PHI cycles may make more
422   // instructions dead.
423   addPass(&OptimizePHIsID);
424 
425   // This pass merges large allocas. StackSlotColoring is a different pass
426   // which merges spill slots.
427   addPass(&StackColoringID);
428 
429   // If the target requests it, assign local variables to stack slots relative
430   // to one another and simplify frame index references where possible.
431   addPass(&LocalStackSlotAllocationID);
432 
433   // With optimization, dead code should already be eliminated. However
434   // there is one known exception: lowered code for arguments that are only
435   // used by tail calls, where the tail calls reuse the incoming stack
436   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
437   addPass(&DeadMachineInstructionElimID);
438   printAndVerify("After codegen DCE pass");
439 
440   // Allow targets to insert passes that improve instruction level parallelism,
441   // like if-conversion. Such passes will typically need dominator trees and
442   // loop info, just like LICM and CSE below.
443   if (addILPOpts())
444     printAndVerify("After ILP optimizations");
445 
446   addPass(&EarlyMachineLICMID);
447   addPass(&MachineCSEID);
448 
449   addPass(&MachineSinkingID);
450   printAndVerify("After Machine LICM, CSE and Sinking passes");
451 
452   addPass(&PeepholeOptimizerID);
453   printAndVerify("After codegen peephole optimization pass");
454 }
455