1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/UniformityAnalysis.h"
19 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/CodeGen/TargetPassConfig.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/InstVisitor.h"
23 #include "llvm/InitializePasses.h"
24 #include "llvm/Support/CommandLine.h"
25 #include "llvm/Support/KnownBits.h"
26 #include "llvm/Transforms/Utils/Local.h"
27 
28 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
29 
30 using namespace llvm;
31 
32 // Scalar load widening needs running after load-store-vectorizer as that pass
33 // doesn't handle overlapping cases. In addition, this pass enhances the
34 // widening to handle cases where scalar sub-dword loads are naturally aligned
35 // only but not dword aligned.
36 static cl::opt<bool>
37     WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
38                cl::desc("Widen sub-dword constant address space loads in "
39                         "AMDGPULateCodeGenPrepare"),
40                cl::ReallyHidden, cl::init(true));
41 
42 namespace {
43 
44 class AMDGPULateCodeGenPrepare
45     : public FunctionPass,
46       public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
47   Module *Mod = nullptr;
48   const DataLayout *DL = nullptr;
49 
50   AssumptionCache *AC = nullptr;
51   UniformityInfo *UA = nullptr;
52 
53 public:
54   static char ID;
55 
56   AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
57 
58   StringRef getPassName() const override {
59     return "AMDGPU IR late optimizations";
60   }
61 
62   void getAnalysisUsage(AnalysisUsage &AU) const override {
63     AU.addRequired<TargetPassConfig>();
64     AU.addRequired<AssumptionCacheTracker>();
65     AU.addRequired<UniformityInfoWrapperPass>();
66     AU.setPreservesAll();
67   }
68 
69   bool doInitialization(Module &M) override;
70   bool runOnFunction(Function &F) override;
71 
72   bool visitInstruction(Instruction &) { return false; }
73 
74   // Check if the specified value is at least DWORD aligned.
75   bool isDWORDAligned(const Value *V) const {
76     KnownBits Known = computeKnownBits(V, *DL, 0, AC);
77     return Known.countMinTrailingZeros() >= 2;
78   }
79 
80   bool canWidenScalarExtLoad(LoadInst &LI) const;
81   bool visitLoadInst(LoadInst &LI);
82 };
83 
84 } // end anonymous namespace
85 
86 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
87   Mod = &M;
88   DL = &Mod->getDataLayout();
89   return false;
90 }
91 
92 bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
93   if (skipFunction(F))
94     return false;
95 
96   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
97   const TargetMachine &TM = TPC.getTM<TargetMachine>();
98   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
99   if (ST.hasScalarSubwordLoads())
100     return false;
101 
102   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
103   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
104 
105   bool Changed = false;
106   for (auto &BB : F)
107     for (Instruction &I : llvm::make_early_inc_range(BB))
108       Changed |= visit(I);
109 
110   return Changed;
111 }
112 
113 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
114   unsigned AS = LI.getPointerAddressSpace();
115   // Skip non-constant address space.
116   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
117       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
118     return false;
119   // Skip non-simple loads.
120   if (!LI.isSimple())
121     return false;
122   auto *Ty = LI.getType();
123   // Skip aggregate types.
124   if (Ty->isAggregateType())
125     return false;
126   unsigned TySize = DL->getTypeStoreSize(Ty);
127   // Only handle sub-DWORD loads.
128   if (TySize >= 4)
129     return false;
130   // That load must be at least naturally aligned.
131   if (LI.getAlign() < DL->getABITypeAlign(Ty))
132     return false;
133   // It should be uniform, i.e. a scalar load.
134   return UA->isUniform(&LI);
135 }
136 
137 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
138   if (!WidenLoads)
139     return false;
140 
141   // Skip if that load is already aligned on DWORD at least as it's handled in
142   // SDAG.
143   if (LI.getAlign() >= 4)
144     return false;
145 
146   if (!canWidenScalarExtLoad(LI))
147     return false;
148 
149   int64_t Offset = 0;
150   auto *Base =
151       GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
152   // If that base is not DWORD aligned, it's not safe to perform the following
153   // transforms.
154   if (!isDWORDAligned(Base))
155     return false;
156 
157   int64_t Adjust = Offset & 0x3;
158   if (Adjust == 0) {
159     // With a zero adjust, the original alignment could be promoted with a
160     // better one.
161     LI.setAlignment(Align(4));
162     return true;
163   }
164 
165   IRBuilder<> IRB(&LI);
166   IRB.SetCurrentDebugLocation(LI.getDebugLoc());
167 
168   unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
169   auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
170 
171   auto *NewPtr = IRB.CreateConstGEP1_64(
172       IRB.getInt8Ty(),
173       IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
174       Offset - Adjust);
175 
176   LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
177   NewLd->copyMetadata(LI);
178   NewLd->setMetadata(LLVMContext::MD_range, nullptr);
179 
180   unsigned ShAmt = Adjust * 8;
181   auto *NewVal = IRB.CreateBitCast(
182       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
183   LI.replaceAllUsesWith(NewVal);
184   RecursivelyDeleteTriviallyDeadInstructions(&LI);
185 
186   return true;
187 }
188 
189 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
190                       "AMDGPU IR late optimizations", false, false)
191 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
192 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
193 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
194 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
195                     "AMDGPU IR late optimizations", false, false)
196 
197 char AMDGPULateCodeGenPrepare::ID = 0;
198 
199 FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
200   return new AMDGPULateCodeGenPrepare();
201 }
202