1 //===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 
11 #include "AArch64.h"
12 #include "AArch64MachineFunctionInfo.h"
13 #include "AArch64InstrInfo.h"
14 #include "llvm/ADT/DepthFirstIterator.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineInstrBuilder.h"
22 #include "llvm/CodeGen/MachineLoopInfo.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/MachineTraceMetrics.h"
25 #include "llvm/CodeGen/Passes.h"
26 #include "llvm/CodeGen/TargetInstrInfo.h"
27 #include "llvm/CodeGen/TargetRegisterInfo.h"
28 #include "llvm/CodeGen/TargetSubtargetInfo.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "aarch64-stack-tagging-pre-ra"
36 
37 enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways };
38 
39 cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
40     "stack-tagging-unchecked-ld-st", cl::Hidden,
41     cl::init(UncheckedSafe),
42     cl::desc(
43         "Unconditionally apply unchecked-ld-st optimization (even for large "
44         "stack frames, or in the presence of variable sized allocas)."),
45     cl::values(
46         clEnumValN(UncheckedNever, "never", "never apply unchecked-ld-st"),
47         clEnumValN(
48             UncheckedSafe, "safe",
49             "apply unchecked-ld-st when the target is definitely within range"),
50         clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
51 
52 static cl::opt<bool>
53     ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
54                 cl::ZeroOrMore,
55                 cl::desc("Apply first slot optimization for stack tagging "
56                          "(eliminate ADDG Rt, Rn, 0, 0)."));
57 
58 namespace {
59 
60 class AArch64StackTaggingPreRA : public MachineFunctionPass {
61   MachineFunction *MF;
62   AArch64FunctionInfo *AFI;
63   MachineFrameInfo *MFI;
64   MachineRegisterInfo *MRI;
65   const AArch64RegisterInfo *TRI;
66   const AArch64InstrInfo *TII;
67 
68   SmallVector<MachineInstr*, 16> ReTags;
69 
70 public:
71   static char ID;
AArch64StackTaggingPreRA()72   AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {
73     initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry());
74   }
75 
76   bool mayUseUncheckedLoadStore();
77   void uncheckUsesOf(unsigned TaggedReg, int FI);
78   void uncheckLoadsAndStores();
79   Optional<int> findFirstSlotCandidate();
80 
81   bool runOnMachineFunction(MachineFunction &Func) override;
getPassName() const82   StringRef getPassName() const override {
83     return "AArch64 Stack Tagging PreRA";
84   }
85 
getAnalysisUsage(AnalysisUsage & AU) const86   void getAnalysisUsage(AnalysisUsage &AU) const override {
87     AU.setPreservesCFG();
88     MachineFunctionPass::getAnalysisUsage(AU);
89   }
90 };
91 } // end anonymous namespace
92 
93 char AArch64StackTaggingPreRA::ID = 0;
94 
95 INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
96                       "AArch64 Stack Tagging PreRA Pass", false, false)
97 INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
98                     "AArch64 Stack Tagging PreRA Pass", false, false)
99 
createAArch64StackTaggingPreRAPass()100 FunctionPass *llvm::createAArch64StackTaggingPreRAPass() {
101   return new AArch64StackTaggingPreRA();
102 }
103 
isUncheckedLoadOrStoreOpcode(unsigned Opcode)104 static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) {
105   switch (Opcode) {
106   case AArch64::LDRBBui:
107   case AArch64::LDRHHui:
108   case AArch64::LDRWui:
109   case AArch64::LDRXui:
110 
111   case AArch64::LDRBui:
112   case AArch64::LDRHui:
113   case AArch64::LDRSui:
114   case AArch64::LDRDui:
115   case AArch64::LDRQui:
116 
117   case AArch64::LDRSHWui:
118   case AArch64::LDRSHXui:
119 
120   case AArch64::LDRSBWui:
121   case AArch64::LDRSBXui:
122 
123   case AArch64::LDRSWui:
124 
125   case AArch64::STRBBui:
126   case AArch64::STRHHui:
127   case AArch64::STRWui:
128   case AArch64::STRXui:
129 
130   case AArch64::STRBui:
131   case AArch64::STRHui:
132   case AArch64::STRSui:
133   case AArch64::STRDui:
134   case AArch64::STRQui:
135 
136   case AArch64::LDPWi:
137   case AArch64::LDPXi:
138   case AArch64::LDPSi:
139   case AArch64::LDPDi:
140   case AArch64::LDPQi:
141 
142   case AArch64::LDPSWi:
143 
144   case AArch64::STPWi:
145   case AArch64::STPXi:
146   case AArch64::STPSi:
147   case AArch64::STPDi:
148   case AArch64::STPQi:
149     return true;
150   default:
151     return false;
152   }
153 }
154 
mayUseUncheckedLoadStore()155 bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() {
156   if (ClUncheckedLdSt == UncheckedNever)
157     return false;
158   else if (ClUncheckedLdSt == UncheckedAlways)
159     return true;
160 
161   // This estimate can be improved if we had harder guarantees about stack frame
162   // layout. With LocalStackAllocation we can estimate SP offset to any
163   // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged
164   // objects ahead of non-tagged ones, but that's not always desirable.
165   //
166   // Underestimating SP offset here may require the use of LDG to materialize
167   // the tagged address of the stack slot, along with a scratch register
168   // allocation (post-regalloc!).
169   //
170   // For now we do the safe thing here and require that the entire stack frame
171   // is within range of the shortest of the unchecked instructions.
172   unsigned FrameSize = 0;
173   for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i)
174     FrameSize += MFI->getObjectSize(i);
175   bool EntireFrameReachableFromSP = FrameSize < 0xf00;
176   return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP;
177 }
178 
uncheckUsesOf(unsigned TaggedReg,int FI)179 void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) {
180   for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end();
181        UI != E;) {
182     MachineInstr *UseI = &*(UI++);
183     if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) {
184       // FI operand is always the one before the immediate offset.
185       unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1;
186       if (UseI->getOperand(OpIdx).isReg() &&
187           UseI->getOperand(OpIdx).getReg() == TaggedReg) {
188         UseI->getOperand(OpIdx).ChangeToFrameIndex(FI);
189         UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED);
190       }
191     } else if (UseI->isCopy() &&
192                Register::isVirtualRegister(UseI->getOperand(0).getReg())) {
193       uncheckUsesOf(UseI->getOperand(0).getReg(), FI);
194     }
195   }
196 }
197 
uncheckLoadsAndStores()198 void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
199   for (auto *I : ReTags) {
200     unsigned TaggedReg = I->getOperand(0).getReg();
201     int FI = I->getOperand(1).getIndex();
202     uncheckUsesOf(TaggedReg, FI);
203   }
204 }
205 
206 struct SlotWithTag {
207   int FI;
208   int Tag;
SlotWithTagSlotWithTag209   SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {}
SlotWithTagSlotWithTag210   explicit SlotWithTag(const MachineInstr &MI)
211       : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {}
operator ==SlotWithTag212   bool operator==(const SlotWithTag &Other) const {
213     return FI == Other.FI && Tag == Other.Tag;
214   }
215 };
216 
217 namespace llvm {
218 template <> struct DenseMapInfo<SlotWithTag> {
getEmptyKeyllvm::DenseMapInfo219   static inline SlotWithTag getEmptyKey() { return {-2, -2}; }
getTombstoneKeyllvm::DenseMapInfo220   static inline SlotWithTag getTombstoneKey() { return {-3, -3}; }
getHashValuellvm::DenseMapInfo221   static unsigned getHashValue(const SlotWithTag &V) {
222     return hash_combine(DenseMapInfo<int>::getHashValue(V.FI),
223                         DenseMapInfo<int>::getHashValue(V.Tag));
224   }
isEqualllvm::DenseMapInfo225   static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) {
226     return A == B;
227   }
228 };
229 } // namespace llvm
230 
isSlotPreAllocated(MachineFrameInfo * MFI,int FI)231 static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) {
232   return MFI->getUseLocalStackAllocationBlock() &&
233          MFI->isObjectPreAllocated(FI);
234 }
235 
236 // Pin one of the tagged slots to offset 0 from the tagged base pointer.
237 // This would make its address available in a virtual register (IRG's def), as
238 // opposed to requiring an ADDG instruction to materialize. This effectively
239 // eliminates a vreg (by replacing it with direct uses of IRG, which is usually
240 // live almost everywhere anyway), and therefore needs to happen before
241 // regalloc.
findFirstSlotCandidate()242 Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() {
243   // Find the best (FI, Tag) pair to pin to offset 0.
244   // Looking at the possible uses of a tagged address, the advantage of pinning
245   // is:
246   // - COPY to physical register.
247   //   Does not matter, this would trade a MOV instruction for an ADDG.
248   // - ST*G matter, but those mostly appear near the function prologue where all
249   //   the tagged addresses need to be materialized anyway; also, counting ST*G
250   //   uses would overweight large allocas that require more than one ST*G
251   //   instruction.
252   // - Load/Store instructions in the address operand do not require a tagged
253   //   pointer, so they also do not benefit. These operands have already been
254   //   eliminated (see uncheckLoadsAndStores) so all remaining load/store
255   //   instructions count.
256   // - Any other instruction may benefit from being pinned to offset 0.
257   LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n");
258   if (!ClFirstSlot)
259     return None;
260 
261   DenseMap<SlotWithTag, int> RetagScore;
262   SlotWithTag MaxScoreST{-1, -1};
263   int MaxScore = -1;
264   for (auto *I : ReTags) {
265     SlotWithTag ST{*I};
266     if (isSlotPreAllocated(MFI, ST.FI))
267       continue;
268 
269     Register RetagReg = I->getOperand(0).getReg();
270     if (!Register::isVirtualRegister(RetagReg))
271       continue;
272 
273     int Score = 0;
274     SmallVector<Register, 8> WorkList;
275     WorkList.push_back(RetagReg);
276 
277     while (!WorkList.empty()) {
278       Register UseReg = WorkList.back();
279       WorkList.pop_back();
280       for (auto &UseI : MRI->use_instructions(UseReg)) {
281         unsigned Opcode = UseI.getOpcode();
282         if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset ||
283             Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset ||
284             Opcode == AArch64::STGPi || Opcode == AArch64::STGloop ||
285             Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback ||
286             Opcode == AArch64::STZGloop_wback)
287           continue;
288         if (UseI.isCopy()) {
289           Register DstReg = UseI.getOperand(0).getReg();
290           if (Register::isVirtualRegister(DstReg))
291             WorkList.push_back(DstReg);
292           continue;
293         }
294         LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %"
295                           << Register::virtReg2Index(UseReg) << " in " << UseI
296                           << "\n");
297         Score++;
298       }
299     }
300 
301     int TotalScore = RetagScore[ST] += Score;
302     if (TotalScore > MaxScore ||
303         (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) {
304       MaxScore = TotalScore;
305       MaxScoreST = ST;
306     }
307   }
308 
309   if (MaxScoreST.FI < 0)
310     return None;
311 
312   // If FI's tag is already 0, we are done.
313   if (MaxScoreST.Tag == 0)
314     return MaxScoreST.FI;
315 
316   // Otherwise, find a random victim pair (FI, Tag) where Tag == 0.
317   SlotWithTag SwapST{-1, -1};
318   for (auto *I : ReTags) {
319     SlotWithTag ST{*I};
320     if (ST.Tag == 0) {
321       SwapST = ST;
322       break;
323     }
324   }
325 
326   // Swap tags between the victim and the highest scoring pair.
327   // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for
328   // the highest score slot without changing anything else.
329   for (auto *&I : ReTags) {
330     SlotWithTag ST{*I};
331     MachineOperand &TagOp = I->getOperand(4);
332     if (ST == MaxScoreST) {
333       TagOp.setImm(0);
334     } else if (ST == SwapST) {
335       TagOp.setImm(MaxScoreST.Tag);
336     }
337   }
338   return MaxScoreST.FI;
339 }
340 
runOnMachineFunction(MachineFunction & Func)341 bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
342   MF = &Func;
343   MRI = &MF->getRegInfo();
344   AFI = MF->getInfo<AArch64FunctionInfo>();
345   TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
346   TRI = static_cast<const AArch64RegisterInfo *>(
347       MF->getSubtarget().getRegisterInfo());
348   MFI = &MF->getFrameInfo();
349   ReTags.clear();
350 
351   assert(MRI->isSSA());
352 
353   LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n"
354                     << "********** Function: " << MF->getName() << '\n');
355 
356   SmallSetVector<int, 8> TaggedSlots;
357   for (auto &BB : *MF) {
358     for (auto &I : BB) {
359       if (I.getOpcode() == AArch64::TAGPstack) {
360         ReTags.push_back(&I);
361         int FI = I.getOperand(1).getIndex();
362         TaggedSlots.insert(FI);
363         // There should be no offsets in TAGP yet.
364         assert(I.getOperand(2).getImm() == 0);
365       }
366     }
367   }
368 
369   // Take over from SSP. It does nothing for tagged slots, and should not really
370   // have been enabled in the first place.
371   for (int FI : TaggedSlots)
372     MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None);
373 
374   if (ReTags.empty())
375     return false;
376 
377   if (mayUseUncheckedLoadStore())
378     uncheckLoadsAndStores();
379 
380   // Find a slot that is used with zero tag offset, like ADDG #fi, 0.
381   // If the base tagged pointer is set up to the address of this slot,
382   // the ADDG instruction can be eliminated.
383   Optional<int> BaseSlot = findFirstSlotCandidate();
384   if (BaseSlot)
385     AFI->setTaggedBasePointerIndex(*BaseSlot);
386 
387   for (auto *I : ReTags) {
388     int FI = I->getOperand(1).getIndex();
389     int Tag = I->getOperand(4).getImm();
390     Register Base = I->getOperand(3).getReg();
391     if (Tag == 0 && FI == BaseSlot) {
392       BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY),
393               I->getOperand(0).getReg())
394           .addReg(Base);
395       I->eraseFromParent();
396     }
397   }
398 
399   return true;
400 }
401