1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // |                                   | Higher address
22 // |-----------------------------------|
23 // |                                   |
24 // | arguments passed on the stack     |
25 // |                                   |
26 // |-----------------------------------| <- sp
27 // |                                   | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // |                                   | Higher address
37 // |-----------------------------------|
38 // |                                   |
39 // | arguments passed on the stack     |
40 // |                                   |
41 // |-----------------------------------|
42 // |                                   |
43 // | (Win64 only) varargs from reg     |
44 // |                                   |
45 // |-----------------------------------|
46 // |                                   |
47 // | callee-saved gpr registers        | <--.
48 // |                                   |    | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -|    | callee saves are swapped,
50 // | prev_lr                           |    | (frame record first)
51 // | prev_fp                           | <--'
52 // | async context if needed           |
53 // | (a.k.a. "frame record")           |
54 // |-----------------------------------| <- fp(=x29)
55 // |                                   |
56 // | callee-saved fp/simd/SVE regs     |
57 // |                                   |
58 // |-----------------------------------|
59 // |                                   |
60 // |        SVE stack objects          |
61 // |                                   |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....|  compile time; if present)
66 // |-----------------------------------|
67 // |                                   |
68 // | local variables of fixed size     |
69 // | including spill slots             |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....|       LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................|  compile time)
74 // |-----------------------------------| <- sp
75 // |                                   | Lower address
76 //
77 //
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
84 //
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 //   variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 //   more-than-default alignment requirements.
91 //
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 //    ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
105 //
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
110 //
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
117 //
118 // FIXME: also explain the redzone concept.
119 //
120 //===----------------------------------------------------------------------===//
121 
122 #include "AArch64FrameLowering.h"
123 #include "AArch64InstrInfo.h"
124 #include "AArch64MachineFunctionInfo.h"
125 #include "AArch64RegisterInfo.h"
126 #include "AArch64Subtarget.h"
127 #include "AArch64TargetMachine.h"
128 #include "MCTargetDesc/AArch64AddressingModes.h"
129 #include "llvm/ADT/ScopeExit.h"
130 #include "llvm/ADT/SmallVector.h"
131 #include "llvm/ADT/Statistic.h"
132 #include "llvm/CodeGen/LivePhysRegs.h"
133 #include "llvm/CodeGen/MachineBasicBlock.h"
134 #include "llvm/CodeGen/MachineFrameInfo.h"
135 #include "llvm/CodeGen/MachineFunction.h"
136 #include "llvm/CodeGen/MachineInstr.h"
137 #include "llvm/CodeGen/MachineInstrBuilder.h"
138 #include "llvm/CodeGen/MachineMemOperand.h"
139 #include "llvm/CodeGen/MachineModuleInfo.h"
140 #include "llvm/CodeGen/MachineOperand.h"
141 #include "llvm/CodeGen/MachineRegisterInfo.h"
142 #include "llvm/CodeGen/RegisterScavenging.h"
143 #include "llvm/CodeGen/TargetInstrInfo.h"
144 #include "llvm/CodeGen/TargetRegisterInfo.h"
145 #include "llvm/CodeGen/TargetSubtargetInfo.h"
146 #include "llvm/CodeGen/WinEHFuncInfo.h"
147 #include "llvm/IR/Attributes.h"
148 #include "llvm/IR/CallingConv.h"
149 #include "llvm/IR/DataLayout.h"
150 #include "llvm/IR/DebugLoc.h"
151 #include "llvm/IR/Function.h"
152 #include "llvm/MC/MCAsmInfo.h"
153 #include "llvm/MC/MCDwarf.h"
154 #include "llvm/Support/CommandLine.h"
155 #include "llvm/Support/Debug.h"
156 #include "llvm/Support/ErrorHandling.h"
157 #include "llvm/Support/LEB128.h"
158 #include "llvm/Support/MathExtras.h"
159 #include "llvm/Support/raw_ostream.h"
160 #include "llvm/Target/TargetMachine.h"
161 #include "llvm/Target/TargetOptions.h"
162 #include <cassert>
163 #include <cstdint>
164 #include <iterator>
165 #include <vector>
166 
167 using namespace llvm;
168 
169 #define DEBUG_TYPE "frame-info"
170 
171 static cl::opt<bool> EnableRedZone("aarch64-redzone",
172                                    cl::desc("enable use of redzone on AArch64"),
173                                    cl::init(false), cl::Hidden);
174 
175 static cl::opt<bool>
176     ReverseCSRRestoreSeq("reverse-csr-restore-seq",
177                          cl::desc("reverse the CSR restore sequence"),
178                          cl::init(false), cl::Hidden);
179 
180 static cl::opt<bool> StackTaggingMergeSetTag(
181     "stack-tagging-merge-settag",
182     cl::desc("merge settag instruction in function epilog"), cl::init(true),
183     cl::Hidden);
184 
185 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
186                                        cl::desc("sort stack allocations"),
187                                        cl::init(true), cl::Hidden);
188 
189 cl::opt<bool> EnableHomogeneousPrologEpilog(
190     "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
191     cl::desc("Emit homogeneous prologue and epilogue for the size "
192              "optimization (default = off)"));
193 
194 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
195 
196 /// Returns how much of the incoming argument stack area (in bytes) we should
197 /// clean up in an epilogue. For the C calling convention this will be 0, for
198 /// guaranteed tail call conventions it can be positive (a normal return or a
199 /// tail call to a function that uses less stack space for arguments) or
200 /// negative (for a tail call to a function that needs more stack space than us
201 /// for arguments).
getArgumentStackToRestore(MachineFunction & MF,MachineBasicBlock & MBB)202 static int64_t getArgumentStackToRestore(MachineFunction &MF,
203                                          MachineBasicBlock &MBB) {
204   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
205   bool IsTailCallReturn = false;
206   if (MBB.end() != MBBI) {
207     unsigned RetOpcode = MBBI->getOpcode();
208     IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
209                        RetOpcode == AArch64::TCRETURNri ||
210                        RetOpcode == AArch64::TCRETURNriBTI;
211   }
212   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
213 
214   int64_t ArgumentPopSize = 0;
215   if (IsTailCallReturn) {
216     MachineOperand &StackAdjust = MBBI->getOperand(1);
217 
218     // For a tail-call in a callee-pops-arguments environment, some or all of
219     // the stack may actually be in use for the call's arguments, this is
220     // calculated during LowerCall and consumed here...
221     ArgumentPopSize = StackAdjust.getImm();
222   } else {
223     // ... otherwise the amount to pop is *all* of the argument space,
224     // conveniently stored in the MachineFunctionInfo by
225     // LowerFormalArguments. This will, of course, be zero for the C calling
226     // convention.
227     ArgumentPopSize = AFI->getArgumentStackToRestore();
228   }
229 
230   return ArgumentPopSize;
231 }
232 
233 static bool produceCompactUnwindFrame(MachineFunction &MF);
234 static bool needsWinCFI(const MachineFunction &MF);
235 static StackOffset getSVEStackSize(const MachineFunction &MF);
236 
237 /// Returns true if a homogeneous prolog or epilog code can be emitted
238 /// for the size optimization. If possible, a frame helper call is injected.
239 /// When Exit block is given, this check is for epilog.
homogeneousPrologEpilog(MachineFunction & MF,MachineBasicBlock * Exit) const240 bool AArch64FrameLowering::homogeneousPrologEpilog(
241     MachineFunction &MF, MachineBasicBlock *Exit) const {
242   if (!MF.getFunction().hasMinSize())
243     return false;
244   if (!EnableHomogeneousPrologEpilog)
245     return false;
246   if (ReverseCSRRestoreSeq)
247     return false;
248   if (EnableRedZone)
249     return false;
250 
251   // TODO: Window is supported yet.
252   if (needsWinCFI(MF))
253     return false;
254   // TODO: SVE is not supported yet.
255   if (getSVEStackSize(MF))
256     return false;
257 
258   // Bail on stack adjustment needed on return for simplicity.
259   const MachineFrameInfo &MFI = MF.getFrameInfo();
260   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
261   if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
262     return false;
263   if (Exit && getArgumentStackToRestore(MF, *Exit))
264     return false;
265 
266   return true;
267 }
268 
269 /// Returns true if CSRs should be paired.
producePairRegisters(MachineFunction & MF) const270 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
271   return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
272 }
273 
274 /// This is the biggest offset to the stack pointer we can encode in aarch64
275 /// instructions (without using a separate calculation and a temp register).
276 /// Note that the exception here are vector stores/loads which cannot encode any
277 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
278 static const unsigned DefaultSafeSPDisplacement = 255;
279 
280 /// Look at each instruction that references stack frames and return the stack
281 /// size limit beyond which some of these instructions will require a scratch
282 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)283 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
284   // FIXME: For now, just conservatively guestimate based on unscaled indexing
285   // range. We'll end up allocating an unnecessary spill slot a lot, but
286   // realistically that's not a big deal at this stage of the game.
287   for (MachineBasicBlock &MBB : MF) {
288     for (MachineInstr &MI : MBB) {
289       if (MI.isDebugInstr() || MI.isPseudo() ||
290           MI.getOpcode() == AArch64::ADDXri ||
291           MI.getOpcode() == AArch64::ADDSXri)
292         continue;
293 
294       for (const MachineOperand &MO : MI.operands()) {
295         if (!MO.isFI())
296           continue;
297 
298         StackOffset Offset;
299         if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
300             AArch64FrameOffsetCannotUpdate)
301           return 0;
302       }
303     }
304   }
305   return DefaultSafeSPDisplacement;
306 }
307 
308 TargetStackID::Value
getStackIDForScalableVectors() const309 AArch64FrameLowering::getStackIDForScalableVectors() const {
310   return TargetStackID::ScalableVector;
311 }
312 
313 /// Returns the size of the fixed object area (allocated next to sp on entry)
314 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)315 static unsigned getFixedObjectSize(const MachineFunction &MF,
316                                    const AArch64FunctionInfo *AFI, bool IsWin64,
317                                    bool IsFunclet) {
318   if (!IsWin64 || IsFunclet) {
319     return AFI->getTailCallReservedStack();
320   } else {
321     if (AFI->getTailCallReservedStack() != 0)
322       report_fatal_error("cannot generate ABI-changing tail call for Win64");
323     // Var args are stored here in the primary function.
324     const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
325     // To support EH funclets we allocate an UnwindHelp object
326     const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
327     return alignTo(VarArgsArea + UnwindHelpObject, 16);
328   }
329 }
330 
331 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)332 static StackOffset getSVEStackSize(const MachineFunction &MF) {
333   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
334   return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
335 }
336 
canUseRedZone(const MachineFunction & MF) const337 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
338   if (!EnableRedZone)
339     return false;
340 
341   // Don't use the red zone if the function explicitly asks us not to.
342   // This is typically used for kernel code.
343   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
344   const unsigned RedZoneSize =
345       Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
346   if (!RedZoneSize)
347     return false;
348 
349   const MachineFrameInfo &MFI = MF.getFrameInfo();
350   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
351   uint64_t NumBytes = AFI->getLocalStackSize();
352 
353   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
354            getSVEStackSize(MF));
355 }
356 
357 /// hasFP - Return true if the specified function should have a dedicated frame
358 /// pointer register.
hasFP(const MachineFunction & MF) const359 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
360   const MachineFrameInfo &MFI = MF.getFrameInfo();
361   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
362   // Win64 EH requires a frame pointer if funclets are present, as the locals
363   // are accessed off the frame pointer in both the parent function and the
364   // funclets.
365   if (MF.hasEHFunclets())
366     return true;
367   // Retain behavior of always omitting the FP for leaf functions when possible.
368   if (MF.getTarget().Options.DisableFramePointerElim(MF))
369     return true;
370   if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
371       MFI.hasStackMap() || MFI.hasPatchPoint() ||
372       RegInfo->hasStackRealignment(MF))
373     return true;
374   // With large callframes around we may need to use FP to access the scavenging
375   // emergency spillslot.
376   //
377   // Unfortunately some calls to hasFP() like machine verifier ->
378   // getReservedReg() -> hasFP in the middle of global isel are too early
379   // to know the max call frame size. Hopefully conservatively returning "true"
380   // in those cases is fine.
381   // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
382   if (!MFI.isMaxCallFrameSizeComputed() ||
383       MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
384     return true;
385 
386   return false;
387 }
388 
389 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
390 /// not required, we reserve argument space for call sites in the function
391 /// immediately on entry to the current function.  This eliminates the need for
392 /// add/sub sp brackets around call sites.  Returns true if the call frame is
393 /// included as part of the stack frame.
394 bool
hasReservedCallFrame(const MachineFunction & MF) const395 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
396   return !MF.getFrameInfo().hasVarSizedObjects();
397 }
398 
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const399 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
400     MachineFunction &MF, MachineBasicBlock &MBB,
401     MachineBasicBlock::iterator I) const {
402   const AArch64InstrInfo *TII =
403       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
404   DebugLoc DL = I->getDebugLoc();
405   unsigned Opc = I->getOpcode();
406   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
407   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
408 
409   if (!hasReservedCallFrame(MF)) {
410     int64_t Amount = I->getOperand(0).getImm();
411     Amount = alignTo(Amount, getStackAlign());
412     if (!IsDestroy)
413       Amount = -Amount;
414 
415     // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
416     // doesn't have to pop anything), then the first operand will be zero too so
417     // this adjustment is a no-op.
418     if (CalleePopAmount == 0) {
419       // FIXME: in-function stack adjustment for calls is limited to 24-bits
420       // because there's no guaranteed temporary register available.
421       //
422       // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
423       // 1) For offset <= 12-bit, we use LSL #0
424       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
425       // LSL #0, and the other uses LSL #12.
426       //
427       // Most call frames will be allocated at the start of a function so
428       // this is OK, but it is a limitation that needs dealing with.
429       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
430       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
431                       StackOffset::getFixed(Amount), TII);
432     }
433   } else if (CalleePopAmount != 0) {
434     // If the calling convention demands that the callee pops arguments from the
435     // stack, we want to add it back if we have a reserved call frame.
436     assert(CalleePopAmount < 0xffffff && "call frame too large");
437     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
438                     StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
439   }
440   return MBB.erase(I);
441 }
442 
443 // Convenience function to create a DWARF expression for
444 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)445 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
446                                      int NumBytes, int NumVGScaledBytes, unsigned VG,
447                                      llvm::raw_string_ostream &Comment) {
448   uint8_t buffer[16];
449 
450   if (NumBytes) {
451     Expr.push_back(dwarf::DW_OP_consts);
452     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
453     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
454     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
455   }
456 
457   if (NumVGScaledBytes) {
458     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
459     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
460 
461     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
462     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
463     Expr.push_back(0);
464 
465     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
466     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
467 
468     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
469             << std::abs(NumVGScaledBytes) << " * VG";
470   }
471 }
472 
473 // Creates an MCCFIInstruction:
474 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpressionFromSP(const TargetRegisterInfo & TRI,const StackOffset & OffsetFromSP) const475 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
476     const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
477   int64_t NumBytes, NumVGScaledBytes;
478   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
479                                                         NumVGScaledBytes);
480 
481   std::string CommentBuffer = "sp";
482   llvm::raw_string_ostream Comment(CommentBuffer);
483 
484   // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
485   SmallString<64> Expr;
486   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
487   Expr.push_back(0);
488   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
489                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
490 
491   // Wrap this into DW_CFA_def_cfa.
492   SmallString<64> DefCfaExpr;
493   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
494   uint8_t buffer[16];
495   DefCfaExpr.append(buffer,
496                     buffer + encodeULEB128(Expr.size(), buffer));
497   DefCfaExpr.append(Expr.str());
498   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
499                                         Comment.str());
500 }
501 
createCfaOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA) const502 MCCFIInstruction AArch64FrameLowering::createCfaOffset(
503     const TargetRegisterInfo &TRI, unsigned Reg,
504     const StackOffset &OffsetFromDefCFA) const {
505   int64_t NumBytes, NumVGScaledBytes;
506   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
507       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
508 
509   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
510 
511   // Non-scalable offsets can use DW_CFA_offset directly.
512   if (!NumVGScaledBytes)
513     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
514 
515   std::string CommentBuffer;
516   llvm::raw_string_ostream Comment(CommentBuffer);
517   Comment << printReg(Reg, &TRI) << "  @ cfa";
518 
519   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
520   SmallString<64> OffsetExpr;
521   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
522                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
523 
524   // Wrap this into DW_CFA_expression
525   SmallString<64> CfaExpr;
526   CfaExpr.push_back(dwarf::DW_CFA_expression);
527   uint8_t buffer[16];
528   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
529   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
530   CfaExpr.append(OffsetExpr.str());
531 
532   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
533 }
534 
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const535 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
536     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
537   MachineFunction &MF = *MBB.getParent();
538   MachineFrameInfo &MFI = MF.getFrameInfo();
539   const TargetSubtargetInfo &STI = MF.getSubtarget();
540   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
541   const TargetInstrInfo *TII = STI.getInstrInfo();
542   DebugLoc DL = MBB.findDebugLoc(MBBI);
543 
544   // Add callee saved registers to move list.
545   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
546   if (CSI.empty())
547     return;
548 
549   for (const auto &Info : CSI) {
550     unsigned Reg = Info.getReg();
551 
552     // Not all unwinders may know about SVE registers, so assume the lowest
553     // common demoninator.
554     unsigned NewReg;
555     if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
556       Reg = NewReg;
557     else
558       continue;
559 
560     StackOffset Offset;
561     if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
562       AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
563       Offset =
564           StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
565           StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
566     } else {
567       Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
568                                      getOffsetOfLocalArea());
569     }
570     unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
571     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
572         .addCFIIndex(CFIIndex)
573         .setMIFlags(MachineInstr::FrameSetup);
574   }
575 }
576 
577 // Find a scratch register that we can use at the start of the prologue to
578 // re-align the stack pointer.  We avoid using callee-save registers since they
579 // may appear to be free when this is called from canUseAsPrologue (during
580 // shrink wrapping), but then no longer be free when this is called from
581 // emitPrologue.
582 //
583 // FIXME: This is a bit conservative, since in the above case we could use one
584 // of the callee-save registers as a scratch temp to re-align the stack pointer,
585 // but we would then have to make sure that we were in fact saving at least one
586 // callee-save register in the prologue, which is additional complexity that
587 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)588 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
589   MachineFunction *MF = MBB->getParent();
590 
591   // If MBB is an entry block, use X9 as the scratch register
592   if (&MF->front() == MBB)
593     return AArch64::X9;
594 
595   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
596   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
597   LivePhysRegs LiveRegs(TRI);
598   LiveRegs.addLiveIns(*MBB);
599 
600   // Mark callee saved registers as used so we will not choose them.
601   const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
602   for (unsigned i = 0; CSRegs[i]; ++i)
603     LiveRegs.addReg(CSRegs[i]);
604 
605   // Prefer X9 since it was historically used for the prologue scratch reg.
606   const MachineRegisterInfo &MRI = MF->getRegInfo();
607   if (LiveRegs.available(MRI, AArch64::X9))
608     return AArch64::X9;
609 
610   for (unsigned Reg : AArch64::GPR64RegClass) {
611     if (LiveRegs.available(MRI, Reg))
612       return Reg;
613   }
614   return AArch64::NoRegister;
615 }
616 
canUseAsPrologue(const MachineBasicBlock & MBB) const617 bool AArch64FrameLowering::canUseAsPrologue(
618     const MachineBasicBlock &MBB) const {
619   const MachineFunction *MF = MBB.getParent();
620   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
621   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
622   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
623 
624   // Don't need a scratch register if we're not going to re-align the stack.
625   if (!RegInfo->hasStackRealignment(*MF))
626     return true;
627   // Otherwise, we can use any block as long as it has a scratch register
628   // available.
629   return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
630 }
631 
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)632 static bool windowsRequiresStackProbe(MachineFunction &MF,
633                                       uint64_t StackSizeInBytes) {
634   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
635   if (!Subtarget.isTargetWindows())
636     return false;
637   const Function &F = MF.getFunction();
638   // TODO: When implementing stack protectors, take that into account
639   // for the probe threshold.
640   unsigned StackProbeSize = 4096;
641   if (F.hasFnAttribute("stack-probe-size"))
642     F.getFnAttribute("stack-probe-size")
643         .getValueAsString()
644         .getAsInteger(0, StackProbeSize);
645   return (StackSizeInBytes >= StackProbeSize) &&
646          !F.hasFnAttribute("no-stack-arg-probe");
647 }
648 
needsWinCFI(const MachineFunction & MF)649 static bool needsWinCFI(const MachineFunction &MF) {
650   const Function &F = MF.getFunction();
651   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
652          F.needsUnwindTableEntry();
653 }
654 
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const655 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
656     MachineFunction &MF, uint64_t StackBumpBytes) const {
657   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
658   const MachineFrameInfo &MFI = MF.getFrameInfo();
659   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
660   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
661   if (homogeneousPrologEpilog(MF))
662     return false;
663 
664   if (AFI->getLocalStackSize() == 0)
665     return false;
666 
667   // For WinCFI, if optimizing for size, prefer to not combine the stack bump
668   // (to force a stp with predecrement) to match the packed unwind format,
669   // provided that there actually are any callee saved registers to merge the
670   // decrement with.
671   // This is potentially marginally slower, but allows using the packed
672   // unwind format for functions that both have a local area and callee saved
673   // registers. Using the packed unwind format notably reduces the size of
674   // the unwind info.
675   if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
676       MF.getFunction().hasOptSize())
677     return false;
678 
679   // 512 is the maximum immediate for stp/ldp that will be used for
680   // callee-save save/restores
681   if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
682     return false;
683 
684   if (MFI.hasVarSizedObjects())
685     return false;
686 
687   if (RegInfo->hasStackRealignment(MF))
688     return false;
689 
690   // This isn't strictly necessary, but it simplifies things a bit since the
691   // current RedZone handling code assumes the SP is adjusted by the
692   // callee-save save/restore code.
693   if (canUseRedZone(MF))
694     return false;
695 
696   // When there is an SVE area on the stack, always allocate the
697   // callee-saves and spills/locals separately.
698   if (getSVEStackSize(MF))
699     return false;
700 
701   return true;
702 }
703 
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const704 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
705     MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
706   if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
707     return false;
708 
709   if (MBB.empty())
710     return true;
711 
712   // Disable combined SP bump if the last instruction is an MTE tag store. It
713   // is almost always better to merge SP adjustment into those instructions.
714   MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
715   MachineBasicBlock::iterator Begin = MBB.begin();
716   while (LastI != Begin) {
717     --LastI;
718     if (LastI->isTransient())
719       continue;
720     if (!LastI->getFlag(MachineInstr::FrameDestroy))
721       break;
722   }
723   switch (LastI->getOpcode()) {
724   case AArch64::STGloop:
725   case AArch64::STZGloop:
726   case AArch64::STGOffset:
727   case AArch64::STZGOffset:
728   case AArch64::ST2GOffset:
729   case AArch64::STZ2GOffset:
730     return false;
731   default:
732     return true;
733   }
734   llvm_unreachable("unreachable");
735 }
736 
737 // Given a load or a store instruction, generate an appropriate unwinding SEH
738 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)739 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
740                                              const TargetInstrInfo &TII,
741                                              MachineInstr::MIFlag Flag) {
742   unsigned Opc = MBBI->getOpcode();
743   MachineBasicBlock *MBB = MBBI->getParent();
744   MachineFunction &MF = *MBB->getParent();
745   DebugLoc DL = MBBI->getDebugLoc();
746   unsigned ImmIdx = MBBI->getNumOperands() - 1;
747   int Imm = MBBI->getOperand(ImmIdx).getImm();
748   MachineInstrBuilder MIB;
749   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
750   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
751 
752   switch (Opc) {
753   default:
754     llvm_unreachable("No SEH Opcode for this instruction");
755   case AArch64::LDPDpost:
756     Imm = -Imm;
757     LLVM_FALLTHROUGH;
758   case AArch64::STPDpre: {
759     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
760     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
761     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
762               .addImm(Reg0)
763               .addImm(Reg1)
764               .addImm(Imm * 8)
765               .setMIFlag(Flag);
766     break;
767   }
768   case AArch64::LDPXpost:
769     Imm = -Imm;
770     LLVM_FALLTHROUGH;
771   case AArch64::STPXpre: {
772     Register Reg0 = MBBI->getOperand(1).getReg();
773     Register Reg1 = MBBI->getOperand(2).getReg();
774     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
775       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
776                 .addImm(Imm * 8)
777                 .setMIFlag(Flag);
778     else
779       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
780                 .addImm(RegInfo->getSEHRegNum(Reg0))
781                 .addImm(RegInfo->getSEHRegNum(Reg1))
782                 .addImm(Imm * 8)
783                 .setMIFlag(Flag);
784     break;
785   }
786   case AArch64::LDRDpost:
787     Imm = -Imm;
788     LLVM_FALLTHROUGH;
789   case AArch64::STRDpre: {
790     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
791     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
792               .addImm(Reg)
793               .addImm(Imm)
794               .setMIFlag(Flag);
795     break;
796   }
797   case AArch64::LDRXpost:
798     Imm = -Imm;
799     LLVM_FALLTHROUGH;
800   case AArch64::STRXpre: {
801     unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
802     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
803               .addImm(Reg)
804               .addImm(Imm)
805               .setMIFlag(Flag);
806     break;
807   }
808   case AArch64::STPDi:
809   case AArch64::LDPDi: {
810     unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
811     unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
812     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
813               .addImm(Reg0)
814               .addImm(Reg1)
815               .addImm(Imm * 8)
816               .setMIFlag(Flag);
817     break;
818   }
819   case AArch64::STPXi:
820   case AArch64::LDPXi: {
821     Register Reg0 = MBBI->getOperand(0).getReg();
822     Register Reg1 = MBBI->getOperand(1).getReg();
823     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
824       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
825                 .addImm(Imm * 8)
826                 .setMIFlag(Flag);
827     else
828       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
829                 .addImm(RegInfo->getSEHRegNum(Reg0))
830                 .addImm(RegInfo->getSEHRegNum(Reg1))
831                 .addImm(Imm * 8)
832                 .setMIFlag(Flag);
833     break;
834   }
835   case AArch64::STRXui:
836   case AArch64::LDRXui: {
837     int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
838     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
839               .addImm(Reg)
840               .addImm(Imm * 8)
841               .setMIFlag(Flag);
842     break;
843   }
844   case AArch64::STRDui:
845   case AArch64::LDRDui: {
846     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
847     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
848               .addImm(Reg)
849               .addImm(Imm * 8)
850               .setMIFlag(Flag);
851     break;
852   }
853   }
854   auto I = MBB->insertAfter(MBBI, MIB);
855   return I;
856 }
857 
858 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)859 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
860                            unsigned LocalStackSize) {
861   MachineOperand *ImmOpnd = nullptr;
862   unsigned ImmIdx = MBBI->getNumOperands() - 1;
863   switch (MBBI->getOpcode()) {
864   default:
865     llvm_unreachable("Fix the offset in the SEH instruction");
866   case AArch64::SEH_SaveFPLR:
867   case AArch64::SEH_SaveRegP:
868   case AArch64::SEH_SaveReg:
869   case AArch64::SEH_SaveFRegP:
870   case AArch64::SEH_SaveFReg:
871     ImmOpnd = &MBBI->getOperand(ImmIdx);
872     break;
873   }
874   if (ImmOpnd)
875     ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
876 }
877 
878 // Convert callee-save register save/restore instruction to do stack pointer
879 // decrement/increment to allocate/deallocate the callee-save stack area by
880 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool InProlog=true)881 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
882     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
883     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
884     bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
885   // Ignore instructions that do not operate on SP, i.e. shadow call stack
886   // instructions and associated CFI instruction.
887   while (MBBI->getOpcode() == AArch64::STRXpost ||
888          MBBI->getOpcode() == AArch64::LDRXpre ||
889          MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
890     if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
891       assert(MBBI->getOperand(0).getReg() != AArch64::SP);
892     ++MBBI;
893   }
894   unsigned NewOpc;
895   switch (MBBI->getOpcode()) {
896   default:
897     llvm_unreachable("Unexpected callee-save save/restore opcode!");
898   case AArch64::STPXi:
899     NewOpc = AArch64::STPXpre;
900     break;
901   case AArch64::STPDi:
902     NewOpc = AArch64::STPDpre;
903     break;
904   case AArch64::STPQi:
905     NewOpc = AArch64::STPQpre;
906     break;
907   case AArch64::STRXui:
908     NewOpc = AArch64::STRXpre;
909     break;
910   case AArch64::STRDui:
911     NewOpc = AArch64::STRDpre;
912     break;
913   case AArch64::STRQui:
914     NewOpc = AArch64::STRQpre;
915     break;
916   case AArch64::LDPXi:
917     NewOpc = AArch64::LDPXpost;
918     break;
919   case AArch64::LDPDi:
920     NewOpc = AArch64::LDPDpost;
921     break;
922   case AArch64::LDPQi:
923     NewOpc = AArch64::LDPQpost;
924     break;
925   case AArch64::LDRXui:
926     NewOpc = AArch64::LDRXpost;
927     break;
928   case AArch64::LDRDui:
929     NewOpc = AArch64::LDRDpost;
930     break;
931   case AArch64::LDRQui:
932     NewOpc = AArch64::LDRQpost;
933     break;
934   }
935   // Get rid of the SEH code associated with the old instruction.
936   if (NeedsWinCFI) {
937     auto SEH = std::next(MBBI);
938     if (AArch64InstrInfo::isSEHInstruction(*SEH))
939       SEH->eraseFromParent();
940   }
941 
942   TypeSize Scale = TypeSize::Fixed(1);
943   unsigned Width;
944   int64_t MinOffset, MaxOffset;
945   bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
946       NewOpc, Scale, Width, MinOffset, MaxOffset);
947   (void)Success;
948   assert(Success && "unknown load/store opcode");
949 
950   // If the first store isn't right where we want SP then we can't fold the
951   // update in so create a normal arithmetic instruction instead.
952   if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
953       CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
954     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
955                     StackOffset::getFixed(CSStackSizeInc), TII,
956                     InProlog ? MachineInstr::FrameSetup
957                              : MachineInstr::FrameDestroy);
958     return std::prev(MBBI);
959   }
960 
961   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
962   MIB.addReg(AArch64::SP, RegState::Define);
963 
964   // Copy all operands other than the immediate offset.
965   unsigned OpndIdx = 0;
966   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
967        ++OpndIdx)
968     MIB.add(MBBI->getOperand(OpndIdx));
969 
970   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
971          "Unexpected immediate offset in first/last callee-save save/restore "
972          "instruction!");
973   assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
974          "Unexpected base register in callee-save save/restore instruction!");
975   assert(CSStackSizeInc % Scale == 0);
976   MIB.addImm(CSStackSizeInc / (int)Scale);
977 
978   MIB.setMIFlags(MBBI->getFlags());
979   MIB.setMemRefs(MBBI->memoperands());
980 
981   // Generate a new SEH code that corresponds to the new instruction.
982   if (NeedsWinCFI) {
983     *HasWinCFI = true;
984     InsertSEH(*MIB, *TII,
985               InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
986   }
987 
988   return std::prev(MBB.erase(MBBI));
989 }
990 
991 // Fixup callee-save register save/restore instructions to take into account
992 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)993 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
994                                               uint64_t LocalStackSize,
995                                               bool NeedsWinCFI,
996                                               bool *HasWinCFI) {
997   if (AArch64InstrInfo::isSEHInstruction(MI))
998     return;
999 
1000   unsigned Opc = MI.getOpcode();
1001 
1002   // Ignore instructions that do not operate on SP, i.e. shadow call stack
1003   // instructions and associated CFI instruction.
1004   if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
1005       Opc == AArch64::CFI_INSTRUCTION) {
1006     if (Opc != AArch64::CFI_INSTRUCTION)
1007       assert(MI.getOperand(0).getReg() != AArch64::SP);
1008     return;
1009   }
1010 
1011   unsigned Scale;
1012   switch (Opc) {
1013   case AArch64::STPXi:
1014   case AArch64::STRXui:
1015   case AArch64::STPDi:
1016   case AArch64::STRDui:
1017   case AArch64::LDPXi:
1018   case AArch64::LDRXui:
1019   case AArch64::LDPDi:
1020   case AArch64::LDRDui:
1021     Scale = 8;
1022     break;
1023   case AArch64::STPQi:
1024   case AArch64::STRQui:
1025   case AArch64::LDPQi:
1026   case AArch64::LDRQui:
1027     Scale = 16;
1028     break;
1029   default:
1030     llvm_unreachable("Unexpected callee-save save/restore opcode!");
1031   }
1032 
1033   unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1034   assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1035          "Unexpected base register in callee-save save/restore instruction!");
1036   // Last operand is immediate offset that needs fixing.
1037   MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1038   // All generated opcodes have scaled offsets.
1039   assert(LocalStackSize % Scale == 0);
1040   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1041 
1042   if (NeedsWinCFI) {
1043     *HasWinCFI = true;
1044     auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1045     assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1046     assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1047            "Expecting a SEH instruction");
1048     fixupSEHOpcode(MBBI, LocalStackSize);
1049   }
1050 }
1051 
adaptForLdStOpt(MachineBasicBlock & MBB,MachineBasicBlock::iterator FirstSPPopI,MachineBasicBlock::iterator LastPopI)1052 static void adaptForLdStOpt(MachineBasicBlock &MBB,
1053                             MachineBasicBlock::iterator FirstSPPopI,
1054                             MachineBasicBlock::iterator LastPopI) {
1055   // Sometimes (when we restore in the same order as we save), we can end up
1056   // with code like this:
1057   //
1058   // ldp      x26, x25, [sp]
1059   // ldp      x24, x23, [sp, #16]
1060   // ldp      x22, x21, [sp, #32]
1061   // ldp      x20, x19, [sp, #48]
1062   // add      sp, sp, #64
1063   //
1064   // In this case, it is always better to put the first ldp at the end, so
1065   // that the load-store optimizer can run and merge the ldp and the add into
1066   // a post-index ldp.
1067   // If we managed to grab the first pop instruction, move it to the end.
1068   if (ReverseCSRRestoreSeq)
1069     MBB.splice(FirstSPPopI, &MBB, LastPopI);
1070   // We should end up with something like this now:
1071   //
1072   // ldp      x24, x23, [sp, #16]
1073   // ldp      x22, x21, [sp, #32]
1074   // ldp      x20, x19, [sp, #48]
1075   // ldp      x26, x25, [sp]
1076   // add      sp, sp, #64
1077   //
1078   // and the load-store optimizer can merge the last two instructions into:
1079   //
1080   // ldp      x26, x25, [sp], #64
1081   //
1082 }
1083 
isTargetWindows(const MachineFunction & MF)1084 static bool isTargetWindows(const MachineFunction &MF) {
1085   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1086 }
1087 
1088 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1089 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1090   switch (I->getOpcode()) {
1091   default:
1092     return false;
1093   case AArch64::STR_ZXI:
1094   case AArch64::STR_PXI:
1095   case AArch64::LDR_ZXI:
1096   case AArch64::LDR_PXI:
1097     return I->getFlag(MachineInstr::FrameSetup) ||
1098            I->getFlag(MachineInstr::FrameDestroy);
1099   }
1100 }
1101 
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1102 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1103                                         MachineBasicBlock &MBB) const {
1104   MachineBasicBlock::iterator MBBI = MBB.begin();
1105   const MachineFrameInfo &MFI = MF.getFrameInfo();
1106   const Function &F = MF.getFunction();
1107   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1108   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1109   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1110   MachineModuleInfo &MMI = MF.getMMI();
1111   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1112   bool needsFrameMoves =
1113       MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1114   bool HasFP = hasFP(MF);
1115   bool NeedsWinCFI = needsWinCFI(MF);
1116   bool HasWinCFI = false;
1117   auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1118 
1119   bool IsFunclet = MBB.isEHFuncletEntry();
1120 
1121   // At this point, we're going to decide whether or not the function uses a
1122   // redzone. In most cases, the function doesn't have a redzone so let's
1123   // assume that's false and set it to true in the case that there's a redzone.
1124   AFI->setHasRedZone(false);
1125 
1126   // Debug location must be unknown since the first debug location is used
1127   // to determine the end of the prologue.
1128   DebugLoc DL;
1129 
1130   const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1131   if (MFnI.shouldSignReturnAddress()) {
1132 
1133     unsigned PACI;
1134     if (MFnI.shouldSignWithBKey()) {
1135       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1136           .setMIFlag(MachineInstr::FrameSetup);
1137       PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
1138     } else {
1139       PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
1140     }
1141 
1142     auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
1143     if (Subtarget.hasPAuth())
1144       MI.addReg(AArch64::LR, RegState::Define)
1145           .addReg(AArch64::LR)
1146           .addReg(AArch64::SP, RegState::InternalRead);
1147     MI.setMIFlag(MachineInstr::FrameSetup);
1148 
1149     unsigned CFIIndex =
1150         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1151     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1152         .addCFIIndex(CFIIndex)
1153         .setMIFlags(MachineInstr::FrameSetup);
1154   }
1155 
1156   // We signal the presence of a Swift extended frame to external tools by
1157   // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1158   // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1159   // bits so that is still true.
1160   if (HasFP && AFI->hasSwiftAsyncContext()) {
1161     // ORR x29, x29, #0x1000_0000_0000_0000
1162     BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1163         .addUse(AArch64::FP)
1164         .addImm(0x1100)
1165         .setMIFlag(MachineInstr::FrameSetup);
1166   }
1167 
1168   // All calls are tail calls in GHC calling conv, and functions have no
1169   // prologue/epilogue.
1170   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1171     return;
1172 
1173   // Set tagged base pointer to the requested stack slot.
1174   // Ideally it should match SP value after prologue.
1175   Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1176   if (TBPI)
1177     AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1178   else
1179     AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1180 
1181   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1182 
1183   // getStackSize() includes all the locals in its size calculation. We don't
1184   // include these locals when computing the stack size of a funclet, as they
1185   // are allocated in the parent's stack frame and accessed via the frame
1186   // pointer from the funclet.  We only save the callee saved registers in the
1187   // funclet, which are really the callee saved registers of the parent
1188   // function, including the funclet.
1189   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1190                                : MFI.getStackSize();
1191   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1192     assert(!HasFP && "unexpected function without stack frame but with FP");
1193     assert(!SVEStackSize &&
1194            "unexpected function without stack frame but with SVE objects");
1195     // All of the stack allocation is for locals.
1196     AFI->setLocalStackSize(NumBytes);
1197     if (!NumBytes)
1198       return;
1199     // REDZONE: If the stack size is less than 128 bytes, we don't need
1200     // to actually allocate.
1201     if (canUseRedZone(MF)) {
1202       AFI->setHasRedZone(true);
1203       ++NumRedZoneFunctions;
1204     } else {
1205       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1206                       StackOffset::getFixed(-NumBytes), TII,
1207                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1208       if (!NeedsWinCFI && needsFrameMoves) {
1209         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1210         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1211           // Encode the stack size of the leaf function.
1212         unsigned CFIIndex = MF.addFrameInst(
1213             MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1214         BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1215             .addCFIIndex(CFIIndex)
1216             .setMIFlags(MachineInstr::FrameSetup);
1217       }
1218     }
1219 
1220     if (NeedsWinCFI) {
1221       HasWinCFI = true;
1222       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1223           .setMIFlag(MachineInstr::FrameSetup);
1224     }
1225 
1226     return;
1227   }
1228 
1229   bool IsWin64 =
1230       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1231   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1232 
1233   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1234   // All of the remaining stack allocations are for locals.
1235   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1236   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1237   bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1238   if (CombineSPBump) {
1239     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1240     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1241                     StackOffset::getFixed(-NumBytes), TII,
1242                     MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1243     NumBytes = 0;
1244   } else if (HomPrologEpilog) {
1245     // Stack has been already adjusted.
1246     NumBytes -= PrologueSaveSize;
1247   } else if (PrologueSaveSize != 0) {
1248     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1249         MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
1250     NumBytes -= PrologueSaveSize;
1251   }
1252   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1253 
1254   // Move past the saves of the callee-saved registers, fixing up the offsets
1255   // and pre-inc if we decided to combine the callee-save and local stack
1256   // pointer bump above.
1257   MachineBasicBlock::iterator End = MBB.end();
1258   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1259          !IsSVECalleeSave(MBBI)) {
1260     if (CombineSPBump)
1261       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1262                                         NeedsWinCFI, &HasWinCFI);
1263     ++MBBI;
1264   }
1265 
1266   // For funclets the FP belongs to the containing function.
1267   if (!IsFunclet && HasFP) {
1268     // Only set up FP if we actually need to.
1269     int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1270 
1271     if (CombineSPBump)
1272       FPOffset += AFI->getLocalStackSize();
1273 
1274     if (AFI->hasSwiftAsyncContext()) {
1275       // Before we update the live FP we have to ensure there's a valid (or
1276       // null) asynchronous context in its slot just before FP in the frame
1277       // record, so store it now.
1278       const auto &Attrs = MF.getFunction().getAttributes();
1279       bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1280       if (HaveInitialContext)
1281         MBB.addLiveIn(AArch64::X22);
1282       BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1283           .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1284           .addUse(AArch64::SP)
1285           .addImm(FPOffset - 8)
1286           .setMIFlags(MachineInstr::FrameSetup);
1287     }
1288 
1289     if (HomPrologEpilog) {
1290       auto Prolog = MBBI;
1291       --Prolog;
1292       assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1293       Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1294     } else {
1295       // Issue    sub fp, sp, FPOffset or
1296       //          mov fp,sp          when FPOffset is zero.
1297       // Note: All stores of callee-saved registers are marked as "FrameSetup".
1298       // This code marks the instruction(s) that set the FP also.
1299       emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1300                       StackOffset::getFixed(FPOffset), TII,
1301                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1302     }
1303   }
1304 
1305   if (windowsRequiresStackProbe(MF, NumBytes)) {
1306     uint64_t NumWords = NumBytes >> 4;
1307     if (NeedsWinCFI) {
1308       HasWinCFI = true;
1309       // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1310       // exceed this amount.  We need to move at most 2^24 - 1 into x15.
1311       // This is at most two instructions, MOVZ follwed by MOVK.
1312       // TODO: Fix to use multiple stack alloc unwind codes for stacks
1313       // exceeding 256MB in size.
1314       if (NumBytes >= (1 << 28))
1315         report_fatal_error("Stack size cannot exceed 256MB for stack "
1316                             "unwinding purposes");
1317 
1318       uint32_t LowNumWords = NumWords & 0xFFFF;
1319       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1320             .addImm(LowNumWords)
1321             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1322             .setMIFlag(MachineInstr::FrameSetup);
1323       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1324             .setMIFlag(MachineInstr::FrameSetup);
1325       if ((NumWords & 0xFFFF0000) != 0) {
1326           BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1327               .addReg(AArch64::X15)
1328               .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1329               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1330               .setMIFlag(MachineInstr::FrameSetup);
1331           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1332             .setMIFlag(MachineInstr::FrameSetup);
1333       }
1334     } else {
1335       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1336           .addImm(NumWords)
1337           .setMIFlags(MachineInstr::FrameSetup);
1338     }
1339 
1340     switch (MF.getTarget().getCodeModel()) {
1341     case CodeModel::Tiny:
1342     case CodeModel::Small:
1343     case CodeModel::Medium:
1344     case CodeModel::Kernel:
1345       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1346           .addExternalSymbol("__chkstk")
1347           .addReg(AArch64::X15, RegState::Implicit)
1348           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1349           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1350           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1351           .setMIFlags(MachineInstr::FrameSetup);
1352       if (NeedsWinCFI) {
1353         HasWinCFI = true;
1354         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1355             .setMIFlag(MachineInstr::FrameSetup);
1356       }
1357       break;
1358     case CodeModel::Large:
1359       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1360           .addReg(AArch64::X16, RegState::Define)
1361           .addExternalSymbol("__chkstk")
1362           .addExternalSymbol("__chkstk")
1363           .setMIFlags(MachineInstr::FrameSetup);
1364       if (NeedsWinCFI) {
1365         HasWinCFI = true;
1366         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1367             .setMIFlag(MachineInstr::FrameSetup);
1368       }
1369 
1370       BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1371           .addReg(AArch64::X16, RegState::Kill)
1372           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1373           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1374           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1375           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1376           .setMIFlags(MachineInstr::FrameSetup);
1377       if (NeedsWinCFI) {
1378         HasWinCFI = true;
1379         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1380             .setMIFlag(MachineInstr::FrameSetup);
1381       }
1382       break;
1383     }
1384 
1385     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1386         .addReg(AArch64::SP, RegState::Kill)
1387         .addReg(AArch64::X15, RegState::Kill)
1388         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1389         .setMIFlags(MachineInstr::FrameSetup);
1390     if (NeedsWinCFI) {
1391       HasWinCFI = true;
1392       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1393           .addImm(NumBytes)
1394           .setMIFlag(MachineInstr::FrameSetup);
1395     }
1396     NumBytes = 0;
1397   }
1398 
1399   StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1400   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1401 
1402   // Process the SVE callee-saves to determine what space needs to be
1403   // allocated.
1404   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1405     // Find callee save instructions in frame.
1406     CalleeSavesBegin = MBBI;
1407     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1408     while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1409       ++MBBI;
1410     CalleeSavesEnd = MBBI;
1411 
1412     AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1413     AllocateAfter = SVEStackSize - AllocateBefore;
1414   }
1415 
1416   // Allocate space for the callee saves (if any).
1417   emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
1418                   -AllocateBefore, TII,
1419                   MachineInstr::FrameSetup);
1420 
1421   // Finally allocate remaining SVE stack space.
1422   emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1423                   -AllocateAfter, TII,
1424                   MachineInstr::FrameSetup);
1425 
1426   // Allocate space for the rest of the frame.
1427   if (NumBytes) {
1428     // Alignment is required for the parent frame, not the funclet
1429     const bool NeedsRealignment =
1430         !IsFunclet && RegInfo->hasStackRealignment(MF);
1431     unsigned scratchSPReg = AArch64::SP;
1432 
1433     if (NeedsRealignment) {
1434       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1435       assert(scratchSPReg != AArch64::NoRegister);
1436     }
1437 
1438     // If we're a leaf function, try using the red zone.
1439     if (!canUseRedZone(MF))
1440       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1441       // the correct value here, as NumBytes also includes padding bytes,
1442       // which shouldn't be counted here.
1443       emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1444                       StackOffset::getFixed(-NumBytes), TII,
1445                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1446 
1447     if (NeedsRealignment) {
1448       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1449       assert(NrBitsToZero > 1);
1450       assert(scratchSPReg != AArch64::SP);
1451 
1452       // SUB X9, SP, NumBytes
1453       //   -- X9 is temporary register, so shouldn't contain any live data here,
1454       //   -- free to use. This is already produced by emitFrameOffset above.
1455       // AND SP, X9, 0b11111...0000
1456       // The logical immediates have a non-trivial encoding. The following
1457       // formula computes the encoded immediate with all ones but
1458       // NrBitsToZero zero bits as least significant bits.
1459       uint32_t andMaskEncoded = (1 << 12)                         // = N
1460                                 | ((64 - NrBitsToZero) << 6)      // immr
1461                                 | ((64 - NrBitsToZero - 1) << 0); // imms
1462 
1463       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1464           .addReg(scratchSPReg, RegState::Kill)
1465           .addImm(andMaskEncoded);
1466       AFI->setStackRealigned(true);
1467       if (NeedsWinCFI) {
1468         HasWinCFI = true;
1469         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1470             .addImm(NumBytes & andMaskEncoded)
1471             .setMIFlag(MachineInstr::FrameSetup);
1472       }
1473     }
1474   }
1475 
1476   // If we need a base pointer, set it up here. It's whatever the value of the
1477   // stack pointer is at this point. Any variable size objects will be allocated
1478   // after this, so we can still use the base pointer to reference locals.
1479   //
1480   // FIXME: Clarify FrameSetup flags here.
1481   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1482   // needed.
1483   // For funclets the BP belongs to the containing function.
1484   if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1485     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1486                      false);
1487     if (NeedsWinCFI) {
1488       HasWinCFI = true;
1489       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1490           .setMIFlag(MachineInstr::FrameSetup);
1491     }
1492   }
1493 
1494   // The very last FrameSetup instruction indicates the end of prologue. Emit a
1495   // SEH opcode indicating the prologue end.
1496   if (NeedsWinCFI && HasWinCFI) {
1497     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1498         .setMIFlag(MachineInstr::FrameSetup);
1499   }
1500 
1501   // SEH funclets are passed the frame pointer in X1.  If the parent
1502   // function uses the base register, then the base register is used
1503   // directly, and is not retrieved from X1.
1504   if (IsFunclet && F.hasPersonalityFn()) {
1505     EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1506     if (isAsynchronousEHPersonality(Per)) {
1507       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1508           .addReg(AArch64::X1)
1509           .setMIFlag(MachineInstr::FrameSetup);
1510       MBB.addLiveIn(AArch64::X1);
1511     }
1512   }
1513 
1514   if (needsFrameMoves) {
1515     // An example of the prologue:
1516     //
1517     //     .globl __foo
1518     //     .align 2
1519     //  __foo:
1520     // Ltmp0:
1521     //     .cfi_startproc
1522     //     .cfi_personality 155, ___gxx_personality_v0
1523     // Leh_func_begin:
1524     //     .cfi_lsda 16, Lexception33
1525     //
1526     //     stp  xa,bx, [sp, -#offset]!
1527     //     ...
1528     //     stp  x28, x27, [sp, #offset-32]
1529     //     stp  fp, lr, [sp, #offset-16]
1530     //     add  fp, sp, #offset - 16
1531     //     sub  sp, sp, #1360
1532     //
1533     // The Stack:
1534     //       +-------------------------------------------+
1535     // 10000 | ........ | ........ | ........ | ........ |
1536     // 10004 | ........ | ........ | ........ | ........ |
1537     //       +-------------------------------------------+
1538     // 10008 | ........ | ........ | ........ | ........ |
1539     // 1000c | ........ | ........ | ........ | ........ |
1540     //       +===========================================+
1541     // 10010 |                X28 Register               |
1542     // 10014 |                X28 Register               |
1543     //       +-------------------------------------------+
1544     // 10018 |                X27 Register               |
1545     // 1001c |                X27 Register               |
1546     //       +===========================================+
1547     // 10020 |                Frame Pointer              |
1548     // 10024 |                Frame Pointer              |
1549     //       +-------------------------------------------+
1550     // 10028 |                Link Register              |
1551     // 1002c |                Link Register              |
1552     //       +===========================================+
1553     // 10030 | ........ | ........ | ........ | ........ |
1554     // 10034 | ........ | ........ | ........ | ........ |
1555     //       +-------------------------------------------+
1556     // 10038 | ........ | ........ | ........ | ........ |
1557     // 1003c | ........ | ........ | ........ | ........ |
1558     //       +-------------------------------------------+
1559     //
1560     //     [sp] = 10030        ::    >>initial value<<
1561     //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
1562     //     fp = sp == 10020    ::  mov fp, sp
1563     //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
1564     //     sp == 10010         ::    >>final value<<
1565     //
1566     // The frame pointer (w29) points to address 10020. If we use an offset of
1567     // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
1568     // for w27, and -32 for w28:
1569     //
1570     //  Ltmp1:
1571     //     .cfi_def_cfa w29, 16
1572     //  Ltmp2:
1573     //     .cfi_offset w30, -8
1574     //  Ltmp3:
1575     //     .cfi_offset w29, -16
1576     //  Ltmp4:
1577     //     .cfi_offset w27, -24
1578     //  Ltmp5:
1579     //     .cfi_offset w28, -32
1580 
1581     if (HasFP) {
1582       const int OffsetToFirstCalleeSaveFromFP =
1583           AFI->getCalleeSaveBaseToFrameRecordOffset() -
1584           AFI->getCalleeSavedStackSize();
1585       Register FramePtr = RegInfo->getFrameRegister(MF);
1586 
1587       // Define the current CFA rule to use the provided FP.
1588       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1589       unsigned CFIIndex = MF.addFrameInst(
1590           MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1591       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1592           .addCFIIndex(CFIIndex)
1593           .setMIFlags(MachineInstr::FrameSetup);
1594     } else {
1595       unsigned CFIIndex;
1596       if (SVEStackSize) {
1597         const TargetSubtargetInfo &STI = MF.getSubtarget();
1598         const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
1599         StackOffset TotalSize =
1600             SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
1601         CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
1602       } else {
1603         // Encode the stack size of the leaf function.
1604         CFIIndex = MF.addFrameInst(
1605             MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
1606       }
1607       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1608           .addCFIIndex(CFIIndex)
1609           .setMIFlags(MachineInstr::FrameSetup);
1610     }
1611 
1612     // Now emit the moves for whatever callee saved regs we have (including FP,
1613     // LR if those are saved).
1614     emitCalleeSavedFrameMoves(MBB, MBBI);
1615   }
1616 }
1617 
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1618 static void InsertReturnAddressAuth(MachineFunction &MF,
1619                                     MachineBasicBlock &MBB) {
1620   const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1621   if (!MFI.shouldSignReturnAddress())
1622     return;
1623   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1624   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1625 
1626   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1627   DebugLoc DL;
1628   if (MBBI != MBB.end())
1629     DL = MBBI->getDebugLoc();
1630 
1631   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1632   // this instruction can safely used for any v8a architecture.
1633   // From v8.3a onwards there are optimised authenticate LR and return
1634   // instructions, namely RETA{A,B}, that can be used instead.
1635   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1636       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1637     BuildMI(MBB, MBBI, DL,
1638             TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1639         .copyImplicitOps(*MBBI);
1640     MBB.erase(MBBI);
1641   } else {
1642     BuildMI(
1643         MBB, MBBI, DL,
1644         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1645         .setMIFlag(MachineInstr::FrameDestroy);
1646   }
1647 }
1648 
isFuncletReturnInstr(const MachineInstr & MI)1649 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1650   switch (MI.getOpcode()) {
1651   default:
1652     return false;
1653   case AArch64::CATCHRET:
1654   case AArch64::CLEANUPRET:
1655     return true;
1656   }
1657 }
1658 
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1659 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1660                                         MachineBasicBlock &MBB) const {
1661   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1662   MachineFrameInfo &MFI = MF.getFrameInfo();
1663   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1664   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1665   DebugLoc DL;
1666   bool NeedsWinCFI = needsWinCFI(MF);
1667   bool HasWinCFI = false;
1668   bool IsFunclet = false;
1669   auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1670 
1671   if (MBB.end() != MBBI) {
1672     DL = MBBI->getDebugLoc();
1673     IsFunclet = isFuncletReturnInstr(*MBBI);
1674   }
1675 
1676   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1677                                : MFI.getStackSize();
1678   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1679 
1680   // All calls are tail calls in GHC calling conv, and functions have no
1681   // prologue/epilogue.
1682   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1683     return;
1684 
1685   // How much of the stack used by incoming arguments this function is expected
1686   // to restore in this particular epilogue.
1687   int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1688 
1689   // The stack frame should be like below,
1690   //
1691   //      ----------------------                     ---
1692   //      |                    |                      |
1693   //      | BytesInStackArgArea|              CalleeArgStackSize
1694   //      | (NumReusableBytes) |                (of tail call)
1695   //      |                    |                     ---
1696   //      |                    |                      |
1697   //      ---------------------|        ---           |
1698   //      |                    |         |            |
1699   //      |   CalleeSavedReg   |         |            |
1700   //      | (CalleeSavedStackSize)|      |            |
1701   //      |                    |         |            |
1702   //      ---------------------|         |         NumBytes
1703   //      |                    |     StackSize  (StackAdjustUp)
1704   //      |   LocalStackSize   |         |            |
1705   //      | (covering callee   |         |            |
1706   //      |       args)        |         |            |
1707   //      |                    |         |            |
1708   //      ----------------------        ---          ---
1709   //
1710   // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
1711   //             = StackSize + ArgumentPopSize
1712   //
1713   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
1714   // it as the 2nd argument of AArch64ISD::TC_RETURN.
1715 
1716   auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
1717 
1718   bool IsWin64 =
1719       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1720   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1721 
1722   int64_t AfterCSRPopSize = ArgumentStackToRestore;
1723   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1724   // We cannot rely on the local stack size set in emitPrologue if the function
1725   // has funclets, as funclets have different local stack size requirements, and
1726   // the current value set in emitPrologue may be that of the containing
1727   // function.
1728   if (MF.hasEHFunclets())
1729     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1730   if (homogeneousPrologEpilog(MF, &MBB)) {
1731     assert(!NeedsWinCFI);
1732     auto LastPopI = MBB.getFirstTerminator();
1733     if (LastPopI != MBB.begin()) {
1734       auto HomogeneousEpilog = std::prev(LastPopI);
1735       if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1736         LastPopI = HomogeneousEpilog;
1737     }
1738 
1739     // Adjust local stack
1740     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1741                     StackOffset::getFixed(AFI->getLocalStackSize()), TII,
1742                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
1743 
1744     // SP has been already adjusted while restoring callee save regs.
1745     // We've bailed-out the case with adjusting SP for arguments.
1746     assert(AfterCSRPopSize == 0);
1747     return;
1748   }
1749   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1750   // Assume we can't combine the last pop with the sp restore.
1751 
1752   if (!CombineSPBump && PrologueSaveSize != 0) {
1753     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1754     while (AArch64InstrInfo::isSEHInstruction(*Pop))
1755       Pop = std::prev(Pop);
1756     // Converting the last ldp to a post-index ldp is valid only if the last
1757     // ldp's offset is 0.
1758     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1759     // If the offset is 0 and the AfterCSR pop is not actually trying to
1760     // allocate more stack for arguments (in space that an untimely interrupt
1761     // may clobber), convert it to a post-index ldp.
1762     if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
1763       convertCalleeSaveRestoreToSPPrePostIncDec(
1764           MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
1765     else {
1766       // If not, make sure to emit an add after the last ldp.
1767       // We're doing this by transfering the size to be restored from the
1768       // adjustment *before* the CSR pops to the adjustment *after* the CSR
1769       // pops.
1770       AfterCSRPopSize += PrologueSaveSize;
1771     }
1772   }
1773 
1774   // Move past the restores of the callee-saved registers.
1775   // If we plan on combining the sp bump of the local stack size and the callee
1776   // save stack size, we might need to adjust the CSR save and restore offsets.
1777   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1778   MachineBasicBlock::iterator Begin = MBB.begin();
1779   while (LastPopI != Begin) {
1780     --LastPopI;
1781     if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1782         IsSVECalleeSave(LastPopI)) {
1783       ++LastPopI;
1784       break;
1785     } else if (CombineSPBump)
1786       fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1787                                         NeedsWinCFI, &HasWinCFI);
1788   }
1789 
1790   if (MF.hasWinCFI()) {
1791     // If the prologue didn't contain any SEH opcodes and didn't set the
1792     // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
1793     // EpilogStart - to avoid generating CFI for functions that don't need it.
1794     // (And as we didn't generate any prologue at all, it would be asymmetrical
1795     // to the epilogue.) By the end of the function, we assert that
1796     // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
1797     HasWinCFI = true;
1798     BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
1799         .setMIFlag(MachineInstr::FrameDestroy);
1800   }
1801 
1802   if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
1803     // We need to reset FP to its untagged state on return. Bit 60 is currently
1804     // used to show the presence of an extended frame.
1805 
1806     // BIC x29, x29, #0x1000_0000_0000_0000
1807     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
1808             AArch64::FP)
1809         .addUse(AArch64::FP)
1810         .addImm(0x10fe)
1811         .setMIFlag(MachineInstr::FrameDestroy);
1812   }
1813 
1814   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1815 
1816   // If there is a single SP update, insert it before the ret and we're done.
1817   if (CombineSPBump) {
1818     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1819     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
1820                     StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
1821                     TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
1822                     &HasWinCFI);
1823     if (HasWinCFI)
1824       BuildMI(MBB, MBB.getFirstTerminator(), DL,
1825               TII->get(AArch64::SEH_EpilogEnd))
1826           .setMIFlag(MachineInstr::FrameDestroy);
1827     return;
1828   }
1829 
1830   NumBytes -= PrologueSaveSize;
1831   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1832 
1833   // Process the SVE callee-saves to determine what space needs to be
1834   // deallocated.
1835   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
1836   MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
1837   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1838     RestoreBegin = std::prev(RestoreEnd);
1839     while (RestoreBegin != MBB.begin() &&
1840            IsSVECalleeSave(std::prev(RestoreBegin)))
1841       --RestoreBegin;
1842 
1843     assert(IsSVECalleeSave(RestoreBegin) &&
1844            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
1845 
1846     StackOffset CalleeSavedSizeAsOffset =
1847         StackOffset::getScalable(CalleeSavedSize);
1848     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
1849     DeallocateAfter = CalleeSavedSizeAsOffset;
1850   }
1851 
1852   // Deallocate the SVE area.
1853   if (SVEStackSize) {
1854     if (AFI->isStackRealigned()) {
1855       if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
1856         // Set SP to start of SVE callee-save area from which they can
1857         // be reloaded. The code below will deallocate the stack space
1858         // space by moving FP -> SP.
1859         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
1860                         StackOffset::getScalable(-CalleeSavedSize), TII,
1861                         MachineInstr::FrameDestroy);
1862     } else {
1863       if (AFI->getSVECalleeSavedStackSize()) {
1864         // Deallocate the non-SVE locals first before we can deallocate (and
1865         // restore callee saves) from the SVE area.
1866         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1867                         StackOffset::getFixed(NumBytes), TII,
1868                         MachineInstr::FrameDestroy);
1869         NumBytes = 0;
1870       }
1871 
1872       emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1873                       DeallocateBefore, TII, MachineInstr::FrameDestroy);
1874 
1875       emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1876                       DeallocateAfter, TII, MachineInstr::FrameDestroy);
1877     }
1878   }
1879 
1880   if (!hasFP(MF)) {
1881     bool RedZone = canUseRedZone(MF);
1882     // If this was a redzone leaf function, we don't need to restore the
1883     // stack pointer (but we may need to pop stack args for fastcc).
1884     if (RedZone && AfterCSRPopSize == 0)
1885       return;
1886 
1887     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
1888     int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
1889     if (NoCalleeSaveRestore)
1890       StackRestoreBytes += AfterCSRPopSize;
1891 
1892     // If we were able to combine the local stack pop with the argument pop,
1893     // then we're done.
1894     bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
1895 
1896     // If we're done after this, make sure to help the load store optimizer.
1897     if (Done)
1898       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
1899 
1900     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1901                     StackOffset::getFixed(StackRestoreBytes), TII,
1902                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1903     if (Done) {
1904       if (HasWinCFI) {
1905         BuildMI(MBB, MBB.getFirstTerminator(), DL,
1906                 TII->get(AArch64::SEH_EpilogEnd))
1907             .setMIFlag(MachineInstr::FrameDestroy);
1908       }
1909       return;
1910     }
1911 
1912     NumBytes = 0;
1913   }
1914 
1915   // Restore the original stack pointer.
1916   // FIXME: Rather than doing the math here, we should instead just use
1917   // non-post-indexed loads for the restores if we aren't actually going to
1918   // be able to save any instructions.
1919   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
1920     emitFrameOffset(
1921         MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
1922         StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
1923         TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
1924   } else if (NumBytes)
1925     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1926                     StackOffset::getFixed(NumBytes), TII,
1927                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
1928 
1929   // This must be placed after the callee-save restore code because that code
1930   // assumes the SP is at the same location as it was after the callee-save save
1931   // code in the prologue.
1932   if (AfterCSRPopSize) {
1933     assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
1934                                   "interrupt may have clobbered");
1935     // Find an insertion point for the first ldp so that it goes before the
1936     // shadow call stack epilog instruction. This ensures that the restore of
1937     // lr from x18 is placed after the restore from sp.
1938     auto FirstSPPopI = MBB.getFirstTerminator();
1939     while (FirstSPPopI != Begin) {
1940       auto Prev = std::prev(FirstSPPopI);
1941       if (Prev->getOpcode() != AArch64::LDRXpre ||
1942           Prev->getOperand(0).getReg() == AArch64::SP)
1943         break;
1944       FirstSPPopI = Prev;
1945     }
1946 
1947     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1948 
1949     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
1950                     StackOffset::getFixed(AfterCSRPopSize), TII,
1951                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1952   }
1953   if (HasWinCFI)
1954     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
1955         .setMIFlag(MachineInstr::FrameDestroy);
1956 }
1957 
1958 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1959 /// debug info.  It's the same as what we use for resolving the code-gen
1960 /// references for now.  FIXME: This can go wrong when references are
1961 /// SP-relative and simple call frames aren't used.
1962 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1963 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1964                                              Register &FrameReg) const {
1965   return resolveFrameIndexReference(
1966       MF, FI, FrameReg,
1967       /*PreferFP=*/
1968       MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
1969       /*ForSimm=*/false);
1970 }
1971 
1972 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const1973 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1974                                                      int FI) const {
1975   return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
1976 }
1977 
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)1978 static StackOffset getFPOffset(const MachineFunction &MF,
1979                                int64_t ObjectOffset) {
1980   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1981   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1982   bool IsWin64 =
1983       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1984   unsigned FixedObject =
1985       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
1986   int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
1987   int64_t FPAdjust =
1988       CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1989   return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
1990 }
1991 
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)1992 static StackOffset getStackOffset(const MachineFunction &MF,
1993                                   int64_t ObjectOffset) {
1994   const auto &MFI = MF.getFrameInfo();
1995   return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
1996 }
1997 
1998   // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const1999 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2000                                                  int FI) const {
2001   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2002       MF.getSubtarget().getRegisterInfo());
2003   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2004   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2005              ? getFPOffset(MF, ObjectOffset).getFixed()
2006              : getStackOffset(MF, ObjectOffset).getFixed();
2007 }
2008 
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const2009 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2010     const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2011     bool ForSimm) const {
2012   const auto &MFI = MF.getFrameInfo();
2013   int64_t ObjectOffset = MFI.getObjectOffset(FI);
2014   bool isFixed = MFI.isFixedObjectIndex(FI);
2015   bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2016   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2017                                      PreferFP, ForSimm);
2018 }
2019 
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const2020 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2021     const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2022     Register &FrameReg, bool PreferFP, bool ForSimm) const {
2023   const auto &MFI = MF.getFrameInfo();
2024   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2025       MF.getSubtarget().getRegisterInfo());
2026   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2027   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2028 
2029   int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2030   int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2031   bool isCSR =
2032       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2033 
2034   const StackOffset &SVEStackSize = getSVEStackSize(MF);
2035 
2036   // Use frame pointer to reference fixed objects. Use it for locals if
2037   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2038   // reliable as a base). Make sure useFPForScavengingIndex() does the
2039   // right thing for the emergency spill slot.
2040   bool UseFP = false;
2041   if (AFI->hasStackFrame() && !isSVE) {
2042     // We shouldn't prefer using the FP when there is an SVE area
2043     // in between the FP and the non-SVE locals/spills.
2044     PreferFP &= !SVEStackSize;
2045 
2046     // Note: Keeping the following as multiple 'if' statements rather than
2047     // merging to a single expression for readability.
2048     //
2049     // Argument access should always use the FP.
2050     if (isFixed) {
2051       UseFP = hasFP(MF);
2052     } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2053       // References to the CSR area must use FP if we're re-aligning the stack
2054       // since the dynamically-sized alignment padding is between the SP/BP and
2055       // the CSR area.
2056       assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2057       UseFP = true;
2058     } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2059       // If the FPOffset is negative and we're producing a signed immediate, we
2060       // have to keep in mind that the available offset range for negative
2061       // offsets is smaller than for positive ones. If an offset is available
2062       // via the FP and the SP, use whichever is closest.
2063       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2064       PreferFP |= Offset > -FPOffset;
2065 
2066       if (MFI.hasVarSizedObjects()) {
2067         // If we have variable sized objects, we can use either FP or BP, as the
2068         // SP offset is unknown. We can use the base pointer if we have one and
2069         // FP is not preferred. If not, we're stuck with using FP.
2070         bool CanUseBP = RegInfo->hasBasePointer(MF);
2071         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2072           UseFP = PreferFP;
2073         else if (!CanUseBP) // Can't use BP. Forced to use FP.
2074           UseFP = true;
2075         // else we can use BP and FP, but the offset from FP won't fit.
2076         // That will make us scavenge registers which we can probably avoid by
2077         // using BP. If it won't fit for BP either, we'll scavenge anyway.
2078       } else if (FPOffset >= 0) {
2079         // Use SP or FP, whichever gives us the best chance of the offset
2080         // being in range for direct access. If the FPOffset is positive,
2081         // that'll always be best, as the SP will be even further away.
2082         UseFP = true;
2083       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2084         // Funclets access the locals contained in the parent's stack frame
2085         // via the frame pointer, so we have to use the FP in the parent
2086         // function.
2087         (void) Subtarget;
2088         assert(
2089             Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2090             "Funclets should only be present on Win64");
2091         UseFP = true;
2092       } else {
2093         // We have the choice between FP and (SP or BP).
2094         if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2095           UseFP = true;
2096       }
2097     }
2098   }
2099 
2100   assert(
2101       ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2102       "In the presence of dynamic stack pointer realignment, "
2103       "non-argument/CSR objects cannot be accessed through the frame pointer");
2104 
2105   if (isSVE) {
2106     StackOffset FPOffset =
2107         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2108     StackOffset SPOffset =
2109         SVEStackSize +
2110         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2111                          ObjectOffset);
2112     // Always use the FP for SVE spills if available and beneficial.
2113     if (hasFP(MF) && (SPOffset.getFixed() ||
2114                       FPOffset.getScalable() < SPOffset.getScalable() ||
2115                       RegInfo->hasStackRealignment(MF))) {
2116       FrameReg = RegInfo->getFrameRegister(MF);
2117       return FPOffset;
2118     }
2119 
2120     FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2121                                            : (unsigned)AArch64::SP;
2122     return SPOffset;
2123   }
2124 
2125   StackOffset ScalableOffset = {};
2126   if (UseFP && !(isFixed || isCSR))
2127     ScalableOffset = -SVEStackSize;
2128   if (!UseFP && (isFixed || isCSR))
2129     ScalableOffset = SVEStackSize;
2130 
2131   if (UseFP) {
2132     FrameReg = RegInfo->getFrameRegister(MF);
2133     return StackOffset::getFixed(FPOffset) + ScalableOffset;
2134   }
2135 
2136   // Use the base pointer if we have one.
2137   if (RegInfo->hasBasePointer(MF))
2138     FrameReg = RegInfo->getBaseRegister();
2139   else {
2140     assert(!MFI.hasVarSizedObjects() &&
2141            "Can't use SP when we have var sized objects.");
2142     FrameReg = AArch64::SP;
2143     // If we're using the red zone for this function, the SP won't actually
2144     // be adjusted, so the offsets will be negative. They're also all
2145     // within range of the signed 9-bit immediate instructions.
2146     if (canUseRedZone(MF))
2147       Offset -= AFI->getLocalStackSize();
2148   }
2149 
2150   return StackOffset::getFixed(Offset) + ScalableOffset;
2151 }
2152 
getPrologueDeath(MachineFunction & MF,unsigned Reg)2153 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2154   // Do not set a kill flag on values that are also marked as live-in. This
2155   // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2156   // callee saved registers.
2157   // Omitting the kill flags is conservatively correct even if the live-in
2158   // is not used after all.
2159   bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2160   return getKillRegState(!IsLiveIn);
2161 }
2162 
produceCompactUnwindFrame(MachineFunction & MF)2163 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2164   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2165   AttributeList Attrs = MF.getFunction().getAttributes();
2166   return Subtarget.isTargetMachO() &&
2167          !(Subtarget.getTargetLowering()->supportSwiftError() &&
2168            Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2169          MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2170 }
2171 
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2172 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2173                                              bool NeedsWinCFI, bool IsFirst) {
2174   // If we are generating register pairs for a Windows function that requires
2175   // EH support, then pair consecutive registers only.  There are no unwind
2176   // opcodes for saves/restores of non-consectuve register pairs.
2177   // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2178   // save_lrpair.
2179   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2180 
2181   if (Reg2 == AArch64::FP)
2182     return true;
2183   if (!NeedsWinCFI)
2184     return false;
2185   if (Reg2 == Reg1 + 1)
2186     return false;
2187   // If pairing a GPR with LR, the pair can be described by the save_lrpair
2188   // opcode. If this is the first register pair, it would end up with a
2189   // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2190   // if LR is paired with something else than the first register.
2191   // The save_lrpair opcode requires the first register to be an odd one.
2192   if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2193       (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2194     return false;
2195   return true;
2196 }
2197 
2198 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2199 /// WindowsCFI requires that only consecutive registers can be paired.
2200 /// LR and FP need to be allocated together when the frame needs to save
2201 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2202 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2203                                       bool UsesWinAAPCS, bool NeedsWinCFI,
2204                                       bool NeedsFrameRecord, bool IsFirst) {
2205   if (UsesWinAAPCS)
2206     return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2207 
2208   // If we need to store the frame record, don't pair any register
2209   // with LR other than FP.
2210   if (NeedsFrameRecord)
2211     return Reg2 == AArch64::LR;
2212 
2213   return false;
2214 }
2215 
2216 namespace {
2217 
2218 struct RegPairInfo {
2219   unsigned Reg1 = AArch64::NoRegister;
2220   unsigned Reg2 = AArch64::NoRegister;
2221   int FrameIdx;
2222   int Offset;
2223   enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2224 
2225   RegPairInfo() = default;
2226 
isPaired__anonde4e74e70411::RegPairInfo2227   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2228 
getScale__anonde4e74e70411::RegPairInfo2229   unsigned getScale() const {
2230     switch (Type) {
2231     case PPR:
2232       return 2;
2233     case GPR:
2234     case FPR64:
2235       return 8;
2236     case ZPR:
2237     case FPR128:
2238       return 16;
2239     }
2240     llvm_unreachable("Unsupported type");
2241   }
2242 
isScalable__anonde4e74e70411::RegPairInfo2243   bool isScalable() const { return Type == PPR || Type == ZPR; }
2244 };
2245 
2246 } // end anonymous namespace
2247 
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool & NeedShadowCallStackProlog,bool NeedsFrameRecord)2248 static void computeCalleeSaveRegisterPairs(
2249     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2250     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2251     bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
2252 
2253   if (CSI.empty())
2254     return;
2255 
2256   bool IsWindows = isTargetWindows(MF);
2257   bool NeedsWinCFI = needsWinCFI(MF);
2258   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2259   MachineFrameInfo &MFI = MF.getFrameInfo();
2260   CallingConv::ID CC = MF.getFunction().getCallingConv();
2261   unsigned Count = CSI.size();
2262   (void)CC;
2263   // MachO's compact unwind format relies on all registers being stored in
2264   // pairs.
2265   assert((!produceCompactUnwindFrame(MF) ||
2266           CC == CallingConv::PreserveMost ||
2267           (Count & 1) == 0) &&
2268          "Odd number of callee-saved regs to spill!");
2269   int ByteOffset = AFI->getCalleeSavedStackSize();
2270   int StackFillDir = -1;
2271   int RegInc = 1;
2272   unsigned FirstReg = 0;
2273   if (NeedsWinCFI) {
2274     // For WinCFI, fill the stack from the bottom up.
2275     ByteOffset = 0;
2276     StackFillDir = 1;
2277     // As the CSI array is reversed to match PrologEpilogInserter, iterate
2278     // backwards, to pair up registers starting from lower numbered registers.
2279     RegInc = -1;
2280     FirstReg = Count - 1;
2281   }
2282   int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2283   bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2284 
2285   // When iterating backwards, the loop condition relies on unsigned wraparound.
2286   for (unsigned i = FirstReg; i < Count; i += RegInc) {
2287     RegPairInfo RPI;
2288     RPI.Reg1 = CSI[i].getReg();
2289 
2290     if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2291       RPI.Type = RegPairInfo::GPR;
2292     else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2293       RPI.Type = RegPairInfo::FPR64;
2294     else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2295       RPI.Type = RegPairInfo::FPR128;
2296     else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2297       RPI.Type = RegPairInfo::ZPR;
2298     else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2299       RPI.Type = RegPairInfo::PPR;
2300     else
2301       llvm_unreachable("Unsupported register class.");
2302 
2303     // Add the next reg to the pair if it is in the same register class.
2304     if (unsigned(i + RegInc) < Count) {
2305       unsigned NextReg = CSI[i + RegInc].getReg();
2306       bool IsFirst = i == FirstReg;
2307       switch (RPI.Type) {
2308       case RegPairInfo::GPR:
2309         if (AArch64::GPR64RegClass.contains(NextReg) &&
2310             !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2311                                        NeedsWinCFI, NeedsFrameRecord, IsFirst))
2312           RPI.Reg2 = NextReg;
2313         break;
2314       case RegPairInfo::FPR64:
2315         if (AArch64::FPR64RegClass.contains(NextReg) &&
2316             !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2317                                               IsFirst))
2318           RPI.Reg2 = NextReg;
2319         break;
2320       case RegPairInfo::FPR128:
2321         if (AArch64::FPR128RegClass.contains(NextReg))
2322           RPI.Reg2 = NextReg;
2323         break;
2324       case RegPairInfo::PPR:
2325       case RegPairInfo::ZPR:
2326         break;
2327       }
2328     }
2329 
2330     // If either of the registers to be saved is the lr register, it means that
2331     // we also need to save lr in the shadow call stack.
2332     if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
2333         MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
2334       if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
2335         report_fatal_error("Must reserve x18 to use shadow call stack");
2336       NeedShadowCallStackProlog = true;
2337     }
2338 
2339     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2340     // list to come in sorted by frame index so that we can issue the store
2341     // pair instructions directly. Assert if we see anything otherwise.
2342     //
2343     // The order of the registers in the list is controlled by
2344     // getCalleeSavedRegs(), so they will always be in-order, as well.
2345     assert((!RPI.isPaired() ||
2346             (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2347            "Out of order callee saved regs!");
2348 
2349     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2350             RPI.Reg1 == AArch64::LR) &&
2351            "FrameRecord must be allocated together with LR");
2352 
2353     // Windows AAPCS has FP and LR reversed.
2354     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2355             RPI.Reg2 == AArch64::LR) &&
2356            "FrameRecord must be allocated together with LR");
2357 
2358     // MachO's compact unwind format relies on all registers being stored in
2359     // adjacent register pairs.
2360     assert((!produceCompactUnwindFrame(MF) ||
2361             CC == CallingConv::PreserveMost ||
2362             (RPI.isPaired() &&
2363              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2364               RPI.Reg1 + 1 == RPI.Reg2))) &&
2365            "Callee-save registers not saved as adjacent register pair!");
2366 
2367     RPI.FrameIdx = CSI[i].getFrameIdx();
2368     if (NeedsWinCFI &&
2369         RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2370       RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2371 
2372     int Scale = RPI.getScale();
2373 
2374     int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2375     assert(OffsetPre % Scale == 0);
2376 
2377     if (RPI.isScalable())
2378       ScalableByteOffset += StackFillDir * Scale;
2379     else
2380       ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2381 
2382     // Swift's async context is directly before FP, so allocate an extra
2383     // 8 bytes for it.
2384     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2385         RPI.Reg2 == AArch64::FP)
2386       ByteOffset += StackFillDir * 8;
2387 
2388     assert(!(RPI.isScalable() && RPI.isPaired()) &&
2389            "Paired spill/fill instructions don't exist for SVE vectors");
2390 
2391     // Round up size of non-pair to pair size if we need to pad the
2392     // callee-save area to ensure 16-byte alignment.
2393     if (NeedGapToAlignStack && !NeedsWinCFI &&
2394         !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2395         !RPI.isPaired() && ByteOffset % 16 != 0) {
2396       ByteOffset += 8 * StackFillDir;
2397       assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2398       // A stack frame with a gap looks like this, bottom up:
2399       // d9, d8. x21, gap, x20, x19.
2400       // Set extra alignment on the x21 object to create the gap above it.
2401       MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2402       NeedGapToAlignStack = false;
2403     }
2404 
2405     int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2406     assert(OffsetPost % Scale == 0);
2407     // If filling top down (default), we want the offset after incrementing it.
2408     // If fillibg bootom up (WinCFI) we need the original offset.
2409     int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2410 
2411     // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2412     // Swift context can directly precede FP.
2413     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2414         RPI.Reg2 == AArch64::FP)
2415       Offset += 8;
2416     RPI.Offset = Offset / Scale;
2417 
2418     assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2419             (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2420            "Offset out of bounds for LDP/STP immediate");
2421 
2422     // Save the offset to frame record so that the FP register can point to the
2423     // innermost frame record (spilled FP and LR registers).
2424     if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2425                               RPI.Reg2 == AArch64::FP) ||
2426                              (IsWindows && RPI.Reg1 == AArch64::FP &&
2427                               RPI.Reg2 == AArch64::LR)))
2428       AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2429 
2430     RegPairs.push_back(RPI);
2431     if (RPI.isPaired())
2432       i += RegInc;
2433   }
2434   if (NeedsWinCFI) {
2435     // If we need an alignment gap in the stack, align the topmost stack
2436     // object. A stack frame with a gap looks like this, bottom up:
2437     // x19, d8. d9, gap.
2438     // Set extra alignment on the topmost stack object (the first element in
2439     // CSI, which goes top down), to create the gap above it.
2440     if (AFI->hasCalleeSaveStackFreeSpace())
2441       MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2442     // We iterated bottom up over the registers; flip RegPairs back to top
2443     // down order.
2444     std::reverse(RegPairs.begin(), RegPairs.end());
2445   }
2446 }
2447 
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2448 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2449     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2450     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2451   MachineFunction &MF = *MBB.getParent();
2452   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2453   bool NeedsWinCFI = needsWinCFI(MF);
2454   DebugLoc DL;
2455   SmallVector<RegPairInfo, 8> RegPairs;
2456 
2457   bool NeedShadowCallStackProlog = false;
2458   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2459                                  NeedShadowCallStackProlog, hasFP(MF));
2460   const MachineRegisterInfo &MRI = MF.getRegInfo();
2461 
2462   if (NeedShadowCallStackProlog) {
2463     // Shadow call stack prolog: str x30, [x18], #8
2464     BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
2465         .addReg(AArch64::X18, RegState::Define)
2466         .addReg(AArch64::LR)
2467         .addReg(AArch64::X18)
2468         .addImm(8)
2469         .setMIFlag(MachineInstr::FrameSetup);
2470 
2471     if (NeedsWinCFI)
2472       BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
2473           .setMIFlag(MachineInstr::FrameSetup);
2474 
2475     if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
2476       // Emit a CFI instruction that causes 8 to be subtracted from the value of
2477       // x18 when unwinding past this frame.
2478       static const char CFIInst[] = {
2479           dwarf::DW_CFA_val_expression,
2480           18, // register
2481           2,  // length
2482           static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
2483           static_cast<char>(-8) & 0x7f, // addend (sleb128)
2484       };
2485       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
2486           nullptr, StringRef(CFIInst, sizeof(CFIInst))));
2487       BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
2488           .addCFIIndex(CFIIndex)
2489           .setMIFlag(MachineInstr::FrameSetup);
2490     }
2491 
2492     // This instruction also makes x18 live-in to the entry block.
2493     MBB.addLiveIn(AArch64::X18);
2494   }
2495 
2496   if (homogeneousPrologEpilog(MF)) {
2497     auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2498                    .setMIFlag(MachineInstr::FrameSetup);
2499 
2500     for (auto &RPI : RegPairs) {
2501       MIB.addReg(RPI.Reg1);
2502       MIB.addReg(RPI.Reg2);
2503 
2504       // Update register live in.
2505       if (!MRI.isReserved(RPI.Reg1))
2506         MBB.addLiveIn(RPI.Reg1);
2507       if (!MRI.isReserved(RPI.Reg2))
2508         MBB.addLiveIn(RPI.Reg2);
2509     }
2510     return true;
2511   }
2512   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
2513        ++RPII) {
2514     RegPairInfo RPI = *RPII;
2515     unsigned Reg1 = RPI.Reg1;
2516     unsigned Reg2 = RPI.Reg2;
2517     unsigned StrOpc;
2518 
2519     // Issue sequence of spills for cs regs.  The first spill may be converted
2520     // to a pre-decrement store later by emitPrologue if the callee-save stack
2521     // area allocation can't be combined with the local stack area allocation.
2522     // For example:
2523     //    stp     x22, x21, [sp, #0]     // addImm(+0)
2524     //    stp     x20, x19, [sp, #16]    // addImm(+2)
2525     //    stp     fp, lr, [sp, #32]      // addImm(+4)
2526     // Rationale: This sequence saves uop updates compared to a sequence of
2527     // pre-increment spills like stp xi,xj,[sp,#-16]!
2528     // Note: Similar rationale and sequence for restores in epilog.
2529     unsigned Size;
2530     Align Alignment;
2531     switch (RPI.Type) {
2532     case RegPairInfo::GPR:
2533        StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2534        Size = 8;
2535        Alignment = Align(8);
2536        break;
2537     case RegPairInfo::FPR64:
2538        StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2539        Size = 8;
2540        Alignment = Align(8);
2541        break;
2542     case RegPairInfo::FPR128:
2543        StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2544        Size = 16;
2545        Alignment = Align(16);
2546        break;
2547     case RegPairInfo::ZPR:
2548        StrOpc = AArch64::STR_ZXI;
2549        Size = 16;
2550        Alignment = Align(16);
2551        break;
2552     case RegPairInfo::PPR:
2553        StrOpc = AArch64::STR_PXI;
2554        Size = 2;
2555        Alignment = Align(2);
2556        break;
2557     }
2558     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2559                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2560                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2561                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2562                dbgs() << ")\n");
2563 
2564     assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2565            "Windows unwdinding requires a consecutive (FP,LR) pair");
2566     // Windows unwind codes require consecutive registers if registers are
2567     // paired.  Make the switch here, so that the code below will save (x,x+1)
2568     // and not (x+1,x).
2569     unsigned FrameIdxReg1 = RPI.FrameIdx;
2570     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2571     if (NeedsWinCFI && RPI.isPaired()) {
2572       std::swap(Reg1, Reg2);
2573       std::swap(FrameIdxReg1, FrameIdxReg2);
2574     }
2575     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2576     if (!MRI.isReserved(Reg1))
2577       MBB.addLiveIn(Reg1);
2578     if (RPI.isPaired()) {
2579       if (!MRI.isReserved(Reg2))
2580         MBB.addLiveIn(Reg2);
2581       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2582       MIB.addMemOperand(MF.getMachineMemOperand(
2583           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2584           MachineMemOperand::MOStore, Size, Alignment));
2585     }
2586     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2587         .addReg(AArch64::SP)
2588         .addImm(RPI.Offset) // [sp, #offset*scale],
2589                             // where factor*scale is implicit
2590         .setMIFlag(MachineInstr::FrameSetup);
2591     MIB.addMemOperand(MF.getMachineMemOperand(
2592         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2593         MachineMemOperand::MOStore, Size, Alignment));
2594     if (NeedsWinCFI)
2595       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2596 
2597     // Update the StackIDs of the SVE stack slots.
2598     MachineFrameInfo &MFI = MF.getFrameInfo();
2599     if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2600       MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2601 
2602   }
2603   return true;
2604 }
2605 
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2606 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2607     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2608     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2609   MachineFunction &MF = *MBB.getParent();
2610   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2611   DebugLoc DL;
2612   SmallVector<RegPairInfo, 8> RegPairs;
2613   bool NeedsWinCFI = needsWinCFI(MF);
2614 
2615   if (MI != MBB.end())
2616     DL = MI->getDebugLoc();
2617 
2618   bool NeedShadowCallStackProlog = false;
2619   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2620                                  NeedShadowCallStackProlog, hasFP(MF));
2621 
2622   auto EmitMI = [&](const RegPairInfo &RPI) {
2623     unsigned Reg1 = RPI.Reg1;
2624     unsigned Reg2 = RPI.Reg2;
2625 
2626     // Issue sequence of restores for cs regs. The last restore may be converted
2627     // to a post-increment load later by emitEpilogue if the callee-save stack
2628     // area allocation can't be combined with the local stack area allocation.
2629     // For example:
2630     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
2631     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
2632     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
2633     // Note: see comment in spillCalleeSavedRegisters()
2634     unsigned LdrOpc;
2635     unsigned Size;
2636     Align Alignment;
2637     switch (RPI.Type) {
2638     case RegPairInfo::GPR:
2639        LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2640        Size = 8;
2641        Alignment = Align(8);
2642        break;
2643     case RegPairInfo::FPR64:
2644        LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2645        Size = 8;
2646        Alignment = Align(8);
2647        break;
2648     case RegPairInfo::FPR128:
2649        LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2650        Size = 16;
2651        Alignment = Align(16);
2652        break;
2653     case RegPairInfo::ZPR:
2654        LdrOpc = AArch64::LDR_ZXI;
2655        Size = 16;
2656        Alignment = Align(16);
2657        break;
2658     case RegPairInfo::PPR:
2659        LdrOpc = AArch64::LDR_PXI;
2660        Size = 2;
2661        Alignment = Align(2);
2662        break;
2663     }
2664     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2665                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2666                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2667                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2668                dbgs() << ")\n");
2669 
2670     // Windows unwind codes require consecutive registers if registers are
2671     // paired.  Make the switch here, so that the code below will save (x,x+1)
2672     // and not (x+1,x).
2673     unsigned FrameIdxReg1 = RPI.FrameIdx;
2674     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2675     if (NeedsWinCFI && RPI.isPaired()) {
2676       std::swap(Reg1, Reg2);
2677       std::swap(FrameIdxReg1, FrameIdxReg2);
2678     }
2679     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
2680     if (RPI.isPaired()) {
2681       MIB.addReg(Reg2, getDefRegState(true));
2682       MIB.addMemOperand(MF.getMachineMemOperand(
2683           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2684           MachineMemOperand::MOLoad, Size, Alignment));
2685     }
2686     MIB.addReg(Reg1, getDefRegState(true))
2687         .addReg(AArch64::SP)
2688         .addImm(RPI.Offset) // [sp, #offset*scale]
2689                             // where factor*scale is implicit
2690         .setMIFlag(MachineInstr::FrameDestroy);
2691     MIB.addMemOperand(MF.getMachineMemOperand(
2692         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2693         MachineMemOperand::MOLoad, Size, Alignment));
2694     if (NeedsWinCFI)
2695       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2696   };
2697 
2698   // SVE objects are always restored in reverse order.
2699   for (const RegPairInfo &RPI : reverse(RegPairs))
2700     if (RPI.isScalable())
2701       EmitMI(RPI);
2702 
2703   if (ReverseCSRRestoreSeq) {
2704     for (const RegPairInfo &RPI : reverse(RegPairs))
2705       if (!RPI.isScalable())
2706         EmitMI(RPI);
2707   } else if (homogeneousPrologEpilog(MF, &MBB)) {
2708     auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
2709                    .setMIFlag(MachineInstr::FrameDestroy);
2710     for (auto &RPI : RegPairs) {
2711       MIB.addReg(RPI.Reg1, RegState::Define);
2712       MIB.addReg(RPI.Reg2, RegState::Define);
2713     }
2714     return true;
2715   } else
2716     for (const RegPairInfo &RPI : RegPairs)
2717       if (!RPI.isScalable())
2718         EmitMI(RPI);
2719 
2720   if (NeedShadowCallStackProlog) {
2721     // Shadow call stack epilog: ldr x30, [x18, #-8]!
2722     BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
2723         .addReg(AArch64::X18, RegState::Define)
2724         .addReg(AArch64::LR, RegState::Define)
2725         .addReg(AArch64::X18)
2726         .addImm(-8)
2727         .setMIFlag(MachineInstr::FrameDestroy);
2728   }
2729 
2730   return true;
2731 }
2732 
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2733 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2734                                                 BitVector &SavedRegs,
2735                                                 RegScavenger *RS) const {
2736   // All calls are tail calls in GHC calling conv, and functions have no
2737   // prologue/epilogue.
2738   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2739     return;
2740 
2741   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2742   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2743       MF.getSubtarget().getRegisterInfo());
2744   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2745   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2746   unsigned UnspilledCSGPR = AArch64::NoRegister;
2747   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2748 
2749   MachineFrameInfo &MFI = MF.getFrameInfo();
2750   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2751 
2752   unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2753                                 ? RegInfo->getBaseRegister()
2754                                 : (unsigned)AArch64::NoRegister;
2755 
2756   unsigned ExtraCSSpill = 0;
2757   // Figure out which callee-saved registers to save/restore.
2758   for (unsigned i = 0; CSRegs[i]; ++i) {
2759     const unsigned Reg = CSRegs[i];
2760 
2761     // Add the base pointer register to SavedRegs if it is callee-save.
2762     if (Reg == BasePointerReg)
2763       SavedRegs.set(Reg);
2764 
2765     bool RegUsed = SavedRegs.test(Reg);
2766     unsigned PairedReg = AArch64::NoRegister;
2767     if (AArch64::GPR64RegClass.contains(Reg) ||
2768         AArch64::FPR64RegClass.contains(Reg) ||
2769         AArch64::FPR128RegClass.contains(Reg))
2770       PairedReg = CSRegs[i ^ 1];
2771 
2772     if (!RegUsed) {
2773       if (AArch64::GPR64RegClass.contains(Reg) &&
2774           !RegInfo->isReservedReg(MF, Reg)) {
2775         UnspilledCSGPR = Reg;
2776         UnspilledCSGPRPaired = PairedReg;
2777       }
2778       continue;
2779     }
2780 
2781     // MachO's compact unwind format relies on all registers being stored in
2782     // pairs.
2783     // FIXME: the usual format is actually better if unwinding isn't needed.
2784     if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
2785         !SavedRegs.test(PairedReg)) {
2786       SavedRegs.set(PairedReg);
2787       if (AArch64::GPR64RegClass.contains(PairedReg) &&
2788           !RegInfo->isReservedReg(MF, PairedReg))
2789         ExtraCSSpill = PairedReg;
2790     }
2791   }
2792 
2793   if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2794       !Subtarget.isTargetWindows()) {
2795     // For Windows calling convention on a non-windows OS, where X18 is treated
2796     // as reserved, back up X18 when entering non-windows code (marked with the
2797     // Windows calling convention) and restore when returning regardless of
2798     // whether the individual function uses it - it might call other functions
2799     // that clobber it.
2800     SavedRegs.set(AArch64::X18);
2801   }
2802 
2803   // Calculates the callee saved stack size.
2804   unsigned CSStackSize = 0;
2805   unsigned SVECSStackSize = 0;
2806   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2807   const MachineRegisterInfo &MRI = MF.getRegInfo();
2808   for (unsigned Reg : SavedRegs.set_bits()) {
2809     auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
2810     if (AArch64::PPRRegClass.contains(Reg) ||
2811         AArch64::ZPRRegClass.contains(Reg))
2812       SVECSStackSize += RegSize;
2813     else
2814       CSStackSize += RegSize;
2815   }
2816 
2817   // Save number of saved regs, so we can easily update CSStackSize later.
2818   unsigned NumSavedRegs = SavedRegs.count();
2819 
2820   // The frame record needs to be created by saving the appropriate registers
2821   uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2822   if (hasFP(MF) ||
2823       windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
2824     SavedRegs.set(AArch64::FP);
2825     SavedRegs.set(AArch64::LR);
2826   }
2827 
2828   LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2829              for (unsigned Reg
2830                   : SavedRegs.set_bits()) dbgs()
2831              << ' ' << printReg(Reg, RegInfo);
2832              dbgs() << "\n";);
2833 
2834   // If any callee-saved registers are used, the frame cannot be eliminated.
2835   int64_t SVEStackSize =
2836       alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
2837   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
2838 
2839   // The CSR spill slots have not been allocated yet, so estimateStackSize
2840   // won't include them.
2841   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2842 
2843   // Conservatively always assume BigStack when there are SVE spills.
2844   bool BigStack = SVEStackSize ||
2845                   (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
2846   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
2847     AFI->setHasStackFrame(true);
2848 
2849   // Estimate if we might need to scavenge a register at some point in order
2850   // to materialize a stack offset. If so, either spill one additional
2851   // callee-saved register or reserve a special spill slot to facilitate
2852   // register scavenging. If we already spilled an extra callee-saved register
2853   // above to keep the number of spills even, we don't need to do anything else
2854   // here.
2855   if (BigStack) {
2856     if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2857       LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2858                         << " to get a scratch register.\n");
2859       SavedRegs.set(UnspilledCSGPR);
2860       // MachO's compact unwind format relies on all registers being stored in
2861       // pairs, so if we need to spill one extra for BigStack, then we need to
2862       // store the pair.
2863       if (producePairRegisters(MF))
2864         SavedRegs.set(UnspilledCSGPRPaired);
2865       ExtraCSSpill = UnspilledCSGPR;
2866     }
2867 
2868     // If we didn't find an extra callee-saved register to spill, create
2869     // an emergency spill slot.
2870     if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
2871       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2872       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2873       unsigned Size = TRI->getSpillSize(RC);
2874       Align Alignment = TRI->getSpillAlign(RC);
2875       int FI = MFI.CreateStackObject(Size, Alignment, false);
2876       RS->addScavengingFrameIndex(FI);
2877       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2878                         << " as the emergency spill slot.\n");
2879     }
2880   }
2881 
2882   // Adding the size of additional 64bit GPR saves.
2883   CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
2884 
2885   // A Swift asynchronous context extends the frame record with a pointer
2886   // directly before FP.
2887   if (hasFP(MF) && AFI->hasSwiftAsyncContext())
2888     CSStackSize += 8;
2889 
2890   uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
2891   LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2892                << EstimatedStackSize + AlignedCSStackSize
2893                << " bytes.\n");
2894 
2895   assert((!MFI.isCalleeSavedInfoValid() ||
2896           AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2897          "Should not invalidate callee saved info");
2898 
2899   // Round up to register pair alignment to avoid additional SP adjustment
2900   // instructions.
2901   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2902   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2903   AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
2904 }
2905 
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * RegInfo,std::vector<CalleeSavedInfo> & CSI,unsigned & MinCSFrameIndex,unsigned & MaxCSFrameIndex) const2906 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2907     MachineFunction &MF, const TargetRegisterInfo *RegInfo,
2908     std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
2909     unsigned &MaxCSFrameIndex) const {
2910   bool NeedsWinCFI = needsWinCFI(MF);
2911   // To match the canonical windows frame layout, reverse the list of
2912   // callee saved registers to get them laid out by PrologEpilogInserter
2913   // in the right order. (PrologEpilogInserter allocates stack objects top
2914   // down. Windows canonical prologs store higher numbered registers at
2915   // the top, thus have the CSI array start from the highest registers.)
2916   if (NeedsWinCFI)
2917     std::reverse(CSI.begin(), CSI.end());
2918 
2919   if (CSI.empty())
2920     return true; // Early exit if no callee saved registers are modified!
2921 
2922   // Now that we know which registers need to be saved and restored, allocate
2923   // stack slots for them.
2924   MachineFrameInfo &MFI = MF.getFrameInfo();
2925   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2926   for (auto &CS : CSI) {
2927     Register Reg = CS.getReg();
2928     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
2929 
2930     unsigned Size = RegInfo->getSpillSize(*RC);
2931     Align Alignment(RegInfo->getSpillAlign(*RC));
2932     int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
2933     CS.setFrameIdx(FrameIdx);
2934 
2935     if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2936     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2937 
2938     // Grab 8 bytes below FP for the extended asynchronous frame info.
2939     if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
2940       FrameIdx = MFI.CreateStackObject(8, Alignment, true);
2941       AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
2942       if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2943       if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2944     }
2945   }
2946   return true;
2947 }
2948 
enableStackSlotScavenging(const MachineFunction & MF) const2949 bool AArch64FrameLowering::enableStackSlotScavenging(
2950     const MachineFunction &MF) const {
2951   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2952   return AFI->hasCalleeSaveStackFreeSpace();
2953 }
2954 
2955 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)2956 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2957                                       int &Min, int &Max) {
2958   Min = std::numeric_limits<int>::max();
2959   Max = std::numeric_limits<int>::min();
2960 
2961   if (!MFI.isCalleeSavedInfoValid())
2962     return false;
2963 
2964   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2965   for (auto &CS : CSI) {
2966     if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
2967         AArch64::PPRRegClass.contains(CS.getReg())) {
2968       assert((Max == std::numeric_limits<int>::min() ||
2969               Max + 1 == CS.getFrameIdx()) &&
2970              "SVE CalleeSaves are not consecutive");
2971 
2972       Min = std::min(Min, CS.getFrameIdx());
2973       Max = std::max(Max, CS.getFrameIdx());
2974     }
2975   }
2976   return Min != std::numeric_limits<int>::max();
2977 }
2978 
2979 // Process all the SVE stack objects and determine offsets for each
2980 // object. If AssignOffsets is true, the offsets get assigned.
2981 // Fills in the first and last callee-saved frame indices into
2982 // Min/MaxCSFrameIndex, respectively.
2983 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)2984 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
2985                                               int &MinCSFrameIndex,
2986                                               int &MaxCSFrameIndex,
2987                                               bool AssignOffsets) {
2988 #ifndef NDEBUG
2989   // First process all fixed stack objects.
2990   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
2991     assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
2992            "SVE vectors should never be passed on the stack by value, only by "
2993            "reference.");
2994 #endif
2995 
2996   auto Assign = [&MFI](int FI, int64_t Offset) {
2997     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2998     MFI.setObjectOffset(FI, Offset);
2999   };
3000 
3001   int64_t Offset = 0;
3002 
3003   // Then process all callee saved slots.
3004   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
3005     // Assign offsets to the callee save slots.
3006     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3007       Offset += MFI.getObjectSize(I);
3008       Offset = alignTo(Offset, MFI.getObjectAlign(I));
3009       if (AssignOffsets)
3010         Assign(I, -Offset);
3011     }
3012   }
3013 
3014   // Ensure that the Callee-save area is aligned to 16bytes.
3015   Offset = alignTo(Offset, Align(16U));
3016 
3017   // Create a buffer of SVE objects to allocate and sort it.
3018   SmallVector<int, 8> ObjectsToAllocate;
3019   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3020     unsigned StackID = MFI.getStackID(I);
3021     if (StackID != TargetStackID::ScalableVector)
3022       continue;
3023     if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3024       continue;
3025     if (MFI.isDeadObjectIndex(I))
3026       continue;
3027 
3028     ObjectsToAllocate.push_back(I);
3029   }
3030 
3031   // Allocate all SVE locals and spills
3032   for (unsigned FI : ObjectsToAllocate) {
3033     Align Alignment = MFI.getObjectAlign(FI);
3034     // FIXME: Given that the length of SVE vectors is not necessarily a power of
3035     // two, we'd need to align every object dynamically at runtime if the
3036     // alignment is larger than 16. This is not yet supported.
3037     if (Alignment > Align(16))
3038       report_fatal_error(
3039           "Alignment of scalable vectors > 16 bytes is not yet supported");
3040 
3041     Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3042     if (AssignOffsets)
3043       Assign(FI, -Offset);
3044   }
3045 
3046   return Offset;
3047 }
3048 
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const3049 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3050     MachineFrameInfo &MFI) const {
3051   int MinCSFrameIndex, MaxCSFrameIndex;
3052   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3053 }
3054 
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const3055 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3056     MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3057   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3058                                         true);
3059 }
3060 
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const3061 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3062     MachineFunction &MF, RegScavenger *RS) const {
3063   MachineFrameInfo &MFI = MF.getFrameInfo();
3064 
3065   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3066          "Upwards growing stack unsupported");
3067 
3068   int MinCSFrameIndex, MaxCSFrameIndex;
3069   int64_t SVEStackSize =
3070       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3071 
3072   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3073   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3074   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3075 
3076   // If this function isn't doing Win64-style C++ EH, we don't need to do
3077   // anything.
3078   if (!MF.hasEHFunclets())
3079     return;
3080   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3081   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3082 
3083   MachineBasicBlock &MBB = MF.front();
3084   auto MBBI = MBB.begin();
3085   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3086     ++MBBI;
3087 
3088   // Create an UnwindHelp object.
3089   // The UnwindHelp object is allocated at the start of the fixed object area
3090   int64_t FixedObject =
3091       getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3092   int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3093                                            /*SPOffset*/ -FixedObject,
3094                                            /*IsImmutable=*/false);
3095   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3096 
3097   // We need to store -2 into the UnwindHelp object at the start of the
3098   // function.
3099   DebugLoc DL;
3100   RS->enterBasicBlockEnd(MBB);
3101   RS->backward(std::prev(MBBI));
3102   unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3103   assert(DstReg && "There must be a free register after frame setup");
3104   BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3105   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3106       .addReg(DstReg, getKillRegState(true))
3107       .addFrameIndex(UnwindHelpFI)
3108       .addImm(0);
3109 }
3110 
3111 namespace {
3112 struct TagStoreInstr {
3113   MachineInstr *MI;
3114   int64_t Offset, Size;
TagStoreInstr__anonde4e74e70711::TagStoreInstr3115   explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3116       : MI(MI), Offset(Offset), Size(Size) {}
3117 };
3118 
3119 class TagStoreEdit {
3120   MachineFunction *MF;
3121   MachineBasicBlock *MBB;
3122   MachineRegisterInfo *MRI;
3123   // Tag store instructions that are being replaced.
3124   SmallVector<TagStoreInstr, 8> TagStores;
3125   // Combined memref arguments of the above instructions.
3126   SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3127 
3128   // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3129   // FrameRegOffset + Size) with the address tag of SP.
3130   Register FrameReg;
3131   StackOffset FrameRegOffset;
3132   int64_t Size;
3133   // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3134   Optional<int64_t> FrameRegUpdate;
3135   // MIFlags for any FrameReg updating instructions.
3136   unsigned FrameRegUpdateFlags;
3137 
3138   // Use zeroing instruction variants.
3139   bool ZeroData;
3140   DebugLoc DL;
3141 
3142   void emitUnrolled(MachineBasicBlock::iterator InsertI);
3143   void emitLoop(MachineBasicBlock::iterator InsertI);
3144 
3145 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)3146   TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3147       : MBB(MBB), ZeroData(ZeroData) {
3148     MF = MBB->getParent();
3149     MRI = &MF->getRegInfo();
3150   }
3151   // Add an instruction to be replaced. Instructions must be added in the
3152   // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)3153   void addInstruction(TagStoreInstr I) {
3154     assert((TagStores.empty() ||
3155             TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3156            "Non-adjacent tag store instructions.");
3157     TagStores.push_back(I);
3158   }
clear()3159   void clear() { TagStores.clear(); }
3160   // Emit equivalent code at the given location, and erase the current set of
3161   // instructions. May skip if the replacement is not profitable. May invalidate
3162   // the input iterator and replace it with a valid one.
3163   void emitCode(MachineBasicBlock::iterator &InsertI,
3164                 const AArch64FrameLowering *TFI, bool IsLast);
3165 };
3166 
emitUnrolled(MachineBasicBlock::iterator InsertI)3167 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3168   const AArch64InstrInfo *TII =
3169       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3170 
3171   const int64_t kMinOffset = -256 * 16;
3172   const int64_t kMaxOffset = 255 * 16;
3173 
3174   Register BaseReg = FrameReg;
3175   int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3176   if (BaseRegOffsetBytes < kMinOffset ||
3177       BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3178     Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3179     emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3180                     StackOffset::getFixed(BaseRegOffsetBytes), TII);
3181     BaseReg = ScratchReg;
3182     BaseRegOffsetBytes = 0;
3183   }
3184 
3185   MachineInstr *LastI = nullptr;
3186   while (Size) {
3187     int64_t InstrSize = (Size > 16) ? 32 : 16;
3188     unsigned Opcode =
3189         InstrSize == 16
3190             ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3191             : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3192     MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3193                           .addReg(AArch64::SP)
3194                           .addReg(BaseReg)
3195                           .addImm(BaseRegOffsetBytes / 16)
3196                           .setMemRefs(CombinedMemRefs);
3197     // A store to [BaseReg, #0] should go last for an opportunity to fold the
3198     // final SP adjustment in the epilogue.
3199     if (BaseRegOffsetBytes == 0)
3200       LastI = I;
3201     BaseRegOffsetBytes += InstrSize;
3202     Size -= InstrSize;
3203   }
3204 
3205   if (LastI)
3206     MBB->splice(InsertI, MBB, LastI);
3207 }
3208 
emitLoop(MachineBasicBlock::iterator InsertI)3209 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3210   const AArch64InstrInfo *TII =
3211       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3212 
3213   Register BaseReg = FrameRegUpdate
3214                          ? FrameReg
3215                          : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3216   Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3217 
3218   emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3219 
3220   int64_t LoopSize = Size;
3221   // If the loop size is not a multiple of 32, split off one 16-byte store at
3222   // the end to fold BaseReg update into.
3223   if (FrameRegUpdate && *FrameRegUpdate)
3224     LoopSize -= LoopSize % 32;
3225   MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3226                                 TII->get(ZeroData ? AArch64::STZGloop_wback
3227                                                   : AArch64::STGloop_wback))
3228                             .addDef(SizeReg)
3229                             .addDef(BaseReg)
3230                             .addImm(LoopSize)
3231                             .addReg(BaseReg)
3232                             .setMemRefs(CombinedMemRefs);
3233   if (FrameRegUpdate)
3234     LoopI->setFlags(FrameRegUpdateFlags);
3235 
3236   int64_t ExtraBaseRegUpdate =
3237       FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3238   if (LoopSize < Size) {
3239     assert(FrameRegUpdate);
3240     assert(Size - LoopSize == 16);
3241     // Tag 16 more bytes at BaseReg and update BaseReg.
3242     BuildMI(*MBB, InsertI, DL,
3243             TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3244         .addDef(BaseReg)
3245         .addReg(BaseReg)
3246         .addReg(BaseReg)
3247         .addImm(1 + ExtraBaseRegUpdate / 16)
3248         .setMemRefs(CombinedMemRefs)
3249         .setMIFlags(FrameRegUpdateFlags);
3250   } else if (ExtraBaseRegUpdate) {
3251     // Update BaseReg.
3252     BuildMI(
3253         *MBB, InsertI, DL,
3254         TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3255         .addDef(BaseReg)
3256         .addReg(BaseReg)
3257         .addImm(std::abs(ExtraBaseRegUpdate))
3258         .addImm(0)
3259         .setMIFlags(FrameRegUpdateFlags);
3260   }
3261 }
3262 
3263 // Check if *II is a register update that can be merged into STGloop that ends
3264 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3265 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3266 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3267                        int64_t Size, int64_t *TotalOffset) {
3268   MachineInstr &MI = *II;
3269   if ((MI.getOpcode() == AArch64::ADDXri ||
3270        MI.getOpcode() == AArch64::SUBXri) &&
3271       MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3272     unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3273     int64_t Offset = MI.getOperand(2).getImm() << Shift;
3274     if (MI.getOpcode() == AArch64::SUBXri)
3275       Offset = -Offset;
3276     int64_t AbsPostOffset = std::abs(Offset - Size);
3277     const int64_t kMaxOffset =
3278         0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3279     if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3280       *TotalOffset = Offset;
3281       return true;
3282     }
3283   }
3284   return false;
3285 }
3286 
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3287 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3288                   SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3289   MemRefs.clear();
3290   for (auto &TS : TSE) {
3291     MachineInstr *MI = TS.MI;
3292     // An instruction without memory operands may access anything. Be
3293     // conservative and return an empty list.
3294     if (MI->memoperands_empty()) {
3295       MemRefs.clear();
3296       return;
3297     }
3298     MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3299   }
3300 }
3301 
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool IsLast)3302 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3303                             const AArch64FrameLowering *TFI, bool IsLast) {
3304   if (TagStores.empty())
3305     return;
3306   TagStoreInstr &FirstTagStore = TagStores[0];
3307   TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3308   Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3309   DL = TagStores[0].MI->getDebugLoc();
3310 
3311   Register Reg;
3312   FrameRegOffset = TFI->resolveFrameOffsetReference(
3313       *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3314       /*PreferFP=*/false, /*ForSimm=*/true);
3315   FrameReg = Reg;
3316   FrameRegUpdate = None;
3317 
3318   mergeMemRefs(TagStores, CombinedMemRefs);
3319 
3320   LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3321              for (const auto &Instr
3322                   : TagStores) { dbgs() << "  " << *Instr.MI; });
3323 
3324   // Size threshold where a loop becomes shorter than a linear sequence of
3325   // tagging instructions.
3326   const int kSetTagLoopThreshold = 176;
3327   if (Size < kSetTagLoopThreshold) {
3328     if (TagStores.size() < 2)
3329       return;
3330     emitUnrolled(InsertI);
3331   } else {
3332     MachineInstr *UpdateInstr = nullptr;
3333     int64_t TotalOffset;
3334     if (IsLast) {
3335       // See if we can merge base register update into the STGloop.
3336       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3337       // but STGloop is way too unusual for that, and also it only
3338       // realistically happens in function epilogue. Also, STGloop is expanded
3339       // before that pass.
3340       if (InsertI != MBB->end() &&
3341           canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3342                             &TotalOffset)) {
3343         UpdateInstr = &*InsertI++;
3344         LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
3345                           << *UpdateInstr);
3346       }
3347     }
3348 
3349     if (!UpdateInstr && TagStores.size() < 2)
3350       return;
3351 
3352     if (UpdateInstr) {
3353       FrameRegUpdate = TotalOffset;
3354       FrameRegUpdateFlags = UpdateInstr->getFlags();
3355     }
3356     emitLoop(InsertI);
3357     if (UpdateInstr)
3358       UpdateInstr->eraseFromParent();
3359   }
3360 
3361   for (auto &TS : TagStores)
3362     TS.MI->eraseFromParent();
3363 }
3364 
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3365 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3366                                         int64_t &Size, bool &ZeroData) {
3367   MachineFunction &MF = *MI.getParent()->getParent();
3368   const MachineFrameInfo &MFI = MF.getFrameInfo();
3369 
3370   unsigned Opcode = MI.getOpcode();
3371   ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3372               Opcode == AArch64::STZ2GOffset);
3373 
3374   if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3375     if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3376       return false;
3377     if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3378       return false;
3379     Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3380     Size = MI.getOperand(2).getImm();
3381     return true;
3382   }
3383 
3384   if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3385     Size = 16;
3386   else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3387     Size = 32;
3388   else
3389     return false;
3390 
3391   if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3392     return false;
3393 
3394   Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3395            16 * MI.getOperand(2).getImm();
3396   return true;
3397 }
3398 
3399 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3400 // and replace them with a shorter instruction sequence:
3401 // * replace STG + STG with ST2G
3402 // * replace STGloop + STGloop with STGloop
3403 // This code needs to run when stack slot offsets are already known, but before
3404 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3405 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3406                                                 const AArch64FrameLowering *TFI,
3407                                                 RegScavenger *RS) {
3408   bool FirstZeroData;
3409   int64_t Size, Offset;
3410   MachineInstr &MI = *II;
3411   MachineBasicBlock *MBB = MI.getParent();
3412   MachineBasicBlock::iterator NextI = ++II;
3413   if (&MI == &MBB->instr_back())
3414     return II;
3415   if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3416     return II;
3417 
3418   SmallVector<TagStoreInstr, 4> Instrs;
3419   Instrs.emplace_back(&MI, Offset, Size);
3420 
3421   constexpr int kScanLimit = 10;
3422   int Count = 0;
3423   for (MachineBasicBlock::iterator E = MBB->end();
3424        NextI != E && Count < kScanLimit; ++NextI) {
3425     MachineInstr &MI = *NextI;
3426     bool ZeroData;
3427     int64_t Size, Offset;
3428     // Collect instructions that update memory tags with a FrameIndex operand
3429     // and (when applicable) constant size, and whose output registers are dead
3430     // (the latter is almost always the case in practice). Since these
3431     // instructions effectively have no inputs or outputs, we are free to skip
3432     // any non-aliasing instructions in between without tracking used registers.
3433     if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3434       if (ZeroData != FirstZeroData)
3435         break;
3436       Instrs.emplace_back(&MI, Offset, Size);
3437       continue;
3438     }
3439 
3440     // Only count non-transient, non-tagging instructions toward the scan
3441     // limit.
3442     if (!MI.isTransient())
3443       ++Count;
3444 
3445     // Just in case, stop before the epilogue code starts.
3446     if (MI.getFlag(MachineInstr::FrameSetup) ||
3447         MI.getFlag(MachineInstr::FrameDestroy))
3448       break;
3449 
3450     // Reject anything that may alias the collected instructions.
3451     if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3452       break;
3453   }
3454 
3455   // New code will be inserted after the last tagging instruction we've found.
3456   MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3457   InsertI++;
3458 
3459   llvm::stable_sort(Instrs,
3460                     [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3461                       return Left.Offset < Right.Offset;
3462                     });
3463 
3464   // Make sure that we don't have any overlapping stores.
3465   int64_t CurOffset = Instrs[0].Offset;
3466   for (auto &Instr : Instrs) {
3467     if (CurOffset > Instr.Offset)
3468       return NextI;
3469     CurOffset = Instr.Offset + Instr.Size;
3470   }
3471 
3472   // Find contiguous runs of tagged memory and emit shorter instruction
3473   // sequencies for them when possible.
3474   TagStoreEdit TSE(MBB, FirstZeroData);
3475   Optional<int64_t> EndOffset;
3476   for (auto &Instr : Instrs) {
3477     if (EndOffset && *EndOffset != Instr.Offset) {
3478       // Found a gap.
3479       TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
3480       TSE.clear();
3481     }
3482 
3483     TSE.addInstruction(Instr);
3484     EndOffset = Instr.Offset + Instr.Size;
3485   }
3486 
3487   TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
3488 
3489   return InsertI;
3490 }
3491 } // namespace
3492 
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3493 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3494     MachineFunction &MF, RegScavenger *RS = nullptr) const {
3495   if (StackTaggingMergeSetTag)
3496     for (auto &BB : MF)
3497       for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3498         II = tryMergeAdjacentSTG(II, this, RS);
3499 }
3500 
3501 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3502 /// before the update.  This is easily retrieved as it is exactly the offset
3503 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3504 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3505     const MachineFunction &MF, int FI, Register &FrameReg,
3506     bool IgnoreSPUpdates) const {
3507   const MachineFrameInfo &MFI = MF.getFrameInfo();
3508   if (IgnoreSPUpdates) {
3509     LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3510                       << MFI.getObjectOffset(FI) << "\n");
3511     FrameReg = AArch64::SP;
3512     return StackOffset::getFixed(MFI.getObjectOffset(FI));
3513   }
3514 
3515   return getFrameIndexReference(MF, FI, FrameReg);
3516 }
3517 
3518 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3519 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3520 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3521     const MachineFunction &MF) const {
3522   return 0;
3523 }
3524 
3525 /// Funclets only need to account for space for the callee saved registers,
3526 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3527 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3528     const MachineFunction &MF) const {
3529   // This is the size of the pushed CSRs.
3530   unsigned CSSize =
3531       MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3532   // This is the amount of stack a funclet needs to allocate.
3533   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3534                  getStackAlign());
3535 }
3536 
3537 namespace {
3538 struct FrameObject {
3539   bool IsValid = false;
3540   // Index of the object in MFI.
3541   int ObjectIndex = 0;
3542   // Group ID this object belongs to.
3543   int GroupIndex = -1;
3544   // This object should be placed first (closest to SP).
3545   bool ObjectFirst = false;
3546   // This object's group (which always contains the object with
3547   // ObjectFirst==true) should be placed first.
3548   bool GroupFirst = false;
3549 };
3550 
3551 class GroupBuilder {
3552   SmallVector<int, 8> CurrentMembers;
3553   int NextGroupIndex = 0;
3554   std::vector<FrameObject> &Objects;
3555 
3556 public:
GroupBuilder(std::vector<FrameObject> & Objects)3557   GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3558   void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3559   void EndCurrentGroup() {
3560     if (CurrentMembers.size() > 1) {
3561       // Create a new group with the current member list. This might remove them
3562       // from their pre-existing groups. That's OK, dealing with overlapping
3563       // groups is too hard and unlikely to make a difference.
3564       LLVM_DEBUG(dbgs() << "group:");
3565       for (int Index : CurrentMembers) {
3566         Objects[Index].GroupIndex = NextGroupIndex;
3567         LLVM_DEBUG(dbgs() << " " << Index);
3568       }
3569       LLVM_DEBUG(dbgs() << "\n");
3570       NextGroupIndex++;
3571     }
3572     CurrentMembers.clear();
3573   }
3574 };
3575 
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3576 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3577   // Objects at a lower index are closer to FP; objects at a higher index are
3578   // closer to SP.
3579   //
3580   // For consistency in our comparison, all invalid objects are placed
3581   // at the end. This also allows us to stop walking when we hit the
3582   // first invalid item after it's all sorted.
3583   //
3584   // The "first" object goes first (closest to SP), followed by the members of
3585   // the "first" group.
3586   //
3587   // The rest are sorted by the group index to keep the groups together.
3588   // Higher numbered groups are more likely to be around longer (i.e. untagged
3589   // in the function epilogue and not at some earlier point). Place them closer
3590   // to SP.
3591   //
3592   // If all else equal, sort by the object index to keep the objects in the
3593   // original order.
3594   return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3595                          A.ObjectIndex) <
3596          std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3597                          B.ObjectIndex);
3598 }
3599 } // namespace
3600 
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3601 void AArch64FrameLowering::orderFrameObjects(
3602     const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3603   if (!OrderFrameObjects || ObjectsToAllocate.empty())
3604     return;
3605 
3606   const MachineFrameInfo &MFI = MF.getFrameInfo();
3607   std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3608   for (auto &Obj : ObjectsToAllocate) {
3609     FrameObjects[Obj].IsValid = true;
3610     FrameObjects[Obj].ObjectIndex = Obj;
3611   }
3612 
3613   // Identify stack slots that are tagged at the same time.
3614   GroupBuilder GB(FrameObjects);
3615   for (auto &MBB : MF) {
3616     for (auto &MI : MBB) {
3617       if (MI.isDebugInstr())
3618         continue;
3619       int OpIndex;
3620       switch (MI.getOpcode()) {
3621       case AArch64::STGloop:
3622       case AArch64::STZGloop:
3623         OpIndex = 3;
3624         break;
3625       case AArch64::STGOffset:
3626       case AArch64::STZGOffset:
3627       case AArch64::ST2GOffset:
3628       case AArch64::STZ2GOffset:
3629         OpIndex = 1;
3630         break;
3631       default:
3632         OpIndex = -1;
3633       }
3634 
3635       int TaggedFI = -1;
3636       if (OpIndex >= 0) {
3637         const MachineOperand &MO = MI.getOperand(OpIndex);
3638         if (MO.isFI()) {
3639           int FI = MO.getIndex();
3640           if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3641               FrameObjects[FI].IsValid)
3642             TaggedFI = FI;
3643         }
3644       }
3645 
3646       // If this is a stack tagging instruction for a slot that is not part of a
3647       // group yet, either start a new group or add it to the current one.
3648       if (TaggedFI >= 0)
3649         GB.AddMember(TaggedFI);
3650       else
3651         GB.EndCurrentGroup();
3652     }
3653     // Groups should never span multiple basic blocks.
3654     GB.EndCurrentGroup();
3655   }
3656 
3657   // If the function's tagged base pointer is pinned to a stack slot, we want to
3658   // put that slot first when possible. This will likely place it at SP + 0,
3659   // and save one instruction when generating the base pointer because IRG does
3660   // not allow an immediate offset.
3661   const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3662   Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3663   if (TBPI) {
3664     FrameObjects[*TBPI].ObjectFirst = true;
3665     FrameObjects[*TBPI].GroupFirst = true;
3666     int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3667     if (FirstGroupIndex >= 0)
3668       for (FrameObject &Object : FrameObjects)
3669         if (Object.GroupIndex == FirstGroupIndex)
3670           Object.GroupFirst = true;
3671   }
3672 
3673   llvm::stable_sort(FrameObjects, FrameObjectCompare);
3674 
3675   int i = 0;
3676   for (auto &Obj : FrameObjects) {
3677     // All invalid items are sorted at the end, so it's safe to stop.
3678     if (!Obj.IsValid)
3679       break;
3680     ObjectsToAllocate[i++] = Obj.ObjectIndex;
3681   }
3682 
3683   LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3684                                                     : FrameObjects) {
3685     if (!Obj.IsValid)
3686       break;
3687     dbgs() << "  " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3688     if (Obj.ObjectFirst)
3689       dbgs() << ", first";
3690     if (Obj.GroupFirst)
3691       dbgs() << ", group-first";
3692     dbgs() << "\n";
3693   });
3694 }
3695