1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // |                                   | Higher address
22 // |-----------------------------------|
23 // |                                   |
24 // | arguments passed on the stack     |
25 // |                                   |
26 // |-----------------------------------| <- sp
27 // |                                   | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // |                                   | Higher address
37 // |-----------------------------------|
38 // |                                   |
39 // | arguments passed on the stack     |
40 // |                                   |
41 // |-----------------------------------|
42 // |                                   |
43 // | (Win64 only) varargs from reg     |
44 // |                                   |
45 // |-----------------------------------|
46 // |                                   |
47 // | callee-saved gpr registers        | <--.
48 // |                                   |    | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -|    | callee saves are swapped,
50 // | prev_lr                           |    | (frame record first)
51 // | prev_fp                           | <--'
52 // | async context if needed           |
53 // | (a.k.a. "frame record")           |
54 // |-----------------------------------| <- fp(=x29)
55 // |                                   |
56 // | callee-saved fp/simd/SVE regs     |
57 // |                                   |
58 // |-----------------------------------|
59 // |                                   |
60 // |        SVE stack objects          |
61 // |                                   |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....|  compile time; if present)
66 // |-----------------------------------|
67 // |                                   |
68 // | local variables of fixed size     |
69 // | including spill slots             |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....|       LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................|  compile time)
74 // |-----------------------------------| <- sp
75 // |                                   | Lower address
76 //
77 //
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
84 //
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 //   variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 //   more-than-default alignment requirements.
91 //
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 //    ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
105 //
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
110 //
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
117 //
118 // FIXME: also explain the redzone concept.
119 //
120 //===----------------------------------------------------------------------===//
121 
122 #include "AArch64FrameLowering.h"
123 #include "AArch64InstrInfo.h"
124 #include "AArch64MachineFunctionInfo.h"
125 #include "AArch64RegisterInfo.h"
126 #include "AArch64Subtarget.h"
127 #include "AArch64TargetMachine.h"
128 #include "MCTargetDesc/AArch64AddressingModes.h"
129 #include "llvm/ADT/ScopeExit.h"
130 #include "llvm/ADT/SmallVector.h"
131 #include "llvm/ADT/Statistic.h"
132 #include "llvm/CodeGen/LivePhysRegs.h"
133 #include "llvm/CodeGen/MachineBasicBlock.h"
134 #include "llvm/CodeGen/MachineFrameInfo.h"
135 #include "llvm/CodeGen/MachineFunction.h"
136 #include "llvm/CodeGen/MachineInstr.h"
137 #include "llvm/CodeGen/MachineInstrBuilder.h"
138 #include "llvm/CodeGen/MachineMemOperand.h"
139 #include "llvm/CodeGen/MachineModuleInfo.h"
140 #include "llvm/CodeGen/MachineOperand.h"
141 #include "llvm/CodeGen/MachineRegisterInfo.h"
142 #include "llvm/CodeGen/RegisterScavenging.h"
143 #include "llvm/CodeGen/TargetInstrInfo.h"
144 #include "llvm/CodeGen/TargetRegisterInfo.h"
145 #include "llvm/CodeGen/TargetSubtargetInfo.h"
146 #include "llvm/CodeGen/WinEHFuncInfo.h"
147 #include "llvm/IR/Attributes.h"
148 #include "llvm/IR/CallingConv.h"
149 #include "llvm/IR/DataLayout.h"
150 #include "llvm/IR/DebugLoc.h"
151 #include "llvm/IR/Function.h"
152 #include "llvm/MC/MCAsmInfo.h"
153 #include "llvm/MC/MCDwarf.h"
154 #include "llvm/Support/CommandLine.h"
155 #include "llvm/Support/Debug.h"
156 #include "llvm/Support/ErrorHandling.h"
157 #include "llvm/Support/LEB128.h"
158 #include "llvm/Support/MathExtras.h"
159 #include "llvm/Support/raw_ostream.h"
160 #include "llvm/Target/TargetMachine.h"
161 #include "llvm/Target/TargetOptions.h"
162 #include <cassert>
163 #include <cstdint>
164 #include <iterator>
165 #include <vector>
166 
167 using namespace llvm;
168 
169 #define DEBUG_TYPE "frame-info"
170 
171 static cl::opt<bool> EnableRedZone("aarch64-redzone",
172                                    cl::desc("enable use of redzone on AArch64"),
173                                    cl::init(false), cl::Hidden);
174 
175 static cl::opt<bool>
176     ReverseCSRRestoreSeq("reverse-csr-restore-seq",
177                          cl::desc("reverse the CSR restore sequence"),
178                          cl::init(false), cl::Hidden);
179 
180 static cl::opt<bool> StackTaggingMergeSetTag(
181     "stack-tagging-merge-settag",
182     cl::desc("merge settag instruction in function epilog"), cl::init(true),
183     cl::Hidden);
184 
185 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
186                                        cl::desc("sort stack allocations"),
187                                        cl::init(true), cl::Hidden);
188 
189 cl::opt<bool> EnableHomogeneousPrologEpilog(
190     "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
191     cl::desc("Emit homogeneous prologue and epilogue for the size "
192              "optimization (default = off)"));
193 
194 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
195 
196 /// Returns how much of the incoming argument stack area (in bytes) we should
197 /// clean up in an epilogue. For the C calling convention this will be 0, for
198 /// guaranteed tail call conventions it can be positive (a normal return or a
199 /// tail call to a function that uses less stack space for arguments) or
200 /// negative (for a tail call to a function that needs more stack space than us
201 /// for arguments).
getArgumentStackToRestore(MachineFunction & MF,MachineBasicBlock & MBB)202 static int64_t getArgumentStackToRestore(MachineFunction &MF,
203                                          MachineBasicBlock &MBB) {
204   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
205   bool IsTailCallReturn = false;
206   if (MBB.end() != MBBI) {
207     unsigned RetOpcode = MBBI->getOpcode();
208     IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
209                        RetOpcode == AArch64::TCRETURNri ||
210                        RetOpcode == AArch64::TCRETURNriBTI;
211   }
212   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
213 
214   int64_t ArgumentPopSize = 0;
215   if (IsTailCallReturn) {
216     MachineOperand &StackAdjust = MBBI->getOperand(1);
217 
218     // For a tail-call in a callee-pops-arguments environment, some or all of
219     // the stack may actually be in use for the call's arguments, this is
220     // calculated during LowerCall and consumed here...
221     ArgumentPopSize = StackAdjust.getImm();
222   } else {
223     // ... otherwise the amount to pop is *all* of the argument space,
224     // conveniently stored in the MachineFunctionInfo by
225     // LowerFormalArguments. This will, of course, be zero for the C calling
226     // convention.
227     ArgumentPopSize = AFI->getArgumentStackToRestore();
228   }
229 
230   return ArgumentPopSize;
231 }
232 
233 static bool produceCompactUnwindFrame(MachineFunction &MF);
234 static bool needsWinCFI(const MachineFunction &MF);
235 static StackOffset getSVEStackSize(const MachineFunction &MF);
236 
237 /// Returns true if a homogeneous prolog or epilog code can be emitted
238 /// for the size optimization. If possible, a frame helper call is injected.
239 /// When Exit block is given, this check is for epilog.
homogeneousPrologEpilog(MachineFunction & MF,MachineBasicBlock * Exit) const240 bool AArch64FrameLowering::homogeneousPrologEpilog(
241     MachineFunction &MF, MachineBasicBlock *Exit) const {
242   if (!MF.getFunction().hasMinSize())
243     return false;
244   if (!EnableHomogeneousPrologEpilog)
245     return false;
246   if (ReverseCSRRestoreSeq)
247     return false;
248   if (EnableRedZone)
249     return false;
250 
251   // TODO: Window is supported yet.
252   if (needsWinCFI(MF))
253     return false;
254   // TODO: SVE is not supported yet.
255   if (getSVEStackSize(MF))
256     return false;
257 
258   // Bail on stack adjustment needed on return for simplicity.
259   const MachineFrameInfo &MFI = MF.getFrameInfo();
260   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
261   if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
262     return false;
263   if (Exit && getArgumentStackToRestore(MF, *Exit))
264     return false;
265 
266   return true;
267 }
268 
269 /// Returns true if CSRs should be paired.
producePairRegisters(MachineFunction & MF) const270 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
271   return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
272 }
273 
274 /// This is the biggest offset to the stack pointer we can encode in aarch64
275 /// instructions (without using a separate calculation and a temp register).
276 /// Note that the exception here are vector stores/loads which cannot encode any
277 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
278 static const unsigned DefaultSafeSPDisplacement = 255;
279 
280 /// Look at each instruction that references stack frames and return the stack
281 /// size limit beyond which some of these instructions will require a scratch
282 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)283 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
284   // FIXME: For now, just conservatively guestimate based on unscaled indexing
285   // range. We'll end up allocating an unnecessary spill slot a lot, but
286   // realistically that's not a big deal at this stage of the game.
287   for (MachineBasicBlock &MBB : MF) {
288     for (MachineInstr &MI : MBB) {
289       if (MI.isDebugInstr() || MI.isPseudo() ||
290           MI.getOpcode() == AArch64::ADDXri ||
291           MI.getOpcode() == AArch64::ADDSXri)
292         continue;
293 
294       for (const MachineOperand &MO : MI.operands()) {
295         if (!MO.isFI())
296           continue;
297 
298         StackOffset Offset;
299         if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
300             AArch64FrameOffsetCannotUpdate)
301           return 0;
302       }
303     }
304   }
305   return DefaultSafeSPDisplacement;
306 }
307 
308 TargetStackID::Value
getStackIDForScalableVectors() const309 AArch64FrameLowering::getStackIDForScalableVectors() const {
310   return TargetStackID::ScalableVector;
311 }
312 
313 /// Returns the size of the fixed object area (allocated next to sp on entry)
314 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)315 static unsigned getFixedObjectSize(const MachineFunction &MF,
316                                    const AArch64FunctionInfo *AFI, bool IsWin64,
317                                    bool IsFunclet) {
318   if (!IsWin64 || IsFunclet) {
319     return AFI->getTailCallReservedStack();
320   } else {
321     if (AFI->getTailCallReservedStack() != 0)
322       report_fatal_error("cannot generate ABI-changing tail call for Win64");
323     // Var args are stored here in the primary function.
324     const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
325     // To support EH funclets we allocate an UnwindHelp object
326     const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
327     return alignTo(VarArgsArea + UnwindHelpObject, 16);
328   }
329 }
330 
331 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)332 static StackOffset getSVEStackSize(const MachineFunction &MF) {
333   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
334   return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
335 }
336 
canUseRedZone(const MachineFunction & MF) const337 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
338   if (!EnableRedZone)
339     return false;
340 
341   // Don't use the red zone if the function explicitly asks us not to.
342   // This is typically used for kernel code.
343   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
344   const unsigned RedZoneSize =
345       Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
346   if (!RedZoneSize)
347     return false;
348 
349   const MachineFrameInfo &MFI = MF.getFrameInfo();
350   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
351   uint64_t NumBytes = AFI->getLocalStackSize();
352 
353   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
354            getSVEStackSize(MF));
355 }
356 
357 /// hasFP - Return true if the specified function should have a dedicated frame
358 /// pointer register.
hasFP(const MachineFunction & MF) const359 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
360   const MachineFrameInfo &MFI = MF.getFrameInfo();
361   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
362   // Win64 EH requires a frame pointer if funclets are present, as the locals
363   // are accessed off the frame pointer in both the parent function and the
364   // funclets.
365   if (MF.hasEHFunclets())
366     return true;
367   // Retain behavior of always omitting the FP for leaf functions when possible.
368   if (MF.getTarget().Options.DisableFramePointerElim(MF))
369     return true;
370   if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
371       MFI.hasStackMap() || MFI.hasPatchPoint() ||
372       RegInfo->hasStackRealignment(MF))
373     return true;
374   // With large callframes around we may need to use FP to access the scavenging
375   // emergency spillslot.
376   //
377   // Unfortunately some calls to hasFP() like machine verifier ->
378   // getReservedReg() -> hasFP in the middle of global isel are too early
379   // to know the max call frame size. Hopefully conservatively returning "true"
380   // in those cases is fine.
381   // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
382   if (!MFI.isMaxCallFrameSizeComputed() ||
383       MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
384     return true;
385 
386   return false;
387 }
388 
389 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
390 /// not required, we reserve argument space for call sites in the function
391 /// immediately on entry to the current function.  This eliminates the need for
392 /// add/sub sp brackets around call sites.  Returns true if the call frame is
393 /// included as part of the stack frame.
394 bool
hasReservedCallFrame(const MachineFunction & MF) const395 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
396   return !MF.getFrameInfo().hasVarSizedObjects();
397 }
398 
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const399 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
400     MachineFunction &MF, MachineBasicBlock &MBB,
401     MachineBasicBlock::iterator I) const {
402   const AArch64InstrInfo *TII =
403       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
404   DebugLoc DL = I->getDebugLoc();
405   unsigned Opc = I->getOpcode();
406   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
407   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
408 
409   if (!hasReservedCallFrame(MF)) {
410     int64_t Amount = I->getOperand(0).getImm();
411     Amount = alignTo(Amount, getStackAlign());
412     if (!IsDestroy)
413       Amount = -Amount;
414 
415     // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
416     // doesn't have to pop anything), then the first operand will be zero too so
417     // this adjustment is a no-op.
418     if (CalleePopAmount == 0) {
419       // FIXME: in-function stack adjustment for calls is limited to 24-bits
420       // because there's no guaranteed temporary register available.
421       //
422       // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
423       // 1) For offset <= 12-bit, we use LSL #0
424       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
425       // LSL #0, and the other uses LSL #12.
426       //
427       // Most call frames will be allocated at the start of a function so
428       // this is OK, but it is a limitation that needs dealing with.
429       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
430       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
431                       StackOffset::getFixed(Amount), TII);
432     }
433   } else if (CalleePopAmount != 0) {
434     // If the calling convention demands that the callee pops arguments from the
435     // stack, we want to add it back if we have a reserved call frame.
436     assert(CalleePopAmount < 0xffffff && "call frame too large");
437     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
438                     StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
439   }
440   return MBB.erase(I);
441 }
442 
443 // Convenience function to create a DWARF expression for
444 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)445 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
446                                      int NumBytes, int NumVGScaledBytes, unsigned VG,
447                                      llvm::raw_string_ostream &Comment) {
448   uint8_t buffer[16];
449 
450   if (NumBytes) {
451     Expr.push_back(dwarf::DW_OP_consts);
452     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
453     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
454     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
455   }
456 
457   if (NumVGScaledBytes) {
458     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
459     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
460 
461     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
462     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
463     Expr.push_back(0);
464 
465     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
466     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
467 
468     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
469             << std::abs(NumVGScaledBytes) << " * VG";
470   }
471 }
472 
473 // Creates an MCCFIInstruction:
474 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpressionFromSP(const TargetRegisterInfo & TRI,const StackOffset & OffsetFromSP) const475 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
476     const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
477   int64_t NumBytes, NumVGScaledBytes;
478   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
479                                                         NumVGScaledBytes);
480 
481   std::string CommentBuffer = "sp";
482   llvm::raw_string_ostream Comment(CommentBuffer);
483 
484   // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
485   SmallString<64> Expr;
486   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
487   Expr.push_back(0);
488   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
489                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
490 
491   // Wrap this into DW_CFA_def_cfa.
492   SmallString<64> DefCfaExpr;
493   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
494   uint8_t buffer[16];
495   DefCfaExpr.append(buffer,
496                     buffer + encodeULEB128(Expr.size(), buffer));
497   DefCfaExpr.append(Expr.str());
498   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
499                                         Comment.str());
500 }
501 
createCfaOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA) const502 MCCFIInstruction AArch64FrameLowering::createCfaOffset(
503     const TargetRegisterInfo &TRI, unsigned Reg,
504     const StackOffset &OffsetFromDefCFA) const {
505   int64_t NumBytes, NumVGScaledBytes;
506   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
507       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
508 
509   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
510 
511   // Non-scalable offsets can use DW_CFA_offset directly.
512   if (!NumVGScaledBytes)
513     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
514 
515   std::string CommentBuffer;
516   llvm::raw_string_ostream Comment(CommentBuffer);
517   Comment << printReg(Reg, &TRI) << "  @ cfa";
518 
519   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
520   SmallString<64> OffsetExpr;
521   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
522                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
523 
524   // Wrap this into DW_CFA_expression
525   SmallString<64> CfaExpr;
526   CfaExpr.push_back(dwarf::DW_CFA_expression);
527   uint8_t buffer[16];
528   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
529   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
530   CfaExpr.append(OffsetExpr.str());
531 
532   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
533 }
534 
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const535 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
536     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
537   MachineFunction &MF = *MBB.getParent();
538   MachineFrameInfo &MFI = MF.getFrameInfo();
539   const TargetSubtargetInfo &STI = MF.getSubtarget();
540   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
541   const TargetInstrInfo *TII = STI.getInstrInfo();
542   DebugLoc DL = MBB.findDebugLoc(MBBI);
543 
544   // Add callee saved registers to move list.
545   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
546   if (CSI.empty())
547     return;
548 
549   for (const auto &Info : CSI) {
550     unsigned Reg = Info.getReg();
551 
552     // Not all unwinders may know about SVE registers, so assume the lowest
553     // common demoninator.
554     unsigned NewReg;
555     if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
556       Reg = NewReg;
557     else
558       continue;
559 
560     StackOffset Offset;
561     if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
562       AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
563       Offset =
564           StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
565           StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
566     } else {
567       Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
568                                      getOffsetOfLocalArea());
569     }
570     unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
571     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
572         .addCFIIndex(CFIIndex)
573         .setMIFlags(MachineInstr::FrameSetup);
574   }
575 }
576 
577 // Find a scratch register that we can use at the start of the prologue to
578 // re-align the stack pointer.  We avoid using callee-save registers since they
579 // may appear to be free when this is called from canUseAsPrologue (during
580 // shrink wrapping), but then no longer be free when this is called from
581 // emitPrologue.
582 //
583 // FIXME: This is a bit conservative, since in the above case we could use one
584 // of the callee-save registers as a scratch temp to re-align the stack pointer,
585 // but we would then have to make sure that we were in fact saving at least one
586 // callee-save register in the prologue, which is additional complexity that
587 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)588 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
589   MachineFunction *MF = MBB->getParent();
590 
591   // If MBB is an entry block, use X9 as the scratch register
592   if (&MF->front() == MBB)
593     return AArch64::X9;
594 
595   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
596   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
597   LivePhysRegs LiveRegs(TRI);
598   LiveRegs.addLiveIns(*MBB);
599 
600   // Mark callee saved registers as used so we will not choose them.
601   const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
602   for (unsigned i = 0; CSRegs[i]; ++i)
603     LiveRegs.addReg(CSRegs[i]);
604 
605   // Prefer X9 since it was historically used for the prologue scratch reg.
606   const MachineRegisterInfo &MRI = MF->getRegInfo();
607   if (LiveRegs.available(MRI, AArch64::X9))
608     return AArch64::X9;
609 
610   for (unsigned Reg : AArch64::GPR64RegClass) {
611     if (LiveRegs.available(MRI, Reg))
612       return Reg;
613   }
614   return AArch64::NoRegister;
615 }
616 
canUseAsPrologue(const MachineBasicBlock & MBB) const617 bool AArch64FrameLowering::canUseAsPrologue(
618     const MachineBasicBlock &MBB) const {
619   const MachineFunction *MF = MBB.getParent();
620   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
621   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
622   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
623 
624   // Don't need a scratch register if we're not going to re-align the stack.
625   if (!RegInfo->hasStackRealignment(*MF))
626     return true;
627   // Otherwise, we can use any block as long as it has a scratch register
628   // available.
629   return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
630 }
631 
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)632 static bool windowsRequiresStackProbe(MachineFunction &MF,
633                                       uint64_t StackSizeInBytes) {
634   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
635   if (!Subtarget.isTargetWindows())
636     return false;
637   const Function &F = MF.getFunction();
638   // TODO: When implementing stack protectors, take that into account
639   // for the probe threshold.
640   unsigned StackProbeSize = 4096;
641   if (F.hasFnAttribute("stack-probe-size"))
642     F.getFnAttribute("stack-probe-size")
643         .getValueAsString()
644         .getAsInteger(0, StackProbeSize);
645   return (StackSizeInBytes >= StackProbeSize) &&
646          !F.hasFnAttribute("no-stack-arg-probe");
647 }
648 
needsWinCFI(const MachineFunction & MF)649 static bool needsWinCFI(const MachineFunction &MF) {
650   const Function &F = MF.getFunction();
651   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
652          F.needsUnwindTableEntry();
653 }
654 
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const655 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
656     MachineFunction &MF, uint64_t StackBumpBytes) const {
657   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
658   const MachineFrameInfo &MFI = MF.getFrameInfo();
659   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
660   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
661   if (homogeneousPrologEpilog(MF))
662     return false;
663 
664   if (AFI->getLocalStackSize() == 0)
665     return false;
666 
667   // For WinCFI, if optimizing for size, prefer to not combine the stack bump
668   // (to force a stp with predecrement) to match the packed unwind format,
669   // provided that there actually are any callee saved registers to merge the
670   // decrement with.
671   // This is potentially marginally slower, but allows using the packed
672   // unwind format for functions that both have a local area and callee saved
673   // registers. Using the packed unwind format notably reduces the size of
674   // the unwind info.
675   if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
676       MF.getFunction().hasOptSize())
677     return false;
678 
679   // 512 is the maximum immediate for stp/ldp that will be used for
680   // callee-save save/restores
681   if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
682     return false;
683 
684   if (MFI.hasVarSizedObjects())
685     return false;
686 
687   if (RegInfo->hasStackRealignment(MF))
688     return false;
689 
690   // This isn't strictly necessary, but it simplifies things a bit since the
691   // current RedZone handling code assumes the SP is adjusted by the
692   // callee-save save/restore code.
693   if (canUseRedZone(MF))
694     return false;
695 
696   // When there is an SVE area on the stack, always allocate the
697   // callee-saves and spills/locals separately.
698   if (getSVEStackSize(MF))
699     return false;
700 
701   return true;
702 }
703 
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const704 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
705     MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
706   if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
707     return false;
708 
709   if (MBB.empty())
710     return true;
711 
712   // Disable combined SP bump if the last instruction is an MTE tag store. It
713   // is almost always better to merge SP adjustment into those instructions.
714   MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
715   MachineBasicBlock::iterator Begin = MBB.begin();
716   while (LastI != Begin) {
717     --LastI;
718     if (LastI->isTransient())
719       continue;
720     if (!LastI->getFlag(MachineInstr::FrameDestroy))
721       break;
722   }
723   switch (LastI->getOpcode()) {
724   case AArch64::STGloop:
725   case AArch64::STZGloop:
726   case AArch64::STGOffset:
727   case AArch64::STZGOffset:
728   case AArch64::ST2GOffset:
729   case AArch64::STZ2GOffset:
730     return false;
731   default:
732     return true;
733   }
734   llvm_unreachable("unreachable");
735 }
736 
737 // Given a load or a store instruction, generate an appropriate unwinding SEH
738 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)739 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
740                                              const TargetInstrInfo &TII,
741                                              MachineInstr::MIFlag Flag) {
742   unsigned Opc = MBBI->getOpcode();
743   MachineBasicBlock *MBB = MBBI->getParent();
744   MachineFunction &MF = *MBB->getParent();
745   DebugLoc DL = MBBI->getDebugLoc();
746   unsigned ImmIdx = MBBI->getNumOperands() - 1;
747   int Imm = MBBI->getOperand(ImmIdx).getImm();
748   MachineInstrBuilder MIB;
749   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
750   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
751 
752   switch (Opc) {
753   default:
754     llvm_unreachable("No SEH Opcode for this instruction");
755   case AArch64::LDPDpost:
756     Imm = -Imm;
757     LLVM_FALLTHROUGH;
758   case AArch64::STPDpre: {
759     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
760     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
761     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
762               .addImm(Reg0)
763               .addImm(Reg1)
764               .addImm(Imm * 8)
765               .setMIFlag(Flag);
766     break;
767   }
768   case AArch64::LDPXpost:
769     Imm = -Imm;
770     LLVM_FALLTHROUGH;
771   case AArch64::STPXpre: {
772     Register Reg0 = MBBI->getOperand(1).getReg();
773     Register Reg1 = MBBI->getOperand(2).getReg();
774     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
775       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
776                 .addImm(Imm * 8)
777                 .setMIFlag(Flag);
778     else
779       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
780                 .addImm(RegInfo->getSEHRegNum(Reg0))
781                 .addImm(RegInfo->getSEHRegNum(Reg1))
782                 .addImm(Imm * 8)
783                 .setMIFlag(Flag);
784     break;
785   }
786   case AArch64::LDRDpost:
787     Imm = -Imm;
788     LLVM_FALLTHROUGH;
789   case AArch64::STRDpre: {
790     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
791     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
792               .addImm(Reg)
793               .addImm(Imm)
794               .setMIFlag(Flag);
795     break;
796   }
797   case AArch64::LDRXpost:
798     Imm = -Imm;
799     LLVM_FALLTHROUGH;
800   case AArch64::STRXpre: {
801     unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
802     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
803               .addImm(Reg)
804               .addImm(Imm)
805               .setMIFlag(Flag);
806     break;
807   }
808   case AArch64::STPDi:
809   case AArch64::LDPDi: {
810     unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
811     unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
812     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
813               .addImm(Reg0)
814               .addImm(Reg1)
815               .addImm(Imm * 8)
816               .setMIFlag(Flag);
817     break;
818   }
819   case AArch64::STPXi:
820   case AArch64::LDPXi: {
821     Register Reg0 = MBBI->getOperand(0).getReg();
822     Register Reg1 = MBBI->getOperand(1).getReg();
823     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
824       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
825                 .addImm(Imm * 8)
826                 .setMIFlag(Flag);
827     else
828       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
829                 .addImm(RegInfo->getSEHRegNum(Reg0))
830                 .addImm(RegInfo->getSEHRegNum(Reg1))
831                 .addImm(Imm * 8)
832                 .setMIFlag(Flag);
833     break;
834   }
835   case AArch64::STRXui:
836   case AArch64::LDRXui: {
837     int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
838     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
839               .addImm(Reg)
840               .addImm(Imm * 8)
841               .setMIFlag(Flag);
842     break;
843   }
844   case AArch64::STRDui:
845   case AArch64::LDRDui: {
846     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
847     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
848               .addImm(Reg)
849               .addImm(Imm * 8)
850               .setMIFlag(Flag);
851     break;
852   }
853   }
854   auto I = MBB->insertAfter(MBBI, MIB);
855   return I;
856 }
857 
858 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)859 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
860                            unsigned LocalStackSize) {
861   MachineOperand *ImmOpnd = nullptr;
862   unsigned ImmIdx = MBBI->getNumOperands() - 1;
863   switch (MBBI->getOpcode()) {
864   default:
865     llvm_unreachable("Fix the offset in the SEH instruction");
866   case AArch64::SEH_SaveFPLR:
867   case AArch64::SEH_SaveRegP:
868   case AArch64::SEH_SaveReg:
869   case AArch64::SEH_SaveFRegP:
870   case AArch64::SEH_SaveFReg:
871     ImmOpnd = &MBBI->getOperand(ImmIdx);
872     break;
873   }
874   if (ImmOpnd)
875     ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
876 }
877 
878 // Convert callee-save register save/restore instruction to do stack pointer
879 // decrement/increment to allocate/deallocate the callee-save stack area by
880 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool InProlog=true)881 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
882     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
883     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
884     bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
885   // Ignore instructions that do not operate on SP, i.e. shadow call stack
886   // instructions and associated CFI instruction.
887   while (MBBI->getOpcode() == AArch64::STRXpost ||
888          MBBI->getOpcode() == AArch64::LDRXpre ||
889          MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
890     if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
891       assert(MBBI->getOperand(0).getReg() != AArch64::SP);
892     ++MBBI;
893   }
894   unsigned NewOpc;
895   switch (MBBI->getOpcode()) {
896   default:
897     llvm_unreachable("Unexpected callee-save save/restore opcode!");
898   case AArch64::STPXi:
899     NewOpc = AArch64::STPXpre;
900     break;
901   case AArch64::STPDi:
902     NewOpc = AArch64::STPDpre;
903     break;
904   case AArch64::STPQi:
905     NewOpc = AArch64::STPQpre;
906     break;
907   case AArch64::STRXui:
908     NewOpc = AArch64::STRXpre;
909     break;
910   case AArch64::STRDui:
911     NewOpc = AArch64::STRDpre;
912     break;
913   case AArch64::STRQui:
914     NewOpc = AArch64::STRQpre;
915     break;
916   case AArch64::LDPXi:
917     NewOpc = AArch64::LDPXpost;
918     break;
919   case AArch64::LDPDi:
920     NewOpc = AArch64::LDPDpost;
921     break;
922   case AArch64::LDPQi:
923     NewOpc = AArch64::LDPQpost;
924     break;
925   case AArch64::LDRXui:
926     NewOpc = AArch64::LDRXpost;
927     break;
928   case AArch64::LDRDui:
929     NewOpc = AArch64::LDRDpost;
930     break;
931   case AArch64::LDRQui:
932     NewOpc = AArch64::LDRQpost;
933     break;
934   }
935   // Get rid of the SEH code associated with the old instruction.
936   if (NeedsWinCFI) {
937     auto SEH = std::next(MBBI);
938     if (AArch64InstrInfo::isSEHInstruction(*SEH))
939       SEH->eraseFromParent();
940   }
941 
942   TypeSize Scale = TypeSize::Fixed(1);
943   unsigned Width;
944   int64_t MinOffset, MaxOffset;
945   bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
946       NewOpc, Scale, Width, MinOffset, MaxOffset);
947   (void)Success;
948   assert(Success && "unknown load/store opcode");
949 
950   // If the first store isn't right where we want SP then we can't fold the
951   // update in so create a normal arithmetic instruction instead.
952   if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
953       CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
954     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
955                     StackOffset::getFixed(CSStackSizeInc), TII,
956                     InProlog ? MachineInstr::FrameSetup
957                              : MachineInstr::FrameDestroy);
958     return std::prev(MBBI);
959   }
960 
961   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
962   MIB.addReg(AArch64::SP, RegState::Define);
963 
964   // Copy all operands other than the immediate offset.
965   unsigned OpndIdx = 0;
966   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
967        ++OpndIdx)
968     MIB.add(MBBI->getOperand(OpndIdx));
969 
970   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
971          "Unexpected immediate offset in first/last callee-save save/restore "
972          "instruction!");
973   assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
974          "Unexpected base register in callee-save save/restore instruction!");
975   assert(CSStackSizeInc % Scale == 0);
976   MIB.addImm(CSStackSizeInc / (int)Scale);
977 
978   MIB.setMIFlags(MBBI->getFlags());
979   MIB.setMemRefs(MBBI->memoperands());
980 
981   // Generate a new SEH code that corresponds to the new instruction.
982   if (NeedsWinCFI) {
983     *HasWinCFI = true;
984     InsertSEH(*MIB, *TII,
985               InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
986   }
987 
988   return std::prev(MBB.erase(MBBI));
989 }
990 
991 // Fixup callee-save register save/restore instructions to take into account
992 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)993 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
994                                               uint64_t LocalStackSize,
995                                               bool NeedsWinCFI,
996                                               bool *HasWinCFI) {
997   if (AArch64InstrInfo::isSEHInstruction(MI))
998     return;
999 
1000   unsigned Opc = MI.getOpcode();
1001 
1002   // Ignore instructions that do not operate on SP, i.e. shadow call stack
1003   // instructions and associated CFI instruction.
1004   if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
1005       Opc == AArch64::CFI_INSTRUCTION) {
1006     if (Opc != AArch64::CFI_INSTRUCTION)
1007       assert(MI.getOperand(0).getReg() != AArch64::SP);
1008     return;
1009   }
1010 
1011   unsigned Scale;
1012   switch (Opc) {
1013   case AArch64::STPXi:
1014   case AArch64::STRXui:
1015   case AArch64::STPDi:
1016   case AArch64::STRDui:
1017   case AArch64::LDPXi:
1018   case AArch64::LDRXui:
1019   case AArch64::LDPDi:
1020   case AArch64::LDRDui:
1021     Scale = 8;
1022     break;
1023   case AArch64::STPQi:
1024   case AArch64::STRQui:
1025   case AArch64::LDPQi:
1026   case AArch64::LDRQui:
1027     Scale = 16;
1028     break;
1029   default:
1030     llvm_unreachable("Unexpected callee-save save/restore opcode!");
1031   }
1032 
1033   unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1034   assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1035          "Unexpected base register in callee-save save/restore instruction!");
1036   // Last operand is immediate offset that needs fixing.
1037   MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1038   // All generated opcodes have scaled offsets.
1039   assert(LocalStackSize % Scale == 0);
1040   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1041 
1042   if (NeedsWinCFI) {
1043     *HasWinCFI = true;
1044     auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1045     assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1046     assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1047            "Expecting a SEH instruction");
1048     fixupSEHOpcode(MBBI, LocalStackSize);
1049   }
1050 }
1051 
adaptForLdStOpt(MachineBasicBlock & MBB,MachineBasicBlock::iterator FirstSPPopI,MachineBasicBlock::iterator LastPopI)1052 static void adaptForLdStOpt(MachineBasicBlock &MBB,
1053                             MachineBasicBlock::iterator FirstSPPopI,
1054                             MachineBasicBlock::iterator LastPopI) {
1055   // Sometimes (when we restore in the same order as we save), we can end up
1056   // with code like this:
1057   //
1058   // ldp      x26, x25, [sp]
1059   // ldp      x24, x23, [sp, #16]
1060   // ldp      x22, x21, [sp, #32]
1061   // ldp      x20, x19, [sp, #48]
1062   // add      sp, sp, #64
1063   //
1064   // In this case, it is always better to put the first ldp at the end, so
1065   // that the load-store optimizer can run and merge the ldp and the add into
1066   // a post-index ldp.
1067   // If we managed to grab the first pop instruction, move it to the end.
1068   if (ReverseCSRRestoreSeq)
1069     MBB.splice(FirstSPPopI, &MBB, LastPopI);
1070   // We should end up with something like this now:
1071   //
1072   // ldp      x24, x23, [sp, #16]
1073   // ldp      x22, x21, [sp, #32]
1074   // ldp      x20, x19, [sp, #48]
1075   // ldp      x26, x25, [sp]
1076   // add      sp, sp, #64
1077   //
1078   // and the load-store optimizer can merge the last two instructions into:
1079   //
1080   // ldp      x26, x25, [sp], #64
1081   //
1082 }
1083 
isTargetWindows(const MachineFunction & MF)1084 static bool isTargetWindows(const MachineFunction &MF) {
1085   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1086 }
1087 
1088 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1089 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1090   switch (I->getOpcode()) {
1091   default:
1092     return false;
1093   case AArch64::STR_ZXI:
1094   case AArch64::STR_PXI:
1095   case AArch64::LDR_ZXI:
1096   case AArch64::LDR_PXI:
1097     return I->getFlag(MachineInstr::FrameSetup) ||
1098            I->getFlag(MachineInstr::FrameDestroy);
1099   }
1100 }
1101 
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1102 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1103                                         MachineBasicBlock &MBB) const {
1104   MachineBasicBlock::iterator MBBI = MBB.begin();
1105   const MachineFrameInfo &MFI = MF.getFrameInfo();
1106   const Function &F = MF.getFunction();
1107   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1108   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1109   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1110   MachineModuleInfo &MMI = MF.getMMI();
1111   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1112   bool needsFrameMoves =
1113       MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1114   bool HasFP = hasFP(MF);
1115   bool NeedsWinCFI = needsWinCFI(MF);
1116   bool HasWinCFI = false;
1117   auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1118 
1119   bool IsFunclet = MBB.isEHFuncletEntry();
1120 
1121   // At this point, we're going to decide whether or not the function uses a
1122   // redzone. In most cases, the function doesn't have a redzone so let's
1123   // assume that's false and set it to true in the case that there's a redzone.
1124   AFI->setHasRedZone(false);
1125 
1126   // Debug location must be unknown since the first debug location is used
1127   // to determine the end of the prologue.
1128   DebugLoc DL;
1129 
1130   const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1131   if (MFnI.shouldSignReturnAddress()) {
1132     if (MFnI.shouldSignWithBKey()) {
1133       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1134           .setMIFlag(MachineInstr::FrameSetup);
1135       BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
1136           .setMIFlag(MachineInstr::FrameSetup);
1137     } else {
1138       BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
1139           .setMIFlag(MachineInstr::FrameSetup);
1140     }
1141 
1142     unsigned CFIIndex =
1143         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1144     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1145         .addCFIIndex(CFIIndex)
1146         .setMIFlags(MachineInstr::FrameSetup);
1147   }
1148 
1149   // We signal the presence of a Swift extended frame to external tools by
1150   // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1151   // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1152   // bits so that is still true.
1153   if (HasFP && AFI->hasSwiftAsyncContext()) {
1154     // ORR x29, x29, #0x1000_0000_0000_0000
1155     BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1156         .addUse(AArch64::FP)
1157         .addImm(0x1100)
1158         .setMIFlag(MachineInstr::FrameSetup);
1159   }
1160 
1161   // All calls are tail calls in GHC calling conv, and functions have no
1162   // prologue/epilogue.
1163   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1164     return;
1165 
1166   // Set tagged base pointer to the requested stack slot.
1167   // Ideally it should match SP value after prologue.
1168   Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1169   if (TBPI)
1170     AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1171   else
1172     AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1173 
1174   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1175 
1176   // getStackSize() includes all the locals in its size calculation. We don't
1177   // include these locals when computing the stack size of a funclet, as they
1178   // are allocated in the parent's stack frame and accessed via the frame
1179   // pointer from the funclet.  We only save the callee saved registers in the
1180   // funclet, which are really the callee saved registers of the parent
1181   // function, including the funclet.
1182   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1183                                : MFI.getStackSize();
1184   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1185     assert(!HasFP && "unexpected function without stack frame but with FP");
1186     assert(!SVEStackSize &&
1187            "unexpected function without stack frame but with SVE objects");
1188     // All of the stack allocation is for locals.
1189     AFI->setLocalStackSize(NumBytes);
1190     if (!NumBytes)
1191       return;
1192     // REDZONE: If the stack size is less than 128 bytes, we don't need
1193     // to actually allocate.
1194     if (canUseRedZone(MF)) {
1195       AFI->setHasRedZone(true);
1196       ++NumRedZoneFunctions;
1197     } else {
1198       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1199                       StackOffset::getFixed(-NumBytes), TII,
1200                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1201       if (!NeedsWinCFI && needsFrameMoves) {
1202         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1203         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1204           // Encode the stack size of the leaf function.
1205         unsigned CFIIndex = MF.addFrameInst(
1206             MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1207         BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1208             .addCFIIndex(CFIIndex)
1209             .setMIFlags(MachineInstr::FrameSetup);
1210       }
1211     }
1212 
1213     if (NeedsWinCFI) {
1214       HasWinCFI = true;
1215       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1216           .setMIFlag(MachineInstr::FrameSetup);
1217     }
1218 
1219     return;
1220   }
1221 
1222   bool IsWin64 =
1223       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1224   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1225 
1226   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1227   // All of the remaining stack allocations are for locals.
1228   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1229   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1230   bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1231   if (CombineSPBump) {
1232     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1233     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1234                     StackOffset::getFixed(-NumBytes), TII,
1235                     MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1236     NumBytes = 0;
1237   } else if (HomPrologEpilog) {
1238     // Stack has been already adjusted.
1239     NumBytes -= PrologueSaveSize;
1240   } else if (PrologueSaveSize != 0) {
1241     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1242         MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
1243     NumBytes -= PrologueSaveSize;
1244   }
1245   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1246 
1247   // Move past the saves of the callee-saved registers, fixing up the offsets
1248   // and pre-inc if we decided to combine the callee-save and local stack
1249   // pointer bump above.
1250   MachineBasicBlock::iterator End = MBB.end();
1251   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1252          !IsSVECalleeSave(MBBI)) {
1253     if (CombineSPBump)
1254       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1255                                         NeedsWinCFI, &HasWinCFI);
1256     ++MBBI;
1257   }
1258 
1259   // For funclets the FP belongs to the containing function.
1260   if (!IsFunclet && HasFP) {
1261     // Only set up FP if we actually need to.
1262     int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1263 
1264     if (CombineSPBump)
1265       FPOffset += AFI->getLocalStackSize();
1266 
1267     if (AFI->hasSwiftAsyncContext()) {
1268       // Before we update the live FP we have to ensure there's a valid (or
1269       // null) asynchronous context in its slot just before FP in the frame
1270       // record, so store it now.
1271       const auto &Attrs = MF.getFunction().getAttributes();
1272       bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1273       if (HaveInitialContext)
1274         MBB.addLiveIn(AArch64::X22);
1275       BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1276           .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1277           .addUse(AArch64::SP)
1278           .addImm(FPOffset - 8)
1279           .setMIFlags(MachineInstr::FrameSetup);
1280     }
1281 
1282     if (HomPrologEpilog) {
1283       auto Prolog = MBBI;
1284       --Prolog;
1285       assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1286       Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1287     } else {
1288       // Issue    sub fp, sp, FPOffset or
1289       //          mov fp,sp          when FPOffset is zero.
1290       // Note: All stores of callee-saved registers are marked as "FrameSetup".
1291       // This code marks the instruction(s) that set the FP also.
1292       emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1293                       StackOffset::getFixed(FPOffset), TII,
1294                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1295     }
1296   }
1297 
1298   if (windowsRequiresStackProbe(MF, NumBytes)) {
1299     uint64_t NumWords = NumBytes >> 4;
1300     if (NeedsWinCFI) {
1301       HasWinCFI = true;
1302       // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1303       // exceed this amount.  We need to move at most 2^24 - 1 into x15.
1304       // This is at most two instructions, MOVZ follwed by MOVK.
1305       // TODO: Fix to use multiple stack alloc unwind codes for stacks
1306       // exceeding 256MB in size.
1307       if (NumBytes >= (1 << 28))
1308         report_fatal_error("Stack size cannot exceed 256MB for stack "
1309                             "unwinding purposes");
1310 
1311       uint32_t LowNumWords = NumWords & 0xFFFF;
1312       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1313             .addImm(LowNumWords)
1314             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1315             .setMIFlag(MachineInstr::FrameSetup);
1316       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1317             .setMIFlag(MachineInstr::FrameSetup);
1318       if ((NumWords & 0xFFFF0000) != 0) {
1319           BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1320               .addReg(AArch64::X15)
1321               .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1322               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1323               .setMIFlag(MachineInstr::FrameSetup);
1324           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1325             .setMIFlag(MachineInstr::FrameSetup);
1326       }
1327     } else {
1328       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1329           .addImm(NumWords)
1330           .setMIFlags(MachineInstr::FrameSetup);
1331     }
1332 
1333     switch (MF.getTarget().getCodeModel()) {
1334     case CodeModel::Tiny:
1335     case CodeModel::Small:
1336     case CodeModel::Medium:
1337     case CodeModel::Kernel:
1338       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1339           .addExternalSymbol("__chkstk")
1340           .addReg(AArch64::X15, RegState::Implicit)
1341           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1342           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1343           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1344           .setMIFlags(MachineInstr::FrameSetup);
1345       if (NeedsWinCFI) {
1346         HasWinCFI = true;
1347         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1348             .setMIFlag(MachineInstr::FrameSetup);
1349       }
1350       break;
1351     case CodeModel::Large:
1352       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1353           .addReg(AArch64::X16, RegState::Define)
1354           .addExternalSymbol("__chkstk")
1355           .addExternalSymbol("__chkstk")
1356           .setMIFlags(MachineInstr::FrameSetup);
1357       if (NeedsWinCFI) {
1358         HasWinCFI = true;
1359         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1360             .setMIFlag(MachineInstr::FrameSetup);
1361       }
1362 
1363       BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1364           .addReg(AArch64::X16, RegState::Kill)
1365           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1366           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1367           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1368           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1369           .setMIFlags(MachineInstr::FrameSetup);
1370       if (NeedsWinCFI) {
1371         HasWinCFI = true;
1372         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1373             .setMIFlag(MachineInstr::FrameSetup);
1374       }
1375       break;
1376     }
1377 
1378     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1379         .addReg(AArch64::SP, RegState::Kill)
1380         .addReg(AArch64::X15, RegState::Kill)
1381         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1382         .setMIFlags(MachineInstr::FrameSetup);
1383     if (NeedsWinCFI) {
1384       HasWinCFI = true;
1385       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1386           .addImm(NumBytes)
1387           .setMIFlag(MachineInstr::FrameSetup);
1388     }
1389     NumBytes = 0;
1390   }
1391 
1392   StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1393   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1394 
1395   // Process the SVE callee-saves to determine what space needs to be
1396   // allocated.
1397   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1398     // Find callee save instructions in frame.
1399     CalleeSavesBegin = MBBI;
1400     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1401     while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1402       ++MBBI;
1403     CalleeSavesEnd = MBBI;
1404 
1405     AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1406     AllocateAfter = SVEStackSize - AllocateBefore;
1407   }
1408 
1409   // Allocate space for the callee saves (if any).
1410   emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
1411                   -AllocateBefore, TII,
1412                   MachineInstr::FrameSetup);
1413 
1414   // Finally allocate remaining SVE stack space.
1415   emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1416                   -AllocateAfter, TII,
1417                   MachineInstr::FrameSetup);
1418 
1419   // Allocate space for the rest of the frame.
1420   if (NumBytes) {
1421     // Alignment is required for the parent frame, not the funclet
1422     const bool NeedsRealignment =
1423         !IsFunclet && RegInfo->hasStackRealignment(MF);
1424     unsigned scratchSPReg = AArch64::SP;
1425 
1426     if (NeedsRealignment) {
1427       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1428       assert(scratchSPReg != AArch64::NoRegister);
1429     }
1430 
1431     // If we're a leaf function, try using the red zone.
1432     if (!canUseRedZone(MF))
1433       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1434       // the correct value here, as NumBytes also includes padding bytes,
1435       // which shouldn't be counted here.
1436       emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1437                       StackOffset::getFixed(-NumBytes), TII,
1438                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1439 
1440     if (NeedsRealignment) {
1441       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1442       assert(NrBitsToZero > 1);
1443       assert(scratchSPReg != AArch64::SP);
1444 
1445       // SUB X9, SP, NumBytes
1446       //   -- X9 is temporary register, so shouldn't contain any live data here,
1447       //   -- free to use. This is already produced by emitFrameOffset above.
1448       // AND SP, X9, 0b11111...0000
1449       // The logical immediates have a non-trivial encoding. The following
1450       // formula computes the encoded immediate with all ones but
1451       // NrBitsToZero zero bits as least significant bits.
1452       uint32_t andMaskEncoded = (1 << 12)                         // = N
1453                                 | ((64 - NrBitsToZero) << 6)      // immr
1454                                 | ((64 - NrBitsToZero - 1) << 0); // imms
1455 
1456       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1457           .addReg(scratchSPReg, RegState::Kill)
1458           .addImm(andMaskEncoded);
1459       AFI->setStackRealigned(true);
1460       if (NeedsWinCFI) {
1461         HasWinCFI = true;
1462         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1463             .addImm(NumBytes & andMaskEncoded)
1464             .setMIFlag(MachineInstr::FrameSetup);
1465       }
1466     }
1467   }
1468 
1469   // If we need a base pointer, set it up here. It's whatever the value of the
1470   // stack pointer is at this point. Any variable size objects will be allocated
1471   // after this, so we can still use the base pointer to reference locals.
1472   //
1473   // FIXME: Clarify FrameSetup flags here.
1474   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1475   // needed.
1476   // For funclets the BP belongs to the containing function.
1477   if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1478     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1479                      false);
1480     if (NeedsWinCFI) {
1481       HasWinCFI = true;
1482       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1483           .setMIFlag(MachineInstr::FrameSetup);
1484     }
1485   }
1486 
1487   // The very last FrameSetup instruction indicates the end of prologue. Emit a
1488   // SEH opcode indicating the prologue end.
1489   if (NeedsWinCFI && HasWinCFI) {
1490     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1491         .setMIFlag(MachineInstr::FrameSetup);
1492   }
1493 
1494   // SEH funclets are passed the frame pointer in X1.  If the parent
1495   // function uses the base register, then the base register is used
1496   // directly, and is not retrieved from X1.
1497   if (IsFunclet && F.hasPersonalityFn()) {
1498     EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1499     if (isAsynchronousEHPersonality(Per)) {
1500       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1501           .addReg(AArch64::X1)
1502           .setMIFlag(MachineInstr::FrameSetup);
1503       MBB.addLiveIn(AArch64::X1);
1504     }
1505   }
1506 
1507   if (needsFrameMoves) {
1508     // An example of the prologue:
1509     //
1510     //     .globl __foo
1511     //     .align 2
1512     //  __foo:
1513     // Ltmp0:
1514     //     .cfi_startproc
1515     //     .cfi_personality 155, ___gxx_personality_v0
1516     // Leh_func_begin:
1517     //     .cfi_lsda 16, Lexception33
1518     //
1519     //     stp  xa,bx, [sp, -#offset]!
1520     //     ...
1521     //     stp  x28, x27, [sp, #offset-32]
1522     //     stp  fp, lr, [sp, #offset-16]
1523     //     add  fp, sp, #offset - 16
1524     //     sub  sp, sp, #1360
1525     //
1526     // The Stack:
1527     //       +-------------------------------------------+
1528     // 10000 | ........ | ........ | ........ | ........ |
1529     // 10004 | ........ | ........ | ........ | ........ |
1530     //       +-------------------------------------------+
1531     // 10008 | ........ | ........ | ........ | ........ |
1532     // 1000c | ........ | ........ | ........ | ........ |
1533     //       +===========================================+
1534     // 10010 |                X28 Register               |
1535     // 10014 |                X28 Register               |
1536     //       +-------------------------------------------+
1537     // 10018 |                X27 Register               |
1538     // 1001c |                X27 Register               |
1539     //       +===========================================+
1540     // 10020 |                Frame Pointer              |
1541     // 10024 |                Frame Pointer              |
1542     //       +-------------------------------------------+
1543     // 10028 |                Link Register              |
1544     // 1002c |                Link Register              |
1545     //       +===========================================+
1546     // 10030 | ........ | ........ | ........ | ........ |
1547     // 10034 | ........ | ........ | ........ | ........ |
1548     //       +-------------------------------------------+
1549     // 10038 | ........ | ........ | ........ | ........ |
1550     // 1003c | ........ | ........ | ........ | ........ |
1551     //       +-------------------------------------------+
1552     //
1553     //     [sp] = 10030        ::    >>initial value<<
1554     //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
1555     //     fp = sp == 10020    ::  mov fp, sp
1556     //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
1557     //     sp == 10010         ::    >>final value<<
1558     //
1559     // The frame pointer (w29) points to address 10020. If we use an offset of
1560     // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
1561     // for w27, and -32 for w28:
1562     //
1563     //  Ltmp1:
1564     //     .cfi_def_cfa w29, 16
1565     //  Ltmp2:
1566     //     .cfi_offset w30, -8
1567     //  Ltmp3:
1568     //     .cfi_offset w29, -16
1569     //  Ltmp4:
1570     //     .cfi_offset w27, -24
1571     //  Ltmp5:
1572     //     .cfi_offset w28, -32
1573 
1574     if (HasFP) {
1575       const int OffsetToFirstCalleeSaveFromFP =
1576           AFI->getCalleeSaveBaseToFrameRecordOffset() -
1577           AFI->getCalleeSavedStackSize();
1578       Register FramePtr = RegInfo->getFrameRegister(MF);
1579 
1580       // Define the current CFA rule to use the provided FP.
1581       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1582       unsigned CFIIndex = MF.addFrameInst(
1583           MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1584       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1585           .addCFIIndex(CFIIndex)
1586           .setMIFlags(MachineInstr::FrameSetup);
1587     } else {
1588       unsigned CFIIndex;
1589       if (SVEStackSize) {
1590         const TargetSubtargetInfo &STI = MF.getSubtarget();
1591         const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
1592         StackOffset TotalSize =
1593             SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
1594         CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
1595       } else {
1596         // Encode the stack size of the leaf function.
1597         CFIIndex = MF.addFrameInst(
1598             MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
1599       }
1600       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1601           .addCFIIndex(CFIIndex)
1602           .setMIFlags(MachineInstr::FrameSetup);
1603     }
1604 
1605     // Now emit the moves for whatever callee saved regs we have (including FP,
1606     // LR if those are saved).
1607     emitCalleeSavedFrameMoves(MBB, MBBI);
1608   }
1609 }
1610 
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1611 static void InsertReturnAddressAuth(MachineFunction &MF,
1612                                     MachineBasicBlock &MBB) {
1613   const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1614   if (!MFI.shouldSignReturnAddress())
1615     return;
1616   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1617   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1618 
1619   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1620   DebugLoc DL;
1621   if (MBBI != MBB.end())
1622     DL = MBBI->getDebugLoc();
1623 
1624   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1625   // this instruction can safely used for any v8a architecture.
1626   // From v8.3a onwards there are optimised authenticate LR and return
1627   // instructions, namely RETA{A,B}, that can be used instead.
1628   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1629       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1630     BuildMI(MBB, MBBI, DL,
1631             TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1632         .copyImplicitOps(*MBBI);
1633     MBB.erase(MBBI);
1634   } else {
1635     BuildMI(
1636         MBB, MBBI, DL,
1637         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1638         .setMIFlag(MachineInstr::FrameDestroy);
1639   }
1640 }
1641 
isFuncletReturnInstr(const MachineInstr & MI)1642 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1643   switch (MI.getOpcode()) {
1644   default:
1645     return false;
1646   case AArch64::CATCHRET:
1647   case AArch64::CLEANUPRET:
1648     return true;
1649   }
1650 }
1651 
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1652 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1653                                         MachineBasicBlock &MBB) const {
1654   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1655   MachineFrameInfo &MFI = MF.getFrameInfo();
1656   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1657   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1658   DebugLoc DL;
1659   bool NeedsWinCFI = needsWinCFI(MF);
1660   bool HasWinCFI = false;
1661   bool IsFunclet = false;
1662   auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1663 
1664   if (MBB.end() != MBBI) {
1665     DL = MBBI->getDebugLoc();
1666     IsFunclet = isFuncletReturnInstr(*MBBI);
1667   }
1668 
1669   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1670                                : MFI.getStackSize();
1671   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1672 
1673   // All calls are tail calls in GHC calling conv, and functions have no
1674   // prologue/epilogue.
1675   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1676     return;
1677 
1678   // How much of the stack used by incoming arguments this function is expected
1679   // to restore in this particular epilogue.
1680   int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1681 
1682   // The stack frame should be like below,
1683   //
1684   //      ----------------------                     ---
1685   //      |                    |                      |
1686   //      | BytesInStackArgArea|              CalleeArgStackSize
1687   //      | (NumReusableBytes) |                (of tail call)
1688   //      |                    |                     ---
1689   //      |                    |                      |
1690   //      ---------------------|        ---           |
1691   //      |                    |         |            |
1692   //      |   CalleeSavedReg   |         |            |
1693   //      | (CalleeSavedStackSize)|      |            |
1694   //      |                    |         |            |
1695   //      ---------------------|         |         NumBytes
1696   //      |                    |     StackSize  (StackAdjustUp)
1697   //      |   LocalStackSize   |         |            |
1698   //      | (covering callee   |         |            |
1699   //      |       args)        |         |            |
1700   //      |                    |         |            |
1701   //      ----------------------        ---          ---
1702   //
1703   // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
1704   //             = StackSize + ArgumentPopSize
1705   //
1706   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
1707   // it as the 2nd argument of AArch64ISD::TC_RETURN.
1708 
1709   auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
1710 
1711   bool IsWin64 =
1712       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1713   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1714 
1715   int64_t AfterCSRPopSize = ArgumentStackToRestore;
1716   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1717   // We cannot rely on the local stack size set in emitPrologue if the function
1718   // has funclets, as funclets have different local stack size requirements, and
1719   // the current value set in emitPrologue may be that of the containing
1720   // function.
1721   if (MF.hasEHFunclets())
1722     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1723   if (homogeneousPrologEpilog(MF, &MBB)) {
1724     assert(!NeedsWinCFI);
1725     auto LastPopI = MBB.getFirstTerminator();
1726     if (LastPopI != MBB.begin()) {
1727       auto HomogeneousEpilog = std::prev(LastPopI);
1728       if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1729         LastPopI = HomogeneousEpilog;
1730     }
1731 
1732     // Adjust local stack
1733     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1734                     StackOffset::getFixed(-AFI->getLocalStackSize()), TII,
1735                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
1736 
1737     // SP has been already adjusted while restoring callee save regs.
1738     // We've bailed-out the case with adjusting SP for arguments.
1739     assert(AfterCSRPopSize == 0);
1740     return;
1741   }
1742   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1743   // Assume we can't combine the last pop with the sp restore.
1744 
1745   if (!CombineSPBump && PrologueSaveSize != 0) {
1746     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1747     while (AArch64InstrInfo::isSEHInstruction(*Pop))
1748       Pop = std::prev(Pop);
1749     // Converting the last ldp to a post-index ldp is valid only if the last
1750     // ldp's offset is 0.
1751     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1752     // If the offset is 0 and the AfterCSR pop is not actually trying to
1753     // allocate more stack for arguments (in space that an untimely interrupt
1754     // may clobber), convert it to a post-index ldp.
1755     if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
1756       convertCalleeSaveRestoreToSPPrePostIncDec(
1757           MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
1758     else {
1759       // If not, make sure to emit an add after the last ldp.
1760       // We're doing this by transfering the size to be restored from the
1761       // adjustment *before* the CSR pops to the adjustment *after* the CSR
1762       // pops.
1763       AfterCSRPopSize += PrologueSaveSize;
1764     }
1765   }
1766 
1767   // Move past the restores of the callee-saved registers.
1768   // If we plan on combining the sp bump of the local stack size and the callee
1769   // save stack size, we might need to adjust the CSR save and restore offsets.
1770   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1771   MachineBasicBlock::iterator Begin = MBB.begin();
1772   while (LastPopI != Begin) {
1773     --LastPopI;
1774     if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1775         IsSVECalleeSave(LastPopI)) {
1776       ++LastPopI;
1777       break;
1778     } else if (CombineSPBump)
1779       fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1780                                         NeedsWinCFI, &HasWinCFI);
1781   }
1782 
1783   if (MF.hasWinCFI()) {
1784     // If the prologue didn't contain any SEH opcodes and didn't set the
1785     // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
1786     // EpilogStart - to avoid generating CFI for functions that don't need it.
1787     // (And as we didn't generate any prologue at all, it would be asymmetrical
1788     // to the epilogue.) By the end of the function, we assert that
1789     // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
1790     HasWinCFI = true;
1791     BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
1792         .setMIFlag(MachineInstr::FrameDestroy);
1793   }
1794 
1795   if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
1796     // We need to reset FP to its untagged state on return. Bit 60 is currently
1797     // used to show the presence of an extended frame.
1798 
1799     // BIC x29, x29, #0x1000_0000_0000_0000
1800     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
1801             AArch64::FP)
1802         .addUse(AArch64::FP)
1803         .addImm(0x10fe)
1804         .setMIFlag(MachineInstr::FrameDestroy);
1805   }
1806 
1807   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1808 
1809   // If there is a single SP update, insert it before the ret and we're done.
1810   if (CombineSPBump) {
1811     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1812     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
1813                     StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
1814                     TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
1815                     &HasWinCFI);
1816     if (HasWinCFI)
1817       BuildMI(MBB, MBB.getFirstTerminator(), DL,
1818               TII->get(AArch64::SEH_EpilogEnd))
1819           .setMIFlag(MachineInstr::FrameDestroy);
1820     return;
1821   }
1822 
1823   NumBytes -= PrologueSaveSize;
1824   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1825 
1826   // Process the SVE callee-saves to determine what space needs to be
1827   // deallocated.
1828   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
1829   MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
1830   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1831     RestoreBegin = std::prev(RestoreEnd);
1832     while (RestoreBegin != MBB.begin() &&
1833            IsSVECalleeSave(std::prev(RestoreBegin)))
1834       --RestoreBegin;
1835 
1836     assert(IsSVECalleeSave(RestoreBegin) &&
1837            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
1838 
1839     StackOffset CalleeSavedSizeAsOffset =
1840         StackOffset::getScalable(CalleeSavedSize);
1841     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
1842     DeallocateAfter = CalleeSavedSizeAsOffset;
1843   }
1844 
1845   // Deallocate the SVE area.
1846   if (SVEStackSize) {
1847     if (AFI->isStackRealigned()) {
1848       if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
1849         // Set SP to start of SVE callee-save area from which they can
1850         // be reloaded. The code below will deallocate the stack space
1851         // space by moving FP -> SP.
1852         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
1853                         StackOffset::getScalable(-CalleeSavedSize), TII,
1854                         MachineInstr::FrameDestroy);
1855     } else {
1856       if (AFI->getSVECalleeSavedStackSize()) {
1857         // Deallocate the non-SVE locals first before we can deallocate (and
1858         // restore callee saves) from the SVE area.
1859         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1860                         StackOffset::getFixed(NumBytes), TII,
1861                         MachineInstr::FrameDestroy);
1862         NumBytes = 0;
1863       }
1864 
1865       emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1866                       DeallocateBefore, TII, MachineInstr::FrameDestroy);
1867 
1868       emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1869                       DeallocateAfter, TII, MachineInstr::FrameDestroy);
1870     }
1871   }
1872 
1873   if (!hasFP(MF)) {
1874     bool RedZone = canUseRedZone(MF);
1875     // If this was a redzone leaf function, we don't need to restore the
1876     // stack pointer (but we may need to pop stack args for fastcc).
1877     if (RedZone && AfterCSRPopSize == 0)
1878       return;
1879 
1880     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
1881     int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
1882     if (NoCalleeSaveRestore)
1883       StackRestoreBytes += AfterCSRPopSize;
1884 
1885     // If we were able to combine the local stack pop with the argument pop,
1886     // then we're done.
1887     bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
1888 
1889     // If we're done after this, make sure to help the load store optimizer.
1890     if (Done)
1891       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
1892 
1893     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1894                     StackOffset::getFixed(StackRestoreBytes), TII,
1895                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1896     if (Done) {
1897       if (HasWinCFI) {
1898         BuildMI(MBB, MBB.getFirstTerminator(), DL,
1899                 TII->get(AArch64::SEH_EpilogEnd))
1900             .setMIFlag(MachineInstr::FrameDestroy);
1901       }
1902       return;
1903     }
1904 
1905     NumBytes = 0;
1906   }
1907 
1908   // Restore the original stack pointer.
1909   // FIXME: Rather than doing the math here, we should instead just use
1910   // non-post-indexed loads for the restores if we aren't actually going to
1911   // be able to save any instructions.
1912   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
1913     emitFrameOffset(
1914         MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
1915         StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
1916         TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
1917   } else if (NumBytes)
1918     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1919                     StackOffset::getFixed(NumBytes), TII,
1920                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
1921 
1922   // This must be placed after the callee-save restore code because that code
1923   // assumes the SP is at the same location as it was after the callee-save save
1924   // code in the prologue.
1925   if (AfterCSRPopSize) {
1926     assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
1927                                   "interrupt may have clobbered");
1928     // Find an insertion point for the first ldp so that it goes before the
1929     // shadow call stack epilog instruction. This ensures that the restore of
1930     // lr from x18 is placed after the restore from sp.
1931     auto FirstSPPopI = MBB.getFirstTerminator();
1932     while (FirstSPPopI != Begin) {
1933       auto Prev = std::prev(FirstSPPopI);
1934       if (Prev->getOpcode() != AArch64::LDRXpre ||
1935           Prev->getOperand(0).getReg() == AArch64::SP)
1936         break;
1937       FirstSPPopI = Prev;
1938     }
1939 
1940     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1941 
1942     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
1943                     StackOffset::getFixed(AfterCSRPopSize), TII,
1944                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1945   }
1946   if (HasWinCFI)
1947     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
1948         .setMIFlag(MachineInstr::FrameDestroy);
1949 }
1950 
1951 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1952 /// debug info.  It's the same as what we use for resolving the code-gen
1953 /// references for now.  FIXME: This can go wrong when references are
1954 /// SP-relative and simple call frames aren't used.
1955 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1956 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1957                                              Register &FrameReg) const {
1958   return resolveFrameIndexReference(
1959       MF, FI, FrameReg,
1960       /*PreferFP=*/
1961       MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
1962       /*ForSimm=*/false);
1963 }
1964 
1965 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const1966 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1967                                                      int FI) const {
1968   return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
1969 }
1970 
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)1971 static StackOffset getFPOffset(const MachineFunction &MF,
1972                                int64_t ObjectOffset) {
1973   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1974   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1975   bool IsWin64 =
1976       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1977   unsigned FixedObject =
1978       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
1979   int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
1980   int64_t FPAdjust =
1981       CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1982   return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
1983 }
1984 
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)1985 static StackOffset getStackOffset(const MachineFunction &MF,
1986                                   int64_t ObjectOffset) {
1987   const auto &MFI = MF.getFrameInfo();
1988   return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
1989 }
1990 
1991   // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const1992 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
1993                                                  int FI) const {
1994   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
1995       MF.getSubtarget().getRegisterInfo());
1996   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
1997   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
1998              ? getFPOffset(MF, ObjectOffset).getFixed()
1999              : getStackOffset(MF, ObjectOffset).getFixed();
2000 }
2001 
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const2002 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2003     const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2004     bool ForSimm) const {
2005   const auto &MFI = MF.getFrameInfo();
2006   int64_t ObjectOffset = MFI.getObjectOffset(FI);
2007   bool isFixed = MFI.isFixedObjectIndex(FI);
2008   bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2009   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2010                                      PreferFP, ForSimm);
2011 }
2012 
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const2013 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2014     const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2015     Register &FrameReg, bool PreferFP, bool ForSimm) const {
2016   const auto &MFI = MF.getFrameInfo();
2017   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2018       MF.getSubtarget().getRegisterInfo());
2019   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2020   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2021 
2022   int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2023   int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2024   bool isCSR =
2025       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2026 
2027   const StackOffset &SVEStackSize = getSVEStackSize(MF);
2028 
2029   // Use frame pointer to reference fixed objects. Use it for locals if
2030   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2031   // reliable as a base). Make sure useFPForScavengingIndex() does the
2032   // right thing for the emergency spill slot.
2033   bool UseFP = false;
2034   if (AFI->hasStackFrame() && !isSVE) {
2035     // We shouldn't prefer using the FP when there is an SVE area
2036     // in between the FP and the non-SVE locals/spills.
2037     PreferFP &= !SVEStackSize;
2038 
2039     // Note: Keeping the following as multiple 'if' statements rather than
2040     // merging to a single expression for readability.
2041     //
2042     // Argument access should always use the FP.
2043     if (isFixed) {
2044       UseFP = hasFP(MF);
2045     } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2046       // References to the CSR area must use FP if we're re-aligning the stack
2047       // since the dynamically-sized alignment padding is between the SP/BP and
2048       // the CSR area.
2049       assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2050       UseFP = true;
2051     } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2052       // If the FPOffset is negative and we're producing a signed immediate, we
2053       // have to keep in mind that the available offset range for negative
2054       // offsets is smaller than for positive ones. If an offset is available
2055       // via the FP and the SP, use whichever is closest.
2056       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2057       PreferFP |= Offset > -FPOffset;
2058 
2059       if (MFI.hasVarSizedObjects()) {
2060         // If we have variable sized objects, we can use either FP or BP, as the
2061         // SP offset is unknown. We can use the base pointer if we have one and
2062         // FP is not preferred. If not, we're stuck with using FP.
2063         bool CanUseBP = RegInfo->hasBasePointer(MF);
2064         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2065           UseFP = PreferFP;
2066         else if (!CanUseBP) // Can't use BP. Forced to use FP.
2067           UseFP = true;
2068         // else we can use BP and FP, but the offset from FP won't fit.
2069         // That will make us scavenge registers which we can probably avoid by
2070         // using BP. If it won't fit for BP either, we'll scavenge anyway.
2071       } else if (FPOffset >= 0) {
2072         // Use SP or FP, whichever gives us the best chance of the offset
2073         // being in range for direct access. If the FPOffset is positive,
2074         // that'll always be best, as the SP will be even further away.
2075         UseFP = true;
2076       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2077         // Funclets access the locals contained in the parent's stack frame
2078         // via the frame pointer, so we have to use the FP in the parent
2079         // function.
2080         (void) Subtarget;
2081         assert(
2082             Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2083             "Funclets should only be present on Win64");
2084         UseFP = true;
2085       } else {
2086         // We have the choice between FP and (SP or BP).
2087         if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2088           UseFP = true;
2089       }
2090     }
2091   }
2092 
2093   assert(
2094       ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2095       "In the presence of dynamic stack pointer realignment, "
2096       "non-argument/CSR objects cannot be accessed through the frame pointer");
2097 
2098   if (isSVE) {
2099     StackOffset FPOffset =
2100         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2101     StackOffset SPOffset =
2102         SVEStackSize +
2103         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2104                          ObjectOffset);
2105     // Always use the FP for SVE spills if available and beneficial.
2106     if (hasFP(MF) && (SPOffset.getFixed() ||
2107                       FPOffset.getScalable() < SPOffset.getScalable() ||
2108                       RegInfo->hasStackRealignment(MF))) {
2109       FrameReg = RegInfo->getFrameRegister(MF);
2110       return FPOffset;
2111     }
2112 
2113     FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2114                                            : (unsigned)AArch64::SP;
2115     return SPOffset;
2116   }
2117 
2118   StackOffset ScalableOffset = {};
2119   if (UseFP && !(isFixed || isCSR))
2120     ScalableOffset = -SVEStackSize;
2121   if (!UseFP && (isFixed || isCSR))
2122     ScalableOffset = SVEStackSize;
2123 
2124   if (UseFP) {
2125     FrameReg = RegInfo->getFrameRegister(MF);
2126     return StackOffset::getFixed(FPOffset) + ScalableOffset;
2127   }
2128 
2129   // Use the base pointer if we have one.
2130   if (RegInfo->hasBasePointer(MF))
2131     FrameReg = RegInfo->getBaseRegister();
2132   else {
2133     assert(!MFI.hasVarSizedObjects() &&
2134            "Can't use SP when we have var sized objects.");
2135     FrameReg = AArch64::SP;
2136     // If we're using the red zone for this function, the SP won't actually
2137     // be adjusted, so the offsets will be negative. They're also all
2138     // within range of the signed 9-bit immediate instructions.
2139     if (canUseRedZone(MF))
2140       Offset -= AFI->getLocalStackSize();
2141   }
2142 
2143   return StackOffset::getFixed(Offset) + ScalableOffset;
2144 }
2145 
getPrologueDeath(MachineFunction & MF,unsigned Reg)2146 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2147   // Do not set a kill flag on values that are also marked as live-in. This
2148   // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2149   // callee saved registers.
2150   // Omitting the kill flags is conservatively correct even if the live-in
2151   // is not used after all.
2152   bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2153   return getKillRegState(!IsLiveIn);
2154 }
2155 
produceCompactUnwindFrame(MachineFunction & MF)2156 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2157   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2158   AttributeList Attrs = MF.getFunction().getAttributes();
2159   return Subtarget.isTargetMachO() &&
2160          !(Subtarget.getTargetLowering()->supportSwiftError() &&
2161            Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2162          MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2163 }
2164 
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2165 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2166                                              bool NeedsWinCFI, bool IsFirst) {
2167   // If we are generating register pairs for a Windows function that requires
2168   // EH support, then pair consecutive registers only.  There are no unwind
2169   // opcodes for saves/restores of non-consectuve register pairs.
2170   // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2171   // save_lrpair.
2172   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2173 
2174   if (Reg2 == AArch64::FP)
2175     return true;
2176   if (!NeedsWinCFI)
2177     return false;
2178   if (Reg2 == Reg1 + 1)
2179     return false;
2180   // If pairing a GPR with LR, the pair can be described by the save_lrpair
2181   // opcode. If this is the first register pair, it would end up with a
2182   // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2183   // if LR is paired with something else than the first register.
2184   // The save_lrpair opcode requires the first register to be an odd one.
2185   if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2186       (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2187     return false;
2188   return true;
2189 }
2190 
2191 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2192 /// WindowsCFI requires that only consecutive registers can be paired.
2193 /// LR and FP need to be allocated together when the frame needs to save
2194 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2195 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2196                                       bool UsesWinAAPCS, bool NeedsWinCFI,
2197                                       bool NeedsFrameRecord, bool IsFirst) {
2198   if (UsesWinAAPCS)
2199     return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2200 
2201   // If we need to store the frame record, don't pair any register
2202   // with LR other than FP.
2203   if (NeedsFrameRecord)
2204     return Reg2 == AArch64::LR;
2205 
2206   return false;
2207 }
2208 
2209 namespace {
2210 
2211 struct RegPairInfo {
2212   unsigned Reg1 = AArch64::NoRegister;
2213   unsigned Reg2 = AArch64::NoRegister;
2214   int FrameIdx;
2215   int Offset;
2216   enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2217 
2218   RegPairInfo() = default;
2219 
isPaired__anonfee6f2da0411::RegPairInfo2220   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2221 
getScale__anonfee6f2da0411::RegPairInfo2222   unsigned getScale() const {
2223     switch (Type) {
2224     case PPR:
2225       return 2;
2226     case GPR:
2227     case FPR64:
2228       return 8;
2229     case ZPR:
2230     case FPR128:
2231       return 16;
2232     }
2233     llvm_unreachable("Unsupported type");
2234   }
2235 
isScalable__anonfee6f2da0411::RegPairInfo2236   bool isScalable() const { return Type == PPR || Type == ZPR; }
2237 };
2238 
2239 } // end anonymous namespace
2240 
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool & NeedShadowCallStackProlog,bool NeedsFrameRecord)2241 static void computeCalleeSaveRegisterPairs(
2242     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2243     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2244     bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
2245 
2246   if (CSI.empty())
2247     return;
2248 
2249   bool IsWindows = isTargetWindows(MF);
2250   bool NeedsWinCFI = needsWinCFI(MF);
2251   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2252   MachineFrameInfo &MFI = MF.getFrameInfo();
2253   CallingConv::ID CC = MF.getFunction().getCallingConv();
2254   unsigned Count = CSI.size();
2255   (void)CC;
2256   // MachO's compact unwind format relies on all registers being stored in
2257   // pairs.
2258   assert((!produceCompactUnwindFrame(MF) ||
2259           CC == CallingConv::PreserveMost ||
2260           (Count & 1) == 0) &&
2261          "Odd number of callee-saved regs to spill!");
2262   int ByteOffset = AFI->getCalleeSavedStackSize();
2263   int StackFillDir = -1;
2264   int RegInc = 1;
2265   unsigned FirstReg = 0;
2266   if (NeedsWinCFI) {
2267     // For WinCFI, fill the stack from the bottom up.
2268     ByteOffset = 0;
2269     StackFillDir = 1;
2270     // As the CSI array is reversed to match PrologEpilogInserter, iterate
2271     // backwards, to pair up registers starting from lower numbered registers.
2272     RegInc = -1;
2273     FirstReg = Count - 1;
2274   }
2275   int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2276   bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2277 
2278   // When iterating backwards, the loop condition relies on unsigned wraparound.
2279   for (unsigned i = FirstReg; i < Count; i += RegInc) {
2280     RegPairInfo RPI;
2281     RPI.Reg1 = CSI[i].getReg();
2282 
2283     if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2284       RPI.Type = RegPairInfo::GPR;
2285     else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2286       RPI.Type = RegPairInfo::FPR64;
2287     else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2288       RPI.Type = RegPairInfo::FPR128;
2289     else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2290       RPI.Type = RegPairInfo::ZPR;
2291     else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2292       RPI.Type = RegPairInfo::PPR;
2293     else
2294       llvm_unreachable("Unsupported register class.");
2295 
2296     // Add the next reg to the pair if it is in the same register class.
2297     if (unsigned(i + RegInc) < Count) {
2298       unsigned NextReg = CSI[i + RegInc].getReg();
2299       bool IsFirst = i == FirstReg;
2300       switch (RPI.Type) {
2301       case RegPairInfo::GPR:
2302         if (AArch64::GPR64RegClass.contains(NextReg) &&
2303             !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2304                                        NeedsWinCFI, NeedsFrameRecord, IsFirst))
2305           RPI.Reg2 = NextReg;
2306         break;
2307       case RegPairInfo::FPR64:
2308         if (AArch64::FPR64RegClass.contains(NextReg) &&
2309             !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2310                                               IsFirst))
2311           RPI.Reg2 = NextReg;
2312         break;
2313       case RegPairInfo::FPR128:
2314         if (AArch64::FPR128RegClass.contains(NextReg))
2315           RPI.Reg2 = NextReg;
2316         break;
2317       case RegPairInfo::PPR:
2318       case RegPairInfo::ZPR:
2319         break;
2320       }
2321     }
2322 
2323     // If either of the registers to be saved is the lr register, it means that
2324     // we also need to save lr in the shadow call stack.
2325     if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
2326         MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
2327       if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
2328         report_fatal_error("Must reserve x18 to use shadow call stack");
2329       NeedShadowCallStackProlog = true;
2330     }
2331 
2332     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2333     // list to come in sorted by frame index so that we can issue the store
2334     // pair instructions directly. Assert if we see anything otherwise.
2335     //
2336     // The order of the registers in the list is controlled by
2337     // getCalleeSavedRegs(), so they will always be in-order, as well.
2338     assert((!RPI.isPaired() ||
2339             (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2340            "Out of order callee saved regs!");
2341 
2342     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2343             RPI.Reg1 == AArch64::LR) &&
2344            "FrameRecord must be allocated together with LR");
2345 
2346     // Windows AAPCS has FP and LR reversed.
2347     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2348             RPI.Reg2 == AArch64::LR) &&
2349            "FrameRecord must be allocated together with LR");
2350 
2351     // MachO's compact unwind format relies on all registers being stored in
2352     // adjacent register pairs.
2353     assert((!produceCompactUnwindFrame(MF) ||
2354             CC == CallingConv::PreserveMost ||
2355             (RPI.isPaired() &&
2356              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2357               RPI.Reg1 + 1 == RPI.Reg2))) &&
2358            "Callee-save registers not saved as adjacent register pair!");
2359 
2360     RPI.FrameIdx = CSI[i].getFrameIdx();
2361     if (NeedsWinCFI &&
2362         RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2363       RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2364 
2365     int Scale = RPI.getScale();
2366 
2367     int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2368     assert(OffsetPre % Scale == 0);
2369 
2370     if (RPI.isScalable())
2371       ScalableByteOffset += StackFillDir * Scale;
2372     else
2373       ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2374 
2375     // Swift's async context is directly before FP, so allocate an extra
2376     // 8 bytes for it.
2377     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2378         RPI.Reg2 == AArch64::FP)
2379       ByteOffset += StackFillDir * 8;
2380 
2381     assert(!(RPI.isScalable() && RPI.isPaired()) &&
2382            "Paired spill/fill instructions don't exist for SVE vectors");
2383 
2384     // Round up size of non-pair to pair size if we need to pad the
2385     // callee-save area to ensure 16-byte alignment.
2386     if (NeedGapToAlignStack && !NeedsWinCFI &&
2387         !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2388         !RPI.isPaired() && ByteOffset % 16 != 0) {
2389       ByteOffset += 8 * StackFillDir;
2390       assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2391       // A stack frame with a gap looks like this, bottom up:
2392       // d9, d8. x21, gap, x20, x19.
2393       // Set extra alignment on the x21 object to create the gap above it.
2394       MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2395       NeedGapToAlignStack = false;
2396     }
2397 
2398     int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2399     assert(OffsetPost % Scale == 0);
2400     // If filling top down (default), we want the offset after incrementing it.
2401     // If fillibg bootom up (WinCFI) we need the original offset.
2402     int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2403 
2404     // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2405     // Swift context can directly precede FP.
2406     if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2407         RPI.Reg2 == AArch64::FP)
2408       Offset += 8;
2409     RPI.Offset = Offset / Scale;
2410 
2411     assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2412             (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2413            "Offset out of bounds for LDP/STP immediate");
2414 
2415     // Save the offset to frame record so that the FP register can point to the
2416     // innermost frame record (spilled FP and LR registers).
2417     if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2418                               RPI.Reg2 == AArch64::FP) ||
2419                              (IsWindows && RPI.Reg1 == AArch64::FP &&
2420                               RPI.Reg2 == AArch64::LR)))
2421       AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2422 
2423     RegPairs.push_back(RPI);
2424     if (RPI.isPaired())
2425       i += RegInc;
2426   }
2427   if (NeedsWinCFI) {
2428     // If we need an alignment gap in the stack, align the topmost stack
2429     // object. A stack frame with a gap looks like this, bottom up:
2430     // x19, d8. d9, gap.
2431     // Set extra alignment on the topmost stack object (the first element in
2432     // CSI, which goes top down), to create the gap above it.
2433     if (AFI->hasCalleeSaveStackFreeSpace())
2434       MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2435     // We iterated bottom up over the registers; flip RegPairs back to top
2436     // down order.
2437     std::reverse(RegPairs.begin(), RegPairs.end());
2438   }
2439 }
2440 
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2441 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2442     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2443     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2444   MachineFunction &MF = *MBB.getParent();
2445   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2446   bool NeedsWinCFI = needsWinCFI(MF);
2447   DebugLoc DL;
2448   SmallVector<RegPairInfo, 8> RegPairs;
2449 
2450   bool NeedShadowCallStackProlog = false;
2451   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2452                                  NeedShadowCallStackProlog, hasFP(MF));
2453   const MachineRegisterInfo &MRI = MF.getRegInfo();
2454 
2455   if (NeedShadowCallStackProlog) {
2456     // Shadow call stack prolog: str x30, [x18], #8
2457     BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
2458         .addReg(AArch64::X18, RegState::Define)
2459         .addReg(AArch64::LR)
2460         .addReg(AArch64::X18)
2461         .addImm(8)
2462         .setMIFlag(MachineInstr::FrameSetup);
2463 
2464     if (NeedsWinCFI)
2465       BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
2466           .setMIFlag(MachineInstr::FrameSetup);
2467 
2468     if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
2469       // Emit a CFI instruction that causes 8 to be subtracted from the value of
2470       // x18 when unwinding past this frame.
2471       static const char CFIInst[] = {
2472           dwarf::DW_CFA_val_expression,
2473           18, // register
2474           2,  // length
2475           static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
2476           static_cast<char>(-8) & 0x7f, // addend (sleb128)
2477       };
2478       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
2479           nullptr, StringRef(CFIInst, sizeof(CFIInst))));
2480       BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
2481           .addCFIIndex(CFIIndex)
2482           .setMIFlag(MachineInstr::FrameSetup);
2483     }
2484 
2485     // This instruction also makes x18 live-in to the entry block.
2486     MBB.addLiveIn(AArch64::X18);
2487   }
2488 
2489   if (homogeneousPrologEpilog(MF)) {
2490     auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2491                    .setMIFlag(MachineInstr::FrameSetup);
2492 
2493     for (auto &RPI : RegPairs) {
2494       MIB.addReg(RPI.Reg1);
2495       MIB.addReg(RPI.Reg2);
2496 
2497       // Update register live in.
2498       if (!MRI.isReserved(RPI.Reg1))
2499         MBB.addLiveIn(RPI.Reg1);
2500       if (!MRI.isReserved(RPI.Reg2))
2501         MBB.addLiveIn(RPI.Reg2);
2502     }
2503     return true;
2504   }
2505   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
2506        ++RPII) {
2507     RegPairInfo RPI = *RPII;
2508     unsigned Reg1 = RPI.Reg1;
2509     unsigned Reg2 = RPI.Reg2;
2510     unsigned StrOpc;
2511 
2512     // Issue sequence of spills for cs regs.  The first spill may be converted
2513     // to a pre-decrement store later by emitPrologue if the callee-save stack
2514     // area allocation can't be combined with the local stack area allocation.
2515     // For example:
2516     //    stp     x22, x21, [sp, #0]     // addImm(+0)
2517     //    stp     x20, x19, [sp, #16]    // addImm(+2)
2518     //    stp     fp, lr, [sp, #32]      // addImm(+4)
2519     // Rationale: This sequence saves uop updates compared to a sequence of
2520     // pre-increment spills like stp xi,xj,[sp,#-16]!
2521     // Note: Similar rationale and sequence for restores in epilog.
2522     unsigned Size;
2523     Align Alignment;
2524     switch (RPI.Type) {
2525     case RegPairInfo::GPR:
2526        StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2527        Size = 8;
2528        Alignment = Align(8);
2529        break;
2530     case RegPairInfo::FPR64:
2531        StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2532        Size = 8;
2533        Alignment = Align(8);
2534        break;
2535     case RegPairInfo::FPR128:
2536        StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2537        Size = 16;
2538        Alignment = Align(16);
2539        break;
2540     case RegPairInfo::ZPR:
2541        StrOpc = AArch64::STR_ZXI;
2542        Size = 16;
2543        Alignment = Align(16);
2544        break;
2545     case RegPairInfo::PPR:
2546        StrOpc = AArch64::STR_PXI;
2547        Size = 2;
2548        Alignment = Align(2);
2549        break;
2550     }
2551     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2552                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2553                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2554                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2555                dbgs() << ")\n");
2556 
2557     assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2558            "Windows unwdinding requires a consecutive (FP,LR) pair");
2559     // Windows unwind codes require consecutive registers if registers are
2560     // paired.  Make the switch here, so that the code below will save (x,x+1)
2561     // and not (x+1,x).
2562     unsigned FrameIdxReg1 = RPI.FrameIdx;
2563     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2564     if (NeedsWinCFI && RPI.isPaired()) {
2565       std::swap(Reg1, Reg2);
2566       std::swap(FrameIdxReg1, FrameIdxReg2);
2567     }
2568     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2569     if (!MRI.isReserved(Reg1))
2570       MBB.addLiveIn(Reg1);
2571     if (RPI.isPaired()) {
2572       if (!MRI.isReserved(Reg2))
2573         MBB.addLiveIn(Reg2);
2574       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2575       MIB.addMemOperand(MF.getMachineMemOperand(
2576           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2577           MachineMemOperand::MOStore, Size, Alignment));
2578     }
2579     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2580         .addReg(AArch64::SP)
2581         .addImm(RPI.Offset) // [sp, #offset*scale],
2582                             // where factor*scale is implicit
2583         .setMIFlag(MachineInstr::FrameSetup);
2584     MIB.addMemOperand(MF.getMachineMemOperand(
2585         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2586         MachineMemOperand::MOStore, Size, Alignment));
2587     if (NeedsWinCFI)
2588       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2589 
2590     // Update the StackIDs of the SVE stack slots.
2591     MachineFrameInfo &MFI = MF.getFrameInfo();
2592     if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2593       MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2594 
2595   }
2596   return true;
2597 }
2598 
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2599 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2600     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2601     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2602   MachineFunction &MF = *MBB.getParent();
2603   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2604   DebugLoc DL;
2605   SmallVector<RegPairInfo, 8> RegPairs;
2606   bool NeedsWinCFI = needsWinCFI(MF);
2607 
2608   if (MI != MBB.end())
2609     DL = MI->getDebugLoc();
2610 
2611   bool NeedShadowCallStackProlog = false;
2612   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2613                                  NeedShadowCallStackProlog, hasFP(MF));
2614 
2615   auto EmitMI = [&](const RegPairInfo &RPI) {
2616     unsigned Reg1 = RPI.Reg1;
2617     unsigned Reg2 = RPI.Reg2;
2618 
2619     // Issue sequence of restores for cs regs. The last restore may be converted
2620     // to a post-increment load later by emitEpilogue if the callee-save stack
2621     // area allocation can't be combined with the local stack area allocation.
2622     // For example:
2623     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
2624     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
2625     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
2626     // Note: see comment in spillCalleeSavedRegisters()
2627     unsigned LdrOpc;
2628     unsigned Size;
2629     Align Alignment;
2630     switch (RPI.Type) {
2631     case RegPairInfo::GPR:
2632        LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2633        Size = 8;
2634        Alignment = Align(8);
2635        break;
2636     case RegPairInfo::FPR64:
2637        LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2638        Size = 8;
2639        Alignment = Align(8);
2640        break;
2641     case RegPairInfo::FPR128:
2642        LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2643        Size = 16;
2644        Alignment = Align(16);
2645        break;
2646     case RegPairInfo::ZPR:
2647        LdrOpc = AArch64::LDR_ZXI;
2648        Size = 16;
2649        Alignment = Align(16);
2650        break;
2651     case RegPairInfo::PPR:
2652        LdrOpc = AArch64::LDR_PXI;
2653        Size = 2;
2654        Alignment = Align(2);
2655        break;
2656     }
2657     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2658                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2659                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2660                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2661                dbgs() << ")\n");
2662 
2663     // Windows unwind codes require consecutive registers if registers are
2664     // paired.  Make the switch here, so that the code below will save (x,x+1)
2665     // and not (x+1,x).
2666     unsigned FrameIdxReg1 = RPI.FrameIdx;
2667     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2668     if (NeedsWinCFI && RPI.isPaired()) {
2669       std::swap(Reg1, Reg2);
2670       std::swap(FrameIdxReg1, FrameIdxReg2);
2671     }
2672     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
2673     if (RPI.isPaired()) {
2674       MIB.addReg(Reg2, getDefRegState(true));
2675       MIB.addMemOperand(MF.getMachineMemOperand(
2676           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2677           MachineMemOperand::MOLoad, Size, Alignment));
2678     }
2679     MIB.addReg(Reg1, getDefRegState(true))
2680         .addReg(AArch64::SP)
2681         .addImm(RPI.Offset) // [sp, #offset*scale]
2682                             // where factor*scale is implicit
2683         .setMIFlag(MachineInstr::FrameDestroy);
2684     MIB.addMemOperand(MF.getMachineMemOperand(
2685         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2686         MachineMemOperand::MOLoad, Size, Alignment));
2687     if (NeedsWinCFI)
2688       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2689   };
2690 
2691   // SVE objects are always restored in reverse order.
2692   for (const RegPairInfo &RPI : reverse(RegPairs))
2693     if (RPI.isScalable())
2694       EmitMI(RPI);
2695 
2696   if (ReverseCSRRestoreSeq) {
2697     for (const RegPairInfo &RPI : reverse(RegPairs))
2698       if (!RPI.isScalable())
2699         EmitMI(RPI);
2700   } else if (homogeneousPrologEpilog(MF, &MBB)) {
2701     auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
2702                    .setMIFlag(MachineInstr::FrameDestroy);
2703     for (auto &RPI : RegPairs) {
2704       MIB.addReg(RPI.Reg1, RegState::Define);
2705       MIB.addReg(RPI.Reg2, RegState::Define);
2706     }
2707     return true;
2708   } else
2709     for (const RegPairInfo &RPI : RegPairs)
2710       if (!RPI.isScalable())
2711         EmitMI(RPI);
2712 
2713   if (NeedShadowCallStackProlog) {
2714     // Shadow call stack epilog: ldr x30, [x18, #-8]!
2715     BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
2716         .addReg(AArch64::X18, RegState::Define)
2717         .addReg(AArch64::LR, RegState::Define)
2718         .addReg(AArch64::X18)
2719         .addImm(-8)
2720         .setMIFlag(MachineInstr::FrameDestroy);
2721   }
2722 
2723   return true;
2724 }
2725 
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2726 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2727                                                 BitVector &SavedRegs,
2728                                                 RegScavenger *RS) const {
2729   // All calls are tail calls in GHC calling conv, and functions have no
2730   // prologue/epilogue.
2731   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2732     return;
2733 
2734   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2735   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2736       MF.getSubtarget().getRegisterInfo());
2737   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2738   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2739   unsigned UnspilledCSGPR = AArch64::NoRegister;
2740   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2741 
2742   MachineFrameInfo &MFI = MF.getFrameInfo();
2743   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2744 
2745   unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2746                                 ? RegInfo->getBaseRegister()
2747                                 : (unsigned)AArch64::NoRegister;
2748 
2749   unsigned ExtraCSSpill = 0;
2750   // Figure out which callee-saved registers to save/restore.
2751   for (unsigned i = 0; CSRegs[i]; ++i) {
2752     const unsigned Reg = CSRegs[i];
2753 
2754     // Add the base pointer register to SavedRegs if it is callee-save.
2755     if (Reg == BasePointerReg)
2756       SavedRegs.set(Reg);
2757 
2758     bool RegUsed = SavedRegs.test(Reg);
2759     unsigned PairedReg = AArch64::NoRegister;
2760     if (AArch64::GPR64RegClass.contains(Reg) ||
2761         AArch64::FPR64RegClass.contains(Reg) ||
2762         AArch64::FPR128RegClass.contains(Reg))
2763       PairedReg = CSRegs[i ^ 1];
2764 
2765     if (!RegUsed) {
2766       if (AArch64::GPR64RegClass.contains(Reg) &&
2767           !RegInfo->isReservedReg(MF, Reg)) {
2768         UnspilledCSGPR = Reg;
2769         UnspilledCSGPRPaired = PairedReg;
2770       }
2771       continue;
2772     }
2773 
2774     // MachO's compact unwind format relies on all registers being stored in
2775     // pairs.
2776     // FIXME: the usual format is actually better if unwinding isn't needed.
2777     if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
2778         !SavedRegs.test(PairedReg)) {
2779       SavedRegs.set(PairedReg);
2780       if (AArch64::GPR64RegClass.contains(PairedReg) &&
2781           !RegInfo->isReservedReg(MF, PairedReg))
2782         ExtraCSSpill = PairedReg;
2783     }
2784   }
2785 
2786   if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2787       !Subtarget.isTargetWindows()) {
2788     // For Windows calling convention on a non-windows OS, where X18 is treated
2789     // as reserved, back up X18 when entering non-windows code (marked with the
2790     // Windows calling convention) and restore when returning regardless of
2791     // whether the individual function uses it - it might call other functions
2792     // that clobber it.
2793     SavedRegs.set(AArch64::X18);
2794   }
2795 
2796   // Calculates the callee saved stack size.
2797   unsigned CSStackSize = 0;
2798   unsigned SVECSStackSize = 0;
2799   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2800   const MachineRegisterInfo &MRI = MF.getRegInfo();
2801   for (unsigned Reg : SavedRegs.set_bits()) {
2802     auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
2803     if (AArch64::PPRRegClass.contains(Reg) ||
2804         AArch64::ZPRRegClass.contains(Reg))
2805       SVECSStackSize += RegSize;
2806     else
2807       CSStackSize += RegSize;
2808   }
2809 
2810   // Save number of saved regs, so we can easily update CSStackSize later.
2811   unsigned NumSavedRegs = SavedRegs.count();
2812 
2813   // The frame record needs to be created by saving the appropriate registers
2814   uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2815   if (hasFP(MF) ||
2816       windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
2817     SavedRegs.set(AArch64::FP);
2818     SavedRegs.set(AArch64::LR);
2819   }
2820 
2821   LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2822              for (unsigned Reg
2823                   : SavedRegs.set_bits()) dbgs()
2824              << ' ' << printReg(Reg, RegInfo);
2825              dbgs() << "\n";);
2826 
2827   // If any callee-saved registers are used, the frame cannot be eliminated.
2828   int64_t SVEStackSize =
2829       alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
2830   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
2831 
2832   // The CSR spill slots have not been allocated yet, so estimateStackSize
2833   // won't include them.
2834   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2835 
2836   // Conservatively always assume BigStack when there are SVE spills.
2837   bool BigStack = SVEStackSize ||
2838                   (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
2839   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
2840     AFI->setHasStackFrame(true);
2841 
2842   // Estimate if we might need to scavenge a register at some point in order
2843   // to materialize a stack offset. If so, either spill one additional
2844   // callee-saved register or reserve a special spill slot to facilitate
2845   // register scavenging. If we already spilled an extra callee-saved register
2846   // above to keep the number of spills even, we don't need to do anything else
2847   // here.
2848   if (BigStack) {
2849     if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2850       LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2851                         << " to get a scratch register.\n");
2852       SavedRegs.set(UnspilledCSGPR);
2853       // MachO's compact unwind format relies on all registers being stored in
2854       // pairs, so if we need to spill one extra for BigStack, then we need to
2855       // store the pair.
2856       if (producePairRegisters(MF))
2857         SavedRegs.set(UnspilledCSGPRPaired);
2858       ExtraCSSpill = UnspilledCSGPR;
2859     }
2860 
2861     // If we didn't find an extra callee-saved register to spill, create
2862     // an emergency spill slot.
2863     if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
2864       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2865       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2866       unsigned Size = TRI->getSpillSize(RC);
2867       Align Alignment = TRI->getSpillAlign(RC);
2868       int FI = MFI.CreateStackObject(Size, Alignment, false);
2869       RS->addScavengingFrameIndex(FI);
2870       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2871                         << " as the emergency spill slot.\n");
2872     }
2873   }
2874 
2875   // Adding the size of additional 64bit GPR saves.
2876   CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
2877 
2878   // A Swift asynchronous context extends the frame record with a pointer
2879   // directly before FP.
2880   if (hasFP(MF) && AFI->hasSwiftAsyncContext())
2881     CSStackSize += 8;
2882 
2883   uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
2884   LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2885                << EstimatedStackSize + AlignedCSStackSize
2886                << " bytes.\n");
2887 
2888   assert((!MFI.isCalleeSavedInfoValid() ||
2889           AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2890          "Should not invalidate callee saved info");
2891 
2892   // Round up to register pair alignment to avoid additional SP adjustment
2893   // instructions.
2894   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2895   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2896   AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
2897 }
2898 
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * RegInfo,std::vector<CalleeSavedInfo> & CSI,unsigned & MinCSFrameIndex,unsigned & MaxCSFrameIndex) const2899 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2900     MachineFunction &MF, const TargetRegisterInfo *RegInfo,
2901     std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
2902     unsigned &MaxCSFrameIndex) const {
2903   bool NeedsWinCFI = needsWinCFI(MF);
2904   // To match the canonical windows frame layout, reverse the list of
2905   // callee saved registers to get them laid out by PrologEpilogInserter
2906   // in the right order. (PrologEpilogInserter allocates stack objects top
2907   // down. Windows canonical prologs store higher numbered registers at
2908   // the top, thus have the CSI array start from the highest registers.)
2909   if (NeedsWinCFI)
2910     std::reverse(CSI.begin(), CSI.end());
2911 
2912   if (CSI.empty())
2913     return true; // Early exit if no callee saved registers are modified!
2914 
2915   // Now that we know which registers need to be saved and restored, allocate
2916   // stack slots for them.
2917   MachineFrameInfo &MFI = MF.getFrameInfo();
2918   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2919   for (auto &CS : CSI) {
2920     Register Reg = CS.getReg();
2921     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
2922 
2923     unsigned Size = RegInfo->getSpillSize(*RC);
2924     Align Alignment(RegInfo->getSpillAlign(*RC));
2925     int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
2926     CS.setFrameIdx(FrameIdx);
2927 
2928     if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2929     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2930 
2931     // Grab 8 bytes below FP for the extended asynchronous frame info.
2932     if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
2933       FrameIdx = MFI.CreateStackObject(8, Alignment, true);
2934       AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
2935       if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2936       if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2937     }
2938   }
2939   return true;
2940 }
2941 
enableStackSlotScavenging(const MachineFunction & MF) const2942 bool AArch64FrameLowering::enableStackSlotScavenging(
2943     const MachineFunction &MF) const {
2944   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2945   return AFI->hasCalleeSaveStackFreeSpace();
2946 }
2947 
2948 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)2949 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2950                                       int &Min, int &Max) {
2951   Min = std::numeric_limits<int>::max();
2952   Max = std::numeric_limits<int>::min();
2953 
2954   if (!MFI.isCalleeSavedInfoValid())
2955     return false;
2956 
2957   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2958   for (auto &CS : CSI) {
2959     if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
2960         AArch64::PPRRegClass.contains(CS.getReg())) {
2961       assert((Max == std::numeric_limits<int>::min() ||
2962               Max + 1 == CS.getFrameIdx()) &&
2963              "SVE CalleeSaves are not consecutive");
2964 
2965       Min = std::min(Min, CS.getFrameIdx());
2966       Max = std::max(Max, CS.getFrameIdx());
2967     }
2968   }
2969   return Min != std::numeric_limits<int>::max();
2970 }
2971 
2972 // Process all the SVE stack objects and determine offsets for each
2973 // object. If AssignOffsets is true, the offsets get assigned.
2974 // Fills in the first and last callee-saved frame indices into
2975 // Min/MaxCSFrameIndex, respectively.
2976 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)2977 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
2978                                               int &MinCSFrameIndex,
2979                                               int &MaxCSFrameIndex,
2980                                               bool AssignOffsets) {
2981 #ifndef NDEBUG
2982   // First process all fixed stack objects.
2983   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
2984     assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
2985            "SVE vectors should never be passed on the stack by value, only by "
2986            "reference.");
2987 #endif
2988 
2989   auto Assign = [&MFI](int FI, int64_t Offset) {
2990     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2991     MFI.setObjectOffset(FI, Offset);
2992   };
2993 
2994   int64_t Offset = 0;
2995 
2996   // Then process all callee saved slots.
2997   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
2998     // Assign offsets to the callee save slots.
2999     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3000       Offset += MFI.getObjectSize(I);
3001       Offset = alignTo(Offset, MFI.getObjectAlign(I));
3002       if (AssignOffsets)
3003         Assign(I, -Offset);
3004     }
3005   }
3006 
3007   // Ensure that the Callee-save area is aligned to 16bytes.
3008   Offset = alignTo(Offset, Align(16U));
3009 
3010   // Create a buffer of SVE objects to allocate and sort it.
3011   SmallVector<int, 8> ObjectsToAllocate;
3012   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3013     unsigned StackID = MFI.getStackID(I);
3014     if (StackID != TargetStackID::ScalableVector)
3015       continue;
3016     if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3017       continue;
3018     if (MFI.isDeadObjectIndex(I))
3019       continue;
3020 
3021     ObjectsToAllocate.push_back(I);
3022   }
3023 
3024   // Allocate all SVE locals and spills
3025   for (unsigned FI : ObjectsToAllocate) {
3026     Align Alignment = MFI.getObjectAlign(FI);
3027     // FIXME: Given that the length of SVE vectors is not necessarily a power of
3028     // two, we'd need to align every object dynamically at runtime if the
3029     // alignment is larger than 16. This is not yet supported.
3030     if (Alignment > Align(16))
3031       report_fatal_error(
3032           "Alignment of scalable vectors > 16 bytes is not yet supported");
3033 
3034     Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3035     if (AssignOffsets)
3036       Assign(FI, -Offset);
3037   }
3038 
3039   return Offset;
3040 }
3041 
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const3042 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3043     MachineFrameInfo &MFI) const {
3044   int MinCSFrameIndex, MaxCSFrameIndex;
3045   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3046 }
3047 
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const3048 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3049     MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3050   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3051                                         true);
3052 }
3053 
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const3054 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3055     MachineFunction &MF, RegScavenger *RS) const {
3056   MachineFrameInfo &MFI = MF.getFrameInfo();
3057 
3058   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3059          "Upwards growing stack unsupported");
3060 
3061   int MinCSFrameIndex, MaxCSFrameIndex;
3062   int64_t SVEStackSize =
3063       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3064 
3065   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3066   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3067   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3068 
3069   // If this function isn't doing Win64-style C++ EH, we don't need to do
3070   // anything.
3071   if (!MF.hasEHFunclets())
3072     return;
3073   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3074   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3075 
3076   MachineBasicBlock &MBB = MF.front();
3077   auto MBBI = MBB.begin();
3078   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3079     ++MBBI;
3080 
3081   // Create an UnwindHelp object.
3082   // The UnwindHelp object is allocated at the start of the fixed object area
3083   int64_t FixedObject =
3084       getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3085   int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3086                                            /*SPOffset*/ -FixedObject,
3087                                            /*IsImmutable=*/false);
3088   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3089 
3090   // We need to store -2 into the UnwindHelp object at the start of the
3091   // function.
3092   DebugLoc DL;
3093   RS->enterBasicBlockEnd(MBB);
3094   RS->backward(std::prev(MBBI));
3095   unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3096   assert(DstReg && "There must be a free register after frame setup");
3097   BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3098   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3099       .addReg(DstReg, getKillRegState(true))
3100       .addFrameIndex(UnwindHelpFI)
3101       .addImm(0);
3102 }
3103 
3104 namespace {
3105 struct TagStoreInstr {
3106   MachineInstr *MI;
3107   int64_t Offset, Size;
TagStoreInstr__anonfee6f2da0711::TagStoreInstr3108   explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3109       : MI(MI), Offset(Offset), Size(Size) {}
3110 };
3111 
3112 class TagStoreEdit {
3113   MachineFunction *MF;
3114   MachineBasicBlock *MBB;
3115   MachineRegisterInfo *MRI;
3116   // Tag store instructions that are being replaced.
3117   SmallVector<TagStoreInstr, 8> TagStores;
3118   // Combined memref arguments of the above instructions.
3119   SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3120 
3121   // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3122   // FrameRegOffset + Size) with the address tag of SP.
3123   Register FrameReg;
3124   StackOffset FrameRegOffset;
3125   int64_t Size;
3126   // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3127   Optional<int64_t> FrameRegUpdate;
3128   // MIFlags for any FrameReg updating instructions.
3129   unsigned FrameRegUpdateFlags;
3130 
3131   // Use zeroing instruction variants.
3132   bool ZeroData;
3133   DebugLoc DL;
3134 
3135   void emitUnrolled(MachineBasicBlock::iterator InsertI);
3136   void emitLoop(MachineBasicBlock::iterator InsertI);
3137 
3138 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)3139   TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3140       : MBB(MBB), ZeroData(ZeroData) {
3141     MF = MBB->getParent();
3142     MRI = &MF->getRegInfo();
3143   }
3144   // Add an instruction to be replaced. Instructions must be added in the
3145   // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)3146   void addInstruction(TagStoreInstr I) {
3147     assert((TagStores.empty() ||
3148             TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3149            "Non-adjacent tag store instructions.");
3150     TagStores.push_back(I);
3151   }
clear()3152   void clear() { TagStores.clear(); }
3153   // Emit equivalent code at the given location, and erase the current set of
3154   // instructions. May skip if the replacement is not profitable. May invalidate
3155   // the input iterator and replace it with a valid one.
3156   void emitCode(MachineBasicBlock::iterator &InsertI,
3157                 const AArch64FrameLowering *TFI, bool IsLast);
3158 };
3159 
emitUnrolled(MachineBasicBlock::iterator InsertI)3160 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3161   const AArch64InstrInfo *TII =
3162       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3163 
3164   const int64_t kMinOffset = -256 * 16;
3165   const int64_t kMaxOffset = 255 * 16;
3166 
3167   Register BaseReg = FrameReg;
3168   int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3169   if (BaseRegOffsetBytes < kMinOffset ||
3170       BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3171     Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3172     emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3173                     StackOffset::getFixed(BaseRegOffsetBytes), TII);
3174     BaseReg = ScratchReg;
3175     BaseRegOffsetBytes = 0;
3176   }
3177 
3178   MachineInstr *LastI = nullptr;
3179   while (Size) {
3180     int64_t InstrSize = (Size > 16) ? 32 : 16;
3181     unsigned Opcode =
3182         InstrSize == 16
3183             ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3184             : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3185     MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3186                           .addReg(AArch64::SP)
3187                           .addReg(BaseReg)
3188                           .addImm(BaseRegOffsetBytes / 16)
3189                           .setMemRefs(CombinedMemRefs);
3190     // A store to [BaseReg, #0] should go last for an opportunity to fold the
3191     // final SP adjustment in the epilogue.
3192     if (BaseRegOffsetBytes == 0)
3193       LastI = I;
3194     BaseRegOffsetBytes += InstrSize;
3195     Size -= InstrSize;
3196   }
3197 
3198   if (LastI)
3199     MBB->splice(InsertI, MBB, LastI);
3200 }
3201 
emitLoop(MachineBasicBlock::iterator InsertI)3202 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3203   const AArch64InstrInfo *TII =
3204       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3205 
3206   Register BaseReg = FrameRegUpdate
3207                          ? FrameReg
3208                          : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3209   Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3210 
3211   emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3212 
3213   int64_t LoopSize = Size;
3214   // If the loop size is not a multiple of 32, split off one 16-byte store at
3215   // the end to fold BaseReg update into.
3216   if (FrameRegUpdate && *FrameRegUpdate)
3217     LoopSize -= LoopSize % 32;
3218   MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3219                                 TII->get(ZeroData ? AArch64::STZGloop_wback
3220                                                   : AArch64::STGloop_wback))
3221                             .addDef(SizeReg)
3222                             .addDef(BaseReg)
3223                             .addImm(LoopSize)
3224                             .addReg(BaseReg)
3225                             .setMemRefs(CombinedMemRefs);
3226   if (FrameRegUpdate)
3227     LoopI->setFlags(FrameRegUpdateFlags);
3228 
3229   int64_t ExtraBaseRegUpdate =
3230       FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3231   if (LoopSize < Size) {
3232     assert(FrameRegUpdate);
3233     assert(Size - LoopSize == 16);
3234     // Tag 16 more bytes at BaseReg and update BaseReg.
3235     BuildMI(*MBB, InsertI, DL,
3236             TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3237         .addDef(BaseReg)
3238         .addReg(BaseReg)
3239         .addReg(BaseReg)
3240         .addImm(1 + ExtraBaseRegUpdate / 16)
3241         .setMemRefs(CombinedMemRefs)
3242         .setMIFlags(FrameRegUpdateFlags);
3243   } else if (ExtraBaseRegUpdate) {
3244     // Update BaseReg.
3245     BuildMI(
3246         *MBB, InsertI, DL,
3247         TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3248         .addDef(BaseReg)
3249         .addReg(BaseReg)
3250         .addImm(std::abs(ExtraBaseRegUpdate))
3251         .addImm(0)
3252         .setMIFlags(FrameRegUpdateFlags);
3253   }
3254 }
3255 
3256 // Check if *II is a register update that can be merged into STGloop that ends
3257 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3258 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3259 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3260                        int64_t Size, int64_t *TotalOffset) {
3261   MachineInstr &MI = *II;
3262   if ((MI.getOpcode() == AArch64::ADDXri ||
3263        MI.getOpcode() == AArch64::SUBXri) &&
3264       MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3265     unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3266     int64_t Offset = MI.getOperand(2).getImm() << Shift;
3267     if (MI.getOpcode() == AArch64::SUBXri)
3268       Offset = -Offset;
3269     int64_t AbsPostOffset = std::abs(Offset - Size);
3270     const int64_t kMaxOffset =
3271         0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3272     if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3273       *TotalOffset = Offset;
3274       return true;
3275     }
3276   }
3277   return false;
3278 }
3279 
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3280 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3281                   SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3282   MemRefs.clear();
3283   for (auto &TS : TSE) {
3284     MachineInstr *MI = TS.MI;
3285     // An instruction without memory operands may access anything. Be
3286     // conservative and return an empty list.
3287     if (MI->memoperands_empty()) {
3288       MemRefs.clear();
3289       return;
3290     }
3291     MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3292   }
3293 }
3294 
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool IsLast)3295 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3296                             const AArch64FrameLowering *TFI, bool IsLast) {
3297   if (TagStores.empty())
3298     return;
3299   TagStoreInstr &FirstTagStore = TagStores[0];
3300   TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3301   Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3302   DL = TagStores[0].MI->getDebugLoc();
3303 
3304   Register Reg;
3305   FrameRegOffset = TFI->resolveFrameOffsetReference(
3306       *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3307       /*PreferFP=*/false, /*ForSimm=*/true);
3308   FrameReg = Reg;
3309   FrameRegUpdate = None;
3310 
3311   mergeMemRefs(TagStores, CombinedMemRefs);
3312 
3313   LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3314              for (const auto &Instr
3315                   : TagStores) { dbgs() << "  " << *Instr.MI; });
3316 
3317   // Size threshold where a loop becomes shorter than a linear sequence of
3318   // tagging instructions.
3319   const int kSetTagLoopThreshold = 176;
3320   if (Size < kSetTagLoopThreshold) {
3321     if (TagStores.size() < 2)
3322       return;
3323     emitUnrolled(InsertI);
3324   } else {
3325     MachineInstr *UpdateInstr = nullptr;
3326     int64_t TotalOffset;
3327     if (IsLast) {
3328       // See if we can merge base register update into the STGloop.
3329       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3330       // but STGloop is way too unusual for that, and also it only
3331       // realistically happens in function epilogue. Also, STGloop is expanded
3332       // before that pass.
3333       if (InsertI != MBB->end() &&
3334           canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3335                             &TotalOffset)) {
3336         UpdateInstr = &*InsertI++;
3337         LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
3338                           << *UpdateInstr);
3339       }
3340     }
3341 
3342     if (!UpdateInstr && TagStores.size() < 2)
3343       return;
3344 
3345     if (UpdateInstr) {
3346       FrameRegUpdate = TotalOffset;
3347       FrameRegUpdateFlags = UpdateInstr->getFlags();
3348     }
3349     emitLoop(InsertI);
3350     if (UpdateInstr)
3351       UpdateInstr->eraseFromParent();
3352   }
3353 
3354   for (auto &TS : TagStores)
3355     TS.MI->eraseFromParent();
3356 }
3357 
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3358 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3359                                         int64_t &Size, bool &ZeroData) {
3360   MachineFunction &MF = *MI.getParent()->getParent();
3361   const MachineFrameInfo &MFI = MF.getFrameInfo();
3362 
3363   unsigned Opcode = MI.getOpcode();
3364   ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3365               Opcode == AArch64::STZ2GOffset);
3366 
3367   if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3368     if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3369       return false;
3370     if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3371       return false;
3372     Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3373     Size = MI.getOperand(2).getImm();
3374     return true;
3375   }
3376 
3377   if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3378     Size = 16;
3379   else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3380     Size = 32;
3381   else
3382     return false;
3383 
3384   if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3385     return false;
3386 
3387   Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3388            16 * MI.getOperand(2).getImm();
3389   return true;
3390 }
3391 
3392 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3393 // and replace them with a shorter instruction sequence:
3394 // * replace STG + STG with ST2G
3395 // * replace STGloop + STGloop with STGloop
3396 // This code needs to run when stack slot offsets are already known, but before
3397 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3398 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3399                                                 const AArch64FrameLowering *TFI,
3400                                                 RegScavenger *RS) {
3401   bool FirstZeroData;
3402   int64_t Size, Offset;
3403   MachineInstr &MI = *II;
3404   MachineBasicBlock *MBB = MI.getParent();
3405   MachineBasicBlock::iterator NextI = ++II;
3406   if (&MI == &MBB->instr_back())
3407     return II;
3408   if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3409     return II;
3410 
3411   SmallVector<TagStoreInstr, 4> Instrs;
3412   Instrs.emplace_back(&MI, Offset, Size);
3413 
3414   constexpr int kScanLimit = 10;
3415   int Count = 0;
3416   for (MachineBasicBlock::iterator E = MBB->end();
3417        NextI != E && Count < kScanLimit; ++NextI) {
3418     MachineInstr &MI = *NextI;
3419     bool ZeroData;
3420     int64_t Size, Offset;
3421     // Collect instructions that update memory tags with a FrameIndex operand
3422     // and (when applicable) constant size, and whose output registers are dead
3423     // (the latter is almost always the case in practice). Since these
3424     // instructions effectively have no inputs or outputs, we are free to skip
3425     // any non-aliasing instructions in between without tracking used registers.
3426     if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3427       if (ZeroData != FirstZeroData)
3428         break;
3429       Instrs.emplace_back(&MI, Offset, Size);
3430       continue;
3431     }
3432 
3433     // Only count non-transient, non-tagging instructions toward the scan
3434     // limit.
3435     if (!MI.isTransient())
3436       ++Count;
3437 
3438     // Just in case, stop before the epilogue code starts.
3439     if (MI.getFlag(MachineInstr::FrameSetup) ||
3440         MI.getFlag(MachineInstr::FrameDestroy))
3441       break;
3442 
3443     // Reject anything that may alias the collected instructions.
3444     if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3445       break;
3446   }
3447 
3448   // New code will be inserted after the last tagging instruction we've found.
3449   MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3450   InsertI++;
3451 
3452   llvm::stable_sort(Instrs,
3453                     [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3454                       return Left.Offset < Right.Offset;
3455                     });
3456 
3457   // Make sure that we don't have any overlapping stores.
3458   int64_t CurOffset = Instrs[0].Offset;
3459   for (auto &Instr : Instrs) {
3460     if (CurOffset > Instr.Offset)
3461       return NextI;
3462     CurOffset = Instr.Offset + Instr.Size;
3463   }
3464 
3465   // Find contiguous runs of tagged memory and emit shorter instruction
3466   // sequencies for them when possible.
3467   TagStoreEdit TSE(MBB, FirstZeroData);
3468   Optional<int64_t> EndOffset;
3469   for (auto &Instr : Instrs) {
3470     if (EndOffset && *EndOffset != Instr.Offset) {
3471       // Found a gap.
3472       TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
3473       TSE.clear();
3474     }
3475 
3476     TSE.addInstruction(Instr);
3477     EndOffset = Instr.Offset + Instr.Size;
3478   }
3479 
3480   TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
3481 
3482   return InsertI;
3483 }
3484 } // namespace
3485 
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3486 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3487     MachineFunction &MF, RegScavenger *RS = nullptr) const {
3488   if (StackTaggingMergeSetTag)
3489     for (auto &BB : MF)
3490       for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3491         II = tryMergeAdjacentSTG(II, this, RS);
3492 }
3493 
3494 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3495 /// before the update.  This is easily retrieved as it is exactly the offset
3496 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3497 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3498     const MachineFunction &MF, int FI, Register &FrameReg,
3499     bool IgnoreSPUpdates) const {
3500   const MachineFrameInfo &MFI = MF.getFrameInfo();
3501   if (IgnoreSPUpdates) {
3502     LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3503                       << MFI.getObjectOffset(FI) << "\n");
3504     FrameReg = AArch64::SP;
3505     return StackOffset::getFixed(MFI.getObjectOffset(FI));
3506   }
3507 
3508   return getFrameIndexReference(MF, FI, FrameReg);
3509 }
3510 
3511 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3512 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3513 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3514     const MachineFunction &MF) const {
3515   return 0;
3516 }
3517 
3518 /// Funclets only need to account for space for the callee saved registers,
3519 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3520 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3521     const MachineFunction &MF) const {
3522   // This is the size of the pushed CSRs.
3523   unsigned CSSize =
3524       MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3525   // This is the amount of stack a funclet needs to allocate.
3526   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3527                  getStackAlign());
3528 }
3529 
3530 namespace {
3531 struct FrameObject {
3532   bool IsValid = false;
3533   // Index of the object in MFI.
3534   int ObjectIndex = 0;
3535   // Group ID this object belongs to.
3536   int GroupIndex = -1;
3537   // This object should be placed first (closest to SP).
3538   bool ObjectFirst = false;
3539   // This object's group (which always contains the object with
3540   // ObjectFirst==true) should be placed first.
3541   bool GroupFirst = false;
3542 };
3543 
3544 class GroupBuilder {
3545   SmallVector<int, 8> CurrentMembers;
3546   int NextGroupIndex = 0;
3547   std::vector<FrameObject> &Objects;
3548 
3549 public:
GroupBuilder(std::vector<FrameObject> & Objects)3550   GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3551   void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3552   void EndCurrentGroup() {
3553     if (CurrentMembers.size() > 1) {
3554       // Create a new group with the current member list. This might remove them
3555       // from their pre-existing groups. That's OK, dealing with overlapping
3556       // groups is too hard and unlikely to make a difference.
3557       LLVM_DEBUG(dbgs() << "group:");
3558       for (int Index : CurrentMembers) {
3559         Objects[Index].GroupIndex = NextGroupIndex;
3560         LLVM_DEBUG(dbgs() << " " << Index);
3561       }
3562       LLVM_DEBUG(dbgs() << "\n");
3563       NextGroupIndex++;
3564     }
3565     CurrentMembers.clear();
3566   }
3567 };
3568 
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3569 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3570   // Objects at a lower index are closer to FP; objects at a higher index are
3571   // closer to SP.
3572   //
3573   // For consistency in our comparison, all invalid objects are placed
3574   // at the end. This also allows us to stop walking when we hit the
3575   // first invalid item after it's all sorted.
3576   //
3577   // The "first" object goes first (closest to SP), followed by the members of
3578   // the "first" group.
3579   //
3580   // The rest are sorted by the group index to keep the groups together.
3581   // Higher numbered groups are more likely to be around longer (i.e. untagged
3582   // in the function epilogue and not at some earlier point). Place them closer
3583   // to SP.
3584   //
3585   // If all else equal, sort by the object index to keep the objects in the
3586   // original order.
3587   return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3588                          A.ObjectIndex) <
3589          std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3590                          B.ObjectIndex);
3591 }
3592 } // namespace
3593 
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3594 void AArch64FrameLowering::orderFrameObjects(
3595     const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3596   if (!OrderFrameObjects || ObjectsToAllocate.empty())
3597     return;
3598 
3599   const MachineFrameInfo &MFI = MF.getFrameInfo();
3600   std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3601   for (auto &Obj : ObjectsToAllocate) {
3602     FrameObjects[Obj].IsValid = true;
3603     FrameObjects[Obj].ObjectIndex = Obj;
3604   }
3605 
3606   // Identify stack slots that are tagged at the same time.
3607   GroupBuilder GB(FrameObjects);
3608   for (auto &MBB : MF) {
3609     for (auto &MI : MBB) {
3610       if (MI.isDebugInstr())
3611         continue;
3612       int OpIndex;
3613       switch (MI.getOpcode()) {
3614       case AArch64::STGloop:
3615       case AArch64::STZGloop:
3616         OpIndex = 3;
3617         break;
3618       case AArch64::STGOffset:
3619       case AArch64::STZGOffset:
3620       case AArch64::ST2GOffset:
3621       case AArch64::STZ2GOffset:
3622         OpIndex = 1;
3623         break;
3624       default:
3625         OpIndex = -1;
3626       }
3627 
3628       int TaggedFI = -1;
3629       if (OpIndex >= 0) {
3630         const MachineOperand &MO = MI.getOperand(OpIndex);
3631         if (MO.isFI()) {
3632           int FI = MO.getIndex();
3633           if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3634               FrameObjects[FI].IsValid)
3635             TaggedFI = FI;
3636         }
3637       }
3638 
3639       // If this is a stack tagging instruction for a slot that is not part of a
3640       // group yet, either start a new group or add it to the current one.
3641       if (TaggedFI >= 0)
3642         GB.AddMember(TaggedFI);
3643       else
3644         GB.EndCurrentGroup();
3645     }
3646     // Groups should never span multiple basic blocks.
3647     GB.EndCurrentGroup();
3648   }
3649 
3650   // If the function's tagged base pointer is pinned to a stack slot, we want to
3651   // put that slot first when possible. This will likely place it at SP + 0,
3652   // and save one instruction when generating the base pointer because IRG does
3653   // not allow an immediate offset.
3654   const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3655   Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3656   if (TBPI) {
3657     FrameObjects[*TBPI].ObjectFirst = true;
3658     FrameObjects[*TBPI].GroupFirst = true;
3659     int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3660     if (FirstGroupIndex >= 0)
3661       for (FrameObject &Object : FrameObjects)
3662         if (Object.GroupIndex == FirstGroupIndex)
3663           Object.GroupFirst = true;
3664   }
3665 
3666   llvm::stable_sort(FrameObjects, FrameObjectCompare);
3667 
3668   int i = 0;
3669   for (auto &Obj : FrameObjects) {
3670     // All invalid items are sorted at the end, so it's safe to stop.
3671     if (!Obj.IsValid)
3672       break;
3673     ObjectsToAllocate[i++] = Obj.ObjectIndex;
3674   }
3675 
3676   LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3677                                                     : FrameObjects) {
3678     if (!Obj.IsValid)
3679       break;
3680     dbgs() << "  " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3681     if (Obj.ObjectFirst)
3682       dbgs() << ", first";
3683     if (Obj.GroupFirst)
3684       dbgs() << ", group-first";
3685     dbgs() << "\n";
3686   });
3687 }
3688