1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // |                                   | Higher address
22 // |-----------------------------------|
23 // |                                   |
24 // | arguments passed on the stack     |
25 // |                                   |
26 // |-----------------------------------| <- sp
27 // |                                   | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // |                                   | Higher address
37 // |-----------------------------------|
38 // |                                   |
39 // | arguments passed on the stack     |
40 // |                                   |
41 // |-----------------------------------|
42 // |                                   |
43 // | (Win64 only) varargs from reg     |
44 // |                                   |
45 // |-----------------------------------|
46 // |                                   |
47 // | callee-saved gpr registers        | <--.
48 // |                                   |    | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -|    | callee saves are swapped,
50 // |                                   |    | (frame record first)
51 // | prev_fp, prev_lr                  | <--'
52 // | (a.k.a. "frame record")           |
53 // |-----------------------------------| <- fp(=x29)
54 // |                                   |
55 // | callee-saved fp/simd/SVE regs     |
56 // |                                   |
57 // |-----------------------------------|
58 // |                                   |
59 // |        SVE stack objects          |
60 // |                                   |
61 // |-----------------------------------|
62 // |.empty.space.to.make.part.below....|
63 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
64 // |.the.standard.16-byte.alignment....|  compile time; if present)
65 // |-----------------------------------|
66 // |                                   |
67 // | local variables of fixed size     |
68 // | including spill slots             |
69 // |-----------------------------------| <- bp(not defined by ABI,
70 // |.variable-sized.local.variables....|       LLVM chooses X19)
71 // |.(VLAs)............................| (size of this area is unknown at
72 // |...................................|  compile time)
73 // |-----------------------------------| <- sp
74 // |                                   | Lower address
75 //
76 //
77 // To access the data in a frame, at-compile time, a constant offset must be
78 // computable from one of the pointers (fp, bp, sp) to access it. The size
79 // of the areas with a dotted background cannot be computed at compile-time
80 // if they are present, making it required to have all three of fp, bp and
81 // sp to be set up to be able to access all contents in the frame areas,
82 // assuming all of the frame areas are non-empty.
83 //
84 // For most functions, some of the frame areas are empty. For those functions,
85 // it may not be necessary to set up fp or bp:
86 // * A base pointer is definitely needed when there are both VLAs and local
87 //   variables with more-than-default alignment requirements.
88 // * A frame pointer is definitely needed when there are local variables with
89 //   more-than-default alignment requirements.
90 //
91 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
92 // callee-saved area, since the unwind encoding does not allow for encoding
93 // this dynamically and existing tools depend on this layout. For other
94 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
95 // area to allow SVE stack objects (allocated directly below the callee-saves,
96 // if available) to be accessed directly from the framepointer.
97 // The SVE spill/fill instructions have VL-scaled addressing modes such
98 // as:
99 //    ldr z8, [fp, #-7 mul vl]
100 // For SVE the size of the vector length (VL) is not known at compile-time, so
101 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
102 // layout, we don't need to add an unscaled offset to the framepointer before
103 // accessing the SVE object in the frame.
104 //
105 // In some cases when a base pointer is not strictly needed, it is generated
106 // anyway when offsets from the frame pointer to access local variables become
107 // so large that the offset can't be encoded in the immediate fields of loads
108 // or stores.
109 //
110 // FIXME: also explain the redzone concept.
111 // FIXME: also explain the concept of reserved call frames.
112 //
113 //===----------------------------------------------------------------------===//
114 
115 #include "AArch64FrameLowering.h"
116 #include "AArch64InstrInfo.h"
117 #include "AArch64MachineFunctionInfo.h"
118 #include "AArch64RegisterInfo.h"
119 #include "AArch64Subtarget.h"
120 #include "AArch64TargetMachine.h"
121 #include "MCTargetDesc/AArch64AddressingModes.h"
122 #include "llvm/ADT/ScopeExit.h"
123 #include "llvm/ADT/SmallVector.h"
124 #include "llvm/ADT/Statistic.h"
125 #include "llvm/CodeGen/LivePhysRegs.h"
126 #include "llvm/CodeGen/MachineBasicBlock.h"
127 #include "llvm/CodeGen/MachineFrameInfo.h"
128 #include "llvm/CodeGen/MachineFunction.h"
129 #include "llvm/CodeGen/MachineInstr.h"
130 #include "llvm/CodeGen/MachineInstrBuilder.h"
131 #include "llvm/CodeGen/MachineMemOperand.h"
132 #include "llvm/CodeGen/MachineModuleInfo.h"
133 #include "llvm/CodeGen/MachineOperand.h"
134 #include "llvm/CodeGen/MachineRegisterInfo.h"
135 #include "llvm/CodeGen/RegisterScavenging.h"
136 #include "llvm/CodeGen/TargetInstrInfo.h"
137 #include "llvm/CodeGen/TargetRegisterInfo.h"
138 #include "llvm/CodeGen/TargetSubtargetInfo.h"
139 #include "llvm/CodeGen/WinEHFuncInfo.h"
140 #include "llvm/IR/Attributes.h"
141 #include "llvm/IR/CallingConv.h"
142 #include "llvm/IR/DataLayout.h"
143 #include "llvm/IR/DebugLoc.h"
144 #include "llvm/IR/Function.h"
145 #include "llvm/MC/MCAsmInfo.h"
146 #include "llvm/MC/MCDwarf.h"
147 #include "llvm/Support/CommandLine.h"
148 #include "llvm/Support/Debug.h"
149 #include "llvm/Support/ErrorHandling.h"
150 #include "llvm/Support/LEB128.h"
151 #include "llvm/Support/MathExtras.h"
152 #include "llvm/Support/raw_ostream.h"
153 #include "llvm/Target/TargetMachine.h"
154 #include "llvm/Target/TargetOptions.h"
155 #include <cassert>
156 #include <cstdint>
157 #include <iterator>
158 #include <vector>
159 
160 using namespace llvm;
161 
162 #define DEBUG_TYPE "frame-info"
163 
164 static cl::opt<bool> EnableRedZone("aarch64-redzone",
165                                    cl::desc("enable use of redzone on AArch64"),
166                                    cl::init(false), cl::Hidden);
167 
168 static cl::opt<bool>
169     ReverseCSRRestoreSeq("reverse-csr-restore-seq",
170                          cl::desc("reverse the CSR restore sequence"),
171                          cl::init(false), cl::Hidden);
172 
173 static cl::opt<bool> StackTaggingMergeSetTag(
174     "stack-tagging-merge-settag",
175     cl::desc("merge settag instruction in function epilog"), cl::init(true),
176     cl::Hidden);
177 
178 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
179                                        cl::desc("sort stack allocations"),
180                                        cl::init(true), cl::Hidden);
181 
182 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
183 
184 /// Returns the argument pop size.
getArgumentPopSize(MachineFunction & MF,MachineBasicBlock & MBB)185 static uint64_t getArgumentPopSize(MachineFunction &MF,
186                                    MachineBasicBlock &MBB) {
187   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
188   bool IsTailCallReturn = false;
189   if (MBB.end() != MBBI) {
190     unsigned RetOpcode = MBBI->getOpcode();
191     IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
192                        RetOpcode == AArch64::TCRETURNri ||
193                        RetOpcode == AArch64::TCRETURNriBTI;
194   }
195   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
196 
197   uint64_t ArgumentPopSize = 0;
198   if (IsTailCallReturn) {
199     MachineOperand &StackAdjust = MBBI->getOperand(1);
200 
201     // For a tail-call in a callee-pops-arguments environment, some or all of
202     // the stack may actually be in use for the call's arguments, this is
203     // calculated during LowerCall and consumed here...
204     ArgumentPopSize = StackAdjust.getImm();
205   } else {
206     // ... otherwise the amount to pop is *all* of the argument space,
207     // conveniently stored in the MachineFunctionInfo by
208     // LowerFormalArguments. This will, of course, be zero for the C calling
209     // convention.
210     ArgumentPopSize = AFI->getArgumentStackToRestore();
211   }
212 
213   return ArgumentPopSize;
214 }
215 
216 /// This is the biggest offset to the stack pointer we can encode in aarch64
217 /// instructions (without using a separate calculation and a temp register).
218 /// Note that the exception here are vector stores/loads which cannot encode any
219 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
220 static const unsigned DefaultSafeSPDisplacement = 255;
221 
222 /// Look at each instruction that references stack frames and return the stack
223 /// size limit beyond which some of these instructions will require a scratch
224 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)225 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
226   // FIXME: For now, just conservatively guestimate based on unscaled indexing
227   // range. We'll end up allocating an unnecessary spill slot a lot, but
228   // realistically that's not a big deal at this stage of the game.
229   for (MachineBasicBlock &MBB : MF) {
230     for (MachineInstr &MI : MBB) {
231       if (MI.isDebugInstr() || MI.isPseudo() ||
232           MI.getOpcode() == AArch64::ADDXri ||
233           MI.getOpcode() == AArch64::ADDSXri)
234         continue;
235 
236       for (const MachineOperand &MO : MI.operands()) {
237         if (!MO.isFI())
238           continue;
239 
240         StackOffset Offset;
241         if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
242             AArch64FrameOffsetCannotUpdate)
243           return 0;
244       }
245     }
246   }
247   return DefaultSafeSPDisplacement;
248 }
249 
250 TargetStackID::Value
getStackIDForScalableVectors() const251 AArch64FrameLowering::getStackIDForScalableVectors() const {
252   return TargetStackID::ScalableVector;
253 }
254 
255 /// Returns the size of the fixed object area (allocated next to sp on entry)
256 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)257 static unsigned getFixedObjectSize(const MachineFunction &MF,
258                                    const AArch64FunctionInfo *AFI, bool IsWin64,
259                                    bool IsFunclet) {
260   if (!IsWin64 || IsFunclet) {
261     // Only Win64 uses fixed objects, and then only for the function (not
262     // funclets)
263     return 0;
264   } else {
265     // Var args are stored here in the primary function.
266     const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
267     // To support EH funclets we allocate an UnwindHelp object
268     const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
269     return alignTo(VarArgsArea + UnwindHelpObject, 16);
270   }
271 }
272 
273 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)274 static StackOffset getSVEStackSize(const MachineFunction &MF) {
275   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
276   return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
277 }
278 
canUseRedZone(const MachineFunction & MF) const279 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
280   if (!EnableRedZone)
281     return false;
282   // Don't use the red zone if the function explicitly asks us not to.
283   // This is typically used for kernel code.
284   if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
285     return false;
286 
287   const MachineFrameInfo &MFI = MF.getFrameInfo();
288   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
289   uint64_t NumBytes = AFI->getLocalStackSize();
290 
291   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
292            getSVEStackSize(MF));
293 }
294 
295 /// hasFP - Return true if the specified function should have a dedicated frame
296 /// pointer register.
hasFP(const MachineFunction & MF) const297 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
298   const MachineFrameInfo &MFI = MF.getFrameInfo();
299   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
300   // Win64 EH requires a frame pointer if funclets are present, as the locals
301   // are accessed off the frame pointer in both the parent function and the
302   // funclets.
303   if (MF.hasEHFunclets())
304     return true;
305   // Retain behavior of always omitting the FP for leaf functions when possible.
306   if (MF.getTarget().Options.DisableFramePointerElim(MF))
307     return true;
308   if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
309       MFI.hasStackMap() || MFI.hasPatchPoint() ||
310       RegInfo->needsStackRealignment(MF))
311     return true;
312   // With large callframes around we may need to use FP to access the scavenging
313   // emergency spillslot.
314   //
315   // Unfortunately some calls to hasFP() like machine verifier ->
316   // getReservedReg() -> hasFP in the middle of global isel are too early
317   // to know the max call frame size. Hopefully conservatively returning "true"
318   // in those cases is fine.
319   // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
320   if (!MFI.isMaxCallFrameSizeComputed() ||
321       MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
322     return true;
323 
324   return false;
325 }
326 
327 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
328 /// not required, we reserve argument space for call sites in the function
329 /// immediately on entry to the current function.  This eliminates the need for
330 /// add/sub sp brackets around call sites.  Returns true if the call frame is
331 /// included as part of the stack frame.
332 bool
hasReservedCallFrame(const MachineFunction & MF) const333 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
334   return !MF.getFrameInfo().hasVarSizedObjects();
335 }
336 
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const337 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
338     MachineFunction &MF, MachineBasicBlock &MBB,
339     MachineBasicBlock::iterator I) const {
340   const AArch64InstrInfo *TII =
341       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
342   DebugLoc DL = I->getDebugLoc();
343   unsigned Opc = I->getOpcode();
344   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
345   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
346 
347   if (!hasReservedCallFrame(MF)) {
348     int64_t Amount = I->getOperand(0).getImm();
349     Amount = alignTo(Amount, getStackAlign());
350     if (!IsDestroy)
351       Amount = -Amount;
352 
353     // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
354     // doesn't have to pop anything), then the first operand will be zero too so
355     // this adjustment is a no-op.
356     if (CalleePopAmount == 0) {
357       // FIXME: in-function stack adjustment for calls is limited to 24-bits
358       // because there's no guaranteed temporary register available.
359       //
360       // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
361       // 1) For offset <= 12-bit, we use LSL #0
362       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
363       // LSL #0, and the other uses LSL #12.
364       //
365       // Most call frames will be allocated at the start of a function so
366       // this is OK, but it is a limitation that needs dealing with.
367       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
368       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
369                       StackOffset::getFixed(Amount), TII);
370     }
371   } else if (CalleePopAmount != 0) {
372     // If the calling convention demands that the callee pops arguments from the
373     // stack, we want to add it back if we have a reserved call frame.
374     assert(CalleePopAmount < 0xffffff && "call frame too large");
375     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
376                     StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
377   }
378   return MBB.erase(I);
379 }
380 
381 // Convenience function to create a DWARF expression for
382 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)383 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
384                                      int NumBytes, int NumVGScaledBytes, unsigned VG,
385                                      llvm::raw_string_ostream &Comment) {
386   uint8_t buffer[16];
387 
388   if (NumBytes) {
389     Expr.push_back(dwarf::DW_OP_consts);
390     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
391     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
392     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
393   }
394 
395   if (NumVGScaledBytes) {
396     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
397     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
398 
399     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
400     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
401     Expr.push_back(0);
402 
403     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
404     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
405 
406     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
407             << std::abs(NumVGScaledBytes) << " * VG";
408   }
409 }
410 
411 // Creates an MCCFIInstruction:
412 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpressionFromSP(const TargetRegisterInfo & TRI,const StackOffset & OffsetFromSP) const413 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
414     const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
415   int64_t NumBytes, NumVGScaledBytes;
416   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
417                                                         NumVGScaledBytes);
418 
419   std::string CommentBuffer = "sp";
420   llvm::raw_string_ostream Comment(CommentBuffer);
421 
422   // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
423   SmallString<64> Expr;
424   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
425   Expr.push_back(0);
426   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
427                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
428 
429   // Wrap this into DW_CFA_def_cfa.
430   SmallString<64> DefCfaExpr;
431   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
432   uint8_t buffer[16];
433   DefCfaExpr.append(buffer,
434                     buffer + encodeULEB128(Expr.size(), buffer));
435   DefCfaExpr.append(Expr.str());
436   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
437                                         Comment.str());
438 }
439 
createCfaOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA) const440 MCCFIInstruction AArch64FrameLowering::createCfaOffset(
441     const TargetRegisterInfo &TRI, unsigned Reg,
442     const StackOffset &OffsetFromDefCFA) const {
443   int64_t NumBytes, NumVGScaledBytes;
444   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
445       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
446 
447   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
448 
449   // Non-scalable offsets can use DW_CFA_offset directly.
450   if (!NumVGScaledBytes)
451     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
452 
453   std::string CommentBuffer;
454   llvm::raw_string_ostream Comment(CommentBuffer);
455   Comment << printReg(Reg, &TRI) << "  @ cfa";
456 
457   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
458   SmallString<64> OffsetExpr;
459   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
460                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
461 
462   // Wrap this into DW_CFA_expression
463   SmallString<64> CfaExpr;
464   CfaExpr.push_back(dwarf::DW_CFA_expression);
465   uint8_t buffer[16];
466   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
467   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
468   CfaExpr.append(OffsetExpr.str());
469 
470   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
471 }
472 
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const473 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
474     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
475   MachineFunction &MF = *MBB.getParent();
476   MachineFrameInfo &MFI = MF.getFrameInfo();
477   const TargetSubtargetInfo &STI = MF.getSubtarget();
478   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
479   const TargetInstrInfo *TII = STI.getInstrInfo();
480   DebugLoc DL = MBB.findDebugLoc(MBBI);
481 
482   // Add callee saved registers to move list.
483   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
484   if (CSI.empty())
485     return;
486 
487   for (const auto &Info : CSI) {
488     unsigned Reg = Info.getReg();
489 
490     // Not all unwinders may know about SVE registers, so assume the lowest
491     // common demoninator.
492     unsigned NewReg;
493     if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
494       Reg = NewReg;
495     else
496       continue;
497 
498     StackOffset Offset;
499     if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
500       AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
501       Offset =
502           StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
503           StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
504     } else {
505       Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
506                                      getOffsetOfLocalArea());
507     }
508     unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
509     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
510         .addCFIIndex(CFIIndex)
511         .setMIFlags(MachineInstr::FrameSetup);
512   }
513 }
514 
515 // Find a scratch register that we can use at the start of the prologue to
516 // re-align the stack pointer.  We avoid using callee-save registers since they
517 // may appear to be free when this is called from canUseAsPrologue (during
518 // shrink wrapping), but then no longer be free when this is called from
519 // emitPrologue.
520 //
521 // FIXME: This is a bit conservative, since in the above case we could use one
522 // of the callee-save registers as a scratch temp to re-align the stack pointer,
523 // but we would then have to make sure that we were in fact saving at least one
524 // callee-save register in the prologue, which is additional complexity that
525 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)526 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
527   MachineFunction *MF = MBB->getParent();
528 
529   // If MBB is an entry block, use X9 as the scratch register
530   if (&MF->front() == MBB)
531     return AArch64::X9;
532 
533   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
534   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
535   LivePhysRegs LiveRegs(TRI);
536   LiveRegs.addLiveIns(*MBB);
537 
538   // Mark callee saved registers as used so we will not choose them.
539   const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
540   for (unsigned i = 0; CSRegs[i]; ++i)
541     LiveRegs.addReg(CSRegs[i]);
542 
543   // Prefer X9 since it was historically used for the prologue scratch reg.
544   const MachineRegisterInfo &MRI = MF->getRegInfo();
545   if (LiveRegs.available(MRI, AArch64::X9))
546     return AArch64::X9;
547 
548   for (unsigned Reg : AArch64::GPR64RegClass) {
549     if (LiveRegs.available(MRI, Reg))
550       return Reg;
551   }
552   return AArch64::NoRegister;
553 }
554 
canUseAsPrologue(const MachineBasicBlock & MBB) const555 bool AArch64FrameLowering::canUseAsPrologue(
556     const MachineBasicBlock &MBB) const {
557   const MachineFunction *MF = MBB.getParent();
558   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
559   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
560   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
561 
562   // Don't need a scratch register if we're not going to re-align the stack.
563   if (!RegInfo->needsStackRealignment(*MF))
564     return true;
565   // Otherwise, we can use any block as long as it has a scratch register
566   // available.
567   return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
568 }
569 
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)570 static bool windowsRequiresStackProbe(MachineFunction &MF,
571                                       uint64_t StackSizeInBytes) {
572   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
573   if (!Subtarget.isTargetWindows())
574     return false;
575   const Function &F = MF.getFunction();
576   // TODO: When implementing stack protectors, take that into account
577   // for the probe threshold.
578   unsigned StackProbeSize = 4096;
579   if (F.hasFnAttribute("stack-probe-size"))
580     F.getFnAttribute("stack-probe-size")
581         .getValueAsString()
582         .getAsInteger(0, StackProbeSize);
583   return (StackSizeInBytes >= StackProbeSize) &&
584          !F.hasFnAttribute("no-stack-arg-probe");
585 }
586 
needsWinCFI(const MachineFunction & MF)587 static bool needsWinCFI(const MachineFunction &MF) {
588   const Function &F = MF.getFunction();
589   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
590          F.needsUnwindTableEntry();
591 }
592 
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const593 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
594     MachineFunction &MF, uint64_t StackBumpBytes) const {
595   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
596   const MachineFrameInfo &MFI = MF.getFrameInfo();
597   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
598   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
599 
600   if (AFI->getLocalStackSize() == 0)
601     return false;
602 
603   // For WinCFI, if optimizing for size, prefer to not combine the stack bump
604   // (to force a stp with predecrement) to match the packed unwind format,
605   // provided that there actually are any callee saved registers to merge the
606   // decrement with.
607   // This is potentially marginally slower, but allows using the packed
608   // unwind format for functions that both have a local area and callee saved
609   // registers. Using the packed unwind format notably reduces the size of
610   // the unwind info.
611   if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
612       MF.getFunction().hasOptSize())
613     return false;
614 
615   // 512 is the maximum immediate for stp/ldp that will be used for
616   // callee-save save/restores
617   if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
618     return false;
619 
620   if (MFI.hasVarSizedObjects())
621     return false;
622 
623   if (RegInfo->needsStackRealignment(MF))
624     return false;
625 
626   // This isn't strictly necessary, but it simplifies things a bit since the
627   // current RedZone handling code assumes the SP is adjusted by the
628   // callee-save save/restore code.
629   if (canUseRedZone(MF))
630     return false;
631 
632   // When there is an SVE area on the stack, always allocate the
633   // callee-saves and spills/locals separately.
634   if (getSVEStackSize(MF))
635     return false;
636 
637   return true;
638 }
639 
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const640 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
641     MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
642   if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
643     return false;
644 
645   if (MBB.empty())
646     return true;
647 
648   // Disable combined SP bump if the last instruction is an MTE tag store. It
649   // is almost always better to merge SP adjustment into those instructions.
650   MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
651   MachineBasicBlock::iterator Begin = MBB.begin();
652   while (LastI != Begin) {
653     --LastI;
654     if (LastI->isTransient())
655       continue;
656     if (!LastI->getFlag(MachineInstr::FrameDestroy))
657       break;
658   }
659   switch (LastI->getOpcode()) {
660   case AArch64::STGloop:
661   case AArch64::STZGloop:
662   case AArch64::STGOffset:
663   case AArch64::STZGOffset:
664   case AArch64::ST2GOffset:
665   case AArch64::STZ2GOffset:
666     return false;
667   default:
668     return true;
669   }
670   llvm_unreachable("unreachable");
671 }
672 
673 // Given a load or a store instruction, generate an appropriate unwinding SEH
674 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)675 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
676                                              const TargetInstrInfo &TII,
677                                              MachineInstr::MIFlag Flag) {
678   unsigned Opc = MBBI->getOpcode();
679   MachineBasicBlock *MBB = MBBI->getParent();
680   MachineFunction &MF = *MBB->getParent();
681   DebugLoc DL = MBBI->getDebugLoc();
682   unsigned ImmIdx = MBBI->getNumOperands() - 1;
683   int Imm = MBBI->getOperand(ImmIdx).getImm();
684   MachineInstrBuilder MIB;
685   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
686   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
687 
688   switch (Opc) {
689   default:
690     llvm_unreachable("No SEH Opcode for this instruction");
691   case AArch64::LDPDpost:
692     Imm = -Imm;
693     LLVM_FALLTHROUGH;
694   case AArch64::STPDpre: {
695     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
696     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
697     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
698               .addImm(Reg0)
699               .addImm(Reg1)
700               .addImm(Imm * 8)
701               .setMIFlag(Flag);
702     break;
703   }
704   case AArch64::LDPXpost:
705     Imm = -Imm;
706     LLVM_FALLTHROUGH;
707   case AArch64::STPXpre: {
708     Register Reg0 = MBBI->getOperand(1).getReg();
709     Register Reg1 = MBBI->getOperand(2).getReg();
710     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
711       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
712                 .addImm(Imm * 8)
713                 .setMIFlag(Flag);
714     else
715       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
716                 .addImm(RegInfo->getSEHRegNum(Reg0))
717                 .addImm(RegInfo->getSEHRegNum(Reg1))
718                 .addImm(Imm * 8)
719                 .setMIFlag(Flag);
720     break;
721   }
722   case AArch64::LDRDpost:
723     Imm = -Imm;
724     LLVM_FALLTHROUGH;
725   case AArch64::STRDpre: {
726     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
727     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
728               .addImm(Reg)
729               .addImm(Imm)
730               .setMIFlag(Flag);
731     break;
732   }
733   case AArch64::LDRXpost:
734     Imm = -Imm;
735     LLVM_FALLTHROUGH;
736   case AArch64::STRXpre: {
737     unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
738     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
739               .addImm(Reg)
740               .addImm(Imm)
741               .setMIFlag(Flag);
742     break;
743   }
744   case AArch64::STPDi:
745   case AArch64::LDPDi: {
746     unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
747     unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
748     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
749               .addImm(Reg0)
750               .addImm(Reg1)
751               .addImm(Imm * 8)
752               .setMIFlag(Flag);
753     break;
754   }
755   case AArch64::STPXi:
756   case AArch64::LDPXi: {
757     Register Reg0 = MBBI->getOperand(0).getReg();
758     Register Reg1 = MBBI->getOperand(1).getReg();
759     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
760       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
761                 .addImm(Imm * 8)
762                 .setMIFlag(Flag);
763     else
764       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
765                 .addImm(RegInfo->getSEHRegNum(Reg0))
766                 .addImm(RegInfo->getSEHRegNum(Reg1))
767                 .addImm(Imm * 8)
768                 .setMIFlag(Flag);
769     break;
770   }
771   case AArch64::STRXui:
772   case AArch64::LDRXui: {
773     int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
774     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
775               .addImm(Reg)
776               .addImm(Imm * 8)
777               .setMIFlag(Flag);
778     break;
779   }
780   case AArch64::STRDui:
781   case AArch64::LDRDui: {
782     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
783     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
784               .addImm(Reg)
785               .addImm(Imm * 8)
786               .setMIFlag(Flag);
787     break;
788   }
789   }
790   auto I = MBB->insertAfter(MBBI, MIB);
791   return I;
792 }
793 
794 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)795 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
796                            unsigned LocalStackSize) {
797   MachineOperand *ImmOpnd = nullptr;
798   unsigned ImmIdx = MBBI->getNumOperands() - 1;
799   switch (MBBI->getOpcode()) {
800   default:
801     llvm_unreachable("Fix the offset in the SEH instruction");
802   case AArch64::SEH_SaveFPLR:
803   case AArch64::SEH_SaveRegP:
804   case AArch64::SEH_SaveReg:
805   case AArch64::SEH_SaveFRegP:
806   case AArch64::SEH_SaveFReg:
807     ImmOpnd = &MBBI->getOperand(ImmIdx);
808     break;
809   }
810   if (ImmOpnd)
811     ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
812 }
813 
814 // Convert callee-save register save/restore instruction to do stack pointer
815 // decrement/increment to allocate/deallocate the callee-save stack area by
816 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool InProlog=true)817 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
818     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
819     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
820     bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
821   // Ignore instructions that do not operate on SP, i.e. shadow call stack
822   // instructions and associated CFI instruction.
823   while (MBBI->getOpcode() == AArch64::STRXpost ||
824          MBBI->getOpcode() == AArch64::LDRXpre ||
825          MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
826     if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
827       assert(MBBI->getOperand(0).getReg() != AArch64::SP);
828     ++MBBI;
829   }
830   unsigned NewOpc;
831   int Scale = 1;
832   switch (MBBI->getOpcode()) {
833   default:
834     llvm_unreachable("Unexpected callee-save save/restore opcode!");
835   case AArch64::STPXi:
836     NewOpc = AArch64::STPXpre;
837     Scale = 8;
838     break;
839   case AArch64::STPDi:
840     NewOpc = AArch64::STPDpre;
841     Scale = 8;
842     break;
843   case AArch64::STPQi:
844     NewOpc = AArch64::STPQpre;
845     Scale = 16;
846     break;
847   case AArch64::STRXui:
848     NewOpc = AArch64::STRXpre;
849     break;
850   case AArch64::STRDui:
851     NewOpc = AArch64::STRDpre;
852     break;
853   case AArch64::STRQui:
854     NewOpc = AArch64::STRQpre;
855     break;
856   case AArch64::LDPXi:
857     NewOpc = AArch64::LDPXpost;
858     Scale = 8;
859     break;
860   case AArch64::LDPDi:
861     NewOpc = AArch64::LDPDpost;
862     Scale = 8;
863     break;
864   case AArch64::LDPQi:
865     NewOpc = AArch64::LDPQpost;
866     Scale = 16;
867     break;
868   case AArch64::LDRXui:
869     NewOpc = AArch64::LDRXpost;
870     break;
871   case AArch64::LDRDui:
872     NewOpc = AArch64::LDRDpost;
873     break;
874   case AArch64::LDRQui:
875     NewOpc = AArch64::LDRQpost;
876     break;
877   }
878   // Get rid of the SEH code associated with the old instruction.
879   if (NeedsWinCFI) {
880     auto SEH = std::next(MBBI);
881     if (AArch64InstrInfo::isSEHInstruction(*SEH))
882       SEH->eraseFromParent();
883   }
884 
885   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
886   MIB.addReg(AArch64::SP, RegState::Define);
887 
888   // Copy all operands other than the immediate offset.
889   unsigned OpndIdx = 0;
890   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
891        ++OpndIdx)
892     MIB.add(MBBI->getOperand(OpndIdx));
893 
894   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
895          "Unexpected immediate offset in first/last callee-save save/restore "
896          "instruction!");
897   assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
898          "Unexpected base register in callee-save save/restore instruction!");
899   assert(CSStackSizeInc % Scale == 0);
900   MIB.addImm(CSStackSizeInc / Scale);
901 
902   MIB.setMIFlags(MBBI->getFlags());
903   MIB.setMemRefs(MBBI->memoperands());
904 
905   // Generate a new SEH code that corresponds to the new instruction.
906   if (NeedsWinCFI) {
907     *HasWinCFI = true;
908     InsertSEH(*MIB, *TII,
909               InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
910   }
911 
912   return std::prev(MBB.erase(MBBI));
913 }
914 
915 // Fixup callee-save register save/restore instructions to take into account
916 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)917 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
918                                               uint64_t LocalStackSize,
919                                               bool NeedsWinCFI,
920                                               bool *HasWinCFI) {
921   if (AArch64InstrInfo::isSEHInstruction(MI))
922     return;
923 
924   unsigned Opc = MI.getOpcode();
925 
926   // Ignore instructions that do not operate on SP, i.e. shadow call stack
927   // instructions and associated CFI instruction.
928   if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
929       Opc == AArch64::CFI_INSTRUCTION) {
930     if (Opc != AArch64::CFI_INSTRUCTION)
931       assert(MI.getOperand(0).getReg() != AArch64::SP);
932     return;
933   }
934 
935   unsigned Scale;
936   switch (Opc) {
937   case AArch64::STPXi:
938   case AArch64::STRXui:
939   case AArch64::STPDi:
940   case AArch64::STRDui:
941   case AArch64::LDPXi:
942   case AArch64::LDRXui:
943   case AArch64::LDPDi:
944   case AArch64::LDRDui:
945     Scale = 8;
946     break;
947   case AArch64::STPQi:
948   case AArch64::STRQui:
949   case AArch64::LDPQi:
950   case AArch64::LDRQui:
951     Scale = 16;
952     break;
953   default:
954     llvm_unreachable("Unexpected callee-save save/restore opcode!");
955   }
956 
957   unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
958   assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
959          "Unexpected base register in callee-save save/restore instruction!");
960   // Last operand is immediate offset that needs fixing.
961   MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
962   // All generated opcodes have scaled offsets.
963   assert(LocalStackSize % Scale == 0);
964   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
965 
966   if (NeedsWinCFI) {
967     *HasWinCFI = true;
968     auto MBBI = std::next(MachineBasicBlock::iterator(MI));
969     assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
970     assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
971            "Expecting a SEH instruction");
972     fixupSEHOpcode(MBBI, LocalStackSize);
973   }
974 }
975 
adaptForLdStOpt(MachineBasicBlock & MBB,MachineBasicBlock::iterator FirstSPPopI,MachineBasicBlock::iterator LastPopI)976 static void adaptForLdStOpt(MachineBasicBlock &MBB,
977                             MachineBasicBlock::iterator FirstSPPopI,
978                             MachineBasicBlock::iterator LastPopI) {
979   // Sometimes (when we restore in the same order as we save), we can end up
980   // with code like this:
981   //
982   // ldp      x26, x25, [sp]
983   // ldp      x24, x23, [sp, #16]
984   // ldp      x22, x21, [sp, #32]
985   // ldp      x20, x19, [sp, #48]
986   // add      sp, sp, #64
987   //
988   // In this case, it is always better to put the first ldp at the end, so
989   // that the load-store optimizer can run and merge the ldp and the add into
990   // a post-index ldp.
991   // If we managed to grab the first pop instruction, move it to the end.
992   if (ReverseCSRRestoreSeq)
993     MBB.splice(FirstSPPopI, &MBB, LastPopI);
994   // We should end up with something like this now:
995   //
996   // ldp      x24, x23, [sp, #16]
997   // ldp      x22, x21, [sp, #32]
998   // ldp      x20, x19, [sp, #48]
999   // ldp      x26, x25, [sp]
1000   // add      sp, sp, #64
1001   //
1002   // and the load-store optimizer can merge the last two instructions into:
1003   //
1004   // ldp      x26, x25, [sp], #64
1005   //
1006 }
1007 
isTargetWindows(const MachineFunction & MF)1008 static bool isTargetWindows(const MachineFunction &MF) {
1009   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1010 }
1011 
1012 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1013 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1014   switch (I->getOpcode()) {
1015   default:
1016     return false;
1017   case AArch64::STR_ZXI:
1018   case AArch64::STR_PXI:
1019   case AArch64::LDR_ZXI:
1020   case AArch64::LDR_PXI:
1021     return I->getFlag(MachineInstr::FrameSetup) ||
1022            I->getFlag(MachineInstr::FrameDestroy);
1023   }
1024 }
1025 
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1026 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1027                                         MachineBasicBlock &MBB) const {
1028   MachineBasicBlock::iterator MBBI = MBB.begin();
1029   const MachineFrameInfo &MFI = MF.getFrameInfo();
1030   const Function &F = MF.getFunction();
1031   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1032   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1033   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1034   MachineModuleInfo &MMI = MF.getMMI();
1035   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1036   bool needsFrameMoves =
1037       MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1038   bool HasFP = hasFP(MF);
1039   bool NeedsWinCFI = needsWinCFI(MF);
1040   bool HasWinCFI = false;
1041   auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1042 
1043   bool IsFunclet = MBB.isEHFuncletEntry();
1044 
1045   // At this point, we're going to decide whether or not the function uses a
1046   // redzone. In most cases, the function doesn't have a redzone so let's
1047   // assume that's false and set it to true in the case that there's a redzone.
1048   AFI->setHasRedZone(false);
1049 
1050   // Debug location must be unknown since the first debug location is used
1051   // to determine the end of the prologue.
1052   DebugLoc DL;
1053 
1054   const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1055   if (MFnI.shouldSignReturnAddress()) {
1056     if (MFnI.shouldSignWithBKey()) {
1057       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1058           .setMIFlag(MachineInstr::FrameSetup);
1059       BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
1060           .setMIFlag(MachineInstr::FrameSetup);
1061     } else {
1062       BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
1063           .setMIFlag(MachineInstr::FrameSetup);
1064     }
1065 
1066     unsigned CFIIndex =
1067         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1068     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1069         .addCFIIndex(CFIIndex)
1070         .setMIFlags(MachineInstr::FrameSetup);
1071   }
1072 
1073   // All calls are tail calls in GHC calling conv, and functions have no
1074   // prologue/epilogue.
1075   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1076     return;
1077 
1078   // Set tagged base pointer to the requested stack slot.
1079   // Ideally it should match SP value after prologue.
1080   Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1081   if (TBPI)
1082     AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1083   else
1084     AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1085 
1086   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1087 
1088   // getStackSize() includes all the locals in its size calculation. We don't
1089   // include these locals when computing the stack size of a funclet, as they
1090   // are allocated in the parent's stack frame and accessed via the frame
1091   // pointer from the funclet.  We only save the callee saved registers in the
1092   // funclet, which are really the callee saved registers of the parent
1093   // function, including the funclet.
1094   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1095                                : MFI.getStackSize();
1096   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1097     assert(!HasFP && "unexpected function without stack frame but with FP");
1098     assert(!SVEStackSize &&
1099            "unexpected function without stack frame but with SVE objects");
1100     // All of the stack allocation is for locals.
1101     AFI->setLocalStackSize(NumBytes);
1102     if (!NumBytes)
1103       return;
1104     // REDZONE: If the stack size is less than 128 bytes, we don't need
1105     // to actually allocate.
1106     if (canUseRedZone(MF)) {
1107       AFI->setHasRedZone(true);
1108       ++NumRedZoneFunctions;
1109     } else {
1110       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1111                       StackOffset::getFixed(-NumBytes), TII,
1112                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1113       if (!NeedsWinCFI && needsFrameMoves) {
1114         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1115         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1116           // Encode the stack size of the leaf function.
1117         unsigned CFIIndex = MF.addFrameInst(
1118             MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1119         BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1120             .addCFIIndex(CFIIndex)
1121             .setMIFlags(MachineInstr::FrameSetup);
1122       }
1123     }
1124 
1125     if (NeedsWinCFI) {
1126       HasWinCFI = true;
1127       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1128           .setMIFlag(MachineInstr::FrameSetup);
1129     }
1130 
1131     return;
1132   }
1133 
1134   bool IsWin64 =
1135       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1136   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1137 
1138   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1139   // All of the remaining stack allocations are for locals.
1140   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1141   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1142   if (CombineSPBump) {
1143     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1144     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1145                     StackOffset::getFixed(-NumBytes), TII,
1146                     MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1147     NumBytes = 0;
1148   } else if (PrologueSaveSize != 0) {
1149     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1150         MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
1151     NumBytes -= PrologueSaveSize;
1152   }
1153   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1154 
1155   // Move past the saves of the callee-saved registers, fixing up the offsets
1156   // and pre-inc if we decided to combine the callee-save and local stack
1157   // pointer bump above.
1158   MachineBasicBlock::iterator End = MBB.end();
1159   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1160          !IsSVECalleeSave(MBBI)) {
1161     if (CombineSPBump)
1162       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1163                                         NeedsWinCFI, &HasWinCFI);
1164     ++MBBI;
1165   }
1166 
1167   // For funclets the FP belongs to the containing function.
1168   if (!IsFunclet && HasFP) {
1169     // Only set up FP if we actually need to.
1170     int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1171 
1172     if (CombineSPBump)
1173       FPOffset += AFI->getLocalStackSize();
1174 
1175     // Issue    sub fp, sp, FPOffset or
1176     //          mov fp,sp          when FPOffset is zero.
1177     // Note: All stores of callee-saved registers are marked as "FrameSetup".
1178     // This code marks the instruction(s) that set the FP also.
1179     emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1180                     StackOffset::getFixed(FPOffset), TII,
1181                     MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1182   }
1183 
1184   if (windowsRequiresStackProbe(MF, NumBytes)) {
1185     uint64_t NumWords = NumBytes >> 4;
1186     if (NeedsWinCFI) {
1187       HasWinCFI = true;
1188       // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1189       // exceed this amount.  We need to move at most 2^24 - 1 into x15.
1190       // This is at most two instructions, MOVZ follwed by MOVK.
1191       // TODO: Fix to use multiple stack alloc unwind codes for stacks
1192       // exceeding 256MB in size.
1193       if (NumBytes >= (1 << 28))
1194         report_fatal_error("Stack size cannot exceed 256MB for stack "
1195                             "unwinding purposes");
1196 
1197       uint32_t LowNumWords = NumWords & 0xFFFF;
1198       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1199             .addImm(LowNumWords)
1200             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1201             .setMIFlag(MachineInstr::FrameSetup);
1202       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1203             .setMIFlag(MachineInstr::FrameSetup);
1204       if ((NumWords & 0xFFFF0000) != 0) {
1205           BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1206               .addReg(AArch64::X15)
1207               .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1208               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1209               .setMIFlag(MachineInstr::FrameSetup);
1210           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1211             .setMIFlag(MachineInstr::FrameSetup);
1212       }
1213     } else {
1214       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1215           .addImm(NumWords)
1216           .setMIFlags(MachineInstr::FrameSetup);
1217     }
1218 
1219     switch (MF.getTarget().getCodeModel()) {
1220     case CodeModel::Tiny:
1221     case CodeModel::Small:
1222     case CodeModel::Medium:
1223     case CodeModel::Kernel:
1224       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1225           .addExternalSymbol("__chkstk")
1226           .addReg(AArch64::X15, RegState::Implicit)
1227           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1228           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1229           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1230           .setMIFlags(MachineInstr::FrameSetup);
1231       if (NeedsWinCFI) {
1232         HasWinCFI = true;
1233         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1234             .setMIFlag(MachineInstr::FrameSetup);
1235       }
1236       break;
1237     case CodeModel::Large:
1238       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1239           .addReg(AArch64::X16, RegState::Define)
1240           .addExternalSymbol("__chkstk")
1241           .addExternalSymbol("__chkstk")
1242           .setMIFlags(MachineInstr::FrameSetup);
1243       if (NeedsWinCFI) {
1244         HasWinCFI = true;
1245         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1246             .setMIFlag(MachineInstr::FrameSetup);
1247       }
1248 
1249       BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1250           .addReg(AArch64::X16, RegState::Kill)
1251           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1252           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1253           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1254           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1255           .setMIFlags(MachineInstr::FrameSetup);
1256       if (NeedsWinCFI) {
1257         HasWinCFI = true;
1258         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1259             .setMIFlag(MachineInstr::FrameSetup);
1260       }
1261       break;
1262     }
1263 
1264     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1265         .addReg(AArch64::SP, RegState::Kill)
1266         .addReg(AArch64::X15, RegState::Kill)
1267         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1268         .setMIFlags(MachineInstr::FrameSetup);
1269     if (NeedsWinCFI) {
1270       HasWinCFI = true;
1271       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1272           .addImm(NumBytes)
1273           .setMIFlag(MachineInstr::FrameSetup);
1274     }
1275     NumBytes = 0;
1276   }
1277 
1278   StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1279   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1280 
1281   // Process the SVE callee-saves to determine what space needs to be
1282   // allocated.
1283   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1284     // Find callee save instructions in frame.
1285     CalleeSavesBegin = MBBI;
1286     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1287     while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1288       ++MBBI;
1289     CalleeSavesEnd = MBBI;
1290 
1291     AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1292     AllocateAfter = SVEStackSize - AllocateBefore;
1293   }
1294 
1295   // Allocate space for the callee saves (if any).
1296   emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
1297                   -AllocateBefore, TII,
1298                   MachineInstr::FrameSetup);
1299 
1300   // Finally allocate remaining SVE stack space.
1301   emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1302                   -AllocateAfter, TII,
1303                   MachineInstr::FrameSetup);
1304 
1305   // Allocate space for the rest of the frame.
1306   if (NumBytes) {
1307     // Alignment is required for the parent frame, not the funclet
1308     const bool NeedsRealignment =
1309         !IsFunclet && RegInfo->needsStackRealignment(MF);
1310     unsigned scratchSPReg = AArch64::SP;
1311 
1312     if (NeedsRealignment) {
1313       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1314       assert(scratchSPReg != AArch64::NoRegister);
1315     }
1316 
1317     // If we're a leaf function, try using the red zone.
1318     if (!canUseRedZone(MF))
1319       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1320       // the correct value here, as NumBytes also includes padding bytes,
1321       // which shouldn't be counted here.
1322       emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1323                       StackOffset::getFixed(-NumBytes), TII,
1324                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1325 
1326     if (NeedsRealignment) {
1327       const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1328       assert(NrBitsToZero > 1);
1329       assert(scratchSPReg != AArch64::SP);
1330 
1331       // SUB X9, SP, NumBytes
1332       //   -- X9 is temporary register, so shouldn't contain any live data here,
1333       //   -- free to use. This is already produced by emitFrameOffset above.
1334       // AND SP, X9, 0b11111...0000
1335       // The logical immediates have a non-trivial encoding. The following
1336       // formula computes the encoded immediate with all ones but
1337       // NrBitsToZero zero bits as least significant bits.
1338       uint32_t andMaskEncoded = (1 << 12)                         // = N
1339                                 | ((64 - NrBitsToZero) << 6)      // immr
1340                                 | ((64 - NrBitsToZero - 1) << 0); // imms
1341 
1342       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1343           .addReg(scratchSPReg, RegState::Kill)
1344           .addImm(andMaskEncoded);
1345       AFI->setStackRealigned(true);
1346       if (NeedsWinCFI) {
1347         HasWinCFI = true;
1348         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1349             .addImm(NumBytes & andMaskEncoded)
1350             .setMIFlag(MachineInstr::FrameSetup);
1351       }
1352     }
1353   }
1354 
1355   // If we need a base pointer, set it up here. It's whatever the value of the
1356   // stack pointer is at this point. Any variable size objects will be allocated
1357   // after this, so we can still use the base pointer to reference locals.
1358   //
1359   // FIXME: Clarify FrameSetup flags here.
1360   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1361   // needed.
1362   // For funclets the BP belongs to the containing function.
1363   if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1364     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1365                      false);
1366     if (NeedsWinCFI) {
1367       HasWinCFI = true;
1368       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1369           .setMIFlag(MachineInstr::FrameSetup);
1370     }
1371   }
1372 
1373   // The very last FrameSetup instruction indicates the end of prologue. Emit a
1374   // SEH opcode indicating the prologue end.
1375   if (NeedsWinCFI && HasWinCFI) {
1376     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1377         .setMIFlag(MachineInstr::FrameSetup);
1378   }
1379 
1380   // SEH funclets are passed the frame pointer in X1.  If the parent
1381   // function uses the base register, then the base register is used
1382   // directly, and is not retrieved from X1.
1383   if (IsFunclet && F.hasPersonalityFn()) {
1384     EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1385     if (isAsynchronousEHPersonality(Per)) {
1386       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1387           .addReg(AArch64::X1)
1388           .setMIFlag(MachineInstr::FrameSetup);
1389       MBB.addLiveIn(AArch64::X1);
1390     }
1391   }
1392 
1393   if (needsFrameMoves) {
1394     // An example of the prologue:
1395     //
1396     //     .globl __foo
1397     //     .align 2
1398     //  __foo:
1399     // Ltmp0:
1400     //     .cfi_startproc
1401     //     .cfi_personality 155, ___gxx_personality_v0
1402     // Leh_func_begin:
1403     //     .cfi_lsda 16, Lexception33
1404     //
1405     //     stp  xa,bx, [sp, -#offset]!
1406     //     ...
1407     //     stp  x28, x27, [sp, #offset-32]
1408     //     stp  fp, lr, [sp, #offset-16]
1409     //     add  fp, sp, #offset - 16
1410     //     sub  sp, sp, #1360
1411     //
1412     // The Stack:
1413     //       +-------------------------------------------+
1414     // 10000 | ........ | ........ | ........ | ........ |
1415     // 10004 | ........ | ........ | ........ | ........ |
1416     //       +-------------------------------------------+
1417     // 10008 | ........ | ........ | ........ | ........ |
1418     // 1000c | ........ | ........ | ........ | ........ |
1419     //       +===========================================+
1420     // 10010 |                X28 Register               |
1421     // 10014 |                X28 Register               |
1422     //       +-------------------------------------------+
1423     // 10018 |                X27 Register               |
1424     // 1001c |                X27 Register               |
1425     //       +===========================================+
1426     // 10020 |                Frame Pointer              |
1427     // 10024 |                Frame Pointer              |
1428     //       +-------------------------------------------+
1429     // 10028 |                Link Register              |
1430     // 1002c |                Link Register              |
1431     //       +===========================================+
1432     // 10030 | ........ | ........ | ........ | ........ |
1433     // 10034 | ........ | ........ | ........ | ........ |
1434     //       +-------------------------------------------+
1435     // 10038 | ........ | ........ | ........ | ........ |
1436     // 1003c | ........ | ........ | ........ | ........ |
1437     //       +-------------------------------------------+
1438     //
1439     //     [sp] = 10030        ::    >>initial value<<
1440     //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
1441     //     fp = sp == 10020    ::  mov fp, sp
1442     //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
1443     //     sp == 10010         ::    >>final value<<
1444     //
1445     // The frame pointer (w29) points to address 10020. If we use an offset of
1446     // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
1447     // for w27, and -32 for w28:
1448     //
1449     //  Ltmp1:
1450     //     .cfi_def_cfa w29, 16
1451     //  Ltmp2:
1452     //     .cfi_offset w30, -8
1453     //  Ltmp3:
1454     //     .cfi_offset w29, -16
1455     //  Ltmp4:
1456     //     .cfi_offset w27, -24
1457     //  Ltmp5:
1458     //     .cfi_offset w28, -32
1459 
1460     if (HasFP) {
1461       const int OffsetToFirstCalleeSaveFromFP =
1462           AFI->getCalleeSaveBaseToFrameRecordOffset() -
1463           AFI->getCalleeSavedStackSize();
1464       Register FramePtr = RegInfo->getFrameRegister(MF);
1465 
1466       // Define the current CFA rule to use the provided FP.
1467       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1468       unsigned CFIIndex = MF.addFrameInst(
1469           MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1470       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1471           .addCFIIndex(CFIIndex)
1472           .setMIFlags(MachineInstr::FrameSetup);
1473     } else {
1474       unsigned CFIIndex;
1475       if (SVEStackSize) {
1476         const TargetSubtargetInfo &STI = MF.getSubtarget();
1477         const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
1478         StackOffset TotalSize =
1479             SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
1480         CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
1481       } else {
1482         // Encode the stack size of the leaf function.
1483         CFIIndex = MF.addFrameInst(
1484             MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
1485       }
1486       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1487           .addCFIIndex(CFIIndex)
1488           .setMIFlags(MachineInstr::FrameSetup);
1489     }
1490 
1491     // Now emit the moves for whatever callee saved regs we have (including FP,
1492     // LR if those are saved).
1493     emitCalleeSavedFrameMoves(MBB, MBBI);
1494   }
1495 }
1496 
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1497 static void InsertReturnAddressAuth(MachineFunction &MF,
1498                                     MachineBasicBlock &MBB) {
1499   const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1500   if (!MFI.shouldSignReturnAddress())
1501     return;
1502   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1503   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1504 
1505   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1506   DebugLoc DL;
1507   if (MBBI != MBB.end())
1508     DL = MBBI->getDebugLoc();
1509 
1510   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1511   // this instruction can safely used for any v8a architecture.
1512   // From v8.3a onwards there are optimised authenticate LR and return
1513   // instructions, namely RETA{A,B}, that can be used instead.
1514   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1515       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1516     BuildMI(MBB, MBBI, DL,
1517             TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1518         .copyImplicitOps(*MBBI);
1519     MBB.erase(MBBI);
1520   } else {
1521     BuildMI(
1522         MBB, MBBI, DL,
1523         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1524         .setMIFlag(MachineInstr::FrameDestroy);
1525   }
1526 }
1527 
isFuncletReturnInstr(const MachineInstr & MI)1528 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1529   switch (MI.getOpcode()) {
1530   default:
1531     return false;
1532   case AArch64::CATCHRET:
1533   case AArch64::CLEANUPRET:
1534     return true;
1535   }
1536 }
1537 
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1538 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1539                                         MachineBasicBlock &MBB) const {
1540   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1541   MachineFrameInfo &MFI = MF.getFrameInfo();
1542   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1543   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1544   DebugLoc DL;
1545   bool NeedsWinCFI = needsWinCFI(MF);
1546   bool HasWinCFI = false;
1547   bool IsFunclet = false;
1548   auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1549 
1550   if (MBB.end() != MBBI) {
1551     DL = MBBI->getDebugLoc();
1552     IsFunclet = isFuncletReturnInstr(*MBBI);
1553   }
1554 
1555   int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1556                                : MFI.getStackSize();
1557   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1558 
1559   // All calls are tail calls in GHC calling conv, and functions have no
1560   // prologue/epilogue.
1561   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1562     return;
1563 
1564   // Initial and residual are named for consistency with the prologue. Note that
1565   // in the epilogue, the residual adjustment is executed first.
1566   uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);
1567 
1568   // The stack frame should be like below,
1569   //
1570   //      ----------------------                     ---
1571   //      |                    |                      |
1572   //      | BytesInStackArgArea|              CalleeArgStackSize
1573   //      | (NumReusableBytes) |                (of tail call)
1574   //      |                    |                     ---
1575   //      |                    |                      |
1576   //      ---------------------|        ---           |
1577   //      |                    |         |            |
1578   //      |   CalleeSavedReg   |         |            |
1579   //      | (CalleeSavedStackSize)|      |            |
1580   //      |                    |         |            |
1581   //      ---------------------|         |         NumBytes
1582   //      |                    |     StackSize  (StackAdjustUp)
1583   //      |   LocalStackSize   |         |            |
1584   //      | (covering callee   |         |            |
1585   //      |       args)        |         |            |
1586   //      |                    |         |            |
1587   //      ----------------------        ---          ---
1588   //
1589   // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
1590   //             = StackSize + ArgumentPopSize
1591   //
1592   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
1593   // it as the 2nd argument of AArch64ISD::TC_RETURN.
1594 
1595   auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
1596 
1597   bool IsWin64 =
1598       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1599   unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1600 
1601   uint64_t AfterCSRPopSize = ArgumentPopSize;
1602   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1603   // We cannot rely on the local stack size set in emitPrologue if the function
1604   // has funclets, as funclets have different local stack size requirements, and
1605   // the current value set in emitPrologue may be that of the containing
1606   // function.
1607   if (MF.hasEHFunclets())
1608     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1609   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1610   // Assume we can't combine the last pop with the sp restore.
1611 
1612   if (!CombineSPBump && PrologueSaveSize != 0) {
1613     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1614     while (AArch64InstrInfo::isSEHInstruction(*Pop))
1615       Pop = std::prev(Pop);
1616     // Converting the last ldp to a post-index ldp is valid only if the last
1617     // ldp's offset is 0.
1618     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1619     // If the offset is 0, convert it to a post-index ldp.
1620     if (OffsetOp.getImm() == 0)
1621       convertCalleeSaveRestoreToSPPrePostIncDec(
1622           MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
1623     else {
1624       // If not, make sure to emit an add after the last ldp.
1625       // We're doing this by transfering the size to be restored from the
1626       // adjustment *before* the CSR pops to the adjustment *after* the CSR
1627       // pops.
1628       AfterCSRPopSize += PrologueSaveSize;
1629     }
1630   }
1631 
1632   // Move past the restores of the callee-saved registers.
1633   // If we plan on combining the sp bump of the local stack size and the callee
1634   // save stack size, we might need to adjust the CSR save and restore offsets.
1635   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1636   MachineBasicBlock::iterator Begin = MBB.begin();
1637   while (LastPopI != Begin) {
1638     --LastPopI;
1639     if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1640         IsSVECalleeSave(LastPopI)) {
1641       ++LastPopI;
1642       break;
1643     } else if (CombineSPBump)
1644       fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1645                                         NeedsWinCFI, &HasWinCFI);
1646   }
1647 
1648   if (MF.hasWinCFI()) {
1649     // If the prologue didn't contain any SEH opcodes and didn't set the
1650     // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
1651     // EpilogStart - to avoid generating CFI for functions that don't need it.
1652     // (And as we didn't generate any prologue at all, it would be asymmetrical
1653     // to the epilogue.) By the end of the function, we assert that
1654     // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
1655     HasWinCFI = true;
1656     BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
1657         .setMIFlag(MachineInstr::FrameDestroy);
1658   }
1659 
1660   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1661 
1662   // If there is a single SP update, insert it before the ret and we're done.
1663   if (CombineSPBump) {
1664     assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1665     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
1666                     StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
1667                     TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
1668                     &HasWinCFI);
1669     if (HasWinCFI)
1670       BuildMI(MBB, MBB.getFirstTerminator(), DL,
1671               TII->get(AArch64::SEH_EpilogEnd))
1672           .setMIFlag(MachineInstr::FrameDestroy);
1673     return;
1674   }
1675 
1676   NumBytes -= PrologueSaveSize;
1677   assert(NumBytes >= 0 && "Negative stack allocation size!?");
1678 
1679   // Process the SVE callee-saves to determine what space needs to be
1680   // deallocated.
1681   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
1682   MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
1683   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1684     RestoreBegin = std::prev(RestoreEnd);
1685     while (RestoreBegin != MBB.begin() &&
1686            IsSVECalleeSave(std::prev(RestoreBegin)))
1687       --RestoreBegin;
1688 
1689     assert(IsSVECalleeSave(RestoreBegin) &&
1690            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
1691 
1692     StackOffset CalleeSavedSizeAsOffset =
1693         StackOffset::getScalable(CalleeSavedSize);
1694     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
1695     DeallocateAfter = CalleeSavedSizeAsOffset;
1696   }
1697 
1698   // Deallocate the SVE area.
1699   if (SVEStackSize) {
1700     if (AFI->isStackRealigned()) {
1701       if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
1702         // Set SP to start of SVE callee-save area from which they can
1703         // be reloaded. The code below will deallocate the stack space
1704         // space by moving FP -> SP.
1705         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
1706                         StackOffset::getScalable(-CalleeSavedSize), TII,
1707                         MachineInstr::FrameDestroy);
1708     } else {
1709       if (AFI->getSVECalleeSavedStackSize()) {
1710         // Deallocate the non-SVE locals first before we can deallocate (and
1711         // restore callee saves) from the SVE area.
1712         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1713                         StackOffset::getFixed(NumBytes), TII,
1714                         MachineInstr::FrameDestroy);
1715         NumBytes = 0;
1716       }
1717 
1718       emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1719                       DeallocateBefore, TII, MachineInstr::FrameDestroy);
1720 
1721       emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1722                       DeallocateAfter, TII, MachineInstr::FrameDestroy);
1723     }
1724   }
1725 
1726   if (!hasFP(MF)) {
1727     bool RedZone = canUseRedZone(MF);
1728     // If this was a redzone leaf function, we don't need to restore the
1729     // stack pointer (but we may need to pop stack args for fastcc).
1730     if (RedZone && AfterCSRPopSize == 0)
1731       return;
1732 
1733     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
1734     int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
1735     if (NoCalleeSaveRestore)
1736       StackRestoreBytes += AfterCSRPopSize;
1737 
1738     // If we were able to combine the local stack pop with the argument pop,
1739     // then we're done.
1740     bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
1741 
1742     // If we're done after this, make sure to help the load store optimizer.
1743     if (Done)
1744       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
1745 
1746     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1747                     StackOffset::getFixed(StackRestoreBytes), TII,
1748                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1749     if (Done) {
1750       if (HasWinCFI) {
1751         BuildMI(MBB, MBB.getFirstTerminator(), DL,
1752                 TII->get(AArch64::SEH_EpilogEnd))
1753             .setMIFlag(MachineInstr::FrameDestroy);
1754       }
1755       return;
1756     }
1757 
1758     NumBytes = 0;
1759   }
1760 
1761   // Restore the original stack pointer.
1762   // FIXME: Rather than doing the math here, we should instead just use
1763   // non-post-indexed loads for the restores if we aren't actually going to
1764   // be able to save any instructions.
1765   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
1766     emitFrameOffset(
1767         MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
1768         StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
1769         TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
1770   } else if (NumBytes)
1771     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1772                     StackOffset::getFixed(NumBytes), TII,
1773                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
1774 
1775   // This must be placed after the callee-save restore code because that code
1776   // assumes the SP is at the same location as it was after the callee-save save
1777   // code in the prologue.
1778   if (AfterCSRPopSize) {
1779     // Find an insertion point for the first ldp so that it goes before the
1780     // shadow call stack epilog instruction. This ensures that the restore of
1781     // lr from x18 is placed after the restore from sp.
1782     auto FirstSPPopI = MBB.getFirstTerminator();
1783     while (FirstSPPopI != Begin) {
1784       auto Prev = std::prev(FirstSPPopI);
1785       if (Prev->getOpcode() != AArch64::LDRXpre ||
1786           Prev->getOperand(0).getReg() == AArch64::SP)
1787         break;
1788       FirstSPPopI = Prev;
1789     }
1790 
1791     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1792 
1793     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
1794                     StackOffset::getFixed((int64_t)AfterCSRPopSize), TII,
1795                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1796   }
1797   if (HasWinCFI)
1798     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
1799         .setMIFlag(MachineInstr::FrameDestroy);
1800 }
1801 
1802 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1803 /// debug info.  It's the same as what we use for resolving the code-gen
1804 /// references for now.  FIXME: This can go wrong when references are
1805 /// SP-relative and simple call frames aren't used.
1806 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1807 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1808                                              Register &FrameReg) const {
1809   return resolveFrameIndexReference(
1810       MF, FI, FrameReg,
1811       /*PreferFP=*/
1812       MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
1813       /*ForSimm=*/false);
1814 }
1815 
1816 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const1817 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1818                                                      int FI) const {
1819   return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
1820 }
1821 
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)1822 static StackOffset getFPOffset(const MachineFunction &MF,
1823                                int64_t ObjectOffset) {
1824   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1825   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1826   bool IsWin64 =
1827       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1828   unsigned FixedObject =
1829       getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
1830   int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
1831   int64_t FPAdjust =
1832       CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1833   return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
1834 }
1835 
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)1836 static StackOffset getStackOffset(const MachineFunction &MF,
1837                                   int64_t ObjectOffset) {
1838   const auto &MFI = MF.getFrameInfo();
1839   return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
1840 }
1841 
1842   // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const1843 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
1844                                                  int FI) const {
1845   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
1846       MF.getSubtarget().getRegisterInfo());
1847   int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
1848   return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
1849              ? getFPOffset(MF, ObjectOffset).getFixed()
1850              : getStackOffset(MF, ObjectOffset).getFixed();
1851 }
1852 
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const1853 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
1854     const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
1855     bool ForSimm) const {
1856   const auto &MFI = MF.getFrameInfo();
1857   int64_t ObjectOffset = MFI.getObjectOffset(FI);
1858   bool isFixed = MFI.isFixedObjectIndex(FI);
1859   bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
1860   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
1861                                      PreferFP, ForSimm);
1862 }
1863 
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const1864 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
1865     const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
1866     Register &FrameReg, bool PreferFP, bool ForSimm) const {
1867   const auto &MFI = MF.getFrameInfo();
1868   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
1869       MF.getSubtarget().getRegisterInfo());
1870   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1871   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1872 
1873   int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
1874   int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
1875   bool isCSR =
1876       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
1877 
1878   const StackOffset &SVEStackSize = getSVEStackSize(MF);
1879 
1880   // Use frame pointer to reference fixed objects. Use it for locals if
1881   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
1882   // reliable as a base). Make sure useFPForScavengingIndex() does the
1883   // right thing for the emergency spill slot.
1884   bool UseFP = false;
1885   if (AFI->hasStackFrame() && !isSVE) {
1886     // We shouldn't prefer using the FP when there is an SVE area
1887     // in between the FP and the non-SVE locals/spills.
1888     PreferFP &= !SVEStackSize;
1889 
1890     // Note: Keeping the following as multiple 'if' statements rather than
1891     // merging to a single expression for readability.
1892     //
1893     // Argument access should always use the FP.
1894     if (isFixed) {
1895       UseFP = hasFP(MF);
1896     } else if (isCSR && RegInfo->needsStackRealignment(MF)) {
1897       // References to the CSR area must use FP if we're re-aligning the stack
1898       // since the dynamically-sized alignment padding is between the SP/BP and
1899       // the CSR area.
1900       assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
1901       UseFP = true;
1902     } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
1903       // If the FPOffset is negative and we're producing a signed immediate, we
1904       // have to keep in mind that the available offset range for negative
1905       // offsets is smaller than for positive ones. If an offset is available
1906       // via the FP and the SP, use whichever is closest.
1907       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
1908       PreferFP |= Offset > -FPOffset;
1909 
1910       if (MFI.hasVarSizedObjects()) {
1911         // If we have variable sized objects, we can use either FP or BP, as the
1912         // SP offset is unknown. We can use the base pointer if we have one and
1913         // FP is not preferred. If not, we're stuck with using FP.
1914         bool CanUseBP = RegInfo->hasBasePointer(MF);
1915         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
1916           UseFP = PreferFP;
1917         else if (!CanUseBP) // Can't use BP. Forced to use FP.
1918           UseFP = true;
1919         // else we can use BP and FP, but the offset from FP won't fit.
1920         // That will make us scavenge registers which we can probably avoid by
1921         // using BP. If it won't fit for BP either, we'll scavenge anyway.
1922       } else if (FPOffset >= 0) {
1923         // Use SP or FP, whichever gives us the best chance of the offset
1924         // being in range for direct access. If the FPOffset is positive,
1925         // that'll always be best, as the SP will be even further away.
1926         UseFP = true;
1927       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
1928         // Funclets access the locals contained in the parent's stack frame
1929         // via the frame pointer, so we have to use the FP in the parent
1930         // function.
1931         (void) Subtarget;
1932         assert(
1933             Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
1934             "Funclets should only be present on Win64");
1935         UseFP = true;
1936       } else {
1937         // We have the choice between FP and (SP or BP).
1938         if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
1939           UseFP = true;
1940       }
1941     }
1942   }
1943 
1944   assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
1945          "In the presence of dynamic stack pointer realignment, "
1946          "non-argument/CSR objects cannot be accessed through the frame pointer");
1947 
1948   if (isSVE) {
1949     StackOffset FPOffset =
1950         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
1951     StackOffset SPOffset =
1952         SVEStackSize +
1953         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
1954                          ObjectOffset);
1955     // Always use the FP for SVE spills if available and beneficial.
1956     if (hasFP(MF) &&
1957         (SPOffset.getFixed() ||
1958          FPOffset.getScalable() < SPOffset.getScalable() ||
1959          RegInfo->needsStackRealignment(MF))) {
1960       FrameReg = RegInfo->getFrameRegister(MF);
1961       return FPOffset;
1962     }
1963 
1964     FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
1965                                            : (unsigned)AArch64::SP;
1966     return SPOffset;
1967   }
1968 
1969   StackOffset ScalableOffset = {};
1970   if (UseFP && !(isFixed || isCSR))
1971     ScalableOffset = -SVEStackSize;
1972   if (!UseFP && (isFixed || isCSR))
1973     ScalableOffset = SVEStackSize;
1974 
1975   if (UseFP) {
1976     FrameReg = RegInfo->getFrameRegister(MF);
1977     return StackOffset::getFixed(FPOffset) + ScalableOffset;
1978   }
1979 
1980   // Use the base pointer if we have one.
1981   if (RegInfo->hasBasePointer(MF))
1982     FrameReg = RegInfo->getBaseRegister();
1983   else {
1984     assert(!MFI.hasVarSizedObjects() &&
1985            "Can't use SP when we have var sized objects.");
1986     FrameReg = AArch64::SP;
1987     // If we're using the red zone for this function, the SP won't actually
1988     // be adjusted, so the offsets will be negative. They're also all
1989     // within range of the signed 9-bit immediate instructions.
1990     if (canUseRedZone(MF))
1991       Offset -= AFI->getLocalStackSize();
1992   }
1993 
1994   return StackOffset::getFixed(Offset) + ScalableOffset;
1995 }
1996 
getPrologueDeath(MachineFunction & MF,unsigned Reg)1997 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
1998   // Do not set a kill flag on values that are also marked as live-in. This
1999   // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2000   // callee saved registers.
2001   // Omitting the kill flags is conservatively correct even if the live-in
2002   // is not used after all.
2003   bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2004   return getKillRegState(!IsLiveIn);
2005 }
2006 
produceCompactUnwindFrame(MachineFunction & MF)2007 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2008   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2009   AttributeList Attrs = MF.getFunction().getAttributes();
2010   return Subtarget.isTargetMachO() &&
2011          !(Subtarget.getTargetLowering()->supportSwiftError() &&
2012            Attrs.hasAttrSomewhere(Attribute::SwiftError));
2013 }
2014 
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2015 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2016                                              bool NeedsWinCFI, bool IsFirst) {
2017   // If we are generating register pairs for a Windows function that requires
2018   // EH support, then pair consecutive registers only.  There are no unwind
2019   // opcodes for saves/restores of non-consectuve register pairs.
2020   // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2021   // save_lrpair.
2022   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2023 
2024   if (Reg2 == AArch64::FP)
2025     return true;
2026   if (!NeedsWinCFI)
2027     return false;
2028   if (Reg2 == Reg1 + 1)
2029     return false;
2030   // If pairing a GPR with LR, the pair can be described by the save_lrpair
2031   // opcode. If this is the first register pair, it would end up with a
2032   // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2033   // if LR is paired with something else than the first register.
2034   // The save_lrpair opcode requires the first register to be an odd one.
2035   if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2036       (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2037     return false;
2038   return true;
2039 }
2040 
2041 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2042 /// WindowsCFI requires that only consecutive registers can be paired.
2043 /// LR and FP need to be allocated together when the frame needs to save
2044 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2045 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2046                                       bool UsesWinAAPCS, bool NeedsWinCFI,
2047                                       bool NeedsFrameRecord, bool IsFirst) {
2048   if (UsesWinAAPCS)
2049     return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2050 
2051   // If we need to store the frame record, don't pair any register
2052   // with LR other than FP.
2053   if (NeedsFrameRecord)
2054     return Reg2 == AArch64::LR;
2055 
2056   return false;
2057 }
2058 
2059 namespace {
2060 
2061 struct RegPairInfo {
2062   unsigned Reg1 = AArch64::NoRegister;
2063   unsigned Reg2 = AArch64::NoRegister;
2064   int FrameIdx;
2065   int Offset;
2066   enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2067 
2068   RegPairInfo() = default;
2069 
isPaired__anon36e7b1910411::RegPairInfo2070   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2071 
getScale__anon36e7b1910411::RegPairInfo2072   unsigned getScale() const {
2073     switch (Type) {
2074     case PPR:
2075       return 2;
2076     case GPR:
2077     case FPR64:
2078       return 8;
2079     case ZPR:
2080     case FPR128:
2081       return 16;
2082     }
2083     llvm_unreachable("Unsupported type");
2084   }
2085 
isScalable__anon36e7b1910411::RegPairInfo2086   bool isScalable() const { return Type == PPR || Type == ZPR; }
2087 };
2088 
2089 } // end anonymous namespace
2090 
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool & NeedShadowCallStackProlog,bool NeedsFrameRecord)2091 static void computeCalleeSaveRegisterPairs(
2092     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2093     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2094     bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
2095 
2096   if (CSI.empty())
2097     return;
2098 
2099   bool IsWindows = isTargetWindows(MF);
2100   bool NeedsWinCFI = needsWinCFI(MF);
2101   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2102   MachineFrameInfo &MFI = MF.getFrameInfo();
2103   CallingConv::ID CC = MF.getFunction().getCallingConv();
2104   unsigned Count = CSI.size();
2105   (void)CC;
2106   // MachO's compact unwind format relies on all registers being stored in
2107   // pairs.
2108   assert((!produceCompactUnwindFrame(MF) ||
2109           CC == CallingConv::PreserveMost ||
2110           (Count & 1) == 0) &&
2111          "Odd number of callee-saved regs to spill!");
2112   int ByteOffset = AFI->getCalleeSavedStackSize();
2113   int StackFillDir = -1;
2114   int RegInc = 1;
2115   unsigned FirstReg = 0;
2116   if (NeedsWinCFI) {
2117     // For WinCFI, fill the stack from the bottom up.
2118     ByteOffset = 0;
2119     StackFillDir = 1;
2120     // As the CSI array is reversed to match PrologEpilogInserter, iterate
2121     // backwards, to pair up registers starting from lower numbered registers.
2122     RegInc = -1;
2123     FirstReg = Count - 1;
2124   }
2125   int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2126 
2127   // When iterating backwards, the loop condition relies on unsigned wraparound.
2128   for (unsigned i = FirstReg; i < Count; i += RegInc) {
2129     RegPairInfo RPI;
2130     RPI.Reg1 = CSI[i].getReg();
2131 
2132     if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2133       RPI.Type = RegPairInfo::GPR;
2134     else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2135       RPI.Type = RegPairInfo::FPR64;
2136     else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2137       RPI.Type = RegPairInfo::FPR128;
2138     else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2139       RPI.Type = RegPairInfo::ZPR;
2140     else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2141       RPI.Type = RegPairInfo::PPR;
2142     else
2143       llvm_unreachable("Unsupported register class.");
2144 
2145     // Add the next reg to the pair if it is in the same register class.
2146     if (unsigned(i + RegInc) < Count) {
2147       unsigned NextReg = CSI[i + RegInc].getReg();
2148       bool IsFirst = i == FirstReg;
2149       switch (RPI.Type) {
2150       case RegPairInfo::GPR:
2151         if (AArch64::GPR64RegClass.contains(NextReg) &&
2152             !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2153                                        NeedsWinCFI, NeedsFrameRecord, IsFirst))
2154           RPI.Reg2 = NextReg;
2155         break;
2156       case RegPairInfo::FPR64:
2157         if (AArch64::FPR64RegClass.contains(NextReg) &&
2158             !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2159                                               IsFirst))
2160           RPI.Reg2 = NextReg;
2161         break;
2162       case RegPairInfo::FPR128:
2163         if (AArch64::FPR128RegClass.contains(NextReg))
2164           RPI.Reg2 = NextReg;
2165         break;
2166       case RegPairInfo::PPR:
2167       case RegPairInfo::ZPR:
2168         break;
2169       }
2170     }
2171 
2172     // If either of the registers to be saved is the lr register, it means that
2173     // we also need to save lr in the shadow call stack.
2174     if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
2175         MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
2176       if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
2177         report_fatal_error("Must reserve x18 to use shadow call stack");
2178       NeedShadowCallStackProlog = true;
2179     }
2180 
2181     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2182     // list to come in sorted by frame index so that we can issue the store
2183     // pair instructions directly. Assert if we see anything otherwise.
2184     //
2185     // The order of the registers in the list is controlled by
2186     // getCalleeSavedRegs(), so they will always be in-order, as well.
2187     assert((!RPI.isPaired() ||
2188             (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2189            "Out of order callee saved regs!");
2190 
2191     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2192             RPI.Reg1 == AArch64::LR) &&
2193            "FrameRecord must be allocated together with LR");
2194 
2195     // Windows AAPCS has FP and LR reversed.
2196     assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2197             RPI.Reg2 == AArch64::LR) &&
2198            "FrameRecord must be allocated together with LR");
2199 
2200     // MachO's compact unwind format relies on all registers being stored in
2201     // adjacent register pairs.
2202     assert((!produceCompactUnwindFrame(MF) ||
2203             CC == CallingConv::PreserveMost ||
2204             (RPI.isPaired() &&
2205              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2206               RPI.Reg1 + 1 == RPI.Reg2))) &&
2207            "Callee-save registers not saved as adjacent register pair!");
2208 
2209     RPI.FrameIdx = CSI[i].getFrameIdx();
2210     if (NeedsWinCFI &&
2211         RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2212       RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2213 
2214     int Scale = RPI.getScale();
2215 
2216     int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2217     assert(OffsetPre % Scale == 0);
2218 
2219     if (RPI.isScalable())
2220       ScalableByteOffset += StackFillDir * Scale;
2221     else
2222       ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2223 
2224     assert(!(RPI.isScalable() && RPI.isPaired()) &&
2225            "Paired spill/fill instructions don't exist for SVE vectors");
2226 
2227     // Round up size of non-pair to pair size if we need to pad the
2228     // callee-save area to ensure 16-byte alignment.
2229     if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI &&
2230         !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2231         !RPI.isPaired()) {
2232       ByteOffset += 8 * StackFillDir;
2233       assert(ByteOffset % 16 == 0);
2234       assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2235       // A stack frame with a gap looks like this, bottom up:
2236       // d9, d8. x21, gap, x20, x19.
2237       // Set extra alignment on the x21 object (the only unpaired register)
2238       // to create the gap above it.
2239       MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2240     }
2241 
2242     int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2243     assert(OffsetPost % Scale == 0);
2244     // If filling top down (default), we want the offset after incrementing it.
2245     // If fillibg bootom up (WinCFI) we need the original offset.
2246     int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2247     RPI.Offset = Offset / Scale;
2248 
2249     assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2250             (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2251            "Offset out of bounds for LDP/STP immediate");
2252 
2253     // Save the offset to frame record so that the FP register can point to the
2254     // innermost frame record (spilled FP and LR registers).
2255     if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2256                               RPI.Reg2 == AArch64::FP) ||
2257                              (IsWindows && RPI.Reg1 == AArch64::FP &&
2258                               RPI.Reg2 == AArch64::LR)))
2259       AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2260 
2261     RegPairs.push_back(RPI);
2262     if (RPI.isPaired())
2263       i += RegInc;
2264   }
2265   if (NeedsWinCFI) {
2266     // If we need an alignment gap in the stack, align the topmost stack
2267     // object. A stack frame with a gap looks like this, bottom up:
2268     // x19, d8. d9, gap.
2269     // Set extra alignment on the topmost stack object (the first element in
2270     // CSI, which goes top down), to create the gap above it.
2271     if (AFI->hasCalleeSaveStackFreeSpace())
2272       MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2273     // We iterated bottom up over the registers; flip RegPairs back to top
2274     // down order.
2275     std::reverse(RegPairs.begin(), RegPairs.end());
2276   }
2277 }
2278 
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2279 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2280     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2281     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2282   MachineFunction &MF = *MBB.getParent();
2283   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2284   bool NeedsWinCFI = needsWinCFI(MF);
2285   DebugLoc DL;
2286   SmallVector<RegPairInfo, 8> RegPairs;
2287 
2288   bool NeedShadowCallStackProlog = false;
2289   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2290                                  NeedShadowCallStackProlog, hasFP(MF));
2291   const MachineRegisterInfo &MRI = MF.getRegInfo();
2292 
2293   if (NeedShadowCallStackProlog) {
2294     // Shadow call stack prolog: str x30, [x18], #8
2295     BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
2296         .addReg(AArch64::X18, RegState::Define)
2297         .addReg(AArch64::LR)
2298         .addReg(AArch64::X18)
2299         .addImm(8)
2300         .setMIFlag(MachineInstr::FrameSetup);
2301 
2302     if (NeedsWinCFI)
2303       BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
2304           .setMIFlag(MachineInstr::FrameSetup);
2305 
2306     if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
2307       // Emit a CFI instruction that causes 8 to be subtracted from the value of
2308       // x18 when unwinding past this frame.
2309       static const char CFIInst[] = {
2310           dwarf::DW_CFA_val_expression,
2311           18, // register
2312           2,  // length
2313           static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
2314           static_cast<char>(-8) & 0x7f, // addend (sleb128)
2315       };
2316       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
2317           nullptr, StringRef(CFIInst, sizeof(CFIInst))));
2318       BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
2319           .addCFIIndex(CFIIndex)
2320           .setMIFlag(MachineInstr::FrameSetup);
2321     }
2322 
2323     // This instruction also makes x18 live-in to the entry block.
2324     MBB.addLiveIn(AArch64::X18);
2325   }
2326 
2327   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
2328        ++RPII) {
2329     RegPairInfo RPI = *RPII;
2330     unsigned Reg1 = RPI.Reg1;
2331     unsigned Reg2 = RPI.Reg2;
2332     unsigned StrOpc;
2333 
2334     // Issue sequence of spills for cs regs.  The first spill may be converted
2335     // to a pre-decrement store later by emitPrologue if the callee-save stack
2336     // area allocation can't be combined with the local stack area allocation.
2337     // For example:
2338     //    stp     x22, x21, [sp, #0]     // addImm(+0)
2339     //    stp     x20, x19, [sp, #16]    // addImm(+2)
2340     //    stp     fp, lr, [sp, #32]      // addImm(+4)
2341     // Rationale: This sequence saves uop updates compared to a sequence of
2342     // pre-increment spills like stp xi,xj,[sp,#-16]!
2343     // Note: Similar rationale and sequence for restores in epilog.
2344     unsigned Size;
2345     Align Alignment;
2346     switch (RPI.Type) {
2347     case RegPairInfo::GPR:
2348        StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2349        Size = 8;
2350        Alignment = Align(8);
2351        break;
2352     case RegPairInfo::FPR64:
2353        StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2354        Size = 8;
2355        Alignment = Align(8);
2356        break;
2357     case RegPairInfo::FPR128:
2358        StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2359        Size = 16;
2360        Alignment = Align(16);
2361        break;
2362     case RegPairInfo::ZPR:
2363        StrOpc = AArch64::STR_ZXI;
2364        Size = 16;
2365        Alignment = Align(16);
2366        break;
2367     case RegPairInfo::PPR:
2368        StrOpc = AArch64::STR_PXI;
2369        Size = 2;
2370        Alignment = Align(2);
2371        break;
2372     }
2373     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2374                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2375                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2376                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2377                dbgs() << ")\n");
2378 
2379     assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2380            "Windows unwdinding requires a consecutive (FP,LR) pair");
2381     // Windows unwind codes require consecutive registers if registers are
2382     // paired.  Make the switch here, so that the code below will save (x,x+1)
2383     // and not (x+1,x).
2384     unsigned FrameIdxReg1 = RPI.FrameIdx;
2385     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2386     if (NeedsWinCFI && RPI.isPaired()) {
2387       std::swap(Reg1, Reg2);
2388       std::swap(FrameIdxReg1, FrameIdxReg2);
2389     }
2390     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2391     if (!MRI.isReserved(Reg1))
2392       MBB.addLiveIn(Reg1);
2393     if (RPI.isPaired()) {
2394       if (!MRI.isReserved(Reg2))
2395         MBB.addLiveIn(Reg2);
2396       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2397       MIB.addMemOperand(MF.getMachineMemOperand(
2398           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2399           MachineMemOperand::MOStore, Size, Alignment));
2400     }
2401     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2402         .addReg(AArch64::SP)
2403         .addImm(RPI.Offset) // [sp, #offset*scale],
2404                             // where factor*scale is implicit
2405         .setMIFlag(MachineInstr::FrameSetup);
2406     MIB.addMemOperand(MF.getMachineMemOperand(
2407         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2408         MachineMemOperand::MOStore, Size, Alignment));
2409     if (NeedsWinCFI)
2410       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2411 
2412     // Update the StackIDs of the SVE stack slots.
2413     MachineFrameInfo &MFI = MF.getFrameInfo();
2414     if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2415       MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2416 
2417   }
2418   return true;
2419 }
2420 
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2421 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2422     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2423     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2424   MachineFunction &MF = *MBB.getParent();
2425   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2426   DebugLoc DL;
2427   SmallVector<RegPairInfo, 8> RegPairs;
2428   bool NeedsWinCFI = needsWinCFI(MF);
2429 
2430   if (MI != MBB.end())
2431     DL = MI->getDebugLoc();
2432 
2433   bool NeedShadowCallStackProlog = false;
2434   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2435                                  NeedShadowCallStackProlog, hasFP(MF));
2436 
2437   auto EmitMI = [&](const RegPairInfo &RPI) {
2438     unsigned Reg1 = RPI.Reg1;
2439     unsigned Reg2 = RPI.Reg2;
2440 
2441     // Issue sequence of restores for cs regs. The last restore may be converted
2442     // to a post-increment load later by emitEpilogue if the callee-save stack
2443     // area allocation can't be combined with the local stack area allocation.
2444     // For example:
2445     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
2446     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
2447     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
2448     // Note: see comment in spillCalleeSavedRegisters()
2449     unsigned LdrOpc;
2450     unsigned Size;
2451     Align Alignment;
2452     switch (RPI.Type) {
2453     case RegPairInfo::GPR:
2454        LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2455        Size = 8;
2456        Alignment = Align(8);
2457        break;
2458     case RegPairInfo::FPR64:
2459        LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2460        Size = 8;
2461        Alignment = Align(8);
2462        break;
2463     case RegPairInfo::FPR128:
2464        LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2465        Size = 16;
2466        Alignment = Align(16);
2467        break;
2468     case RegPairInfo::ZPR:
2469        LdrOpc = AArch64::LDR_ZXI;
2470        Size = 16;
2471        Alignment = Align(16);
2472        break;
2473     case RegPairInfo::PPR:
2474        LdrOpc = AArch64::LDR_PXI;
2475        Size = 2;
2476        Alignment = Align(2);
2477        break;
2478     }
2479     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2480                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2481                dbgs() << ") -> fi#(" << RPI.FrameIdx;
2482                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2483                dbgs() << ")\n");
2484 
2485     // Windows unwind codes require consecutive registers if registers are
2486     // paired.  Make the switch here, so that the code below will save (x,x+1)
2487     // and not (x+1,x).
2488     unsigned FrameIdxReg1 = RPI.FrameIdx;
2489     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2490     if (NeedsWinCFI && RPI.isPaired()) {
2491       std::swap(Reg1, Reg2);
2492       std::swap(FrameIdxReg1, FrameIdxReg2);
2493     }
2494     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
2495     if (RPI.isPaired()) {
2496       MIB.addReg(Reg2, getDefRegState(true));
2497       MIB.addMemOperand(MF.getMachineMemOperand(
2498           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2499           MachineMemOperand::MOLoad, Size, Alignment));
2500     }
2501     MIB.addReg(Reg1, getDefRegState(true))
2502         .addReg(AArch64::SP)
2503         .addImm(RPI.Offset) // [sp, #offset*scale]
2504                             // where factor*scale is implicit
2505         .setMIFlag(MachineInstr::FrameDestroy);
2506     MIB.addMemOperand(MF.getMachineMemOperand(
2507         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2508         MachineMemOperand::MOLoad, Size, Alignment));
2509     if (NeedsWinCFI)
2510       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2511   };
2512 
2513   // SVE objects are always restored in reverse order.
2514   for (const RegPairInfo &RPI : reverse(RegPairs))
2515     if (RPI.isScalable())
2516       EmitMI(RPI);
2517 
2518   if (ReverseCSRRestoreSeq) {
2519     for (const RegPairInfo &RPI : reverse(RegPairs))
2520       if (!RPI.isScalable())
2521         EmitMI(RPI);
2522   } else
2523     for (const RegPairInfo &RPI : RegPairs)
2524       if (!RPI.isScalable())
2525         EmitMI(RPI);
2526 
2527   if (NeedShadowCallStackProlog) {
2528     // Shadow call stack epilog: ldr x30, [x18, #-8]!
2529     BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
2530         .addReg(AArch64::X18, RegState::Define)
2531         .addReg(AArch64::LR, RegState::Define)
2532         .addReg(AArch64::X18)
2533         .addImm(-8)
2534         .setMIFlag(MachineInstr::FrameDestroy);
2535   }
2536 
2537   return true;
2538 }
2539 
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2540 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2541                                                 BitVector &SavedRegs,
2542                                                 RegScavenger *RS) const {
2543   // All calls are tail calls in GHC calling conv, and functions have no
2544   // prologue/epilogue.
2545   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2546     return;
2547 
2548   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2549   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2550       MF.getSubtarget().getRegisterInfo());
2551   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2552   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2553   unsigned UnspilledCSGPR = AArch64::NoRegister;
2554   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2555 
2556   MachineFrameInfo &MFI = MF.getFrameInfo();
2557   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2558 
2559   unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2560                                 ? RegInfo->getBaseRegister()
2561                                 : (unsigned)AArch64::NoRegister;
2562 
2563   unsigned ExtraCSSpill = 0;
2564   // Figure out which callee-saved registers to save/restore.
2565   for (unsigned i = 0; CSRegs[i]; ++i) {
2566     const unsigned Reg = CSRegs[i];
2567 
2568     // Add the base pointer register to SavedRegs if it is callee-save.
2569     if (Reg == BasePointerReg)
2570       SavedRegs.set(Reg);
2571 
2572     bool RegUsed = SavedRegs.test(Reg);
2573     unsigned PairedReg = AArch64::NoRegister;
2574     if (AArch64::GPR64RegClass.contains(Reg) ||
2575         AArch64::FPR64RegClass.contains(Reg) ||
2576         AArch64::FPR128RegClass.contains(Reg))
2577       PairedReg = CSRegs[i ^ 1];
2578 
2579     if (!RegUsed) {
2580       if (AArch64::GPR64RegClass.contains(Reg) &&
2581           !RegInfo->isReservedReg(MF, Reg)) {
2582         UnspilledCSGPR = Reg;
2583         UnspilledCSGPRPaired = PairedReg;
2584       }
2585       continue;
2586     }
2587 
2588     // MachO's compact unwind format relies on all registers being stored in
2589     // pairs.
2590     // FIXME: the usual format is actually better if unwinding isn't needed.
2591     if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
2592         !SavedRegs.test(PairedReg)) {
2593       SavedRegs.set(PairedReg);
2594       if (AArch64::GPR64RegClass.contains(PairedReg) &&
2595           !RegInfo->isReservedReg(MF, PairedReg))
2596         ExtraCSSpill = PairedReg;
2597     }
2598   }
2599 
2600   if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2601       !Subtarget.isTargetWindows()) {
2602     // For Windows calling convention on a non-windows OS, where X18 is treated
2603     // as reserved, back up X18 when entering non-windows code (marked with the
2604     // Windows calling convention) and restore when returning regardless of
2605     // whether the individual function uses it - it might call other functions
2606     // that clobber it.
2607     SavedRegs.set(AArch64::X18);
2608   }
2609 
2610   // Calculates the callee saved stack size.
2611   unsigned CSStackSize = 0;
2612   unsigned SVECSStackSize = 0;
2613   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2614   const MachineRegisterInfo &MRI = MF.getRegInfo();
2615   for (unsigned Reg : SavedRegs.set_bits()) {
2616     auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
2617     if (AArch64::PPRRegClass.contains(Reg) ||
2618         AArch64::ZPRRegClass.contains(Reg))
2619       SVECSStackSize += RegSize;
2620     else
2621       CSStackSize += RegSize;
2622   }
2623 
2624   // Save number of saved regs, so we can easily update CSStackSize later.
2625   unsigned NumSavedRegs = SavedRegs.count();
2626 
2627   // The frame record needs to be created by saving the appropriate registers
2628   uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2629   if (hasFP(MF) ||
2630       windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
2631     SavedRegs.set(AArch64::FP);
2632     SavedRegs.set(AArch64::LR);
2633   }
2634 
2635   LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2636              for (unsigned Reg
2637                   : SavedRegs.set_bits()) dbgs()
2638              << ' ' << printReg(Reg, RegInfo);
2639              dbgs() << "\n";);
2640 
2641   // If any callee-saved registers are used, the frame cannot be eliminated.
2642   int64_t SVEStackSize =
2643       alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
2644   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
2645 
2646   // The CSR spill slots have not been allocated yet, so estimateStackSize
2647   // won't include them.
2648   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2649 
2650   // Conservatively always assume BigStack when there are SVE spills.
2651   bool BigStack = SVEStackSize ||
2652                   (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
2653   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
2654     AFI->setHasStackFrame(true);
2655 
2656   // Estimate if we might need to scavenge a register at some point in order
2657   // to materialize a stack offset. If so, either spill one additional
2658   // callee-saved register or reserve a special spill slot to facilitate
2659   // register scavenging. If we already spilled an extra callee-saved register
2660   // above to keep the number of spills even, we don't need to do anything else
2661   // here.
2662   if (BigStack) {
2663     if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2664       LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2665                         << " to get a scratch register.\n");
2666       SavedRegs.set(UnspilledCSGPR);
2667       // MachO's compact unwind format relies on all registers being stored in
2668       // pairs, so if we need to spill one extra for BigStack, then we need to
2669       // store the pair.
2670       if (produceCompactUnwindFrame(MF))
2671         SavedRegs.set(UnspilledCSGPRPaired);
2672       ExtraCSSpill = UnspilledCSGPR;
2673     }
2674 
2675     // If we didn't find an extra callee-saved register to spill, create
2676     // an emergency spill slot.
2677     if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
2678       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2679       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2680       unsigned Size = TRI->getSpillSize(RC);
2681       Align Alignment = TRI->getSpillAlign(RC);
2682       int FI = MFI.CreateStackObject(Size, Alignment, false);
2683       RS->addScavengingFrameIndex(FI);
2684       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2685                         << " as the emergency spill slot.\n");
2686     }
2687   }
2688 
2689   // Adding the size of additional 64bit GPR saves.
2690   CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
2691   uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
2692   LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2693                << EstimatedStackSize + AlignedCSStackSize
2694                << " bytes.\n");
2695 
2696   assert((!MFI.isCalleeSavedInfoValid() ||
2697           AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2698          "Should not invalidate callee saved info");
2699 
2700   // Round up to register pair alignment to avoid additional SP adjustment
2701   // instructions.
2702   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2703   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2704   AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
2705 }
2706 
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * TRI,std::vector<CalleeSavedInfo> & CSI) const2707 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2708     MachineFunction &MF, const TargetRegisterInfo *TRI,
2709     std::vector<CalleeSavedInfo> &CSI) const {
2710   bool NeedsWinCFI = needsWinCFI(MF);
2711   // To match the canonical windows frame layout, reverse the list of
2712   // callee saved registers to get them laid out by PrologEpilogInserter
2713   // in the right order. (PrologEpilogInserter allocates stack objects top
2714   // down. Windows canonical prologs store higher numbered registers at
2715   // the top, thus have the CSI array start from the highest registers.)
2716   if (NeedsWinCFI)
2717     std::reverse(CSI.begin(), CSI.end());
2718   // Let the generic code do the rest of the setup.
2719   return false;
2720 }
2721 
enableStackSlotScavenging(const MachineFunction & MF) const2722 bool AArch64FrameLowering::enableStackSlotScavenging(
2723     const MachineFunction &MF) const {
2724   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2725   return AFI->hasCalleeSaveStackFreeSpace();
2726 }
2727 
2728 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)2729 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2730                                       int &Min, int &Max) {
2731   Min = std::numeric_limits<int>::max();
2732   Max = std::numeric_limits<int>::min();
2733 
2734   if (!MFI.isCalleeSavedInfoValid())
2735     return false;
2736 
2737   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2738   for (auto &CS : CSI) {
2739     if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
2740         AArch64::PPRRegClass.contains(CS.getReg())) {
2741       assert((Max == std::numeric_limits<int>::min() ||
2742               Max + 1 == CS.getFrameIdx()) &&
2743              "SVE CalleeSaves are not consecutive");
2744 
2745       Min = std::min(Min, CS.getFrameIdx());
2746       Max = std::max(Max, CS.getFrameIdx());
2747     }
2748   }
2749   return Min != std::numeric_limits<int>::max();
2750 }
2751 
2752 // Process all the SVE stack objects and determine offsets for each
2753 // object. If AssignOffsets is true, the offsets get assigned.
2754 // Fills in the first and last callee-saved frame indices into
2755 // Min/MaxCSFrameIndex, respectively.
2756 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)2757 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
2758                                               int &MinCSFrameIndex,
2759                                               int &MaxCSFrameIndex,
2760                                               bool AssignOffsets) {
2761 #ifndef NDEBUG
2762   // First process all fixed stack objects.
2763   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
2764     assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
2765            "SVE vectors should never be passed on the stack by value, only by "
2766            "reference.");
2767 #endif
2768 
2769   auto Assign = [&MFI](int FI, int64_t Offset) {
2770     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2771     MFI.setObjectOffset(FI, Offset);
2772   };
2773 
2774   int64_t Offset = 0;
2775 
2776   // Then process all callee saved slots.
2777   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
2778     // Assign offsets to the callee save slots.
2779     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
2780       Offset += MFI.getObjectSize(I);
2781       Offset = alignTo(Offset, MFI.getObjectAlign(I));
2782       if (AssignOffsets)
2783         Assign(I, -Offset);
2784     }
2785   }
2786 
2787   // Ensure that the Callee-save area is aligned to 16bytes.
2788   Offset = alignTo(Offset, Align(16U));
2789 
2790   // Create a buffer of SVE objects to allocate and sort it.
2791   SmallVector<int, 8> ObjectsToAllocate;
2792   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
2793     unsigned StackID = MFI.getStackID(I);
2794     if (StackID != TargetStackID::ScalableVector)
2795       continue;
2796     if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
2797       continue;
2798     if (MFI.isDeadObjectIndex(I))
2799       continue;
2800 
2801     ObjectsToAllocate.push_back(I);
2802   }
2803 
2804   // Allocate all SVE locals and spills
2805   for (unsigned FI : ObjectsToAllocate) {
2806     Align Alignment = MFI.getObjectAlign(FI);
2807     // FIXME: Given that the length of SVE vectors is not necessarily a power of
2808     // two, we'd need to align every object dynamically at runtime if the
2809     // alignment is larger than 16. This is not yet supported.
2810     if (Alignment > Align(16))
2811       report_fatal_error(
2812           "Alignment of scalable vectors > 16 bytes is not yet supported");
2813 
2814     Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
2815     if (AssignOffsets)
2816       Assign(FI, -Offset);
2817   }
2818 
2819   return Offset;
2820 }
2821 
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const2822 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
2823     MachineFrameInfo &MFI) const {
2824   int MinCSFrameIndex, MaxCSFrameIndex;
2825   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
2826 }
2827 
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const2828 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
2829     MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
2830   return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
2831                                         true);
2832 }
2833 
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const2834 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
2835     MachineFunction &MF, RegScavenger *RS) const {
2836   MachineFrameInfo &MFI = MF.getFrameInfo();
2837 
2838   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
2839          "Upwards growing stack unsupported");
2840 
2841   int MinCSFrameIndex, MaxCSFrameIndex;
2842   int64_t SVEStackSize =
2843       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
2844 
2845   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2846   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
2847   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
2848 
2849   // If this function isn't doing Win64-style C++ EH, we don't need to do
2850   // anything.
2851   if (!MF.hasEHFunclets())
2852     return;
2853   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2854   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
2855 
2856   MachineBasicBlock &MBB = MF.front();
2857   auto MBBI = MBB.begin();
2858   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
2859     ++MBBI;
2860 
2861   // Create an UnwindHelp object.
2862   // The UnwindHelp object is allocated at the start of the fixed object area
2863   int64_t FixedObject =
2864       getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
2865   int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
2866                                            /*SPOffset*/ -FixedObject,
2867                                            /*IsImmutable=*/false);
2868   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
2869 
2870   // We need to store -2 into the UnwindHelp object at the start of the
2871   // function.
2872   DebugLoc DL;
2873   RS->enterBasicBlockEnd(MBB);
2874   RS->backward(std::prev(MBBI));
2875   unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
2876   assert(DstReg && "There must be a free register after frame setup");
2877   BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
2878   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
2879       .addReg(DstReg, getKillRegState(true))
2880       .addFrameIndex(UnwindHelpFI)
2881       .addImm(0);
2882 }
2883 
2884 namespace {
2885 struct TagStoreInstr {
2886   MachineInstr *MI;
2887   int64_t Offset, Size;
TagStoreInstr__anon36e7b1910711::TagStoreInstr2888   explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
2889       : MI(MI), Offset(Offset), Size(Size) {}
2890 };
2891 
2892 class TagStoreEdit {
2893   MachineFunction *MF;
2894   MachineBasicBlock *MBB;
2895   MachineRegisterInfo *MRI;
2896   // Tag store instructions that are being replaced.
2897   SmallVector<TagStoreInstr, 8> TagStores;
2898   // Combined memref arguments of the above instructions.
2899   SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
2900 
2901   // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
2902   // FrameRegOffset + Size) with the address tag of SP.
2903   Register FrameReg;
2904   StackOffset FrameRegOffset;
2905   int64_t Size;
2906   // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
2907   Optional<int64_t> FrameRegUpdate;
2908   // MIFlags for any FrameReg updating instructions.
2909   unsigned FrameRegUpdateFlags;
2910 
2911   // Use zeroing instruction variants.
2912   bool ZeroData;
2913   DebugLoc DL;
2914 
2915   void emitUnrolled(MachineBasicBlock::iterator InsertI);
2916   void emitLoop(MachineBasicBlock::iterator InsertI);
2917 
2918 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)2919   TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
2920       : MBB(MBB), ZeroData(ZeroData) {
2921     MF = MBB->getParent();
2922     MRI = &MF->getRegInfo();
2923   }
2924   // Add an instruction to be replaced. Instructions must be added in the
2925   // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)2926   void addInstruction(TagStoreInstr I) {
2927     assert((TagStores.empty() ||
2928             TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
2929            "Non-adjacent tag store instructions.");
2930     TagStores.push_back(I);
2931   }
clear()2932   void clear() { TagStores.clear(); }
2933   // Emit equivalent code at the given location, and erase the current set of
2934   // instructions. May skip if the replacement is not profitable. May invalidate
2935   // the input iterator and replace it with a valid one.
2936   void emitCode(MachineBasicBlock::iterator &InsertI,
2937                 const AArch64FrameLowering *TFI, bool IsLast);
2938 };
2939 
emitUnrolled(MachineBasicBlock::iterator InsertI)2940 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
2941   const AArch64InstrInfo *TII =
2942       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
2943 
2944   const int64_t kMinOffset = -256 * 16;
2945   const int64_t kMaxOffset = 255 * 16;
2946 
2947   Register BaseReg = FrameReg;
2948   int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
2949   if (BaseRegOffsetBytes < kMinOffset ||
2950       BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
2951     Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
2952     emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
2953                     StackOffset::getFixed(BaseRegOffsetBytes), TII);
2954     BaseReg = ScratchReg;
2955     BaseRegOffsetBytes = 0;
2956   }
2957 
2958   MachineInstr *LastI = nullptr;
2959   while (Size) {
2960     int64_t InstrSize = (Size > 16) ? 32 : 16;
2961     unsigned Opcode =
2962         InstrSize == 16
2963             ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
2964             : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
2965     MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
2966                           .addReg(AArch64::SP)
2967                           .addReg(BaseReg)
2968                           .addImm(BaseRegOffsetBytes / 16)
2969                           .setMemRefs(CombinedMemRefs);
2970     // A store to [BaseReg, #0] should go last for an opportunity to fold the
2971     // final SP adjustment in the epilogue.
2972     if (BaseRegOffsetBytes == 0)
2973       LastI = I;
2974     BaseRegOffsetBytes += InstrSize;
2975     Size -= InstrSize;
2976   }
2977 
2978   if (LastI)
2979     MBB->splice(InsertI, MBB, LastI);
2980 }
2981 
emitLoop(MachineBasicBlock::iterator InsertI)2982 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
2983   const AArch64InstrInfo *TII =
2984       MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
2985 
2986   Register BaseReg = FrameRegUpdate
2987                          ? FrameReg
2988                          : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
2989   Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
2990 
2991   emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
2992 
2993   int64_t LoopSize = Size;
2994   // If the loop size is not a multiple of 32, split off one 16-byte store at
2995   // the end to fold BaseReg update into.
2996   if (FrameRegUpdate && *FrameRegUpdate)
2997     LoopSize -= LoopSize % 32;
2998   MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
2999                                 TII->get(ZeroData ? AArch64::STZGloop_wback
3000                                                   : AArch64::STGloop_wback))
3001                             .addDef(SizeReg)
3002                             .addDef(BaseReg)
3003                             .addImm(LoopSize)
3004                             .addReg(BaseReg)
3005                             .setMemRefs(CombinedMemRefs);
3006   if (FrameRegUpdate)
3007     LoopI->setFlags(FrameRegUpdateFlags);
3008 
3009   int64_t ExtraBaseRegUpdate =
3010       FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3011   if (LoopSize < Size) {
3012     assert(FrameRegUpdate);
3013     assert(Size - LoopSize == 16);
3014     // Tag 16 more bytes at BaseReg and update BaseReg.
3015     BuildMI(*MBB, InsertI, DL,
3016             TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3017         .addDef(BaseReg)
3018         .addReg(BaseReg)
3019         .addReg(BaseReg)
3020         .addImm(1 + ExtraBaseRegUpdate / 16)
3021         .setMemRefs(CombinedMemRefs)
3022         .setMIFlags(FrameRegUpdateFlags);
3023   } else if (ExtraBaseRegUpdate) {
3024     // Update BaseReg.
3025     BuildMI(
3026         *MBB, InsertI, DL,
3027         TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3028         .addDef(BaseReg)
3029         .addReg(BaseReg)
3030         .addImm(std::abs(ExtraBaseRegUpdate))
3031         .addImm(0)
3032         .setMIFlags(FrameRegUpdateFlags);
3033   }
3034 }
3035 
3036 // Check if *II is a register update that can be merged into STGloop that ends
3037 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3038 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3039 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3040                        int64_t Size, int64_t *TotalOffset) {
3041   MachineInstr &MI = *II;
3042   if ((MI.getOpcode() == AArch64::ADDXri ||
3043        MI.getOpcode() == AArch64::SUBXri) &&
3044       MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3045     unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3046     int64_t Offset = MI.getOperand(2).getImm() << Shift;
3047     if (MI.getOpcode() == AArch64::SUBXri)
3048       Offset = -Offset;
3049     int64_t AbsPostOffset = std::abs(Offset - Size);
3050     const int64_t kMaxOffset =
3051         0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3052     if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3053       *TotalOffset = Offset;
3054       return true;
3055     }
3056   }
3057   return false;
3058 }
3059 
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3060 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3061                   SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3062   MemRefs.clear();
3063   for (auto &TS : TSE) {
3064     MachineInstr *MI = TS.MI;
3065     // An instruction without memory operands may access anything. Be
3066     // conservative and return an empty list.
3067     if (MI->memoperands_empty()) {
3068       MemRefs.clear();
3069       return;
3070     }
3071     MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3072   }
3073 }
3074 
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool IsLast)3075 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3076                             const AArch64FrameLowering *TFI, bool IsLast) {
3077   if (TagStores.empty())
3078     return;
3079   TagStoreInstr &FirstTagStore = TagStores[0];
3080   TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3081   Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3082   DL = TagStores[0].MI->getDebugLoc();
3083 
3084   Register Reg;
3085   FrameRegOffset = TFI->resolveFrameOffsetReference(
3086       *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3087       /*PreferFP=*/false, /*ForSimm=*/true);
3088   FrameReg = Reg;
3089   FrameRegUpdate = None;
3090 
3091   mergeMemRefs(TagStores, CombinedMemRefs);
3092 
3093   LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3094              for (const auto &Instr
3095                   : TagStores) { dbgs() << "  " << *Instr.MI; });
3096 
3097   // Size threshold where a loop becomes shorter than a linear sequence of
3098   // tagging instructions.
3099   const int kSetTagLoopThreshold = 176;
3100   if (Size < kSetTagLoopThreshold) {
3101     if (TagStores.size() < 2)
3102       return;
3103     emitUnrolled(InsertI);
3104   } else {
3105     MachineInstr *UpdateInstr = nullptr;
3106     int64_t TotalOffset;
3107     if (IsLast) {
3108       // See if we can merge base register update into the STGloop.
3109       // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3110       // but STGloop is way too unusual for that, and also it only
3111       // realistically happens in function epilogue. Also, STGloop is expanded
3112       // before that pass.
3113       if (InsertI != MBB->end() &&
3114           canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3115                             &TotalOffset)) {
3116         UpdateInstr = &*InsertI++;
3117         LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
3118                           << *UpdateInstr);
3119       }
3120     }
3121 
3122     if (!UpdateInstr && TagStores.size() < 2)
3123       return;
3124 
3125     if (UpdateInstr) {
3126       FrameRegUpdate = TotalOffset;
3127       FrameRegUpdateFlags = UpdateInstr->getFlags();
3128     }
3129     emitLoop(InsertI);
3130     if (UpdateInstr)
3131       UpdateInstr->eraseFromParent();
3132   }
3133 
3134   for (auto &TS : TagStores)
3135     TS.MI->eraseFromParent();
3136 }
3137 
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3138 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3139                                         int64_t &Size, bool &ZeroData) {
3140   MachineFunction &MF = *MI.getParent()->getParent();
3141   const MachineFrameInfo &MFI = MF.getFrameInfo();
3142 
3143   unsigned Opcode = MI.getOpcode();
3144   ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3145               Opcode == AArch64::STZ2GOffset);
3146 
3147   if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3148     if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3149       return false;
3150     if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3151       return false;
3152     Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3153     Size = MI.getOperand(2).getImm();
3154     return true;
3155   }
3156 
3157   if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3158     Size = 16;
3159   else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3160     Size = 32;
3161   else
3162     return false;
3163 
3164   if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3165     return false;
3166 
3167   Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3168            16 * MI.getOperand(2).getImm();
3169   return true;
3170 }
3171 
3172 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3173 // and replace them with a shorter instruction sequence:
3174 // * replace STG + STG with ST2G
3175 // * replace STGloop + STGloop with STGloop
3176 // This code needs to run when stack slot offsets are already known, but before
3177 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3178 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3179                                                 const AArch64FrameLowering *TFI,
3180                                                 RegScavenger *RS) {
3181   bool FirstZeroData;
3182   int64_t Size, Offset;
3183   MachineInstr &MI = *II;
3184   MachineBasicBlock *MBB = MI.getParent();
3185   MachineBasicBlock::iterator NextI = ++II;
3186   if (&MI == &MBB->instr_back())
3187     return II;
3188   if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3189     return II;
3190 
3191   SmallVector<TagStoreInstr, 4> Instrs;
3192   Instrs.emplace_back(&MI, Offset, Size);
3193 
3194   constexpr int kScanLimit = 10;
3195   int Count = 0;
3196   for (MachineBasicBlock::iterator E = MBB->end();
3197        NextI != E && Count < kScanLimit; ++NextI) {
3198     MachineInstr &MI = *NextI;
3199     bool ZeroData;
3200     int64_t Size, Offset;
3201     // Collect instructions that update memory tags with a FrameIndex operand
3202     // and (when applicable) constant size, and whose output registers are dead
3203     // (the latter is almost always the case in practice). Since these
3204     // instructions effectively have no inputs or outputs, we are free to skip
3205     // any non-aliasing instructions in between without tracking used registers.
3206     if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3207       if (ZeroData != FirstZeroData)
3208         break;
3209       Instrs.emplace_back(&MI, Offset, Size);
3210       continue;
3211     }
3212 
3213     // Only count non-transient, non-tagging instructions toward the scan
3214     // limit.
3215     if (!MI.isTransient())
3216       ++Count;
3217 
3218     // Just in case, stop before the epilogue code starts.
3219     if (MI.getFlag(MachineInstr::FrameSetup) ||
3220         MI.getFlag(MachineInstr::FrameDestroy))
3221       break;
3222 
3223     // Reject anything that may alias the collected instructions.
3224     if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3225       break;
3226   }
3227 
3228   // New code will be inserted after the last tagging instruction we've found.
3229   MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3230   InsertI++;
3231 
3232   llvm::stable_sort(Instrs,
3233                     [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3234                       return Left.Offset < Right.Offset;
3235                     });
3236 
3237   // Make sure that we don't have any overlapping stores.
3238   int64_t CurOffset = Instrs[0].Offset;
3239   for (auto &Instr : Instrs) {
3240     if (CurOffset > Instr.Offset)
3241       return NextI;
3242     CurOffset = Instr.Offset + Instr.Size;
3243   }
3244 
3245   // Find contiguous runs of tagged memory and emit shorter instruction
3246   // sequencies for them when possible.
3247   TagStoreEdit TSE(MBB, FirstZeroData);
3248   Optional<int64_t> EndOffset;
3249   for (auto &Instr : Instrs) {
3250     if (EndOffset && *EndOffset != Instr.Offset) {
3251       // Found a gap.
3252       TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
3253       TSE.clear();
3254     }
3255 
3256     TSE.addInstruction(Instr);
3257     EndOffset = Instr.Offset + Instr.Size;
3258   }
3259 
3260   TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
3261 
3262   return InsertI;
3263 }
3264 } // namespace
3265 
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3266 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3267     MachineFunction &MF, RegScavenger *RS = nullptr) const {
3268   if (StackTaggingMergeSetTag)
3269     for (auto &BB : MF)
3270       for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3271         II = tryMergeAdjacentSTG(II, this, RS);
3272 }
3273 
3274 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3275 /// before the update.  This is easily retrieved as it is exactly the offset
3276 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3277 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3278     const MachineFunction &MF, int FI, Register &FrameReg,
3279     bool IgnoreSPUpdates) const {
3280   const MachineFrameInfo &MFI = MF.getFrameInfo();
3281   if (IgnoreSPUpdates) {
3282     LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3283                       << MFI.getObjectOffset(FI) << "\n");
3284     FrameReg = AArch64::SP;
3285     return StackOffset::getFixed(MFI.getObjectOffset(FI));
3286   }
3287 
3288   return getFrameIndexReference(MF, FI, FrameReg);
3289 }
3290 
3291 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3292 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3293 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3294     const MachineFunction &MF) const {
3295   return 0;
3296 }
3297 
3298 /// Funclets only need to account for space for the callee saved registers,
3299 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3300 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3301     const MachineFunction &MF) const {
3302   // This is the size of the pushed CSRs.
3303   unsigned CSSize =
3304       MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3305   // This is the amount of stack a funclet needs to allocate.
3306   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3307                  getStackAlign());
3308 }
3309 
3310 namespace {
3311 struct FrameObject {
3312   bool IsValid = false;
3313   // Index of the object in MFI.
3314   int ObjectIndex = 0;
3315   // Group ID this object belongs to.
3316   int GroupIndex = -1;
3317   // This object should be placed first (closest to SP).
3318   bool ObjectFirst = false;
3319   // This object's group (which always contains the object with
3320   // ObjectFirst==true) should be placed first.
3321   bool GroupFirst = false;
3322 };
3323 
3324 class GroupBuilder {
3325   SmallVector<int, 8> CurrentMembers;
3326   int NextGroupIndex = 0;
3327   std::vector<FrameObject> &Objects;
3328 
3329 public:
GroupBuilder(std::vector<FrameObject> & Objects)3330   GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3331   void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3332   void EndCurrentGroup() {
3333     if (CurrentMembers.size() > 1) {
3334       // Create a new group with the current member list. This might remove them
3335       // from their pre-existing groups. That's OK, dealing with overlapping
3336       // groups is too hard and unlikely to make a difference.
3337       LLVM_DEBUG(dbgs() << "group:");
3338       for (int Index : CurrentMembers) {
3339         Objects[Index].GroupIndex = NextGroupIndex;
3340         LLVM_DEBUG(dbgs() << " " << Index);
3341       }
3342       LLVM_DEBUG(dbgs() << "\n");
3343       NextGroupIndex++;
3344     }
3345     CurrentMembers.clear();
3346   }
3347 };
3348 
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3349 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3350   // Objects at a lower index are closer to FP; objects at a higher index are
3351   // closer to SP.
3352   //
3353   // For consistency in our comparison, all invalid objects are placed
3354   // at the end. This also allows us to stop walking when we hit the
3355   // first invalid item after it's all sorted.
3356   //
3357   // The "first" object goes first (closest to SP), followed by the members of
3358   // the "first" group.
3359   //
3360   // The rest are sorted by the group index to keep the groups together.
3361   // Higher numbered groups are more likely to be around longer (i.e. untagged
3362   // in the function epilogue and not at some earlier point). Place them closer
3363   // to SP.
3364   //
3365   // If all else equal, sort by the object index to keep the objects in the
3366   // original order.
3367   return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3368                          A.ObjectIndex) <
3369          std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3370                          B.ObjectIndex);
3371 }
3372 } // namespace
3373 
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3374 void AArch64FrameLowering::orderFrameObjects(
3375     const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3376   if (!OrderFrameObjects || ObjectsToAllocate.empty())
3377     return;
3378 
3379   const MachineFrameInfo &MFI = MF.getFrameInfo();
3380   std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3381   for (auto &Obj : ObjectsToAllocate) {
3382     FrameObjects[Obj].IsValid = true;
3383     FrameObjects[Obj].ObjectIndex = Obj;
3384   }
3385 
3386   // Identify stack slots that are tagged at the same time.
3387   GroupBuilder GB(FrameObjects);
3388   for (auto &MBB : MF) {
3389     for (auto &MI : MBB) {
3390       if (MI.isDebugInstr())
3391         continue;
3392       int OpIndex;
3393       switch (MI.getOpcode()) {
3394       case AArch64::STGloop:
3395       case AArch64::STZGloop:
3396         OpIndex = 3;
3397         break;
3398       case AArch64::STGOffset:
3399       case AArch64::STZGOffset:
3400       case AArch64::ST2GOffset:
3401       case AArch64::STZ2GOffset:
3402         OpIndex = 1;
3403         break;
3404       default:
3405         OpIndex = -1;
3406       }
3407 
3408       int TaggedFI = -1;
3409       if (OpIndex >= 0) {
3410         const MachineOperand &MO = MI.getOperand(OpIndex);
3411         if (MO.isFI()) {
3412           int FI = MO.getIndex();
3413           if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3414               FrameObjects[FI].IsValid)
3415             TaggedFI = FI;
3416         }
3417       }
3418 
3419       // If this is a stack tagging instruction for a slot that is not part of a
3420       // group yet, either start a new group or add it to the current one.
3421       if (TaggedFI >= 0)
3422         GB.AddMember(TaggedFI);
3423       else
3424         GB.EndCurrentGroup();
3425     }
3426     // Groups should never span multiple basic blocks.
3427     GB.EndCurrentGroup();
3428   }
3429 
3430   // If the function's tagged base pointer is pinned to a stack slot, we want to
3431   // put that slot first when possible. This will likely place it at SP + 0,
3432   // and save one instruction when generating the base pointer because IRG does
3433   // not allow an immediate offset.
3434   const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3435   Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3436   if (TBPI) {
3437     FrameObjects[*TBPI].ObjectFirst = true;
3438     FrameObjects[*TBPI].GroupFirst = true;
3439     int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3440     if (FirstGroupIndex >= 0)
3441       for (FrameObject &Object : FrameObjects)
3442         if (Object.GroupIndex == FirstGroupIndex)
3443           Object.GroupFirst = true;
3444   }
3445 
3446   llvm::stable_sort(FrameObjects, FrameObjectCompare);
3447 
3448   int i = 0;
3449   for (auto &Obj : FrameObjects) {
3450     // All invalid items are sorted at the end, so it's safe to stop.
3451     if (!Obj.IsValid)
3452       break;
3453     ObjectsToAllocate[i++] = Obj.ObjectIndex;
3454   }
3455 
3456   LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3457                                                     : FrameObjects) {
3458     if (!Obj.IsValid)
3459       break;
3460     dbgs() << "  " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3461     if (Obj.ObjectFirst)
3462       dbgs() << ", first";
3463     if (Obj.GroupFirst)
3464       dbgs() << ", group-first";
3465     dbgs() << "\n";
3466   });
3467 }
3468