1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // | | Higher address
22 // |-----------------------------------|
23 // | |
24 // | arguments passed on the stack |
25 // | |
26 // |-----------------------------------| <- sp
27 // | | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // | | Higher address
37 // |-----------------------------------|
38 // | |
39 // | arguments passed on the stack |
40 // | |
41 // |-----------------------------------|
42 // | |
43 // | (Win64 only) varargs from reg |
44 // | |
45 // |-----------------------------------|
46 // | |
47 // | callee-saved gpr registers | <--.
48 // | | | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
50 // | prev_lr | | (frame record first)
51 // | prev_fp | <--'
52 // | async context if needed |
53 // | (a.k.a. "frame record") |
54 // |-----------------------------------| <- fp(=x29)
55 // | |
56 // | callee-saved fp/simd/SVE regs |
57 // | |
58 // |-----------------------------------|
59 // | |
60 // | SVE stack objects |
61 // | |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....| compile time; if present)
66 // |-----------------------------------|
67 // | |
68 // | local variables of fixed size |
69 // | including spill slots |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....| LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................| compile time)
74 // |-----------------------------------| <- sp
75 // | | Lower address
76 //
77 //
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
84 //
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 // variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 // more-than-default alignment requirements.
91 //
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 // ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
105 //
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
110 //
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
117 //
118 // FIXME: also explain the redzone concept.
119 //
120 //===----------------------------------------------------------------------===//
121
122 #include "AArch64FrameLowering.h"
123 #include "AArch64InstrInfo.h"
124 #include "AArch64MachineFunctionInfo.h"
125 #include "AArch64RegisterInfo.h"
126 #include "AArch64Subtarget.h"
127 #include "AArch64TargetMachine.h"
128 #include "MCTargetDesc/AArch64AddressingModes.h"
129 #include "llvm/ADT/ScopeExit.h"
130 #include "llvm/ADT/SmallVector.h"
131 #include "llvm/ADT/Statistic.h"
132 #include "llvm/CodeGen/LivePhysRegs.h"
133 #include "llvm/CodeGen/MachineBasicBlock.h"
134 #include "llvm/CodeGen/MachineFrameInfo.h"
135 #include "llvm/CodeGen/MachineFunction.h"
136 #include "llvm/CodeGen/MachineInstr.h"
137 #include "llvm/CodeGen/MachineInstrBuilder.h"
138 #include "llvm/CodeGen/MachineMemOperand.h"
139 #include "llvm/CodeGen/MachineModuleInfo.h"
140 #include "llvm/CodeGen/MachineOperand.h"
141 #include "llvm/CodeGen/MachineRegisterInfo.h"
142 #include "llvm/CodeGen/RegisterScavenging.h"
143 #include "llvm/CodeGen/TargetInstrInfo.h"
144 #include "llvm/CodeGen/TargetRegisterInfo.h"
145 #include "llvm/CodeGen/TargetSubtargetInfo.h"
146 #include "llvm/CodeGen/WinEHFuncInfo.h"
147 #include "llvm/IR/Attributes.h"
148 #include "llvm/IR/CallingConv.h"
149 #include "llvm/IR/DataLayout.h"
150 #include "llvm/IR/DebugLoc.h"
151 #include "llvm/IR/Function.h"
152 #include "llvm/MC/MCAsmInfo.h"
153 #include "llvm/MC/MCDwarf.h"
154 #include "llvm/Support/CommandLine.h"
155 #include "llvm/Support/Debug.h"
156 #include "llvm/Support/ErrorHandling.h"
157 #include "llvm/Support/LEB128.h"
158 #include "llvm/Support/MathExtras.h"
159 #include "llvm/Support/raw_ostream.h"
160 #include "llvm/Target/TargetMachine.h"
161 #include "llvm/Target/TargetOptions.h"
162 #include <cassert>
163 #include <cstdint>
164 #include <iterator>
165 #include <vector>
166
167 using namespace llvm;
168
169 #define DEBUG_TYPE "frame-info"
170
171 static cl::opt<bool> EnableRedZone("aarch64-redzone",
172 cl::desc("enable use of redzone on AArch64"),
173 cl::init(false), cl::Hidden);
174
175 static cl::opt<bool>
176 ReverseCSRRestoreSeq("reverse-csr-restore-seq",
177 cl::desc("reverse the CSR restore sequence"),
178 cl::init(false), cl::Hidden);
179
180 static cl::opt<bool> StackTaggingMergeSetTag(
181 "stack-tagging-merge-settag",
182 cl::desc("merge settag instruction in function epilog"), cl::init(true),
183 cl::Hidden);
184
185 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
186 cl::desc("sort stack allocations"),
187 cl::init(true), cl::Hidden);
188
189 cl::opt<bool> EnableHomogeneousPrologEpilog(
190 "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
191 cl::desc("Emit homogeneous prologue and epilogue for the size "
192 "optimization (default = off)"));
193
194 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
195
196 /// Returns how much of the incoming argument stack area (in bytes) we should
197 /// clean up in an epilogue. For the C calling convention this will be 0, for
198 /// guaranteed tail call conventions it can be positive (a normal return or a
199 /// tail call to a function that uses less stack space for arguments) or
200 /// negative (for a tail call to a function that needs more stack space than us
201 /// for arguments).
getArgumentStackToRestore(MachineFunction & MF,MachineBasicBlock & MBB)202 static int64_t getArgumentStackToRestore(MachineFunction &MF,
203 MachineBasicBlock &MBB) {
204 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
205 bool IsTailCallReturn = false;
206 if (MBB.end() != MBBI) {
207 unsigned RetOpcode = MBBI->getOpcode();
208 IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
209 RetOpcode == AArch64::TCRETURNri ||
210 RetOpcode == AArch64::TCRETURNriBTI;
211 }
212 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
213
214 int64_t ArgumentPopSize = 0;
215 if (IsTailCallReturn) {
216 MachineOperand &StackAdjust = MBBI->getOperand(1);
217
218 // For a tail-call in a callee-pops-arguments environment, some or all of
219 // the stack may actually be in use for the call's arguments, this is
220 // calculated during LowerCall and consumed here...
221 ArgumentPopSize = StackAdjust.getImm();
222 } else {
223 // ... otherwise the amount to pop is *all* of the argument space,
224 // conveniently stored in the MachineFunctionInfo by
225 // LowerFormalArguments. This will, of course, be zero for the C calling
226 // convention.
227 ArgumentPopSize = AFI->getArgumentStackToRestore();
228 }
229
230 return ArgumentPopSize;
231 }
232
233 static bool produceCompactUnwindFrame(MachineFunction &MF);
234 static bool needsWinCFI(const MachineFunction &MF);
235 static StackOffset getSVEStackSize(const MachineFunction &MF);
236
237 /// Returns true if a homogeneous prolog or epilog code can be emitted
238 /// for the size optimization. If possible, a frame helper call is injected.
239 /// When Exit block is given, this check is for epilog.
homogeneousPrologEpilog(MachineFunction & MF,MachineBasicBlock * Exit) const240 bool AArch64FrameLowering::homogeneousPrologEpilog(
241 MachineFunction &MF, MachineBasicBlock *Exit) const {
242 if (!MF.getFunction().hasMinSize())
243 return false;
244 if (!EnableHomogeneousPrologEpilog)
245 return false;
246 if (ReverseCSRRestoreSeq)
247 return false;
248 if (EnableRedZone)
249 return false;
250
251 // TODO: Window is supported yet.
252 if (needsWinCFI(MF))
253 return false;
254 // TODO: SVE is not supported yet.
255 if (getSVEStackSize(MF))
256 return false;
257
258 // Bail on stack adjustment needed on return for simplicity.
259 const MachineFrameInfo &MFI = MF.getFrameInfo();
260 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
261 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
262 return false;
263 if (Exit && getArgumentStackToRestore(MF, *Exit))
264 return false;
265
266 return true;
267 }
268
269 /// Returns true if CSRs should be paired.
producePairRegisters(MachineFunction & MF) const270 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
271 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
272 }
273
274 /// This is the biggest offset to the stack pointer we can encode in aarch64
275 /// instructions (without using a separate calculation and a temp register).
276 /// Note that the exception here are vector stores/loads which cannot encode any
277 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
278 static const unsigned DefaultSafeSPDisplacement = 255;
279
280 /// Look at each instruction that references stack frames and return the stack
281 /// size limit beyond which some of these instructions will require a scratch
282 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)283 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
284 // FIXME: For now, just conservatively guestimate based on unscaled indexing
285 // range. We'll end up allocating an unnecessary spill slot a lot, but
286 // realistically that's not a big deal at this stage of the game.
287 for (MachineBasicBlock &MBB : MF) {
288 for (MachineInstr &MI : MBB) {
289 if (MI.isDebugInstr() || MI.isPseudo() ||
290 MI.getOpcode() == AArch64::ADDXri ||
291 MI.getOpcode() == AArch64::ADDSXri)
292 continue;
293
294 for (const MachineOperand &MO : MI.operands()) {
295 if (!MO.isFI())
296 continue;
297
298 StackOffset Offset;
299 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
300 AArch64FrameOffsetCannotUpdate)
301 return 0;
302 }
303 }
304 }
305 return DefaultSafeSPDisplacement;
306 }
307
308 TargetStackID::Value
getStackIDForScalableVectors() const309 AArch64FrameLowering::getStackIDForScalableVectors() const {
310 return TargetStackID::ScalableVector;
311 }
312
313 /// Returns the size of the fixed object area (allocated next to sp on entry)
314 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)315 static unsigned getFixedObjectSize(const MachineFunction &MF,
316 const AArch64FunctionInfo *AFI, bool IsWin64,
317 bool IsFunclet) {
318 if (!IsWin64 || IsFunclet) {
319 return AFI->getTailCallReservedStack();
320 } else {
321 if (AFI->getTailCallReservedStack() != 0)
322 report_fatal_error("cannot generate ABI-changing tail call for Win64");
323 // Var args are stored here in the primary function.
324 const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
325 // To support EH funclets we allocate an UnwindHelp object
326 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
327 return alignTo(VarArgsArea + UnwindHelpObject, 16);
328 }
329 }
330
331 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)332 static StackOffset getSVEStackSize(const MachineFunction &MF) {
333 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
334 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
335 }
336
canUseRedZone(const MachineFunction & MF) const337 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
338 if (!EnableRedZone)
339 return false;
340
341 // Don't use the red zone if the function explicitly asks us not to.
342 // This is typically used for kernel code.
343 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
344 const unsigned RedZoneSize =
345 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
346 if (!RedZoneSize)
347 return false;
348
349 const MachineFrameInfo &MFI = MF.getFrameInfo();
350 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
351 uint64_t NumBytes = AFI->getLocalStackSize();
352
353 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
354 getSVEStackSize(MF));
355 }
356
357 /// hasFP - Return true if the specified function should have a dedicated frame
358 /// pointer register.
hasFP(const MachineFunction & MF) const359 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
360 const MachineFrameInfo &MFI = MF.getFrameInfo();
361 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
362 // Win64 EH requires a frame pointer if funclets are present, as the locals
363 // are accessed off the frame pointer in both the parent function and the
364 // funclets.
365 if (MF.hasEHFunclets())
366 return true;
367 // Retain behavior of always omitting the FP for leaf functions when possible.
368 if (MF.getTarget().Options.DisableFramePointerElim(MF))
369 return true;
370 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
371 MFI.hasStackMap() || MFI.hasPatchPoint() ||
372 RegInfo->hasStackRealignment(MF))
373 return true;
374 // With large callframes around we may need to use FP to access the scavenging
375 // emergency spillslot.
376 //
377 // Unfortunately some calls to hasFP() like machine verifier ->
378 // getReservedReg() -> hasFP in the middle of global isel are too early
379 // to know the max call frame size. Hopefully conservatively returning "true"
380 // in those cases is fine.
381 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
382 if (!MFI.isMaxCallFrameSizeComputed() ||
383 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
384 return true;
385
386 return false;
387 }
388
389 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
390 /// not required, we reserve argument space for call sites in the function
391 /// immediately on entry to the current function. This eliminates the need for
392 /// add/sub sp brackets around call sites. Returns true if the call frame is
393 /// included as part of the stack frame.
394 bool
hasReservedCallFrame(const MachineFunction & MF) const395 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
396 return !MF.getFrameInfo().hasVarSizedObjects();
397 }
398
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const399 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
400 MachineFunction &MF, MachineBasicBlock &MBB,
401 MachineBasicBlock::iterator I) const {
402 const AArch64InstrInfo *TII =
403 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
404 DebugLoc DL = I->getDebugLoc();
405 unsigned Opc = I->getOpcode();
406 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
407 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
408
409 if (!hasReservedCallFrame(MF)) {
410 int64_t Amount = I->getOperand(0).getImm();
411 Amount = alignTo(Amount, getStackAlign());
412 if (!IsDestroy)
413 Amount = -Amount;
414
415 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
416 // doesn't have to pop anything), then the first operand will be zero too so
417 // this adjustment is a no-op.
418 if (CalleePopAmount == 0) {
419 // FIXME: in-function stack adjustment for calls is limited to 24-bits
420 // because there's no guaranteed temporary register available.
421 //
422 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
423 // 1) For offset <= 12-bit, we use LSL #0
424 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
425 // LSL #0, and the other uses LSL #12.
426 //
427 // Most call frames will be allocated at the start of a function so
428 // this is OK, but it is a limitation that needs dealing with.
429 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
430 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
431 StackOffset::getFixed(Amount), TII);
432 }
433 } else if (CalleePopAmount != 0) {
434 // If the calling convention demands that the callee pops arguments from the
435 // stack, we want to add it back if we have a reserved call frame.
436 assert(CalleePopAmount < 0xffffff && "call frame too large");
437 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
438 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
439 }
440 return MBB.erase(I);
441 }
442
443 // Convenience function to create a DWARF expression for
444 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)445 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
446 int NumBytes, int NumVGScaledBytes, unsigned VG,
447 llvm::raw_string_ostream &Comment) {
448 uint8_t buffer[16];
449
450 if (NumBytes) {
451 Expr.push_back(dwarf::DW_OP_consts);
452 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
453 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
454 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
455 }
456
457 if (NumVGScaledBytes) {
458 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
459 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
460
461 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
462 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
463 Expr.push_back(0);
464
465 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
466 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
467
468 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
469 << std::abs(NumVGScaledBytes) << " * VG";
470 }
471 }
472
473 // Creates an MCCFIInstruction:
474 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpressionFromSP(const TargetRegisterInfo & TRI,const StackOffset & OffsetFromSP) const475 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
476 const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
477 int64_t NumBytes, NumVGScaledBytes;
478 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
479 NumVGScaledBytes);
480
481 std::string CommentBuffer = "sp";
482 llvm::raw_string_ostream Comment(CommentBuffer);
483
484 // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
485 SmallString<64> Expr;
486 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
487 Expr.push_back(0);
488 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
489 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
490
491 // Wrap this into DW_CFA_def_cfa.
492 SmallString<64> DefCfaExpr;
493 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
494 uint8_t buffer[16];
495 DefCfaExpr.append(buffer,
496 buffer + encodeULEB128(Expr.size(), buffer));
497 DefCfaExpr.append(Expr.str());
498 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
499 Comment.str());
500 }
501
createCfaOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA) const502 MCCFIInstruction AArch64FrameLowering::createCfaOffset(
503 const TargetRegisterInfo &TRI, unsigned Reg,
504 const StackOffset &OffsetFromDefCFA) const {
505 int64_t NumBytes, NumVGScaledBytes;
506 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
507 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
508
509 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
510
511 // Non-scalable offsets can use DW_CFA_offset directly.
512 if (!NumVGScaledBytes)
513 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
514
515 std::string CommentBuffer;
516 llvm::raw_string_ostream Comment(CommentBuffer);
517 Comment << printReg(Reg, &TRI) << " @ cfa";
518
519 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
520 SmallString<64> OffsetExpr;
521 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
522 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
523
524 // Wrap this into DW_CFA_expression
525 SmallString<64> CfaExpr;
526 CfaExpr.push_back(dwarf::DW_CFA_expression);
527 uint8_t buffer[16];
528 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
529 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
530 CfaExpr.append(OffsetExpr.str());
531
532 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
533 }
534
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const535 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
536 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
537 MachineFunction &MF = *MBB.getParent();
538 MachineFrameInfo &MFI = MF.getFrameInfo();
539 const TargetSubtargetInfo &STI = MF.getSubtarget();
540 const TargetRegisterInfo *TRI = STI.getRegisterInfo();
541 const TargetInstrInfo *TII = STI.getInstrInfo();
542 DebugLoc DL = MBB.findDebugLoc(MBBI);
543
544 // Add callee saved registers to move list.
545 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
546 if (CSI.empty())
547 return;
548
549 for (const auto &Info : CSI) {
550 unsigned Reg = Info.getReg();
551
552 // Not all unwinders may know about SVE registers, so assume the lowest
553 // common demoninator.
554 unsigned NewReg;
555 if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
556 Reg = NewReg;
557 else
558 continue;
559
560 StackOffset Offset;
561 if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
562 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
563 Offset =
564 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
565 StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
566 } else {
567 Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
568 getOffsetOfLocalArea());
569 }
570 unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
571 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
572 .addCFIIndex(CFIIndex)
573 .setMIFlags(MachineInstr::FrameSetup);
574 }
575 }
576
577 // Find a scratch register that we can use at the start of the prologue to
578 // re-align the stack pointer. We avoid using callee-save registers since they
579 // may appear to be free when this is called from canUseAsPrologue (during
580 // shrink wrapping), but then no longer be free when this is called from
581 // emitPrologue.
582 //
583 // FIXME: This is a bit conservative, since in the above case we could use one
584 // of the callee-save registers as a scratch temp to re-align the stack pointer,
585 // but we would then have to make sure that we were in fact saving at least one
586 // callee-save register in the prologue, which is additional complexity that
587 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)588 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
589 MachineFunction *MF = MBB->getParent();
590
591 // If MBB is an entry block, use X9 as the scratch register
592 if (&MF->front() == MBB)
593 return AArch64::X9;
594
595 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
596 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
597 LivePhysRegs LiveRegs(TRI);
598 LiveRegs.addLiveIns(*MBB);
599
600 // Mark callee saved registers as used so we will not choose them.
601 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
602 for (unsigned i = 0; CSRegs[i]; ++i)
603 LiveRegs.addReg(CSRegs[i]);
604
605 // Prefer X9 since it was historically used for the prologue scratch reg.
606 const MachineRegisterInfo &MRI = MF->getRegInfo();
607 if (LiveRegs.available(MRI, AArch64::X9))
608 return AArch64::X9;
609
610 for (unsigned Reg : AArch64::GPR64RegClass) {
611 if (LiveRegs.available(MRI, Reg))
612 return Reg;
613 }
614 return AArch64::NoRegister;
615 }
616
canUseAsPrologue(const MachineBasicBlock & MBB) const617 bool AArch64FrameLowering::canUseAsPrologue(
618 const MachineBasicBlock &MBB) const {
619 const MachineFunction *MF = MBB.getParent();
620 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
621 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
622 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
623
624 // Don't need a scratch register if we're not going to re-align the stack.
625 if (!RegInfo->hasStackRealignment(*MF))
626 return true;
627 // Otherwise, we can use any block as long as it has a scratch register
628 // available.
629 return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
630 }
631
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)632 static bool windowsRequiresStackProbe(MachineFunction &MF,
633 uint64_t StackSizeInBytes) {
634 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
635 if (!Subtarget.isTargetWindows())
636 return false;
637 const Function &F = MF.getFunction();
638 // TODO: When implementing stack protectors, take that into account
639 // for the probe threshold.
640 unsigned StackProbeSize = 4096;
641 if (F.hasFnAttribute("stack-probe-size"))
642 F.getFnAttribute("stack-probe-size")
643 .getValueAsString()
644 .getAsInteger(0, StackProbeSize);
645 return (StackSizeInBytes >= StackProbeSize) &&
646 !F.hasFnAttribute("no-stack-arg-probe");
647 }
648
needsWinCFI(const MachineFunction & MF)649 static bool needsWinCFI(const MachineFunction &MF) {
650 const Function &F = MF.getFunction();
651 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
652 F.needsUnwindTableEntry();
653 }
654
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const655 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
656 MachineFunction &MF, uint64_t StackBumpBytes) const {
657 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
658 const MachineFrameInfo &MFI = MF.getFrameInfo();
659 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
660 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
661 if (homogeneousPrologEpilog(MF))
662 return false;
663
664 if (AFI->getLocalStackSize() == 0)
665 return false;
666
667 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
668 // (to force a stp with predecrement) to match the packed unwind format,
669 // provided that there actually are any callee saved registers to merge the
670 // decrement with.
671 // This is potentially marginally slower, but allows using the packed
672 // unwind format for functions that both have a local area and callee saved
673 // registers. Using the packed unwind format notably reduces the size of
674 // the unwind info.
675 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
676 MF.getFunction().hasOptSize())
677 return false;
678
679 // 512 is the maximum immediate for stp/ldp that will be used for
680 // callee-save save/restores
681 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
682 return false;
683
684 if (MFI.hasVarSizedObjects())
685 return false;
686
687 if (RegInfo->hasStackRealignment(MF))
688 return false;
689
690 // This isn't strictly necessary, but it simplifies things a bit since the
691 // current RedZone handling code assumes the SP is adjusted by the
692 // callee-save save/restore code.
693 if (canUseRedZone(MF))
694 return false;
695
696 // When there is an SVE area on the stack, always allocate the
697 // callee-saves and spills/locals separately.
698 if (getSVEStackSize(MF))
699 return false;
700
701 return true;
702 }
703
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const704 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
705 MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
706 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
707 return false;
708
709 if (MBB.empty())
710 return true;
711
712 // Disable combined SP bump if the last instruction is an MTE tag store. It
713 // is almost always better to merge SP adjustment into those instructions.
714 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
715 MachineBasicBlock::iterator Begin = MBB.begin();
716 while (LastI != Begin) {
717 --LastI;
718 if (LastI->isTransient())
719 continue;
720 if (!LastI->getFlag(MachineInstr::FrameDestroy))
721 break;
722 }
723 switch (LastI->getOpcode()) {
724 case AArch64::STGloop:
725 case AArch64::STZGloop:
726 case AArch64::STGOffset:
727 case AArch64::STZGOffset:
728 case AArch64::ST2GOffset:
729 case AArch64::STZ2GOffset:
730 return false;
731 default:
732 return true;
733 }
734 llvm_unreachable("unreachable");
735 }
736
737 // Given a load or a store instruction, generate an appropriate unwinding SEH
738 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)739 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
740 const TargetInstrInfo &TII,
741 MachineInstr::MIFlag Flag) {
742 unsigned Opc = MBBI->getOpcode();
743 MachineBasicBlock *MBB = MBBI->getParent();
744 MachineFunction &MF = *MBB->getParent();
745 DebugLoc DL = MBBI->getDebugLoc();
746 unsigned ImmIdx = MBBI->getNumOperands() - 1;
747 int Imm = MBBI->getOperand(ImmIdx).getImm();
748 MachineInstrBuilder MIB;
749 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
750 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
751
752 switch (Opc) {
753 default:
754 llvm_unreachable("No SEH Opcode for this instruction");
755 case AArch64::LDPDpost:
756 Imm = -Imm;
757 LLVM_FALLTHROUGH;
758 case AArch64::STPDpre: {
759 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
760 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
761 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
762 .addImm(Reg0)
763 .addImm(Reg1)
764 .addImm(Imm * 8)
765 .setMIFlag(Flag);
766 break;
767 }
768 case AArch64::LDPXpost:
769 Imm = -Imm;
770 LLVM_FALLTHROUGH;
771 case AArch64::STPXpre: {
772 Register Reg0 = MBBI->getOperand(1).getReg();
773 Register Reg1 = MBBI->getOperand(2).getReg();
774 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
775 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
776 .addImm(Imm * 8)
777 .setMIFlag(Flag);
778 else
779 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
780 .addImm(RegInfo->getSEHRegNum(Reg0))
781 .addImm(RegInfo->getSEHRegNum(Reg1))
782 .addImm(Imm * 8)
783 .setMIFlag(Flag);
784 break;
785 }
786 case AArch64::LDRDpost:
787 Imm = -Imm;
788 LLVM_FALLTHROUGH;
789 case AArch64::STRDpre: {
790 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
791 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
792 .addImm(Reg)
793 .addImm(Imm)
794 .setMIFlag(Flag);
795 break;
796 }
797 case AArch64::LDRXpost:
798 Imm = -Imm;
799 LLVM_FALLTHROUGH;
800 case AArch64::STRXpre: {
801 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
802 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
803 .addImm(Reg)
804 .addImm(Imm)
805 .setMIFlag(Flag);
806 break;
807 }
808 case AArch64::STPDi:
809 case AArch64::LDPDi: {
810 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
811 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
812 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
813 .addImm(Reg0)
814 .addImm(Reg1)
815 .addImm(Imm * 8)
816 .setMIFlag(Flag);
817 break;
818 }
819 case AArch64::STPXi:
820 case AArch64::LDPXi: {
821 Register Reg0 = MBBI->getOperand(0).getReg();
822 Register Reg1 = MBBI->getOperand(1).getReg();
823 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
824 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
825 .addImm(Imm * 8)
826 .setMIFlag(Flag);
827 else
828 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
829 .addImm(RegInfo->getSEHRegNum(Reg0))
830 .addImm(RegInfo->getSEHRegNum(Reg1))
831 .addImm(Imm * 8)
832 .setMIFlag(Flag);
833 break;
834 }
835 case AArch64::STRXui:
836 case AArch64::LDRXui: {
837 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
838 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
839 .addImm(Reg)
840 .addImm(Imm * 8)
841 .setMIFlag(Flag);
842 break;
843 }
844 case AArch64::STRDui:
845 case AArch64::LDRDui: {
846 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
847 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
848 .addImm(Reg)
849 .addImm(Imm * 8)
850 .setMIFlag(Flag);
851 break;
852 }
853 }
854 auto I = MBB->insertAfter(MBBI, MIB);
855 return I;
856 }
857
858 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)859 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
860 unsigned LocalStackSize) {
861 MachineOperand *ImmOpnd = nullptr;
862 unsigned ImmIdx = MBBI->getNumOperands() - 1;
863 switch (MBBI->getOpcode()) {
864 default:
865 llvm_unreachable("Fix the offset in the SEH instruction");
866 case AArch64::SEH_SaveFPLR:
867 case AArch64::SEH_SaveRegP:
868 case AArch64::SEH_SaveReg:
869 case AArch64::SEH_SaveFRegP:
870 case AArch64::SEH_SaveFReg:
871 ImmOpnd = &MBBI->getOperand(ImmIdx);
872 break;
873 }
874 if (ImmOpnd)
875 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
876 }
877
878 // Convert callee-save register save/restore instruction to do stack pointer
879 // decrement/increment to allocate/deallocate the callee-save stack area by
880 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool InProlog=true)881 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
882 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
883 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
884 bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
885 // Ignore instructions that do not operate on SP, i.e. shadow call stack
886 // instructions and associated CFI instruction.
887 while (MBBI->getOpcode() == AArch64::STRXpost ||
888 MBBI->getOpcode() == AArch64::LDRXpre ||
889 MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
890 if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
891 assert(MBBI->getOperand(0).getReg() != AArch64::SP);
892 ++MBBI;
893 }
894 unsigned NewOpc;
895 switch (MBBI->getOpcode()) {
896 default:
897 llvm_unreachable("Unexpected callee-save save/restore opcode!");
898 case AArch64::STPXi:
899 NewOpc = AArch64::STPXpre;
900 break;
901 case AArch64::STPDi:
902 NewOpc = AArch64::STPDpre;
903 break;
904 case AArch64::STPQi:
905 NewOpc = AArch64::STPQpre;
906 break;
907 case AArch64::STRXui:
908 NewOpc = AArch64::STRXpre;
909 break;
910 case AArch64::STRDui:
911 NewOpc = AArch64::STRDpre;
912 break;
913 case AArch64::STRQui:
914 NewOpc = AArch64::STRQpre;
915 break;
916 case AArch64::LDPXi:
917 NewOpc = AArch64::LDPXpost;
918 break;
919 case AArch64::LDPDi:
920 NewOpc = AArch64::LDPDpost;
921 break;
922 case AArch64::LDPQi:
923 NewOpc = AArch64::LDPQpost;
924 break;
925 case AArch64::LDRXui:
926 NewOpc = AArch64::LDRXpost;
927 break;
928 case AArch64::LDRDui:
929 NewOpc = AArch64::LDRDpost;
930 break;
931 case AArch64::LDRQui:
932 NewOpc = AArch64::LDRQpost;
933 break;
934 }
935 // Get rid of the SEH code associated with the old instruction.
936 if (NeedsWinCFI) {
937 auto SEH = std::next(MBBI);
938 if (AArch64InstrInfo::isSEHInstruction(*SEH))
939 SEH->eraseFromParent();
940 }
941
942 TypeSize Scale = TypeSize::Fixed(1);
943 unsigned Width;
944 int64_t MinOffset, MaxOffset;
945 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
946 NewOpc, Scale, Width, MinOffset, MaxOffset);
947 (void)Success;
948 assert(Success && "unknown load/store opcode");
949
950 // If the first store isn't right where we want SP then we can't fold the
951 // update in so create a normal arithmetic instruction instead.
952 if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
953 CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
954 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
955 StackOffset::getFixed(CSStackSizeInc), TII,
956 InProlog ? MachineInstr::FrameSetup
957 : MachineInstr::FrameDestroy);
958 return std::prev(MBBI);
959 }
960
961 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
962 MIB.addReg(AArch64::SP, RegState::Define);
963
964 // Copy all operands other than the immediate offset.
965 unsigned OpndIdx = 0;
966 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
967 ++OpndIdx)
968 MIB.add(MBBI->getOperand(OpndIdx));
969
970 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
971 "Unexpected immediate offset in first/last callee-save save/restore "
972 "instruction!");
973 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
974 "Unexpected base register in callee-save save/restore instruction!");
975 assert(CSStackSizeInc % Scale == 0);
976 MIB.addImm(CSStackSizeInc / (int)Scale);
977
978 MIB.setMIFlags(MBBI->getFlags());
979 MIB.setMemRefs(MBBI->memoperands());
980
981 // Generate a new SEH code that corresponds to the new instruction.
982 if (NeedsWinCFI) {
983 *HasWinCFI = true;
984 InsertSEH(*MIB, *TII,
985 InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
986 }
987
988 return std::prev(MBB.erase(MBBI));
989 }
990
991 // Fixup callee-save register save/restore instructions to take into account
992 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)993 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
994 uint64_t LocalStackSize,
995 bool NeedsWinCFI,
996 bool *HasWinCFI) {
997 if (AArch64InstrInfo::isSEHInstruction(MI))
998 return;
999
1000 unsigned Opc = MI.getOpcode();
1001
1002 // Ignore instructions that do not operate on SP, i.e. shadow call stack
1003 // instructions and associated CFI instruction.
1004 if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
1005 Opc == AArch64::CFI_INSTRUCTION) {
1006 if (Opc != AArch64::CFI_INSTRUCTION)
1007 assert(MI.getOperand(0).getReg() != AArch64::SP);
1008 return;
1009 }
1010
1011 unsigned Scale;
1012 switch (Opc) {
1013 case AArch64::STPXi:
1014 case AArch64::STRXui:
1015 case AArch64::STPDi:
1016 case AArch64::STRDui:
1017 case AArch64::LDPXi:
1018 case AArch64::LDRXui:
1019 case AArch64::LDPDi:
1020 case AArch64::LDRDui:
1021 Scale = 8;
1022 break;
1023 case AArch64::STPQi:
1024 case AArch64::STRQui:
1025 case AArch64::LDPQi:
1026 case AArch64::LDRQui:
1027 Scale = 16;
1028 break;
1029 default:
1030 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1031 }
1032
1033 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1034 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1035 "Unexpected base register in callee-save save/restore instruction!");
1036 // Last operand is immediate offset that needs fixing.
1037 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1038 // All generated opcodes have scaled offsets.
1039 assert(LocalStackSize % Scale == 0);
1040 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1041
1042 if (NeedsWinCFI) {
1043 *HasWinCFI = true;
1044 auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1045 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1046 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1047 "Expecting a SEH instruction");
1048 fixupSEHOpcode(MBBI, LocalStackSize);
1049 }
1050 }
1051
adaptForLdStOpt(MachineBasicBlock & MBB,MachineBasicBlock::iterator FirstSPPopI,MachineBasicBlock::iterator LastPopI)1052 static void adaptForLdStOpt(MachineBasicBlock &MBB,
1053 MachineBasicBlock::iterator FirstSPPopI,
1054 MachineBasicBlock::iterator LastPopI) {
1055 // Sometimes (when we restore in the same order as we save), we can end up
1056 // with code like this:
1057 //
1058 // ldp x26, x25, [sp]
1059 // ldp x24, x23, [sp, #16]
1060 // ldp x22, x21, [sp, #32]
1061 // ldp x20, x19, [sp, #48]
1062 // add sp, sp, #64
1063 //
1064 // In this case, it is always better to put the first ldp at the end, so
1065 // that the load-store optimizer can run and merge the ldp and the add into
1066 // a post-index ldp.
1067 // If we managed to grab the first pop instruction, move it to the end.
1068 if (ReverseCSRRestoreSeq)
1069 MBB.splice(FirstSPPopI, &MBB, LastPopI);
1070 // We should end up with something like this now:
1071 //
1072 // ldp x24, x23, [sp, #16]
1073 // ldp x22, x21, [sp, #32]
1074 // ldp x20, x19, [sp, #48]
1075 // ldp x26, x25, [sp]
1076 // add sp, sp, #64
1077 //
1078 // and the load-store optimizer can merge the last two instructions into:
1079 //
1080 // ldp x26, x25, [sp], #64
1081 //
1082 }
1083
isTargetWindows(const MachineFunction & MF)1084 static bool isTargetWindows(const MachineFunction &MF) {
1085 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1086 }
1087
1088 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1089 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1090 switch (I->getOpcode()) {
1091 default:
1092 return false;
1093 case AArch64::STR_ZXI:
1094 case AArch64::STR_PXI:
1095 case AArch64::LDR_ZXI:
1096 case AArch64::LDR_PXI:
1097 return I->getFlag(MachineInstr::FrameSetup) ||
1098 I->getFlag(MachineInstr::FrameDestroy);
1099 }
1100 }
1101
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1102 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1103 MachineBasicBlock &MBB) const {
1104 MachineBasicBlock::iterator MBBI = MBB.begin();
1105 const MachineFrameInfo &MFI = MF.getFrameInfo();
1106 const Function &F = MF.getFunction();
1107 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1108 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1109 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1110 MachineModuleInfo &MMI = MF.getMMI();
1111 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1112 bool needsFrameMoves =
1113 MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1114 bool HasFP = hasFP(MF);
1115 bool NeedsWinCFI = needsWinCFI(MF);
1116 bool HasWinCFI = false;
1117 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1118
1119 bool IsFunclet = MBB.isEHFuncletEntry();
1120
1121 // At this point, we're going to decide whether or not the function uses a
1122 // redzone. In most cases, the function doesn't have a redzone so let's
1123 // assume that's false and set it to true in the case that there's a redzone.
1124 AFI->setHasRedZone(false);
1125
1126 // Debug location must be unknown since the first debug location is used
1127 // to determine the end of the prologue.
1128 DebugLoc DL;
1129
1130 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1131 if (MFnI.shouldSignReturnAddress()) {
1132 if (MFnI.shouldSignWithBKey()) {
1133 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1134 .setMIFlag(MachineInstr::FrameSetup);
1135 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
1136 .setMIFlag(MachineInstr::FrameSetup);
1137 } else {
1138 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
1139 .setMIFlag(MachineInstr::FrameSetup);
1140 }
1141
1142 unsigned CFIIndex =
1143 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1144 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1145 .addCFIIndex(CFIIndex)
1146 .setMIFlags(MachineInstr::FrameSetup);
1147 }
1148
1149 // We signal the presence of a Swift extended frame to external tools by
1150 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1151 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1152 // bits so that is still true.
1153 if (HasFP && AFI->hasSwiftAsyncContext()) {
1154 // ORR x29, x29, #0x1000_0000_0000_0000
1155 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1156 .addUse(AArch64::FP)
1157 .addImm(0x1100)
1158 .setMIFlag(MachineInstr::FrameSetup);
1159 }
1160
1161 // All calls are tail calls in GHC calling conv, and functions have no
1162 // prologue/epilogue.
1163 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1164 return;
1165
1166 // Set tagged base pointer to the requested stack slot.
1167 // Ideally it should match SP value after prologue.
1168 Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1169 if (TBPI)
1170 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1171 else
1172 AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1173
1174 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1175
1176 // getStackSize() includes all the locals in its size calculation. We don't
1177 // include these locals when computing the stack size of a funclet, as they
1178 // are allocated in the parent's stack frame and accessed via the frame
1179 // pointer from the funclet. We only save the callee saved registers in the
1180 // funclet, which are really the callee saved registers of the parent
1181 // function, including the funclet.
1182 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1183 : MFI.getStackSize();
1184 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1185 assert(!HasFP && "unexpected function without stack frame but with FP");
1186 assert(!SVEStackSize &&
1187 "unexpected function without stack frame but with SVE objects");
1188 // All of the stack allocation is for locals.
1189 AFI->setLocalStackSize(NumBytes);
1190 if (!NumBytes)
1191 return;
1192 // REDZONE: If the stack size is less than 128 bytes, we don't need
1193 // to actually allocate.
1194 if (canUseRedZone(MF)) {
1195 AFI->setHasRedZone(true);
1196 ++NumRedZoneFunctions;
1197 } else {
1198 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1199 StackOffset::getFixed(-NumBytes), TII,
1200 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1201 if (!NeedsWinCFI && needsFrameMoves) {
1202 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1203 MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1204 // Encode the stack size of the leaf function.
1205 unsigned CFIIndex = MF.addFrameInst(
1206 MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1207 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1208 .addCFIIndex(CFIIndex)
1209 .setMIFlags(MachineInstr::FrameSetup);
1210 }
1211 }
1212
1213 if (NeedsWinCFI) {
1214 HasWinCFI = true;
1215 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1216 .setMIFlag(MachineInstr::FrameSetup);
1217 }
1218
1219 return;
1220 }
1221
1222 bool IsWin64 =
1223 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1224 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1225
1226 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1227 // All of the remaining stack allocations are for locals.
1228 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1229 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1230 bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1231 if (CombineSPBump) {
1232 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1233 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1234 StackOffset::getFixed(-NumBytes), TII,
1235 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1236 NumBytes = 0;
1237 } else if (HomPrologEpilog) {
1238 // Stack has been already adjusted.
1239 NumBytes -= PrologueSaveSize;
1240 } else if (PrologueSaveSize != 0) {
1241 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1242 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
1243 NumBytes -= PrologueSaveSize;
1244 }
1245 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1246
1247 // Move past the saves of the callee-saved registers, fixing up the offsets
1248 // and pre-inc if we decided to combine the callee-save and local stack
1249 // pointer bump above.
1250 MachineBasicBlock::iterator End = MBB.end();
1251 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1252 !IsSVECalleeSave(MBBI)) {
1253 if (CombineSPBump)
1254 fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1255 NeedsWinCFI, &HasWinCFI);
1256 ++MBBI;
1257 }
1258
1259 // For funclets the FP belongs to the containing function.
1260 if (!IsFunclet && HasFP) {
1261 // Only set up FP if we actually need to.
1262 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1263
1264 if (CombineSPBump)
1265 FPOffset += AFI->getLocalStackSize();
1266
1267 if (AFI->hasSwiftAsyncContext()) {
1268 // Before we update the live FP we have to ensure there's a valid (or
1269 // null) asynchronous context in its slot just before FP in the frame
1270 // record, so store it now.
1271 const auto &Attrs = MF.getFunction().getAttributes();
1272 bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1273 if (HaveInitialContext)
1274 MBB.addLiveIn(AArch64::X22);
1275 BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1276 .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1277 .addUse(AArch64::SP)
1278 .addImm(FPOffset - 8)
1279 .setMIFlags(MachineInstr::FrameSetup);
1280 }
1281
1282 if (HomPrologEpilog) {
1283 auto Prolog = MBBI;
1284 --Prolog;
1285 assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1286 Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1287 } else {
1288 // Issue sub fp, sp, FPOffset or
1289 // mov fp,sp when FPOffset is zero.
1290 // Note: All stores of callee-saved registers are marked as "FrameSetup".
1291 // This code marks the instruction(s) that set the FP also.
1292 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1293 StackOffset::getFixed(FPOffset), TII,
1294 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1295 }
1296 }
1297
1298 if (windowsRequiresStackProbe(MF, NumBytes)) {
1299 uint64_t NumWords = NumBytes >> 4;
1300 if (NeedsWinCFI) {
1301 HasWinCFI = true;
1302 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1303 // exceed this amount. We need to move at most 2^24 - 1 into x15.
1304 // This is at most two instructions, MOVZ follwed by MOVK.
1305 // TODO: Fix to use multiple stack alloc unwind codes for stacks
1306 // exceeding 256MB in size.
1307 if (NumBytes >= (1 << 28))
1308 report_fatal_error("Stack size cannot exceed 256MB for stack "
1309 "unwinding purposes");
1310
1311 uint32_t LowNumWords = NumWords & 0xFFFF;
1312 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1313 .addImm(LowNumWords)
1314 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1315 .setMIFlag(MachineInstr::FrameSetup);
1316 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1317 .setMIFlag(MachineInstr::FrameSetup);
1318 if ((NumWords & 0xFFFF0000) != 0) {
1319 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1320 .addReg(AArch64::X15)
1321 .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1322 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1323 .setMIFlag(MachineInstr::FrameSetup);
1324 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1325 .setMIFlag(MachineInstr::FrameSetup);
1326 }
1327 } else {
1328 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1329 .addImm(NumWords)
1330 .setMIFlags(MachineInstr::FrameSetup);
1331 }
1332
1333 switch (MF.getTarget().getCodeModel()) {
1334 case CodeModel::Tiny:
1335 case CodeModel::Small:
1336 case CodeModel::Medium:
1337 case CodeModel::Kernel:
1338 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1339 .addExternalSymbol("__chkstk")
1340 .addReg(AArch64::X15, RegState::Implicit)
1341 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1342 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1343 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1344 .setMIFlags(MachineInstr::FrameSetup);
1345 if (NeedsWinCFI) {
1346 HasWinCFI = true;
1347 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1348 .setMIFlag(MachineInstr::FrameSetup);
1349 }
1350 break;
1351 case CodeModel::Large:
1352 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1353 .addReg(AArch64::X16, RegState::Define)
1354 .addExternalSymbol("__chkstk")
1355 .addExternalSymbol("__chkstk")
1356 .setMIFlags(MachineInstr::FrameSetup);
1357 if (NeedsWinCFI) {
1358 HasWinCFI = true;
1359 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1360 .setMIFlag(MachineInstr::FrameSetup);
1361 }
1362
1363 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1364 .addReg(AArch64::X16, RegState::Kill)
1365 .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1366 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1367 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1368 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1369 .setMIFlags(MachineInstr::FrameSetup);
1370 if (NeedsWinCFI) {
1371 HasWinCFI = true;
1372 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1373 .setMIFlag(MachineInstr::FrameSetup);
1374 }
1375 break;
1376 }
1377
1378 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1379 .addReg(AArch64::SP, RegState::Kill)
1380 .addReg(AArch64::X15, RegState::Kill)
1381 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1382 .setMIFlags(MachineInstr::FrameSetup);
1383 if (NeedsWinCFI) {
1384 HasWinCFI = true;
1385 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1386 .addImm(NumBytes)
1387 .setMIFlag(MachineInstr::FrameSetup);
1388 }
1389 NumBytes = 0;
1390 }
1391
1392 StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1393 MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1394
1395 // Process the SVE callee-saves to determine what space needs to be
1396 // allocated.
1397 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1398 // Find callee save instructions in frame.
1399 CalleeSavesBegin = MBBI;
1400 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1401 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1402 ++MBBI;
1403 CalleeSavesEnd = MBBI;
1404
1405 AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1406 AllocateAfter = SVEStackSize - AllocateBefore;
1407 }
1408
1409 // Allocate space for the callee saves (if any).
1410 emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
1411 -AllocateBefore, TII,
1412 MachineInstr::FrameSetup);
1413
1414 // Finally allocate remaining SVE stack space.
1415 emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1416 -AllocateAfter, TII,
1417 MachineInstr::FrameSetup);
1418
1419 // Allocate space for the rest of the frame.
1420 if (NumBytes) {
1421 // Alignment is required for the parent frame, not the funclet
1422 const bool NeedsRealignment =
1423 !IsFunclet && RegInfo->hasStackRealignment(MF);
1424 unsigned scratchSPReg = AArch64::SP;
1425
1426 if (NeedsRealignment) {
1427 scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1428 assert(scratchSPReg != AArch64::NoRegister);
1429 }
1430
1431 // If we're a leaf function, try using the red zone.
1432 if (!canUseRedZone(MF))
1433 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1434 // the correct value here, as NumBytes also includes padding bytes,
1435 // which shouldn't be counted here.
1436 emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1437 StackOffset::getFixed(-NumBytes), TII,
1438 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1439
1440 if (NeedsRealignment) {
1441 const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1442 assert(NrBitsToZero > 1);
1443 assert(scratchSPReg != AArch64::SP);
1444
1445 // SUB X9, SP, NumBytes
1446 // -- X9 is temporary register, so shouldn't contain any live data here,
1447 // -- free to use. This is already produced by emitFrameOffset above.
1448 // AND SP, X9, 0b11111...0000
1449 // The logical immediates have a non-trivial encoding. The following
1450 // formula computes the encoded immediate with all ones but
1451 // NrBitsToZero zero bits as least significant bits.
1452 uint32_t andMaskEncoded = (1 << 12) // = N
1453 | ((64 - NrBitsToZero) << 6) // immr
1454 | ((64 - NrBitsToZero - 1) << 0); // imms
1455
1456 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1457 .addReg(scratchSPReg, RegState::Kill)
1458 .addImm(andMaskEncoded);
1459 AFI->setStackRealigned(true);
1460 if (NeedsWinCFI) {
1461 HasWinCFI = true;
1462 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1463 .addImm(NumBytes & andMaskEncoded)
1464 .setMIFlag(MachineInstr::FrameSetup);
1465 }
1466 }
1467 }
1468
1469 // If we need a base pointer, set it up here. It's whatever the value of the
1470 // stack pointer is at this point. Any variable size objects will be allocated
1471 // after this, so we can still use the base pointer to reference locals.
1472 //
1473 // FIXME: Clarify FrameSetup flags here.
1474 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1475 // needed.
1476 // For funclets the BP belongs to the containing function.
1477 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1478 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1479 false);
1480 if (NeedsWinCFI) {
1481 HasWinCFI = true;
1482 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1483 .setMIFlag(MachineInstr::FrameSetup);
1484 }
1485 }
1486
1487 // The very last FrameSetup instruction indicates the end of prologue. Emit a
1488 // SEH opcode indicating the prologue end.
1489 if (NeedsWinCFI && HasWinCFI) {
1490 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1491 .setMIFlag(MachineInstr::FrameSetup);
1492 }
1493
1494 // SEH funclets are passed the frame pointer in X1. If the parent
1495 // function uses the base register, then the base register is used
1496 // directly, and is not retrieved from X1.
1497 if (IsFunclet && F.hasPersonalityFn()) {
1498 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1499 if (isAsynchronousEHPersonality(Per)) {
1500 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1501 .addReg(AArch64::X1)
1502 .setMIFlag(MachineInstr::FrameSetup);
1503 MBB.addLiveIn(AArch64::X1);
1504 }
1505 }
1506
1507 if (needsFrameMoves) {
1508 // An example of the prologue:
1509 //
1510 // .globl __foo
1511 // .align 2
1512 // __foo:
1513 // Ltmp0:
1514 // .cfi_startproc
1515 // .cfi_personality 155, ___gxx_personality_v0
1516 // Leh_func_begin:
1517 // .cfi_lsda 16, Lexception33
1518 //
1519 // stp xa,bx, [sp, -#offset]!
1520 // ...
1521 // stp x28, x27, [sp, #offset-32]
1522 // stp fp, lr, [sp, #offset-16]
1523 // add fp, sp, #offset - 16
1524 // sub sp, sp, #1360
1525 //
1526 // The Stack:
1527 // +-------------------------------------------+
1528 // 10000 | ........ | ........ | ........ | ........ |
1529 // 10004 | ........ | ........ | ........ | ........ |
1530 // +-------------------------------------------+
1531 // 10008 | ........ | ........ | ........ | ........ |
1532 // 1000c | ........ | ........ | ........ | ........ |
1533 // +===========================================+
1534 // 10010 | X28 Register |
1535 // 10014 | X28 Register |
1536 // +-------------------------------------------+
1537 // 10018 | X27 Register |
1538 // 1001c | X27 Register |
1539 // +===========================================+
1540 // 10020 | Frame Pointer |
1541 // 10024 | Frame Pointer |
1542 // +-------------------------------------------+
1543 // 10028 | Link Register |
1544 // 1002c | Link Register |
1545 // +===========================================+
1546 // 10030 | ........ | ........ | ........ | ........ |
1547 // 10034 | ........ | ........ | ........ | ........ |
1548 // +-------------------------------------------+
1549 // 10038 | ........ | ........ | ........ | ........ |
1550 // 1003c | ........ | ........ | ........ | ........ |
1551 // +-------------------------------------------+
1552 //
1553 // [sp] = 10030 :: >>initial value<<
1554 // sp = 10020 :: stp fp, lr, [sp, #-16]!
1555 // fp = sp == 10020 :: mov fp, sp
1556 // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
1557 // sp == 10010 :: >>final value<<
1558 //
1559 // The frame pointer (w29) points to address 10020. If we use an offset of
1560 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
1561 // for w27, and -32 for w28:
1562 //
1563 // Ltmp1:
1564 // .cfi_def_cfa w29, 16
1565 // Ltmp2:
1566 // .cfi_offset w30, -8
1567 // Ltmp3:
1568 // .cfi_offset w29, -16
1569 // Ltmp4:
1570 // .cfi_offset w27, -24
1571 // Ltmp5:
1572 // .cfi_offset w28, -32
1573
1574 if (HasFP) {
1575 const int OffsetToFirstCalleeSaveFromFP =
1576 AFI->getCalleeSaveBaseToFrameRecordOffset() -
1577 AFI->getCalleeSavedStackSize();
1578 Register FramePtr = RegInfo->getFrameRegister(MF);
1579
1580 // Define the current CFA rule to use the provided FP.
1581 unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1582 unsigned CFIIndex = MF.addFrameInst(
1583 MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1584 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1585 .addCFIIndex(CFIIndex)
1586 .setMIFlags(MachineInstr::FrameSetup);
1587 } else {
1588 unsigned CFIIndex;
1589 if (SVEStackSize) {
1590 const TargetSubtargetInfo &STI = MF.getSubtarget();
1591 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
1592 StackOffset TotalSize =
1593 SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
1594 CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
1595 } else {
1596 // Encode the stack size of the leaf function.
1597 CFIIndex = MF.addFrameInst(
1598 MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
1599 }
1600 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1601 .addCFIIndex(CFIIndex)
1602 .setMIFlags(MachineInstr::FrameSetup);
1603 }
1604
1605 // Now emit the moves for whatever callee saved regs we have (including FP,
1606 // LR if those are saved).
1607 emitCalleeSavedFrameMoves(MBB, MBBI);
1608 }
1609 }
1610
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1611 static void InsertReturnAddressAuth(MachineFunction &MF,
1612 MachineBasicBlock &MBB) {
1613 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1614 if (!MFI.shouldSignReturnAddress())
1615 return;
1616 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1617 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1618
1619 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1620 DebugLoc DL;
1621 if (MBBI != MBB.end())
1622 DL = MBBI->getDebugLoc();
1623
1624 // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1625 // this instruction can safely used for any v8a architecture.
1626 // From v8.3a onwards there are optimised authenticate LR and return
1627 // instructions, namely RETA{A,B}, that can be used instead.
1628 if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1629 MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1630 BuildMI(MBB, MBBI, DL,
1631 TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1632 .copyImplicitOps(*MBBI);
1633 MBB.erase(MBBI);
1634 } else {
1635 BuildMI(
1636 MBB, MBBI, DL,
1637 TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1638 .setMIFlag(MachineInstr::FrameDestroy);
1639 }
1640 }
1641
isFuncletReturnInstr(const MachineInstr & MI)1642 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1643 switch (MI.getOpcode()) {
1644 default:
1645 return false;
1646 case AArch64::CATCHRET:
1647 case AArch64::CLEANUPRET:
1648 return true;
1649 }
1650 }
1651
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1652 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1653 MachineBasicBlock &MBB) const {
1654 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1655 MachineFrameInfo &MFI = MF.getFrameInfo();
1656 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1657 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1658 DebugLoc DL;
1659 bool NeedsWinCFI = needsWinCFI(MF);
1660 bool HasWinCFI = false;
1661 bool IsFunclet = false;
1662 auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1663
1664 if (MBB.end() != MBBI) {
1665 DL = MBBI->getDebugLoc();
1666 IsFunclet = isFuncletReturnInstr(*MBBI);
1667 }
1668
1669 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1670 : MFI.getStackSize();
1671 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1672
1673 // All calls are tail calls in GHC calling conv, and functions have no
1674 // prologue/epilogue.
1675 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1676 return;
1677
1678 // How much of the stack used by incoming arguments this function is expected
1679 // to restore in this particular epilogue.
1680 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1681
1682 // The stack frame should be like below,
1683 //
1684 // ---------------------- ---
1685 // | | |
1686 // | BytesInStackArgArea| CalleeArgStackSize
1687 // | (NumReusableBytes) | (of tail call)
1688 // | | ---
1689 // | | |
1690 // ---------------------| --- |
1691 // | | | |
1692 // | CalleeSavedReg | | |
1693 // | (CalleeSavedStackSize)| | |
1694 // | | | |
1695 // ---------------------| | NumBytes
1696 // | | StackSize (StackAdjustUp)
1697 // | LocalStackSize | | |
1698 // | (covering callee | | |
1699 // | args) | | |
1700 // | | | |
1701 // ---------------------- --- ---
1702 //
1703 // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
1704 // = StackSize + ArgumentPopSize
1705 //
1706 // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
1707 // it as the 2nd argument of AArch64ISD::TC_RETURN.
1708
1709 auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
1710
1711 bool IsWin64 =
1712 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1713 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1714
1715 int64_t AfterCSRPopSize = ArgumentStackToRestore;
1716 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1717 // We cannot rely on the local stack size set in emitPrologue if the function
1718 // has funclets, as funclets have different local stack size requirements, and
1719 // the current value set in emitPrologue may be that of the containing
1720 // function.
1721 if (MF.hasEHFunclets())
1722 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1723 if (homogeneousPrologEpilog(MF, &MBB)) {
1724 assert(!NeedsWinCFI);
1725 auto LastPopI = MBB.getFirstTerminator();
1726 if (LastPopI != MBB.begin()) {
1727 auto HomogeneousEpilog = std::prev(LastPopI);
1728 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1729 LastPopI = HomogeneousEpilog;
1730 }
1731
1732 // Adjust local stack
1733 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1734 StackOffset::getFixed(-AFI->getLocalStackSize()), TII,
1735 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1736
1737 // SP has been already adjusted while restoring callee save regs.
1738 // We've bailed-out the case with adjusting SP for arguments.
1739 assert(AfterCSRPopSize == 0);
1740 return;
1741 }
1742 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1743 // Assume we can't combine the last pop with the sp restore.
1744
1745 if (!CombineSPBump && PrologueSaveSize != 0) {
1746 MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1747 while (AArch64InstrInfo::isSEHInstruction(*Pop))
1748 Pop = std::prev(Pop);
1749 // Converting the last ldp to a post-index ldp is valid only if the last
1750 // ldp's offset is 0.
1751 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1752 // If the offset is 0 and the AfterCSR pop is not actually trying to
1753 // allocate more stack for arguments (in space that an untimely interrupt
1754 // may clobber), convert it to a post-index ldp.
1755 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
1756 convertCalleeSaveRestoreToSPPrePostIncDec(
1757 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
1758 else {
1759 // If not, make sure to emit an add after the last ldp.
1760 // We're doing this by transfering the size to be restored from the
1761 // adjustment *before* the CSR pops to the adjustment *after* the CSR
1762 // pops.
1763 AfterCSRPopSize += PrologueSaveSize;
1764 }
1765 }
1766
1767 // Move past the restores of the callee-saved registers.
1768 // If we plan on combining the sp bump of the local stack size and the callee
1769 // save stack size, we might need to adjust the CSR save and restore offsets.
1770 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1771 MachineBasicBlock::iterator Begin = MBB.begin();
1772 while (LastPopI != Begin) {
1773 --LastPopI;
1774 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1775 IsSVECalleeSave(LastPopI)) {
1776 ++LastPopI;
1777 break;
1778 } else if (CombineSPBump)
1779 fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1780 NeedsWinCFI, &HasWinCFI);
1781 }
1782
1783 if (MF.hasWinCFI()) {
1784 // If the prologue didn't contain any SEH opcodes and didn't set the
1785 // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
1786 // EpilogStart - to avoid generating CFI for functions that don't need it.
1787 // (And as we didn't generate any prologue at all, it would be asymmetrical
1788 // to the epilogue.) By the end of the function, we assert that
1789 // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
1790 HasWinCFI = true;
1791 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
1792 .setMIFlag(MachineInstr::FrameDestroy);
1793 }
1794
1795 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
1796 // We need to reset FP to its untagged state on return. Bit 60 is currently
1797 // used to show the presence of an extended frame.
1798
1799 // BIC x29, x29, #0x1000_0000_0000_0000
1800 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
1801 AArch64::FP)
1802 .addUse(AArch64::FP)
1803 .addImm(0x10fe)
1804 .setMIFlag(MachineInstr::FrameDestroy);
1805 }
1806
1807 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1808
1809 // If there is a single SP update, insert it before the ret and we're done.
1810 if (CombineSPBump) {
1811 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1812 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
1813 StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
1814 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
1815 &HasWinCFI);
1816 if (HasWinCFI)
1817 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1818 TII->get(AArch64::SEH_EpilogEnd))
1819 .setMIFlag(MachineInstr::FrameDestroy);
1820 return;
1821 }
1822
1823 NumBytes -= PrologueSaveSize;
1824 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1825
1826 // Process the SVE callee-saves to determine what space needs to be
1827 // deallocated.
1828 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
1829 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
1830 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1831 RestoreBegin = std::prev(RestoreEnd);
1832 while (RestoreBegin != MBB.begin() &&
1833 IsSVECalleeSave(std::prev(RestoreBegin)))
1834 --RestoreBegin;
1835
1836 assert(IsSVECalleeSave(RestoreBegin) &&
1837 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
1838
1839 StackOffset CalleeSavedSizeAsOffset =
1840 StackOffset::getScalable(CalleeSavedSize);
1841 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
1842 DeallocateAfter = CalleeSavedSizeAsOffset;
1843 }
1844
1845 // Deallocate the SVE area.
1846 if (SVEStackSize) {
1847 if (AFI->isStackRealigned()) {
1848 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
1849 // Set SP to start of SVE callee-save area from which they can
1850 // be reloaded. The code below will deallocate the stack space
1851 // space by moving FP -> SP.
1852 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
1853 StackOffset::getScalable(-CalleeSavedSize), TII,
1854 MachineInstr::FrameDestroy);
1855 } else {
1856 if (AFI->getSVECalleeSavedStackSize()) {
1857 // Deallocate the non-SVE locals first before we can deallocate (and
1858 // restore callee saves) from the SVE area.
1859 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1860 StackOffset::getFixed(NumBytes), TII,
1861 MachineInstr::FrameDestroy);
1862 NumBytes = 0;
1863 }
1864
1865 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1866 DeallocateBefore, TII, MachineInstr::FrameDestroy);
1867
1868 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1869 DeallocateAfter, TII, MachineInstr::FrameDestroy);
1870 }
1871 }
1872
1873 if (!hasFP(MF)) {
1874 bool RedZone = canUseRedZone(MF);
1875 // If this was a redzone leaf function, we don't need to restore the
1876 // stack pointer (but we may need to pop stack args for fastcc).
1877 if (RedZone && AfterCSRPopSize == 0)
1878 return;
1879
1880 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
1881 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
1882 if (NoCalleeSaveRestore)
1883 StackRestoreBytes += AfterCSRPopSize;
1884
1885 // If we were able to combine the local stack pop with the argument pop,
1886 // then we're done.
1887 bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
1888
1889 // If we're done after this, make sure to help the load store optimizer.
1890 if (Done)
1891 adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
1892
1893 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1894 StackOffset::getFixed(StackRestoreBytes), TII,
1895 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1896 if (Done) {
1897 if (HasWinCFI) {
1898 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1899 TII->get(AArch64::SEH_EpilogEnd))
1900 .setMIFlag(MachineInstr::FrameDestroy);
1901 }
1902 return;
1903 }
1904
1905 NumBytes = 0;
1906 }
1907
1908 // Restore the original stack pointer.
1909 // FIXME: Rather than doing the math here, we should instead just use
1910 // non-post-indexed loads for the restores if we aren't actually going to
1911 // be able to save any instructions.
1912 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
1913 emitFrameOffset(
1914 MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
1915 StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
1916 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
1917 } else if (NumBytes)
1918 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1919 StackOffset::getFixed(NumBytes), TII,
1920 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1921
1922 // This must be placed after the callee-save restore code because that code
1923 // assumes the SP is at the same location as it was after the callee-save save
1924 // code in the prologue.
1925 if (AfterCSRPopSize) {
1926 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
1927 "interrupt may have clobbered");
1928 // Find an insertion point for the first ldp so that it goes before the
1929 // shadow call stack epilog instruction. This ensures that the restore of
1930 // lr from x18 is placed after the restore from sp.
1931 auto FirstSPPopI = MBB.getFirstTerminator();
1932 while (FirstSPPopI != Begin) {
1933 auto Prev = std::prev(FirstSPPopI);
1934 if (Prev->getOpcode() != AArch64::LDRXpre ||
1935 Prev->getOperand(0).getReg() == AArch64::SP)
1936 break;
1937 FirstSPPopI = Prev;
1938 }
1939
1940 adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1941
1942 emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
1943 StackOffset::getFixed(AfterCSRPopSize), TII,
1944 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1945 }
1946 if (HasWinCFI)
1947 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
1948 .setMIFlag(MachineInstr::FrameDestroy);
1949 }
1950
1951 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1952 /// debug info. It's the same as what we use for resolving the code-gen
1953 /// references for now. FIXME: This can go wrong when references are
1954 /// SP-relative and simple call frames aren't used.
1955 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1956 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1957 Register &FrameReg) const {
1958 return resolveFrameIndexReference(
1959 MF, FI, FrameReg,
1960 /*PreferFP=*/
1961 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
1962 /*ForSimm=*/false);
1963 }
1964
1965 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const1966 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1967 int FI) const {
1968 return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
1969 }
1970
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)1971 static StackOffset getFPOffset(const MachineFunction &MF,
1972 int64_t ObjectOffset) {
1973 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1974 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1975 bool IsWin64 =
1976 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1977 unsigned FixedObject =
1978 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
1979 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
1980 int64_t FPAdjust =
1981 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1982 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
1983 }
1984
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)1985 static StackOffset getStackOffset(const MachineFunction &MF,
1986 int64_t ObjectOffset) {
1987 const auto &MFI = MF.getFrameInfo();
1988 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
1989 }
1990
1991 // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const1992 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
1993 int FI) const {
1994 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
1995 MF.getSubtarget().getRegisterInfo());
1996 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
1997 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
1998 ? getFPOffset(MF, ObjectOffset).getFixed()
1999 : getStackOffset(MF, ObjectOffset).getFixed();
2000 }
2001
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const2002 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2003 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2004 bool ForSimm) const {
2005 const auto &MFI = MF.getFrameInfo();
2006 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2007 bool isFixed = MFI.isFixedObjectIndex(FI);
2008 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2009 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2010 PreferFP, ForSimm);
2011 }
2012
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const2013 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2014 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2015 Register &FrameReg, bool PreferFP, bool ForSimm) const {
2016 const auto &MFI = MF.getFrameInfo();
2017 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2018 MF.getSubtarget().getRegisterInfo());
2019 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2020 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2021
2022 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2023 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2024 bool isCSR =
2025 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2026
2027 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2028
2029 // Use frame pointer to reference fixed objects. Use it for locals if
2030 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2031 // reliable as a base). Make sure useFPForScavengingIndex() does the
2032 // right thing for the emergency spill slot.
2033 bool UseFP = false;
2034 if (AFI->hasStackFrame() && !isSVE) {
2035 // We shouldn't prefer using the FP when there is an SVE area
2036 // in between the FP and the non-SVE locals/spills.
2037 PreferFP &= !SVEStackSize;
2038
2039 // Note: Keeping the following as multiple 'if' statements rather than
2040 // merging to a single expression for readability.
2041 //
2042 // Argument access should always use the FP.
2043 if (isFixed) {
2044 UseFP = hasFP(MF);
2045 } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2046 // References to the CSR area must use FP if we're re-aligning the stack
2047 // since the dynamically-sized alignment padding is between the SP/BP and
2048 // the CSR area.
2049 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2050 UseFP = true;
2051 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2052 // If the FPOffset is negative and we're producing a signed immediate, we
2053 // have to keep in mind that the available offset range for negative
2054 // offsets is smaller than for positive ones. If an offset is available
2055 // via the FP and the SP, use whichever is closest.
2056 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2057 PreferFP |= Offset > -FPOffset;
2058
2059 if (MFI.hasVarSizedObjects()) {
2060 // If we have variable sized objects, we can use either FP or BP, as the
2061 // SP offset is unknown. We can use the base pointer if we have one and
2062 // FP is not preferred. If not, we're stuck with using FP.
2063 bool CanUseBP = RegInfo->hasBasePointer(MF);
2064 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2065 UseFP = PreferFP;
2066 else if (!CanUseBP) // Can't use BP. Forced to use FP.
2067 UseFP = true;
2068 // else we can use BP and FP, but the offset from FP won't fit.
2069 // That will make us scavenge registers which we can probably avoid by
2070 // using BP. If it won't fit for BP either, we'll scavenge anyway.
2071 } else if (FPOffset >= 0) {
2072 // Use SP or FP, whichever gives us the best chance of the offset
2073 // being in range for direct access. If the FPOffset is positive,
2074 // that'll always be best, as the SP will be even further away.
2075 UseFP = true;
2076 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2077 // Funclets access the locals contained in the parent's stack frame
2078 // via the frame pointer, so we have to use the FP in the parent
2079 // function.
2080 (void) Subtarget;
2081 assert(
2082 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2083 "Funclets should only be present on Win64");
2084 UseFP = true;
2085 } else {
2086 // We have the choice between FP and (SP or BP).
2087 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2088 UseFP = true;
2089 }
2090 }
2091 }
2092
2093 assert(
2094 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2095 "In the presence of dynamic stack pointer realignment, "
2096 "non-argument/CSR objects cannot be accessed through the frame pointer");
2097
2098 if (isSVE) {
2099 StackOffset FPOffset =
2100 StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2101 StackOffset SPOffset =
2102 SVEStackSize +
2103 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2104 ObjectOffset);
2105 // Always use the FP for SVE spills if available and beneficial.
2106 if (hasFP(MF) && (SPOffset.getFixed() ||
2107 FPOffset.getScalable() < SPOffset.getScalable() ||
2108 RegInfo->hasStackRealignment(MF))) {
2109 FrameReg = RegInfo->getFrameRegister(MF);
2110 return FPOffset;
2111 }
2112
2113 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2114 : (unsigned)AArch64::SP;
2115 return SPOffset;
2116 }
2117
2118 StackOffset ScalableOffset = {};
2119 if (UseFP && !(isFixed || isCSR))
2120 ScalableOffset = -SVEStackSize;
2121 if (!UseFP && (isFixed || isCSR))
2122 ScalableOffset = SVEStackSize;
2123
2124 if (UseFP) {
2125 FrameReg = RegInfo->getFrameRegister(MF);
2126 return StackOffset::getFixed(FPOffset) + ScalableOffset;
2127 }
2128
2129 // Use the base pointer if we have one.
2130 if (RegInfo->hasBasePointer(MF))
2131 FrameReg = RegInfo->getBaseRegister();
2132 else {
2133 assert(!MFI.hasVarSizedObjects() &&
2134 "Can't use SP when we have var sized objects.");
2135 FrameReg = AArch64::SP;
2136 // If we're using the red zone for this function, the SP won't actually
2137 // be adjusted, so the offsets will be negative. They're also all
2138 // within range of the signed 9-bit immediate instructions.
2139 if (canUseRedZone(MF))
2140 Offset -= AFI->getLocalStackSize();
2141 }
2142
2143 return StackOffset::getFixed(Offset) + ScalableOffset;
2144 }
2145
getPrologueDeath(MachineFunction & MF,unsigned Reg)2146 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2147 // Do not set a kill flag on values that are also marked as live-in. This
2148 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2149 // callee saved registers.
2150 // Omitting the kill flags is conservatively correct even if the live-in
2151 // is not used after all.
2152 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2153 return getKillRegState(!IsLiveIn);
2154 }
2155
produceCompactUnwindFrame(MachineFunction & MF)2156 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2157 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2158 AttributeList Attrs = MF.getFunction().getAttributes();
2159 return Subtarget.isTargetMachO() &&
2160 !(Subtarget.getTargetLowering()->supportSwiftError() &&
2161 Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2162 MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2163 }
2164
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2165 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2166 bool NeedsWinCFI, bool IsFirst) {
2167 // If we are generating register pairs for a Windows function that requires
2168 // EH support, then pair consecutive registers only. There are no unwind
2169 // opcodes for saves/restores of non-consectuve register pairs.
2170 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2171 // save_lrpair.
2172 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2173
2174 if (Reg2 == AArch64::FP)
2175 return true;
2176 if (!NeedsWinCFI)
2177 return false;
2178 if (Reg2 == Reg1 + 1)
2179 return false;
2180 // If pairing a GPR with LR, the pair can be described by the save_lrpair
2181 // opcode. If this is the first register pair, it would end up with a
2182 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2183 // if LR is paired with something else than the first register.
2184 // The save_lrpair opcode requires the first register to be an odd one.
2185 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2186 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2187 return false;
2188 return true;
2189 }
2190
2191 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2192 /// WindowsCFI requires that only consecutive registers can be paired.
2193 /// LR and FP need to be allocated together when the frame needs to save
2194 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2195 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2196 bool UsesWinAAPCS, bool NeedsWinCFI,
2197 bool NeedsFrameRecord, bool IsFirst) {
2198 if (UsesWinAAPCS)
2199 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2200
2201 // If we need to store the frame record, don't pair any register
2202 // with LR other than FP.
2203 if (NeedsFrameRecord)
2204 return Reg2 == AArch64::LR;
2205
2206 return false;
2207 }
2208
2209 namespace {
2210
2211 struct RegPairInfo {
2212 unsigned Reg1 = AArch64::NoRegister;
2213 unsigned Reg2 = AArch64::NoRegister;
2214 int FrameIdx;
2215 int Offset;
2216 enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2217
2218 RegPairInfo() = default;
2219
isPaired__anonfee6f2da0411::RegPairInfo2220 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2221
getScale__anonfee6f2da0411::RegPairInfo2222 unsigned getScale() const {
2223 switch (Type) {
2224 case PPR:
2225 return 2;
2226 case GPR:
2227 case FPR64:
2228 return 8;
2229 case ZPR:
2230 case FPR128:
2231 return 16;
2232 }
2233 llvm_unreachable("Unsupported type");
2234 }
2235
isScalable__anonfee6f2da0411::RegPairInfo2236 bool isScalable() const { return Type == PPR || Type == ZPR; }
2237 };
2238
2239 } // end anonymous namespace
2240
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool & NeedShadowCallStackProlog,bool NeedsFrameRecord)2241 static void computeCalleeSaveRegisterPairs(
2242 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2243 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2244 bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
2245
2246 if (CSI.empty())
2247 return;
2248
2249 bool IsWindows = isTargetWindows(MF);
2250 bool NeedsWinCFI = needsWinCFI(MF);
2251 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2252 MachineFrameInfo &MFI = MF.getFrameInfo();
2253 CallingConv::ID CC = MF.getFunction().getCallingConv();
2254 unsigned Count = CSI.size();
2255 (void)CC;
2256 // MachO's compact unwind format relies on all registers being stored in
2257 // pairs.
2258 assert((!produceCompactUnwindFrame(MF) ||
2259 CC == CallingConv::PreserveMost ||
2260 (Count & 1) == 0) &&
2261 "Odd number of callee-saved regs to spill!");
2262 int ByteOffset = AFI->getCalleeSavedStackSize();
2263 int StackFillDir = -1;
2264 int RegInc = 1;
2265 unsigned FirstReg = 0;
2266 if (NeedsWinCFI) {
2267 // For WinCFI, fill the stack from the bottom up.
2268 ByteOffset = 0;
2269 StackFillDir = 1;
2270 // As the CSI array is reversed to match PrologEpilogInserter, iterate
2271 // backwards, to pair up registers starting from lower numbered registers.
2272 RegInc = -1;
2273 FirstReg = Count - 1;
2274 }
2275 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2276 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2277
2278 // When iterating backwards, the loop condition relies on unsigned wraparound.
2279 for (unsigned i = FirstReg; i < Count; i += RegInc) {
2280 RegPairInfo RPI;
2281 RPI.Reg1 = CSI[i].getReg();
2282
2283 if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2284 RPI.Type = RegPairInfo::GPR;
2285 else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2286 RPI.Type = RegPairInfo::FPR64;
2287 else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2288 RPI.Type = RegPairInfo::FPR128;
2289 else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2290 RPI.Type = RegPairInfo::ZPR;
2291 else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2292 RPI.Type = RegPairInfo::PPR;
2293 else
2294 llvm_unreachable("Unsupported register class.");
2295
2296 // Add the next reg to the pair if it is in the same register class.
2297 if (unsigned(i + RegInc) < Count) {
2298 unsigned NextReg = CSI[i + RegInc].getReg();
2299 bool IsFirst = i == FirstReg;
2300 switch (RPI.Type) {
2301 case RegPairInfo::GPR:
2302 if (AArch64::GPR64RegClass.contains(NextReg) &&
2303 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2304 NeedsWinCFI, NeedsFrameRecord, IsFirst))
2305 RPI.Reg2 = NextReg;
2306 break;
2307 case RegPairInfo::FPR64:
2308 if (AArch64::FPR64RegClass.contains(NextReg) &&
2309 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2310 IsFirst))
2311 RPI.Reg2 = NextReg;
2312 break;
2313 case RegPairInfo::FPR128:
2314 if (AArch64::FPR128RegClass.contains(NextReg))
2315 RPI.Reg2 = NextReg;
2316 break;
2317 case RegPairInfo::PPR:
2318 case RegPairInfo::ZPR:
2319 break;
2320 }
2321 }
2322
2323 // If either of the registers to be saved is the lr register, it means that
2324 // we also need to save lr in the shadow call stack.
2325 if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
2326 MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
2327 if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
2328 report_fatal_error("Must reserve x18 to use shadow call stack");
2329 NeedShadowCallStackProlog = true;
2330 }
2331
2332 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2333 // list to come in sorted by frame index so that we can issue the store
2334 // pair instructions directly. Assert if we see anything otherwise.
2335 //
2336 // The order of the registers in the list is controlled by
2337 // getCalleeSavedRegs(), so they will always be in-order, as well.
2338 assert((!RPI.isPaired() ||
2339 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2340 "Out of order callee saved regs!");
2341
2342 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2343 RPI.Reg1 == AArch64::LR) &&
2344 "FrameRecord must be allocated together with LR");
2345
2346 // Windows AAPCS has FP and LR reversed.
2347 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2348 RPI.Reg2 == AArch64::LR) &&
2349 "FrameRecord must be allocated together with LR");
2350
2351 // MachO's compact unwind format relies on all registers being stored in
2352 // adjacent register pairs.
2353 assert((!produceCompactUnwindFrame(MF) ||
2354 CC == CallingConv::PreserveMost ||
2355 (RPI.isPaired() &&
2356 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2357 RPI.Reg1 + 1 == RPI.Reg2))) &&
2358 "Callee-save registers not saved as adjacent register pair!");
2359
2360 RPI.FrameIdx = CSI[i].getFrameIdx();
2361 if (NeedsWinCFI &&
2362 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2363 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2364
2365 int Scale = RPI.getScale();
2366
2367 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2368 assert(OffsetPre % Scale == 0);
2369
2370 if (RPI.isScalable())
2371 ScalableByteOffset += StackFillDir * Scale;
2372 else
2373 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2374
2375 // Swift's async context is directly before FP, so allocate an extra
2376 // 8 bytes for it.
2377 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2378 RPI.Reg2 == AArch64::FP)
2379 ByteOffset += StackFillDir * 8;
2380
2381 assert(!(RPI.isScalable() && RPI.isPaired()) &&
2382 "Paired spill/fill instructions don't exist for SVE vectors");
2383
2384 // Round up size of non-pair to pair size if we need to pad the
2385 // callee-save area to ensure 16-byte alignment.
2386 if (NeedGapToAlignStack && !NeedsWinCFI &&
2387 !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2388 !RPI.isPaired() && ByteOffset % 16 != 0) {
2389 ByteOffset += 8 * StackFillDir;
2390 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2391 // A stack frame with a gap looks like this, bottom up:
2392 // d9, d8. x21, gap, x20, x19.
2393 // Set extra alignment on the x21 object to create the gap above it.
2394 MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2395 NeedGapToAlignStack = false;
2396 }
2397
2398 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2399 assert(OffsetPost % Scale == 0);
2400 // If filling top down (default), we want the offset after incrementing it.
2401 // If fillibg bootom up (WinCFI) we need the original offset.
2402 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2403
2404 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2405 // Swift context can directly precede FP.
2406 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2407 RPI.Reg2 == AArch64::FP)
2408 Offset += 8;
2409 RPI.Offset = Offset / Scale;
2410
2411 assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2412 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2413 "Offset out of bounds for LDP/STP immediate");
2414
2415 // Save the offset to frame record so that the FP register can point to the
2416 // innermost frame record (spilled FP and LR registers).
2417 if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2418 RPI.Reg2 == AArch64::FP) ||
2419 (IsWindows && RPI.Reg1 == AArch64::FP &&
2420 RPI.Reg2 == AArch64::LR)))
2421 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2422
2423 RegPairs.push_back(RPI);
2424 if (RPI.isPaired())
2425 i += RegInc;
2426 }
2427 if (NeedsWinCFI) {
2428 // If we need an alignment gap in the stack, align the topmost stack
2429 // object. A stack frame with a gap looks like this, bottom up:
2430 // x19, d8. d9, gap.
2431 // Set extra alignment on the topmost stack object (the first element in
2432 // CSI, which goes top down), to create the gap above it.
2433 if (AFI->hasCalleeSaveStackFreeSpace())
2434 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2435 // We iterated bottom up over the registers; flip RegPairs back to top
2436 // down order.
2437 std::reverse(RegPairs.begin(), RegPairs.end());
2438 }
2439 }
2440
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2441 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2442 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2443 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2444 MachineFunction &MF = *MBB.getParent();
2445 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2446 bool NeedsWinCFI = needsWinCFI(MF);
2447 DebugLoc DL;
2448 SmallVector<RegPairInfo, 8> RegPairs;
2449
2450 bool NeedShadowCallStackProlog = false;
2451 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2452 NeedShadowCallStackProlog, hasFP(MF));
2453 const MachineRegisterInfo &MRI = MF.getRegInfo();
2454
2455 if (NeedShadowCallStackProlog) {
2456 // Shadow call stack prolog: str x30, [x18], #8
2457 BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
2458 .addReg(AArch64::X18, RegState::Define)
2459 .addReg(AArch64::LR)
2460 .addReg(AArch64::X18)
2461 .addImm(8)
2462 .setMIFlag(MachineInstr::FrameSetup);
2463
2464 if (NeedsWinCFI)
2465 BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
2466 .setMIFlag(MachineInstr::FrameSetup);
2467
2468 if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
2469 // Emit a CFI instruction that causes 8 to be subtracted from the value of
2470 // x18 when unwinding past this frame.
2471 static const char CFIInst[] = {
2472 dwarf::DW_CFA_val_expression,
2473 18, // register
2474 2, // length
2475 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
2476 static_cast<char>(-8) & 0x7f, // addend (sleb128)
2477 };
2478 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
2479 nullptr, StringRef(CFIInst, sizeof(CFIInst))));
2480 BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
2481 .addCFIIndex(CFIIndex)
2482 .setMIFlag(MachineInstr::FrameSetup);
2483 }
2484
2485 // This instruction also makes x18 live-in to the entry block.
2486 MBB.addLiveIn(AArch64::X18);
2487 }
2488
2489 if (homogeneousPrologEpilog(MF)) {
2490 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2491 .setMIFlag(MachineInstr::FrameSetup);
2492
2493 for (auto &RPI : RegPairs) {
2494 MIB.addReg(RPI.Reg1);
2495 MIB.addReg(RPI.Reg2);
2496
2497 // Update register live in.
2498 if (!MRI.isReserved(RPI.Reg1))
2499 MBB.addLiveIn(RPI.Reg1);
2500 if (!MRI.isReserved(RPI.Reg2))
2501 MBB.addLiveIn(RPI.Reg2);
2502 }
2503 return true;
2504 }
2505 for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
2506 ++RPII) {
2507 RegPairInfo RPI = *RPII;
2508 unsigned Reg1 = RPI.Reg1;
2509 unsigned Reg2 = RPI.Reg2;
2510 unsigned StrOpc;
2511
2512 // Issue sequence of spills for cs regs. The first spill may be converted
2513 // to a pre-decrement store later by emitPrologue if the callee-save stack
2514 // area allocation can't be combined with the local stack area allocation.
2515 // For example:
2516 // stp x22, x21, [sp, #0] // addImm(+0)
2517 // stp x20, x19, [sp, #16] // addImm(+2)
2518 // stp fp, lr, [sp, #32] // addImm(+4)
2519 // Rationale: This sequence saves uop updates compared to a sequence of
2520 // pre-increment spills like stp xi,xj,[sp,#-16]!
2521 // Note: Similar rationale and sequence for restores in epilog.
2522 unsigned Size;
2523 Align Alignment;
2524 switch (RPI.Type) {
2525 case RegPairInfo::GPR:
2526 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2527 Size = 8;
2528 Alignment = Align(8);
2529 break;
2530 case RegPairInfo::FPR64:
2531 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2532 Size = 8;
2533 Alignment = Align(8);
2534 break;
2535 case RegPairInfo::FPR128:
2536 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2537 Size = 16;
2538 Alignment = Align(16);
2539 break;
2540 case RegPairInfo::ZPR:
2541 StrOpc = AArch64::STR_ZXI;
2542 Size = 16;
2543 Alignment = Align(16);
2544 break;
2545 case RegPairInfo::PPR:
2546 StrOpc = AArch64::STR_PXI;
2547 Size = 2;
2548 Alignment = Align(2);
2549 break;
2550 }
2551 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2552 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2553 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2554 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2555 dbgs() << ")\n");
2556
2557 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2558 "Windows unwdinding requires a consecutive (FP,LR) pair");
2559 // Windows unwind codes require consecutive registers if registers are
2560 // paired. Make the switch here, so that the code below will save (x,x+1)
2561 // and not (x+1,x).
2562 unsigned FrameIdxReg1 = RPI.FrameIdx;
2563 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2564 if (NeedsWinCFI && RPI.isPaired()) {
2565 std::swap(Reg1, Reg2);
2566 std::swap(FrameIdxReg1, FrameIdxReg2);
2567 }
2568 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2569 if (!MRI.isReserved(Reg1))
2570 MBB.addLiveIn(Reg1);
2571 if (RPI.isPaired()) {
2572 if (!MRI.isReserved(Reg2))
2573 MBB.addLiveIn(Reg2);
2574 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2575 MIB.addMemOperand(MF.getMachineMemOperand(
2576 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2577 MachineMemOperand::MOStore, Size, Alignment));
2578 }
2579 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2580 .addReg(AArch64::SP)
2581 .addImm(RPI.Offset) // [sp, #offset*scale],
2582 // where factor*scale is implicit
2583 .setMIFlag(MachineInstr::FrameSetup);
2584 MIB.addMemOperand(MF.getMachineMemOperand(
2585 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2586 MachineMemOperand::MOStore, Size, Alignment));
2587 if (NeedsWinCFI)
2588 InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2589
2590 // Update the StackIDs of the SVE stack slots.
2591 MachineFrameInfo &MFI = MF.getFrameInfo();
2592 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2593 MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2594
2595 }
2596 return true;
2597 }
2598
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2599 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2600 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2601 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2602 MachineFunction &MF = *MBB.getParent();
2603 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2604 DebugLoc DL;
2605 SmallVector<RegPairInfo, 8> RegPairs;
2606 bool NeedsWinCFI = needsWinCFI(MF);
2607
2608 if (MI != MBB.end())
2609 DL = MI->getDebugLoc();
2610
2611 bool NeedShadowCallStackProlog = false;
2612 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2613 NeedShadowCallStackProlog, hasFP(MF));
2614
2615 auto EmitMI = [&](const RegPairInfo &RPI) {
2616 unsigned Reg1 = RPI.Reg1;
2617 unsigned Reg2 = RPI.Reg2;
2618
2619 // Issue sequence of restores for cs regs. The last restore may be converted
2620 // to a post-increment load later by emitEpilogue if the callee-save stack
2621 // area allocation can't be combined with the local stack area allocation.
2622 // For example:
2623 // ldp fp, lr, [sp, #32] // addImm(+4)
2624 // ldp x20, x19, [sp, #16] // addImm(+2)
2625 // ldp x22, x21, [sp, #0] // addImm(+0)
2626 // Note: see comment in spillCalleeSavedRegisters()
2627 unsigned LdrOpc;
2628 unsigned Size;
2629 Align Alignment;
2630 switch (RPI.Type) {
2631 case RegPairInfo::GPR:
2632 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2633 Size = 8;
2634 Alignment = Align(8);
2635 break;
2636 case RegPairInfo::FPR64:
2637 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2638 Size = 8;
2639 Alignment = Align(8);
2640 break;
2641 case RegPairInfo::FPR128:
2642 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2643 Size = 16;
2644 Alignment = Align(16);
2645 break;
2646 case RegPairInfo::ZPR:
2647 LdrOpc = AArch64::LDR_ZXI;
2648 Size = 16;
2649 Alignment = Align(16);
2650 break;
2651 case RegPairInfo::PPR:
2652 LdrOpc = AArch64::LDR_PXI;
2653 Size = 2;
2654 Alignment = Align(2);
2655 break;
2656 }
2657 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2658 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2659 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2660 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2661 dbgs() << ")\n");
2662
2663 // Windows unwind codes require consecutive registers if registers are
2664 // paired. Make the switch here, so that the code below will save (x,x+1)
2665 // and not (x+1,x).
2666 unsigned FrameIdxReg1 = RPI.FrameIdx;
2667 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2668 if (NeedsWinCFI && RPI.isPaired()) {
2669 std::swap(Reg1, Reg2);
2670 std::swap(FrameIdxReg1, FrameIdxReg2);
2671 }
2672 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
2673 if (RPI.isPaired()) {
2674 MIB.addReg(Reg2, getDefRegState(true));
2675 MIB.addMemOperand(MF.getMachineMemOperand(
2676 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2677 MachineMemOperand::MOLoad, Size, Alignment));
2678 }
2679 MIB.addReg(Reg1, getDefRegState(true))
2680 .addReg(AArch64::SP)
2681 .addImm(RPI.Offset) // [sp, #offset*scale]
2682 // where factor*scale is implicit
2683 .setMIFlag(MachineInstr::FrameDestroy);
2684 MIB.addMemOperand(MF.getMachineMemOperand(
2685 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2686 MachineMemOperand::MOLoad, Size, Alignment));
2687 if (NeedsWinCFI)
2688 InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2689 };
2690
2691 // SVE objects are always restored in reverse order.
2692 for (const RegPairInfo &RPI : reverse(RegPairs))
2693 if (RPI.isScalable())
2694 EmitMI(RPI);
2695
2696 if (ReverseCSRRestoreSeq) {
2697 for (const RegPairInfo &RPI : reverse(RegPairs))
2698 if (!RPI.isScalable())
2699 EmitMI(RPI);
2700 } else if (homogeneousPrologEpilog(MF, &MBB)) {
2701 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
2702 .setMIFlag(MachineInstr::FrameDestroy);
2703 for (auto &RPI : RegPairs) {
2704 MIB.addReg(RPI.Reg1, RegState::Define);
2705 MIB.addReg(RPI.Reg2, RegState::Define);
2706 }
2707 return true;
2708 } else
2709 for (const RegPairInfo &RPI : RegPairs)
2710 if (!RPI.isScalable())
2711 EmitMI(RPI);
2712
2713 if (NeedShadowCallStackProlog) {
2714 // Shadow call stack epilog: ldr x30, [x18, #-8]!
2715 BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
2716 .addReg(AArch64::X18, RegState::Define)
2717 .addReg(AArch64::LR, RegState::Define)
2718 .addReg(AArch64::X18)
2719 .addImm(-8)
2720 .setMIFlag(MachineInstr::FrameDestroy);
2721 }
2722
2723 return true;
2724 }
2725
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2726 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2727 BitVector &SavedRegs,
2728 RegScavenger *RS) const {
2729 // All calls are tail calls in GHC calling conv, and functions have no
2730 // prologue/epilogue.
2731 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2732 return;
2733
2734 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2735 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2736 MF.getSubtarget().getRegisterInfo());
2737 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2738 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2739 unsigned UnspilledCSGPR = AArch64::NoRegister;
2740 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2741
2742 MachineFrameInfo &MFI = MF.getFrameInfo();
2743 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2744
2745 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2746 ? RegInfo->getBaseRegister()
2747 : (unsigned)AArch64::NoRegister;
2748
2749 unsigned ExtraCSSpill = 0;
2750 // Figure out which callee-saved registers to save/restore.
2751 for (unsigned i = 0; CSRegs[i]; ++i) {
2752 const unsigned Reg = CSRegs[i];
2753
2754 // Add the base pointer register to SavedRegs if it is callee-save.
2755 if (Reg == BasePointerReg)
2756 SavedRegs.set(Reg);
2757
2758 bool RegUsed = SavedRegs.test(Reg);
2759 unsigned PairedReg = AArch64::NoRegister;
2760 if (AArch64::GPR64RegClass.contains(Reg) ||
2761 AArch64::FPR64RegClass.contains(Reg) ||
2762 AArch64::FPR128RegClass.contains(Reg))
2763 PairedReg = CSRegs[i ^ 1];
2764
2765 if (!RegUsed) {
2766 if (AArch64::GPR64RegClass.contains(Reg) &&
2767 !RegInfo->isReservedReg(MF, Reg)) {
2768 UnspilledCSGPR = Reg;
2769 UnspilledCSGPRPaired = PairedReg;
2770 }
2771 continue;
2772 }
2773
2774 // MachO's compact unwind format relies on all registers being stored in
2775 // pairs.
2776 // FIXME: the usual format is actually better if unwinding isn't needed.
2777 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
2778 !SavedRegs.test(PairedReg)) {
2779 SavedRegs.set(PairedReg);
2780 if (AArch64::GPR64RegClass.contains(PairedReg) &&
2781 !RegInfo->isReservedReg(MF, PairedReg))
2782 ExtraCSSpill = PairedReg;
2783 }
2784 }
2785
2786 if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2787 !Subtarget.isTargetWindows()) {
2788 // For Windows calling convention on a non-windows OS, where X18 is treated
2789 // as reserved, back up X18 when entering non-windows code (marked with the
2790 // Windows calling convention) and restore when returning regardless of
2791 // whether the individual function uses it - it might call other functions
2792 // that clobber it.
2793 SavedRegs.set(AArch64::X18);
2794 }
2795
2796 // Calculates the callee saved stack size.
2797 unsigned CSStackSize = 0;
2798 unsigned SVECSStackSize = 0;
2799 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2800 const MachineRegisterInfo &MRI = MF.getRegInfo();
2801 for (unsigned Reg : SavedRegs.set_bits()) {
2802 auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
2803 if (AArch64::PPRRegClass.contains(Reg) ||
2804 AArch64::ZPRRegClass.contains(Reg))
2805 SVECSStackSize += RegSize;
2806 else
2807 CSStackSize += RegSize;
2808 }
2809
2810 // Save number of saved regs, so we can easily update CSStackSize later.
2811 unsigned NumSavedRegs = SavedRegs.count();
2812
2813 // The frame record needs to be created by saving the appropriate registers
2814 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2815 if (hasFP(MF) ||
2816 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
2817 SavedRegs.set(AArch64::FP);
2818 SavedRegs.set(AArch64::LR);
2819 }
2820
2821 LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2822 for (unsigned Reg
2823 : SavedRegs.set_bits()) dbgs()
2824 << ' ' << printReg(Reg, RegInfo);
2825 dbgs() << "\n";);
2826
2827 // If any callee-saved registers are used, the frame cannot be eliminated.
2828 int64_t SVEStackSize =
2829 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
2830 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
2831
2832 // The CSR spill slots have not been allocated yet, so estimateStackSize
2833 // won't include them.
2834 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2835
2836 // Conservatively always assume BigStack when there are SVE spills.
2837 bool BigStack = SVEStackSize ||
2838 (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
2839 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
2840 AFI->setHasStackFrame(true);
2841
2842 // Estimate if we might need to scavenge a register at some point in order
2843 // to materialize a stack offset. If so, either spill one additional
2844 // callee-saved register or reserve a special spill slot to facilitate
2845 // register scavenging. If we already spilled an extra callee-saved register
2846 // above to keep the number of spills even, we don't need to do anything else
2847 // here.
2848 if (BigStack) {
2849 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2850 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2851 << " to get a scratch register.\n");
2852 SavedRegs.set(UnspilledCSGPR);
2853 // MachO's compact unwind format relies on all registers being stored in
2854 // pairs, so if we need to spill one extra for BigStack, then we need to
2855 // store the pair.
2856 if (producePairRegisters(MF))
2857 SavedRegs.set(UnspilledCSGPRPaired);
2858 ExtraCSSpill = UnspilledCSGPR;
2859 }
2860
2861 // If we didn't find an extra callee-saved register to spill, create
2862 // an emergency spill slot.
2863 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
2864 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2865 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2866 unsigned Size = TRI->getSpillSize(RC);
2867 Align Alignment = TRI->getSpillAlign(RC);
2868 int FI = MFI.CreateStackObject(Size, Alignment, false);
2869 RS->addScavengingFrameIndex(FI);
2870 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2871 << " as the emergency spill slot.\n");
2872 }
2873 }
2874
2875 // Adding the size of additional 64bit GPR saves.
2876 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
2877
2878 // A Swift asynchronous context extends the frame record with a pointer
2879 // directly before FP.
2880 if (hasFP(MF) && AFI->hasSwiftAsyncContext())
2881 CSStackSize += 8;
2882
2883 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
2884 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2885 << EstimatedStackSize + AlignedCSStackSize
2886 << " bytes.\n");
2887
2888 assert((!MFI.isCalleeSavedInfoValid() ||
2889 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2890 "Should not invalidate callee saved info");
2891
2892 // Round up to register pair alignment to avoid additional SP adjustment
2893 // instructions.
2894 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2895 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2896 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
2897 }
2898
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * RegInfo,std::vector<CalleeSavedInfo> & CSI,unsigned & MinCSFrameIndex,unsigned & MaxCSFrameIndex) const2899 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2900 MachineFunction &MF, const TargetRegisterInfo *RegInfo,
2901 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
2902 unsigned &MaxCSFrameIndex) const {
2903 bool NeedsWinCFI = needsWinCFI(MF);
2904 // To match the canonical windows frame layout, reverse the list of
2905 // callee saved registers to get them laid out by PrologEpilogInserter
2906 // in the right order. (PrologEpilogInserter allocates stack objects top
2907 // down. Windows canonical prologs store higher numbered registers at
2908 // the top, thus have the CSI array start from the highest registers.)
2909 if (NeedsWinCFI)
2910 std::reverse(CSI.begin(), CSI.end());
2911
2912 if (CSI.empty())
2913 return true; // Early exit if no callee saved registers are modified!
2914
2915 // Now that we know which registers need to be saved and restored, allocate
2916 // stack slots for them.
2917 MachineFrameInfo &MFI = MF.getFrameInfo();
2918 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2919 for (auto &CS : CSI) {
2920 Register Reg = CS.getReg();
2921 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
2922
2923 unsigned Size = RegInfo->getSpillSize(*RC);
2924 Align Alignment(RegInfo->getSpillAlign(*RC));
2925 int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
2926 CS.setFrameIdx(FrameIdx);
2927
2928 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2929 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2930
2931 // Grab 8 bytes below FP for the extended asynchronous frame info.
2932 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
2933 FrameIdx = MFI.CreateStackObject(8, Alignment, true);
2934 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
2935 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2936 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2937 }
2938 }
2939 return true;
2940 }
2941
enableStackSlotScavenging(const MachineFunction & MF) const2942 bool AArch64FrameLowering::enableStackSlotScavenging(
2943 const MachineFunction &MF) const {
2944 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2945 return AFI->hasCalleeSaveStackFreeSpace();
2946 }
2947
2948 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)2949 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2950 int &Min, int &Max) {
2951 Min = std::numeric_limits<int>::max();
2952 Max = std::numeric_limits<int>::min();
2953
2954 if (!MFI.isCalleeSavedInfoValid())
2955 return false;
2956
2957 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2958 for (auto &CS : CSI) {
2959 if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
2960 AArch64::PPRRegClass.contains(CS.getReg())) {
2961 assert((Max == std::numeric_limits<int>::min() ||
2962 Max + 1 == CS.getFrameIdx()) &&
2963 "SVE CalleeSaves are not consecutive");
2964
2965 Min = std::min(Min, CS.getFrameIdx());
2966 Max = std::max(Max, CS.getFrameIdx());
2967 }
2968 }
2969 return Min != std::numeric_limits<int>::max();
2970 }
2971
2972 // Process all the SVE stack objects and determine offsets for each
2973 // object. If AssignOffsets is true, the offsets get assigned.
2974 // Fills in the first and last callee-saved frame indices into
2975 // Min/MaxCSFrameIndex, respectively.
2976 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)2977 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
2978 int &MinCSFrameIndex,
2979 int &MaxCSFrameIndex,
2980 bool AssignOffsets) {
2981 #ifndef NDEBUG
2982 // First process all fixed stack objects.
2983 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
2984 assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
2985 "SVE vectors should never be passed on the stack by value, only by "
2986 "reference.");
2987 #endif
2988
2989 auto Assign = [&MFI](int FI, int64_t Offset) {
2990 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2991 MFI.setObjectOffset(FI, Offset);
2992 };
2993
2994 int64_t Offset = 0;
2995
2996 // Then process all callee saved slots.
2997 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
2998 // Assign offsets to the callee save slots.
2999 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3000 Offset += MFI.getObjectSize(I);
3001 Offset = alignTo(Offset, MFI.getObjectAlign(I));
3002 if (AssignOffsets)
3003 Assign(I, -Offset);
3004 }
3005 }
3006
3007 // Ensure that the Callee-save area is aligned to 16bytes.
3008 Offset = alignTo(Offset, Align(16U));
3009
3010 // Create a buffer of SVE objects to allocate and sort it.
3011 SmallVector<int, 8> ObjectsToAllocate;
3012 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3013 unsigned StackID = MFI.getStackID(I);
3014 if (StackID != TargetStackID::ScalableVector)
3015 continue;
3016 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3017 continue;
3018 if (MFI.isDeadObjectIndex(I))
3019 continue;
3020
3021 ObjectsToAllocate.push_back(I);
3022 }
3023
3024 // Allocate all SVE locals and spills
3025 for (unsigned FI : ObjectsToAllocate) {
3026 Align Alignment = MFI.getObjectAlign(FI);
3027 // FIXME: Given that the length of SVE vectors is not necessarily a power of
3028 // two, we'd need to align every object dynamically at runtime if the
3029 // alignment is larger than 16. This is not yet supported.
3030 if (Alignment > Align(16))
3031 report_fatal_error(
3032 "Alignment of scalable vectors > 16 bytes is not yet supported");
3033
3034 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3035 if (AssignOffsets)
3036 Assign(FI, -Offset);
3037 }
3038
3039 return Offset;
3040 }
3041
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const3042 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3043 MachineFrameInfo &MFI) const {
3044 int MinCSFrameIndex, MaxCSFrameIndex;
3045 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3046 }
3047
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const3048 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3049 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3050 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3051 true);
3052 }
3053
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const3054 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3055 MachineFunction &MF, RegScavenger *RS) const {
3056 MachineFrameInfo &MFI = MF.getFrameInfo();
3057
3058 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3059 "Upwards growing stack unsupported");
3060
3061 int MinCSFrameIndex, MaxCSFrameIndex;
3062 int64_t SVEStackSize =
3063 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3064
3065 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3066 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3067 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3068
3069 // If this function isn't doing Win64-style C++ EH, we don't need to do
3070 // anything.
3071 if (!MF.hasEHFunclets())
3072 return;
3073 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3074 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3075
3076 MachineBasicBlock &MBB = MF.front();
3077 auto MBBI = MBB.begin();
3078 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3079 ++MBBI;
3080
3081 // Create an UnwindHelp object.
3082 // The UnwindHelp object is allocated at the start of the fixed object area
3083 int64_t FixedObject =
3084 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3085 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3086 /*SPOffset*/ -FixedObject,
3087 /*IsImmutable=*/false);
3088 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3089
3090 // We need to store -2 into the UnwindHelp object at the start of the
3091 // function.
3092 DebugLoc DL;
3093 RS->enterBasicBlockEnd(MBB);
3094 RS->backward(std::prev(MBBI));
3095 unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3096 assert(DstReg && "There must be a free register after frame setup");
3097 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3098 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3099 .addReg(DstReg, getKillRegState(true))
3100 .addFrameIndex(UnwindHelpFI)
3101 .addImm(0);
3102 }
3103
3104 namespace {
3105 struct TagStoreInstr {
3106 MachineInstr *MI;
3107 int64_t Offset, Size;
TagStoreInstr__anonfee6f2da0711::TagStoreInstr3108 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3109 : MI(MI), Offset(Offset), Size(Size) {}
3110 };
3111
3112 class TagStoreEdit {
3113 MachineFunction *MF;
3114 MachineBasicBlock *MBB;
3115 MachineRegisterInfo *MRI;
3116 // Tag store instructions that are being replaced.
3117 SmallVector<TagStoreInstr, 8> TagStores;
3118 // Combined memref arguments of the above instructions.
3119 SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3120
3121 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3122 // FrameRegOffset + Size) with the address tag of SP.
3123 Register FrameReg;
3124 StackOffset FrameRegOffset;
3125 int64_t Size;
3126 // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3127 Optional<int64_t> FrameRegUpdate;
3128 // MIFlags for any FrameReg updating instructions.
3129 unsigned FrameRegUpdateFlags;
3130
3131 // Use zeroing instruction variants.
3132 bool ZeroData;
3133 DebugLoc DL;
3134
3135 void emitUnrolled(MachineBasicBlock::iterator InsertI);
3136 void emitLoop(MachineBasicBlock::iterator InsertI);
3137
3138 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)3139 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3140 : MBB(MBB), ZeroData(ZeroData) {
3141 MF = MBB->getParent();
3142 MRI = &MF->getRegInfo();
3143 }
3144 // Add an instruction to be replaced. Instructions must be added in the
3145 // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)3146 void addInstruction(TagStoreInstr I) {
3147 assert((TagStores.empty() ||
3148 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3149 "Non-adjacent tag store instructions.");
3150 TagStores.push_back(I);
3151 }
clear()3152 void clear() { TagStores.clear(); }
3153 // Emit equivalent code at the given location, and erase the current set of
3154 // instructions. May skip if the replacement is not profitable. May invalidate
3155 // the input iterator and replace it with a valid one.
3156 void emitCode(MachineBasicBlock::iterator &InsertI,
3157 const AArch64FrameLowering *TFI, bool IsLast);
3158 };
3159
emitUnrolled(MachineBasicBlock::iterator InsertI)3160 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3161 const AArch64InstrInfo *TII =
3162 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3163
3164 const int64_t kMinOffset = -256 * 16;
3165 const int64_t kMaxOffset = 255 * 16;
3166
3167 Register BaseReg = FrameReg;
3168 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3169 if (BaseRegOffsetBytes < kMinOffset ||
3170 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3171 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3172 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3173 StackOffset::getFixed(BaseRegOffsetBytes), TII);
3174 BaseReg = ScratchReg;
3175 BaseRegOffsetBytes = 0;
3176 }
3177
3178 MachineInstr *LastI = nullptr;
3179 while (Size) {
3180 int64_t InstrSize = (Size > 16) ? 32 : 16;
3181 unsigned Opcode =
3182 InstrSize == 16
3183 ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3184 : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3185 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3186 .addReg(AArch64::SP)
3187 .addReg(BaseReg)
3188 .addImm(BaseRegOffsetBytes / 16)
3189 .setMemRefs(CombinedMemRefs);
3190 // A store to [BaseReg, #0] should go last for an opportunity to fold the
3191 // final SP adjustment in the epilogue.
3192 if (BaseRegOffsetBytes == 0)
3193 LastI = I;
3194 BaseRegOffsetBytes += InstrSize;
3195 Size -= InstrSize;
3196 }
3197
3198 if (LastI)
3199 MBB->splice(InsertI, MBB, LastI);
3200 }
3201
emitLoop(MachineBasicBlock::iterator InsertI)3202 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3203 const AArch64InstrInfo *TII =
3204 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3205
3206 Register BaseReg = FrameRegUpdate
3207 ? FrameReg
3208 : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3209 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3210
3211 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3212
3213 int64_t LoopSize = Size;
3214 // If the loop size is not a multiple of 32, split off one 16-byte store at
3215 // the end to fold BaseReg update into.
3216 if (FrameRegUpdate && *FrameRegUpdate)
3217 LoopSize -= LoopSize % 32;
3218 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3219 TII->get(ZeroData ? AArch64::STZGloop_wback
3220 : AArch64::STGloop_wback))
3221 .addDef(SizeReg)
3222 .addDef(BaseReg)
3223 .addImm(LoopSize)
3224 .addReg(BaseReg)
3225 .setMemRefs(CombinedMemRefs);
3226 if (FrameRegUpdate)
3227 LoopI->setFlags(FrameRegUpdateFlags);
3228
3229 int64_t ExtraBaseRegUpdate =
3230 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3231 if (LoopSize < Size) {
3232 assert(FrameRegUpdate);
3233 assert(Size - LoopSize == 16);
3234 // Tag 16 more bytes at BaseReg and update BaseReg.
3235 BuildMI(*MBB, InsertI, DL,
3236 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3237 .addDef(BaseReg)
3238 .addReg(BaseReg)
3239 .addReg(BaseReg)
3240 .addImm(1 + ExtraBaseRegUpdate / 16)
3241 .setMemRefs(CombinedMemRefs)
3242 .setMIFlags(FrameRegUpdateFlags);
3243 } else if (ExtraBaseRegUpdate) {
3244 // Update BaseReg.
3245 BuildMI(
3246 *MBB, InsertI, DL,
3247 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3248 .addDef(BaseReg)
3249 .addReg(BaseReg)
3250 .addImm(std::abs(ExtraBaseRegUpdate))
3251 .addImm(0)
3252 .setMIFlags(FrameRegUpdateFlags);
3253 }
3254 }
3255
3256 // Check if *II is a register update that can be merged into STGloop that ends
3257 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3258 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3259 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3260 int64_t Size, int64_t *TotalOffset) {
3261 MachineInstr &MI = *II;
3262 if ((MI.getOpcode() == AArch64::ADDXri ||
3263 MI.getOpcode() == AArch64::SUBXri) &&
3264 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3265 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3266 int64_t Offset = MI.getOperand(2).getImm() << Shift;
3267 if (MI.getOpcode() == AArch64::SUBXri)
3268 Offset = -Offset;
3269 int64_t AbsPostOffset = std::abs(Offset - Size);
3270 const int64_t kMaxOffset =
3271 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3272 if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3273 *TotalOffset = Offset;
3274 return true;
3275 }
3276 }
3277 return false;
3278 }
3279
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3280 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3281 SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3282 MemRefs.clear();
3283 for (auto &TS : TSE) {
3284 MachineInstr *MI = TS.MI;
3285 // An instruction without memory operands may access anything. Be
3286 // conservative and return an empty list.
3287 if (MI->memoperands_empty()) {
3288 MemRefs.clear();
3289 return;
3290 }
3291 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3292 }
3293 }
3294
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool IsLast)3295 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3296 const AArch64FrameLowering *TFI, bool IsLast) {
3297 if (TagStores.empty())
3298 return;
3299 TagStoreInstr &FirstTagStore = TagStores[0];
3300 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3301 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3302 DL = TagStores[0].MI->getDebugLoc();
3303
3304 Register Reg;
3305 FrameRegOffset = TFI->resolveFrameOffsetReference(
3306 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3307 /*PreferFP=*/false, /*ForSimm=*/true);
3308 FrameReg = Reg;
3309 FrameRegUpdate = None;
3310
3311 mergeMemRefs(TagStores, CombinedMemRefs);
3312
3313 LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3314 for (const auto &Instr
3315 : TagStores) { dbgs() << " " << *Instr.MI; });
3316
3317 // Size threshold where a loop becomes shorter than a linear sequence of
3318 // tagging instructions.
3319 const int kSetTagLoopThreshold = 176;
3320 if (Size < kSetTagLoopThreshold) {
3321 if (TagStores.size() < 2)
3322 return;
3323 emitUnrolled(InsertI);
3324 } else {
3325 MachineInstr *UpdateInstr = nullptr;
3326 int64_t TotalOffset;
3327 if (IsLast) {
3328 // See if we can merge base register update into the STGloop.
3329 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3330 // but STGloop is way too unusual for that, and also it only
3331 // realistically happens in function epilogue. Also, STGloop is expanded
3332 // before that pass.
3333 if (InsertI != MBB->end() &&
3334 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3335 &TotalOffset)) {
3336 UpdateInstr = &*InsertI++;
3337 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
3338 << *UpdateInstr);
3339 }
3340 }
3341
3342 if (!UpdateInstr && TagStores.size() < 2)
3343 return;
3344
3345 if (UpdateInstr) {
3346 FrameRegUpdate = TotalOffset;
3347 FrameRegUpdateFlags = UpdateInstr->getFlags();
3348 }
3349 emitLoop(InsertI);
3350 if (UpdateInstr)
3351 UpdateInstr->eraseFromParent();
3352 }
3353
3354 for (auto &TS : TagStores)
3355 TS.MI->eraseFromParent();
3356 }
3357
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3358 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3359 int64_t &Size, bool &ZeroData) {
3360 MachineFunction &MF = *MI.getParent()->getParent();
3361 const MachineFrameInfo &MFI = MF.getFrameInfo();
3362
3363 unsigned Opcode = MI.getOpcode();
3364 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3365 Opcode == AArch64::STZ2GOffset);
3366
3367 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3368 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3369 return false;
3370 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3371 return false;
3372 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3373 Size = MI.getOperand(2).getImm();
3374 return true;
3375 }
3376
3377 if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3378 Size = 16;
3379 else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3380 Size = 32;
3381 else
3382 return false;
3383
3384 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3385 return false;
3386
3387 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3388 16 * MI.getOperand(2).getImm();
3389 return true;
3390 }
3391
3392 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3393 // and replace them with a shorter instruction sequence:
3394 // * replace STG + STG with ST2G
3395 // * replace STGloop + STGloop with STGloop
3396 // This code needs to run when stack slot offsets are already known, but before
3397 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3398 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3399 const AArch64FrameLowering *TFI,
3400 RegScavenger *RS) {
3401 bool FirstZeroData;
3402 int64_t Size, Offset;
3403 MachineInstr &MI = *II;
3404 MachineBasicBlock *MBB = MI.getParent();
3405 MachineBasicBlock::iterator NextI = ++II;
3406 if (&MI == &MBB->instr_back())
3407 return II;
3408 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3409 return II;
3410
3411 SmallVector<TagStoreInstr, 4> Instrs;
3412 Instrs.emplace_back(&MI, Offset, Size);
3413
3414 constexpr int kScanLimit = 10;
3415 int Count = 0;
3416 for (MachineBasicBlock::iterator E = MBB->end();
3417 NextI != E && Count < kScanLimit; ++NextI) {
3418 MachineInstr &MI = *NextI;
3419 bool ZeroData;
3420 int64_t Size, Offset;
3421 // Collect instructions that update memory tags with a FrameIndex operand
3422 // and (when applicable) constant size, and whose output registers are dead
3423 // (the latter is almost always the case in practice). Since these
3424 // instructions effectively have no inputs or outputs, we are free to skip
3425 // any non-aliasing instructions in between without tracking used registers.
3426 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3427 if (ZeroData != FirstZeroData)
3428 break;
3429 Instrs.emplace_back(&MI, Offset, Size);
3430 continue;
3431 }
3432
3433 // Only count non-transient, non-tagging instructions toward the scan
3434 // limit.
3435 if (!MI.isTransient())
3436 ++Count;
3437
3438 // Just in case, stop before the epilogue code starts.
3439 if (MI.getFlag(MachineInstr::FrameSetup) ||
3440 MI.getFlag(MachineInstr::FrameDestroy))
3441 break;
3442
3443 // Reject anything that may alias the collected instructions.
3444 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3445 break;
3446 }
3447
3448 // New code will be inserted after the last tagging instruction we've found.
3449 MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3450 InsertI++;
3451
3452 llvm::stable_sort(Instrs,
3453 [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3454 return Left.Offset < Right.Offset;
3455 });
3456
3457 // Make sure that we don't have any overlapping stores.
3458 int64_t CurOffset = Instrs[0].Offset;
3459 for (auto &Instr : Instrs) {
3460 if (CurOffset > Instr.Offset)
3461 return NextI;
3462 CurOffset = Instr.Offset + Instr.Size;
3463 }
3464
3465 // Find contiguous runs of tagged memory and emit shorter instruction
3466 // sequencies for them when possible.
3467 TagStoreEdit TSE(MBB, FirstZeroData);
3468 Optional<int64_t> EndOffset;
3469 for (auto &Instr : Instrs) {
3470 if (EndOffset && *EndOffset != Instr.Offset) {
3471 // Found a gap.
3472 TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
3473 TSE.clear();
3474 }
3475
3476 TSE.addInstruction(Instr);
3477 EndOffset = Instr.Offset + Instr.Size;
3478 }
3479
3480 TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
3481
3482 return InsertI;
3483 }
3484 } // namespace
3485
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3486 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3487 MachineFunction &MF, RegScavenger *RS = nullptr) const {
3488 if (StackTaggingMergeSetTag)
3489 for (auto &BB : MF)
3490 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3491 II = tryMergeAdjacentSTG(II, this, RS);
3492 }
3493
3494 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3495 /// before the update. This is easily retrieved as it is exactly the offset
3496 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3497 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3498 const MachineFunction &MF, int FI, Register &FrameReg,
3499 bool IgnoreSPUpdates) const {
3500 const MachineFrameInfo &MFI = MF.getFrameInfo();
3501 if (IgnoreSPUpdates) {
3502 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3503 << MFI.getObjectOffset(FI) << "\n");
3504 FrameReg = AArch64::SP;
3505 return StackOffset::getFixed(MFI.getObjectOffset(FI));
3506 }
3507
3508 return getFrameIndexReference(MF, FI, FrameReg);
3509 }
3510
3511 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3512 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3513 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3514 const MachineFunction &MF) const {
3515 return 0;
3516 }
3517
3518 /// Funclets only need to account for space for the callee saved registers,
3519 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3520 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3521 const MachineFunction &MF) const {
3522 // This is the size of the pushed CSRs.
3523 unsigned CSSize =
3524 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3525 // This is the amount of stack a funclet needs to allocate.
3526 return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3527 getStackAlign());
3528 }
3529
3530 namespace {
3531 struct FrameObject {
3532 bool IsValid = false;
3533 // Index of the object in MFI.
3534 int ObjectIndex = 0;
3535 // Group ID this object belongs to.
3536 int GroupIndex = -1;
3537 // This object should be placed first (closest to SP).
3538 bool ObjectFirst = false;
3539 // This object's group (which always contains the object with
3540 // ObjectFirst==true) should be placed first.
3541 bool GroupFirst = false;
3542 };
3543
3544 class GroupBuilder {
3545 SmallVector<int, 8> CurrentMembers;
3546 int NextGroupIndex = 0;
3547 std::vector<FrameObject> &Objects;
3548
3549 public:
GroupBuilder(std::vector<FrameObject> & Objects)3550 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3551 void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3552 void EndCurrentGroup() {
3553 if (CurrentMembers.size() > 1) {
3554 // Create a new group with the current member list. This might remove them
3555 // from their pre-existing groups. That's OK, dealing with overlapping
3556 // groups is too hard and unlikely to make a difference.
3557 LLVM_DEBUG(dbgs() << "group:");
3558 for (int Index : CurrentMembers) {
3559 Objects[Index].GroupIndex = NextGroupIndex;
3560 LLVM_DEBUG(dbgs() << " " << Index);
3561 }
3562 LLVM_DEBUG(dbgs() << "\n");
3563 NextGroupIndex++;
3564 }
3565 CurrentMembers.clear();
3566 }
3567 };
3568
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3569 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3570 // Objects at a lower index are closer to FP; objects at a higher index are
3571 // closer to SP.
3572 //
3573 // For consistency in our comparison, all invalid objects are placed
3574 // at the end. This also allows us to stop walking when we hit the
3575 // first invalid item after it's all sorted.
3576 //
3577 // The "first" object goes first (closest to SP), followed by the members of
3578 // the "first" group.
3579 //
3580 // The rest are sorted by the group index to keep the groups together.
3581 // Higher numbered groups are more likely to be around longer (i.e. untagged
3582 // in the function epilogue and not at some earlier point). Place them closer
3583 // to SP.
3584 //
3585 // If all else equal, sort by the object index to keep the objects in the
3586 // original order.
3587 return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3588 A.ObjectIndex) <
3589 std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3590 B.ObjectIndex);
3591 }
3592 } // namespace
3593
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3594 void AArch64FrameLowering::orderFrameObjects(
3595 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3596 if (!OrderFrameObjects || ObjectsToAllocate.empty())
3597 return;
3598
3599 const MachineFrameInfo &MFI = MF.getFrameInfo();
3600 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3601 for (auto &Obj : ObjectsToAllocate) {
3602 FrameObjects[Obj].IsValid = true;
3603 FrameObjects[Obj].ObjectIndex = Obj;
3604 }
3605
3606 // Identify stack slots that are tagged at the same time.
3607 GroupBuilder GB(FrameObjects);
3608 for (auto &MBB : MF) {
3609 for (auto &MI : MBB) {
3610 if (MI.isDebugInstr())
3611 continue;
3612 int OpIndex;
3613 switch (MI.getOpcode()) {
3614 case AArch64::STGloop:
3615 case AArch64::STZGloop:
3616 OpIndex = 3;
3617 break;
3618 case AArch64::STGOffset:
3619 case AArch64::STZGOffset:
3620 case AArch64::ST2GOffset:
3621 case AArch64::STZ2GOffset:
3622 OpIndex = 1;
3623 break;
3624 default:
3625 OpIndex = -1;
3626 }
3627
3628 int TaggedFI = -1;
3629 if (OpIndex >= 0) {
3630 const MachineOperand &MO = MI.getOperand(OpIndex);
3631 if (MO.isFI()) {
3632 int FI = MO.getIndex();
3633 if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3634 FrameObjects[FI].IsValid)
3635 TaggedFI = FI;
3636 }
3637 }
3638
3639 // If this is a stack tagging instruction for a slot that is not part of a
3640 // group yet, either start a new group or add it to the current one.
3641 if (TaggedFI >= 0)
3642 GB.AddMember(TaggedFI);
3643 else
3644 GB.EndCurrentGroup();
3645 }
3646 // Groups should never span multiple basic blocks.
3647 GB.EndCurrentGroup();
3648 }
3649
3650 // If the function's tagged base pointer is pinned to a stack slot, we want to
3651 // put that slot first when possible. This will likely place it at SP + 0,
3652 // and save one instruction when generating the base pointer because IRG does
3653 // not allow an immediate offset.
3654 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3655 Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3656 if (TBPI) {
3657 FrameObjects[*TBPI].ObjectFirst = true;
3658 FrameObjects[*TBPI].GroupFirst = true;
3659 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3660 if (FirstGroupIndex >= 0)
3661 for (FrameObject &Object : FrameObjects)
3662 if (Object.GroupIndex == FirstGroupIndex)
3663 Object.GroupFirst = true;
3664 }
3665
3666 llvm::stable_sort(FrameObjects, FrameObjectCompare);
3667
3668 int i = 0;
3669 for (auto &Obj : FrameObjects) {
3670 // All invalid items are sorted at the end, so it's safe to stop.
3671 if (!Obj.IsValid)
3672 break;
3673 ObjectsToAllocate[i++] = Obj.ObjectIndex;
3674 }
3675
3676 LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3677 : FrameObjects) {
3678 if (!Obj.IsValid)
3679 break;
3680 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3681 if (Obj.ObjectFirst)
3682 dbgs() << ", first";
3683 if (Obj.GroupFirst)
3684 dbgs() << ", group-first";
3685 dbgs() << "\n";
3686 });
3687 }
3688