1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // | | Higher address
22 // |-----------------------------------|
23 // | |
24 // | arguments passed on the stack |
25 // | |
26 // |-----------------------------------| <- sp
27 // | | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // | | Higher address
37 // |-----------------------------------|
38 // | |
39 // | arguments passed on the stack |
40 // | |
41 // |-----------------------------------|
42 // | |
43 // | (Win64 only) varargs from reg |
44 // | |
45 // |-----------------------------------|
46 // | |
47 // | callee-saved gpr registers | <--.
48 // | | | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
50 // | prev_lr | | (frame record first)
51 // | prev_fp | <--'
52 // | async context if needed |
53 // | (a.k.a. "frame record") |
54 // |-----------------------------------| <- fp(=x29)
55 // | |
56 // | callee-saved fp/simd/SVE regs |
57 // | |
58 // |-----------------------------------|
59 // | |
60 // | SVE stack objects |
61 // | |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....| compile time; if present)
66 // |-----------------------------------|
67 // | |
68 // | local variables of fixed size |
69 // | including spill slots |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....| LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................| compile time)
74 // |-----------------------------------| <- sp
75 // | | Lower address
76 //
77 //
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
84 //
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 // variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 // more-than-default alignment requirements.
91 //
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 // ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
105 //
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
110 //
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
117 //
118 // FIXME: also explain the redzone concept.
119 //
120 //===----------------------------------------------------------------------===//
121
122 #include "AArch64FrameLowering.h"
123 #include "AArch64InstrInfo.h"
124 #include "AArch64MachineFunctionInfo.h"
125 #include "AArch64RegisterInfo.h"
126 #include "AArch64Subtarget.h"
127 #include "AArch64TargetMachine.h"
128 #include "MCTargetDesc/AArch64AddressingModes.h"
129 #include "llvm/ADT/ScopeExit.h"
130 #include "llvm/ADT/SmallVector.h"
131 #include "llvm/ADT/Statistic.h"
132 #include "llvm/CodeGen/LivePhysRegs.h"
133 #include "llvm/CodeGen/MachineBasicBlock.h"
134 #include "llvm/CodeGen/MachineFrameInfo.h"
135 #include "llvm/CodeGen/MachineFunction.h"
136 #include "llvm/CodeGen/MachineInstr.h"
137 #include "llvm/CodeGen/MachineInstrBuilder.h"
138 #include "llvm/CodeGen/MachineMemOperand.h"
139 #include "llvm/CodeGen/MachineModuleInfo.h"
140 #include "llvm/CodeGen/MachineOperand.h"
141 #include "llvm/CodeGen/MachineRegisterInfo.h"
142 #include "llvm/CodeGen/RegisterScavenging.h"
143 #include "llvm/CodeGen/TargetInstrInfo.h"
144 #include "llvm/CodeGen/TargetRegisterInfo.h"
145 #include "llvm/CodeGen/TargetSubtargetInfo.h"
146 #include "llvm/CodeGen/WinEHFuncInfo.h"
147 #include "llvm/IR/Attributes.h"
148 #include "llvm/IR/CallingConv.h"
149 #include "llvm/IR/DataLayout.h"
150 #include "llvm/IR/DebugLoc.h"
151 #include "llvm/IR/Function.h"
152 #include "llvm/MC/MCAsmInfo.h"
153 #include "llvm/MC/MCDwarf.h"
154 #include "llvm/Support/CommandLine.h"
155 #include "llvm/Support/Debug.h"
156 #include "llvm/Support/ErrorHandling.h"
157 #include "llvm/Support/LEB128.h"
158 #include "llvm/Support/MathExtras.h"
159 #include "llvm/Support/raw_ostream.h"
160 #include "llvm/Target/TargetMachine.h"
161 #include "llvm/Target/TargetOptions.h"
162 #include <cassert>
163 #include <cstdint>
164 #include <iterator>
165 #include <vector>
166
167 using namespace llvm;
168
169 #define DEBUG_TYPE "frame-info"
170
171 static cl::opt<bool> EnableRedZone("aarch64-redzone",
172 cl::desc("enable use of redzone on AArch64"),
173 cl::init(false), cl::Hidden);
174
175 static cl::opt<bool>
176 ReverseCSRRestoreSeq("reverse-csr-restore-seq",
177 cl::desc("reverse the CSR restore sequence"),
178 cl::init(false), cl::Hidden);
179
180 static cl::opt<bool> StackTaggingMergeSetTag(
181 "stack-tagging-merge-settag",
182 cl::desc("merge settag instruction in function epilog"), cl::init(true),
183 cl::Hidden);
184
185 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
186 cl::desc("sort stack allocations"),
187 cl::init(true), cl::Hidden);
188
189 cl::opt<bool> EnableHomogeneousPrologEpilog(
190 "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
191 cl::desc("Emit homogeneous prologue and epilogue for the size "
192 "optimization (default = off)"));
193
194 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
195
196 /// Returns how much of the incoming argument stack area (in bytes) we should
197 /// clean up in an epilogue. For the C calling convention this will be 0, for
198 /// guaranteed tail call conventions it can be positive (a normal return or a
199 /// tail call to a function that uses less stack space for arguments) or
200 /// negative (for a tail call to a function that needs more stack space than us
201 /// for arguments).
getArgumentStackToRestore(MachineFunction & MF,MachineBasicBlock & MBB)202 static int64_t getArgumentStackToRestore(MachineFunction &MF,
203 MachineBasicBlock &MBB) {
204 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
205 bool IsTailCallReturn = false;
206 if (MBB.end() != MBBI) {
207 unsigned RetOpcode = MBBI->getOpcode();
208 IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
209 RetOpcode == AArch64::TCRETURNri ||
210 RetOpcode == AArch64::TCRETURNriBTI;
211 }
212 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
213
214 int64_t ArgumentPopSize = 0;
215 if (IsTailCallReturn) {
216 MachineOperand &StackAdjust = MBBI->getOperand(1);
217
218 // For a tail-call in a callee-pops-arguments environment, some or all of
219 // the stack may actually be in use for the call's arguments, this is
220 // calculated during LowerCall and consumed here...
221 ArgumentPopSize = StackAdjust.getImm();
222 } else {
223 // ... otherwise the amount to pop is *all* of the argument space,
224 // conveniently stored in the MachineFunctionInfo by
225 // LowerFormalArguments. This will, of course, be zero for the C calling
226 // convention.
227 ArgumentPopSize = AFI->getArgumentStackToRestore();
228 }
229
230 return ArgumentPopSize;
231 }
232
233 static bool produceCompactUnwindFrame(MachineFunction &MF);
234 static bool needsWinCFI(const MachineFunction &MF);
235 static StackOffset getSVEStackSize(const MachineFunction &MF);
236
237 /// Returns true if a homogeneous prolog or epilog code can be emitted
238 /// for the size optimization. If possible, a frame helper call is injected.
239 /// When Exit block is given, this check is for epilog.
homogeneousPrologEpilog(MachineFunction & MF,MachineBasicBlock * Exit) const240 bool AArch64FrameLowering::homogeneousPrologEpilog(
241 MachineFunction &MF, MachineBasicBlock *Exit) const {
242 if (!MF.getFunction().hasMinSize())
243 return false;
244 if (!EnableHomogeneousPrologEpilog)
245 return false;
246 if (ReverseCSRRestoreSeq)
247 return false;
248 if (EnableRedZone)
249 return false;
250
251 // TODO: Window is supported yet.
252 if (needsWinCFI(MF))
253 return false;
254 // TODO: SVE is not supported yet.
255 if (getSVEStackSize(MF))
256 return false;
257
258 // Bail on stack adjustment needed on return for simplicity.
259 const MachineFrameInfo &MFI = MF.getFrameInfo();
260 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
261 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
262 return false;
263 if (Exit && getArgumentStackToRestore(MF, *Exit))
264 return false;
265
266 return true;
267 }
268
269 /// Returns true if CSRs should be paired.
producePairRegisters(MachineFunction & MF) const270 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
271 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
272 }
273
274 /// This is the biggest offset to the stack pointer we can encode in aarch64
275 /// instructions (without using a separate calculation and a temp register).
276 /// Note that the exception here are vector stores/loads which cannot encode any
277 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
278 static const unsigned DefaultSafeSPDisplacement = 255;
279
280 /// Look at each instruction that references stack frames and return the stack
281 /// size limit beyond which some of these instructions will require a scratch
282 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)283 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
284 // FIXME: For now, just conservatively guestimate based on unscaled indexing
285 // range. We'll end up allocating an unnecessary spill slot a lot, but
286 // realistically that's not a big deal at this stage of the game.
287 for (MachineBasicBlock &MBB : MF) {
288 for (MachineInstr &MI : MBB) {
289 if (MI.isDebugInstr() || MI.isPseudo() ||
290 MI.getOpcode() == AArch64::ADDXri ||
291 MI.getOpcode() == AArch64::ADDSXri)
292 continue;
293
294 for (const MachineOperand &MO : MI.operands()) {
295 if (!MO.isFI())
296 continue;
297
298 StackOffset Offset;
299 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
300 AArch64FrameOffsetCannotUpdate)
301 return 0;
302 }
303 }
304 }
305 return DefaultSafeSPDisplacement;
306 }
307
308 TargetStackID::Value
getStackIDForScalableVectors() const309 AArch64FrameLowering::getStackIDForScalableVectors() const {
310 return TargetStackID::ScalableVector;
311 }
312
313 /// Returns the size of the fixed object area (allocated next to sp on entry)
314 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)315 static unsigned getFixedObjectSize(const MachineFunction &MF,
316 const AArch64FunctionInfo *AFI, bool IsWin64,
317 bool IsFunclet) {
318 if (!IsWin64 || IsFunclet) {
319 return AFI->getTailCallReservedStack();
320 } else {
321 if (AFI->getTailCallReservedStack() != 0)
322 report_fatal_error("cannot generate ABI-changing tail call for Win64");
323 // Var args are stored here in the primary function.
324 const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
325 // To support EH funclets we allocate an UnwindHelp object
326 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
327 return alignTo(VarArgsArea + UnwindHelpObject, 16);
328 }
329 }
330
331 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)332 static StackOffset getSVEStackSize(const MachineFunction &MF) {
333 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
334 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
335 }
336
canUseRedZone(const MachineFunction & MF) const337 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
338 if (!EnableRedZone)
339 return false;
340
341 // Don't use the red zone if the function explicitly asks us not to.
342 // This is typically used for kernel code.
343 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
344 const unsigned RedZoneSize =
345 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
346 if (!RedZoneSize)
347 return false;
348
349 const MachineFrameInfo &MFI = MF.getFrameInfo();
350 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
351 uint64_t NumBytes = AFI->getLocalStackSize();
352
353 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
354 getSVEStackSize(MF));
355 }
356
357 /// hasFP - Return true if the specified function should have a dedicated frame
358 /// pointer register.
hasFP(const MachineFunction & MF) const359 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
360 const MachineFrameInfo &MFI = MF.getFrameInfo();
361 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
362 // Win64 EH requires a frame pointer if funclets are present, as the locals
363 // are accessed off the frame pointer in both the parent function and the
364 // funclets.
365 if (MF.hasEHFunclets())
366 return true;
367 // Retain behavior of always omitting the FP for leaf functions when possible.
368 if (MF.getTarget().Options.DisableFramePointerElim(MF))
369 return true;
370 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
371 MFI.hasStackMap() || MFI.hasPatchPoint() ||
372 RegInfo->hasStackRealignment(MF))
373 return true;
374 // With large callframes around we may need to use FP to access the scavenging
375 // emergency spillslot.
376 //
377 // Unfortunately some calls to hasFP() like machine verifier ->
378 // getReservedReg() -> hasFP in the middle of global isel are too early
379 // to know the max call frame size. Hopefully conservatively returning "true"
380 // in those cases is fine.
381 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
382 if (!MFI.isMaxCallFrameSizeComputed() ||
383 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
384 return true;
385
386 return false;
387 }
388
389 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
390 /// not required, we reserve argument space for call sites in the function
391 /// immediately on entry to the current function. This eliminates the need for
392 /// add/sub sp brackets around call sites. Returns true if the call frame is
393 /// included as part of the stack frame.
394 bool
hasReservedCallFrame(const MachineFunction & MF) const395 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
396 return !MF.getFrameInfo().hasVarSizedObjects();
397 }
398
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const399 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
400 MachineFunction &MF, MachineBasicBlock &MBB,
401 MachineBasicBlock::iterator I) const {
402 const AArch64InstrInfo *TII =
403 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
404 DebugLoc DL = I->getDebugLoc();
405 unsigned Opc = I->getOpcode();
406 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
407 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
408
409 if (!hasReservedCallFrame(MF)) {
410 int64_t Amount = I->getOperand(0).getImm();
411 Amount = alignTo(Amount, getStackAlign());
412 if (!IsDestroy)
413 Amount = -Amount;
414
415 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
416 // doesn't have to pop anything), then the first operand will be zero too so
417 // this adjustment is a no-op.
418 if (CalleePopAmount == 0) {
419 // FIXME: in-function stack adjustment for calls is limited to 24-bits
420 // because there's no guaranteed temporary register available.
421 //
422 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
423 // 1) For offset <= 12-bit, we use LSL #0
424 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
425 // LSL #0, and the other uses LSL #12.
426 //
427 // Most call frames will be allocated at the start of a function so
428 // this is OK, but it is a limitation that needs dealing with.
429 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
430 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
431 StackOffset::getFixed(Amount), TII);
432 }
433 } else if (CalleePopAmount != 0) {
434 // If the calling convention demands that the callee pops arguments from the
435 // stack, we want to add it back if we have a reserved call frame.
436 assert(CalleePopAmount < 0xffffff && "call frame too large");
437 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
438 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
439 }
440 return MBB.erase(I);
441 }
442
443 // Convenience function to create a DWARF expression for
444 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)445 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
446 int NumBytes, int NumVGScaledBytes, unsigned VG,
447 llvm::raw_string_ostream &Comment) {
448 uint8_t buffer[16];
449
450 if (NumBytes) {
451 Expr.push_back(dwarf::DW_OP_consts);
452 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
453 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
454 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
455 }
456
457 if (NumVGScaledBytes) {
458 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
459 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
460
461 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
462 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
463 Expr.push_back(0);
464
465 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
466 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
467
468 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
469 << std::abs(NumVGScaledBytes) << " * VG";
470 }
471 }
472
473 // Creates an MCCFIInstruction:
474 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpressionFromSP(const TargetRegisterInfo & TRI,const StackOffset & OffsetFromSP) const475 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
476 const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
477 int64_t NumBytes, NumVGScaledBytes;
478 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
479 NumVGScaledBytes);
480
481 std::string CommentBuffer = "sp";
482 llvm::raw_string_ostream Comment(CommentBuffer);
483
484 // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
485 SmallString<64> Expr;
486 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
487 Expr.push_back(0);
488 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
489 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
490
491 // Wrap this into DW_CFA_def_cfa.
492 SmallString<64> DefCfaExpr;
493 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
494 uint8_t buffer[16];
495 DefCfaExpr.append(buffer,
496 buffer + encodeULEB128(Expr.size(), buffer));
497 DefCfaExpr.append(Expr.str());
498 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
499 Comment.str());
500 }
501
createCfaOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA) const502 MCCFIInstruction AArch64FrameLowering::createCfaOffset(
503 const TargetRegisterInfo &TRI, unsigned Reg,
504 const StackOffset &OffsetFromDefCFA) const {
505 int64_t NumBytes, NumVGScaledBytes;
506 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
507 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
508
509 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
510
511 // Non-scalable offsets can use DW_CFA_offset directly.
512 if (!NumVGScaledBytes)
513 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
514
515 std::string CommentBuffer;
516 llvm::raw_string_ostream Comment(CommentBuffer);
517 Comment << printReg(Reg, &TRI) << " @ cfa";
518
519 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
520 SmallString<64> OffsetExpr;
521 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
522 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
523
524 // Wrap this into DW_CFA_expression
525 SmallString<64> CfaExpr;
526 CfaExpr.push_back(dwarf::DW_CFA_expression);
527 uint8_t buffer[16];
528 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
529 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
530 CfaExpr.append(OffsetExpr.str());
531
532 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
533 }
534
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const535 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
536 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
537 MachineFunction &MF = *MBB.getParent();
538 MachineFrameInfo &MFI = MF.getFrameInfo();
539 const TargetSubtargetInfo &STI = MF.getSubtarget();
540 const TargetRegisterInfo *TRI = STI.getRegisterInfo();
541 const TargetInstrInfo *TII = STI.getInstrInfo();
542 DebugLoc DL = MBB.findDebugLoc(MBBI);
543
544 // Add callee saved registers to move list.
545 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
546 if (CSI.empty())
547 return;
548
549 for (const auto &Info : CSI) {
550 unsigned Reg = Info.getReg();
551
552 // Not all unwinders may know about SVE registers, so assume the lowest
553 // common demoninator.
554 unsigned NewReg;
555 if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
556 Reg = NewReg;
557 else
558 continue;
559
560 StackOffset Offset;
561 if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
562 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
563 Offset =
564 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
565 StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
566 } else {
567 Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
568 getOffsetOfLocalArea());
569 }
570 unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
571 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
572 .addCFIIndex(CFIIndex)
573 .setMIFlags(MachineInstr::FrameSetup);
574 }
575 }
576
577 // Find a scratch register that we can use at the start of the prologue to
578 // re-align the stack pointer. We avoid using callee-save registers since they
579 // may appear to be free when this is called from canUseAsPrologue (during
580 // shrink wrapping), but then no longer be free when this is called from
581 // emitPrologue.
582 //
583 // FIXME: This is a bit conservative, since in the above case we could use one
584 // of the callee-save registers as a scratch temp to re-align the stack pointer,
585 // but we would then have to make sure that we were in fact saving at least one
586 // callee-save register in the prologue, which is additional complexity that
587 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)588 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
589 MachineFunction *MF = MBB->getParent();
590
591 // If MBB is an entry block, use X9 as the scratch register
592 if (&MF->front() == MBB)
593 return AArch64::X9;
594
595 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
596 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
597 LivePhysRegs LiveRegs(TRI);
598 LiveRegs.addLiveIns(*MBB);
599
600 // Mark callee saved registers as used so we will not choose them.
601 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
602 for (unsigned i = 0; CSRegs[i]; ++i)
603 LiveRegs.addReg(CSRegs[i]);
604
605 // Prefer X9 since it was historically used for the prologue scratch reg.
606 const MachineRegisterInfo &MRI = MF->getRegInfo();
607 if (LiveRegs.available(MRI, AArch64::X9))
608 return AArch64::X9;
609
610 for (unsigned Reg : AArch64::GPR64RegClass) {
611 if (LiveRegs.available(MRI, Reg))
612 return Reg;
613 }
614 return AArch64::NoRegister;
615 }
616
canUseAsPrologue(const MachineBasicBlock & MBB) const617 bool AArch64FrameLowering::canUseAsPrologue(
618 const MachineBasicBlock &MBB) const {
619 const MachineFunction *MF = MBB.getParent();
620 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
621 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
622 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
623
624 // Don't need a scratch register if we're not going to re-align the stack.
625 if (!RegInfo->hasStackRealignment(*MF))
626 return true;
627 // Otherwise, we can use any block as long as it has a scratch register
628 // available.
629 return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
630 }
631
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)632 static bool windowsRequiresStackProbe(MachineFunction &MF,
633 uint64_t StackSizeInBytes) {
634 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
635 if (!Subtarget.isTargetWindows())
636 return false;
637 const Function &F = MF.getFunction();
638 // TODO: When implementing stack protectors, take that into account
639 // for the probe threshold.
640 unsigned StackProbeSize = 4096;
641 if (F.hasFnAttribute("stack-probe-size"))
642 F.getFnAttribute("stack-probe-size")
643 .getValueAsString()
644 .getAsInteger(0, StackProbeSize);
645 return (StackSizeInBytes >= StackProbeSize) &&
646 !F.hasFnAttribute("no-stack-arg-probe");
647 }
648
needsWinCFI(const MachineFunction & MF)649 static bool needsWinCFI(const MachineFunction &MF) {
650 const Function &F = MF.getFunction();
651 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
652 F.needsUnwindTableEntry();
653 }
654
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const655 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
656 MachineFunction &MF, uint64_t StackBumpBytes) const {
657 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
658 const MachineFrameInfo &MFI = MF.getFrameInfo();
659 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
660 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
661 if (homogeneousPrologEpilog(MF))
662 return false;
663
664 if (AFI->getLocalStackSize() == 0)
665 return false;
666
667 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
668 // (to force a stp with predecrement) to match the packed unwind format,
669 // provided that there actually are any callee saved registers to merge the
670 // decrement with.
671 // This is potentially marginally slower, but allows using the packed
672 // unwind format for functions that both have a local area and callee saved
673 // registers. Using the packed unwind format notably reduces the size of
674 // the unwind info.
675 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
676 MF.getFunction().hasOptSize())
677 return false;
678
679 // 512 is the maximum immediate for stp/ldp that will be used for
680 // callee-save save/restores
681 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
682 return false;
683
684 if (MFI.hasVarSizedObjects())
685 return false;
686
687 if (RegInfo->hasStackRealignment(MF))
688 return false;
689
690 // This isn't strictly necessary, but it simplifies things a bit since the
691 // current RedZone handling code assumes the SP is adjusted by the
692 // callee-save save/restore code.
693 if (canUseRedZone(MF))
694 return false;
695
696 // When there is an SVE area on the stack, always allocate the
697 // callee-saves and spills/locals separately.
698 if (getSVEStackSize(MF))
699 return false;
700
701 return true;
702 }
703
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const704 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
705 MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
706 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
707 return false;
708
709 if (MBB.empty())
710 return true;
711
712 // Disable combined SP bump if the last instruction is an MTE tag store. It
713 // is almost always better to merge SP adjustment into those instructions.
714 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
715 MachineBasicBlock::iterator Begin = MBB.begin();
716 while (LastI != Begin) {
717 --LastI;
718 if (LastI->isTransient())
719 continue;
720 if (!LastI->getFlag(MachineInstr::FrameDestroy))
721 break;
722 }
723 switch (LastI->getOpcode()) {
724 case AArch64::STGloop:
725 case AArch64::STZGloop:
726 case AArch64::STGOffset:
727 case AArch64::STZGOffset:
728 case AArch64::ST2GOffset:
729 case AArch64::STZ2GOffset:
730 return false;
731 default:
732 return true;
733 }
734 llvm_unreachable("unreachable");
735 }
736
737 // Given a load or a store instruction, generate an appropriate unwinding SEH
738 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)739 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
740 const TargetInstrInfo &TII,
741 MachineInstr::MIFlag Flag) {
742 unsigned Opc = MBBI->getOpcode();
743 MachineBasicBlock *MBB = MBBI->getParent();
744 MachineFunction &MF = *MBB->getParent();
745 DebugLoc DL = MBBI->getDebugLoc();
746 unsigned ImmIdx = MBBI->getNumOperands() - 1;
747 int Imm = MBBI->getOperand(ImmIdx).getImm();
748 MachineInstrBuilder MIB;
749 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
750 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
751
752 switch (Opc) {
753 default:
754 llvm_unreachable("No SEH Opcode for this instruction");
755 case AArch64::LDPDpost:
756 Imm = -Imm;
757 LLVM_FALLTHROUGH;
758 case AArch64::STPDpre: {
759 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
760 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
761 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
762 .addImm(Reg0)
763 .addImm(Reg1)
764 .addImm(Imm * 8)
765 .setMIFlag(Flag);
766 break;
767 }
768 case AArch64::LDPXpost:
769 Imm = -Imm;
770 LLVM_FALLTHROUGH;
771 case AArch64::STPXpre: {
772 Register Reg0 = MBBI->getOperand(1).getReg();
773 Register Reg1 = MBBI->getOperand(2).getReg();
774 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
775 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
776 .addImm(Imm * 8)
777 .setMIFlag(Flag);
778 else
779 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
780 .addImm(RegInfo->getSEHRegNum(Reg0))
781 .addImm(RegInfo->getSEHRegNum(Reg1))
782 .addImm(Imm * 8)
783 .setMIFlag(Flag);
784 break;
785 }
786 case AArch64::LDRDpost:
787 Imm = -Imm;
788 LLVM_FALLTHROUGH;
789 case AArch64::STRDpre: {
790 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
791 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
792 .addImm(Reg)
793 .addImm(Imm)
794 .setMIFlag(Flag);
795 break;
796 }
797 case AArch64::LDRXpost:
798 Imm = -Imm;
799 LLVM_FALLTHROUGH;
800 case AArch64::STRXpre: {
801 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
802 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
803 .addImm(Reg)
804 .addImm(Imm)
805 .setMIFlag(Flag);
806 break;
807 }
808 case AArch64::STPDi:
809 case AArch64::LDPDi: {
810 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
811 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
812 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
813 .addImm(Reg0)
814 .addImm(Reg1)
815 .addImm(Imm * 8)
816 .setMIFlag(Flag);
817 break;
818 }
819 case AArch64::STPXi:
820 case AArch64::LDPXi: {
821 Register Reg0 = MBBI->getOperand(0).getReg();
822 Register Reg1 = MBBI->getOperand(1).getReg();
823 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
824 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
825 .addImm(Imm * 8)
826 .setMIFlag(Flag);
827 else
828 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
829 .addImm(RegInfo->getSEHRegNum(Reg0))
830 .addImm(RegInfo->getSEHRegNum(Reg1))
831 .addImm(Imm * 8)
832 .setMIFlag(Flag);
833 break;
834 }
835 case AArch64::STRXui:
836 case AArch64::LDRXui: {
837 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
838 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
839 .addImm(Reg)
840 .addImm(Imm * 8)
841 .setMIFlag(Flag);
842 break;
843 }
844 case AArch64::STRDui:
845 case AArch64::LDRDui: {
846 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
847 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
848 .addImm(Reg)
849 .addImm(Imm * 8)
850 .setMIFlag(Flag);
851 break;
852 }
853 }
854 auto I = MBB->insertAfter(MBBI, MIB);
855 return I;
856 }
857
858 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)859 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
860 unsigned LocalStackSize) {
861 MachineOperand *ImmOpnd = nullptr;
862 unsigned ImmIdx = MBBI->getNumOperands() - 1;
863 switch (MBBI->getOpcode()) {
864 default:
865 llvm_unreachable("Fix the offset in the SEH instruction");
866 case AArch64::SEH_SaveFPLR:
867 case AArch64::SEH_SaveRegP:
868 case AArch64::SEH_SaveReg:
869 case AArch64::SEH_SaveFRegP:
870 case AArch64::SEH_SaveFReg:
871 ImmOpnd = &MBBI->getOperand(ImmIdx);
872 break;
873 }
874 if (ImmOpnd)
875 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
876 }
877
878 // Convert callee-save register save/restore instruction to do stack pointer
879 // decrement/increment to allocate/deallocate the callee-save stack area by
880 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool InProlog=true)881 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
882 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
883 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
884 bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
885 // Ignore instructions that do not operate on SP, i.e. shadow call stack
886 // instructions and associated CFI instruction.
887 while (MBBI->getOpcode() == AArch64::STRXpost ||
888 MBBI->getOpcode() == AArch64::LDRXpre ||
889 MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
890 if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
891 assert(MBBI->getOperand(0).getReg() != AArch64::SP);
892 ++MBBI;
893 }
894 unsigned NewOpc;
895 switch (MBBI->getOpcode()) {
896 default:
897 llvm_unreachable("Unexpected callee-save save/restore opcode!");
898 case AArch64::STPXi:
899 NewOpc = AArch64::STPXpre;
900 break;
901 case AArch64::STPDi:
902 NewOpc = AArch64::STPDpre;
903 break;
904 case AArch64::STPQi:
905 NewOpc = AArch64::STPQpre;
906 break;
907 case AArch64::STRXui:
908 NewOpc = AArch64::STRXpre;
909 break;
910 case AArch64::STRDui:
911 NewOpc = AArch64::STRDpre;
912 break;
913 case AArch64::STRQui:
914 NewOpc = AArch64::STRQpre;
915 break;
916 case AArch64::LDPXi:
917 NewOpc = AArch64::LDPXpost;
918 break;
919 case AArch64::LDPDi:
920 NewOpc = AArch64::LDPDpost;
921 break;
922 case AArch64::LDPQi:
923 NewOpc = AArch64::LDPQpost;
924 break;
925 case AArch64::LDRXui:
926 NewOpc = AArch64::LDRXpost;
927 break;
928 case AArch64::LDRDui:
929 NewOpc = AArch64::LDRDpost;
930 break;
931 case AArch64::LDRQui:
932 NewOpc = AArch64::LDRQpost;
933 break;
934 }
935 // Get rid of the SEH code associated with the old instruction.
936 if (NeedsWinCFI) {
937 auto SEH = std::next(MBBI);
938 if (AArch64InstrInfo::isSEHInstruction(*SEH))
939 SEH->eraseFromParent();
940 }
941
942 TypeSize Scale = TypeSize::Fixed(1);
943 unsigned Width;
944 int64_t MinOffset, MaxOffset;
945 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
946 NewOpc, Scale, Width, MinOffset, MaxOffset);
947 (void)Success;
948 assert(Success && "unknown load/store opcode");
949
950 // If the first store isn't right where we want SP then we can't fold the
951 // update in so create a normal arithmetic instruction instead.
952 if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
953 CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
954 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
955 StackOffset::getFixed(CSStackSizeInc), TII,
956 InProlog ? MachineInstr::FrameSetup
957 : MachineInstr::FrameDestroy);
958 return std::prev(MBBI);
959 }
960
961 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
962 MIB.addReg(AArch64::SP, RegState::Define);
963
964 // Copy all operands other than the immediate offset.
965 unsigned OpndIdx = 0;
966 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
967 ++OpndIdx)
968 MIB.add(MBBI->getOperand(OpndIdx));
969
970 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
971 "Unexpected immediate offset in first/last callee-save save/restore "
972 "instruction!");
973 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
974 "Unexpected base register in callee-save save/restore instruction!");
975 assert(CSStackSizeInc % Scale == 0);
976 MIB.addImm(CSStackSizeInc / (int)Scale);
977
978 MIB.setMIFlags(MBBI->getFlags());
979 MIB.setMemRefs(MBBI->memoperands());
980
981 // Generate a new SEH code that corresponds to the new instruction.
982 if (NeedsWinCFI) {
983 *HasWinCFI = true;
984 InsertSEH(*MIB, *TII,
985 InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
986 }
987
988 return std::prev(MBB.erase(MBBI));
989 }
990
991 // Fixup callee-save register save/restore instructions to take into account
992 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)993 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
994 uint64_t LocalStackSize,
995 bool NeedsWinCFI,
996 bool *HasWinCFI) {
997 if (AArch64InstrInfo::isSEHInstruction(MI))
998 return;
999
1000 unsigned Opc = MI.getOpcode();
1001
1002 // Ignore instructions that do not operate on SP, i.e. shadow call stack
1003 // instructions and associated CFI instruction.
1004 if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
1005 Opc == AArch64::CFI_INSTRUCTION) {
1006 if (Opc != AArch64::CFI_INSTRUCTION)
1007 assert(MI.getOperand(0).getReg() != AArch64::SP);
1008 return;
1009 }
1010
1011 unsigned Scale;
1012 switch (Opc) {
1013 case AArch64::STPXi:
1014 case AArch64::STRXui:
1015 case AArch64::STPDi:
1016 case AArch64::STRDui:
1017 case AArch64::LDPXi:
1018 case AArch64::LDRXui:
1019 case AArch64::LDPDi:
1020 case AArch64::LDRDui:
1021 Scale = 8;
1022 break;
1023 case AArch64::STPQi:
1024 case AArch64::STRQui:
1025 case AArch64::LDPQi:
1026 case AArch64::LDRQui:
1027 Scale = 16;
1028 break;
1029 default:
1030 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1031 }
1032
1033 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1034 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1035 "Unexpected base register in callee-save save/restore instruction!");
1036 // Last operand is immediate offset that needs fixing.
1037 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1038 // All generated opcodes have scaled offsets.
1039 assert(LocalStackSize % Scale == 0);
1040 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1041
1042 if (NeedsWinCFI) {
1043 *HasWinCFI = true;
1044 auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1045 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1046 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1047 "Expecting a SEH instruction");
1048 fixupSEHOpcode(MBBI, LocalStackSize);
1049 }
1050 }
1051
adaptForLdStOpt(MachineBasicBlock & MBB,MachineBasicBlock::iterator FirstSPPopI,MachineBasicBlock::iterator LastPopI)1052 static void adaptForLdStOpt(MachineBasicBlock &MBB,
1053 MachineBasicBlock::iterator FirstSPPopI,
1054 MachineBasicBlock::iterator LastPopI) {
1055 // Sometimes (when we restore in the same order as we save), we can end up
1056 // with code like this:
1057 //
1058 // ldp x26, x25, [sp]
1059 // ldp x24, x23, [sp, #16]
1060 // ldp x22, x21, [sp, #32]
1061 // ldp x20, x19, [sp, #48]
1062 // add sp, sp, #64
1063 //
1064 // In this case, it is always better to put the first ldp at the end, so
1065 // that the load-store optimizer can run and merge the ldp and the add into
1066 // a post-index ldp.
1067 // If we managed to grab the first pop instruction, move it to the end.
1068 if (ReverseCSRRestoreSeq)
1069 MBB.splice(FirstSPPopI, &MBB, LastPopI);
1070 // We should end up with something like this now:
1071 //
1072 // ldp x24, x23, [sp, #16]
1073 // ldp x22, x21, [sp, #32]
1074 // ldp x20, x19, [sp, #48]
1075 // ldp x26, x25, [sp]
1076 // add sp, sp, #64
1077 //
1078 // and the load-store optimizer can merge the last two instructions into:
1079 //
1080 // ldp x26, x25, [sp], #64
1081 //
1082 }
1083
isTargetWindows(const MachineFunction & MF)1084 static bool isTargetWindows(const MachineFunction &MF) {
1085 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1086 }
1087
1088 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1089 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1090 switch (I->getOpcode()) {
1091 default:
1092 return false;
1093 case AArch64::STR_ZXI:
1094 case AArch64::STR_PXI:
1095 case AArch64::LDR_ZXI:
1096 case AArch64::LDR_PXI:
1097 return I->getFlag(MachineInstr::FrameSetup) ||
1098 I->getFlag(MachineInstr::FrameDestroy);
1099 }
1100 }
1101
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1102 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1103 MachineBasicBlock &MBB) const {
1104 MachineBasicBlock::iterator MBBI = MBB.begin();
1105 const MachineFrameInfo &MFI = MF.getFrameInfo();
1106 const Function &F = MF.getFunction();
1107 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1108 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1109 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1110 MachineModuleInfo &MMI = MF.getMMI();
1111 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1112 bool needsFrameMoves =
1113 MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1114 bool HasFP = hasFP(MF);
1115 bool NeedsWinCFI = needsWinCFI(MF);
1116 bool HasWinCFI = false;
1117 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1118
1119 bool IsFunclet = MBB.isEHFuncletEntry();
1120
1121 // At this point, we're going to decide whether or not the function uses a
1122 // redzone. In most cases, the function doesn't have a redzone so let's
1123 // assume that's false and set it to true in the case that there's a redzone.
1124 AFI->setHasRedZone(false);
1125
1126 // Debug location must be unknown since the first debug location is used
1127 // to determine the end of the prologue.
1128 DebugLoc DL;
1129
1130 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1131 if (MFnI.shouldSignReturnAddress()) {
1132
1133 unsigned PACI;
1134 if (MFnI.shouldSignWithBKey()) {
1135 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1136 .setMIFlag(MachineInstr::FrameSetup);
1137 PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
1138 } else {
1139 PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
1140 }
1141
1142 auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
1143 if (Subtarget.hasPAuth())
1144 MI.addReg(AArch64::LR, RegState::Define)
1145 .addReg(AArch64::LR)
1146 .addReg(AArch64::SP, RegState::InternalRead);
1147 MI.setMIFlag(MachineInstr::FrameSetup);
1148
1149 unsigned CFIIndex =
1150 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1151 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1152 .addCFIIndex(CFIIndex)
1153 .setMIFlags(MachineInstr::FrameSetup);
1154 }
1155
1156 // We signal the presence of a Swift extended frame to external tools by
1157 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1158 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1159 // bits so that is still true.
1160 if (HasFP && AFI->hasSwiftAsyncContext()) {
1161 // ORR x29, x29, #0x1000_0000_0000_0000
1162 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1163 .addUse(AArch64::FP)
1164 .addImm(0x1100)
1165 .setMIFlag(MachineInstr::FrameSetup);
1166 }
1167
1168 // All calls are tail calls in GHC calling conv, and functions have no
1169 // prologue/epilogue.
1170 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1171 return;
1172
1173 // Set tagged base pointer to the requested stack slot.
1174 // Ideally it should match SP value after prologue.
1175 Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1176 if (TBPI)
1177 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1178 else
1179 AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1180
1181 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1182
1183 // getStackSize() includes all the locals in its size calculation. We don't
1184 // include these locals when computing the stack size of a funclet, as they
1185 // are allocated in the parent's stack frame and accessed via the frame
1186 // pointer from the funclet. We only save the callee saved registers in the
1187 // funclet, which are really the callee saved registers of the parent
1188 // function, including the funclet.
1189 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1190 : MFI.getStackSize();
1191 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1192 assert(!HasFP && "unexpected function without stack frame but with FP");
1193 assert(!SVEStackSize &&
1194 "unexpected function without stack frame but with SVE objects");
1195 // All of the stack allocation is for locals.
1196 AFI->setLocalStackSize(NumBytes);
1197 if (!NumBytes)
1198 return;
1199 // REDZONE: If the stack size is less than 128 bytes, we don't need
1200 // to actually allocate.
1201 if (canUseRedZone(MF)) {
1202 AFI->setHasRedZone(true);
1203 ++NumRedZoneFunctions;
1204 } else {
1205 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1206 StackOffset::getFixed(-NumBytes), TII,
1207 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1208 if (!NeedsWinCFI && needsFrameMoves) {
1209 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1210 MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1211 // Encode the stack size of the leaf function.
1212 unsigned CFIIndex = MF.addFrameInst(
1213 MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1214 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1215 .addCFIIndex(CFIIndex)
1216 .setMIFlags(MachineInstr::FrameSetup);
1217 }
1218 }
1219
1220 if (NeedsWinCFI) {
1221 HasWinCFI = true;
1222 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1223 .setMIFlag(MachineInstr::FrameSetup);
1224 }
1225
1226 return;
1227 }
1228
1229 bool IsWin64 =
1230 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1231 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1232
1233 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1234 // All of the remaining stack allocations are for locals.
1235 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1236 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1237 bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1238 if (CombineSPBump) {
1239 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1240 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1241 StackOffset::getFixed(-NumBytes), TII,
1242 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1243 NumBytes = 0;
1244 } else if (HomPrologEpilog) {
1245 // Stack has been already adjusted.
1246 NumBytes -= PrologueSaveSize;
1247 } else if (PrologueSaveSize != 0) {
1248 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1249 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
1250 NumBytes -= PrologueSaveSize;
1251 }
1252 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1253
1254 // Move past the saves of the callee-saved registers, fixing up the offsets
1255 // and pre-inc if we decided to combine the callee-save and local stack
1256 // pointer bump above.
1257 MachineBasicBlock::iterator End = MBB.end();
1258 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1259 !IsSVECalleeSave(MBBI)) {
1260 if (CombineSPBump)
1261 fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1262 NeedsWinCFI, &HasWinCFI);
1263 ++MBBI;
1264 }
1265
1266 // For funclets the FP belongs to the containing function.
1267 if (!IsFunclet && HasFP) {
1268 // Only set up FP if we actually need to.
1269 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1270
1271 if (CombineSPBump)
1272 FPOffset += AFI->getLocalStackSize();
1273
1274 if (AFI->hasSwiftAsyncContext()) {
1275 // Before we update the live FP we have to ensure there's a valid (or
1276 // null) asynchronous context in its slot just before FP in the frame
1277 // record, so store it now.
1278 const auto &Attrs = MF.getFunction().getAttributes();
1279 bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1280 if (HaveInitialContext)
1281 MBB.addLiveIn(AArch64::X22);
1282 BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1283 .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1284 .addUse(AArch64::SP)
1285 .addImm(FPOffset - 8)
1286 .setMIFlags(MachineInstr::FrameSetup);
1287 }
1288
1289 if (HomPrologEpilog) {
1290 auto Prolog = MBBI;
1291 --Prolog;
1292 assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1293 Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1294 } else {
1295 // Issue sub fp, sp, FPOffset or
1296 // mov fp,sp when FPOffset is zero.
1297 // Note: All stores of callee-saved registers are marked as "FrameSetup".
1298 // This code marks the instruction(s) that set the FP also.
1299 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1300 StackOffset::getFixed(FPOffset), TII,
1301 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1302 }
1303 }
1304
1305 if (windowsRequiresStackProbe(MF, NumBytes)) {
1306 uint64_t NumWords = NumBytes >> 4;
1307 if (NeedsWinCFI) {
1308 HasWinCFI = true;
1309 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1310 // exceed this amount. We need to move at most 2^24 - 1 into x15.
1311 // This is at most two instructions, MOVZ follwed by MOVK.
1312 // TODO: Fix to use multiple stack alloc unwind codes for stacks
1313 // exceeding 256MB in size.
1314 if (NumBytes >= (1 << 28))
1315 report_fatal_error("Stack size cannot exceed 256MB for stack "
1316 "unwinding purposes");
1317
1318 uint32_t LowNumWords = NumWords & 0xFFFF;
1319 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1320 .addImm(LowNumWords)
1321 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1322 .setMIFlag(MachineInstr::FrameSetup);
1323 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1324 .setMIFlag(MachineInstr::FrameSetup);
1325 if ((NumWords & 0xFFFF0000) != 0) {
1326 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1327 .addReg(AArch64::X15)
1328 .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1329 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1330 .setMIFlag(MachineInstr::FrameSetup);
1331 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1332 .setMIFlag(MachineInstr::FrameSetup);
1333 }
1334 } else {
1335 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1336 .addImm(NumWords)
1337 .setMIFlags(MachineInstr::FrameSetup);
1338 }
1339
1340 switch (MF.getTarget().getCodeModel()) {
1341 case CodeModel::Tiny:
1342 case CodeModel::Small:
1343 case CodeModel::Medium:
1344 case CodeModel::Kernel:
1345 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1346 .addExternalSymbol("__chkstk")
1347 .addReg(AArch64::X15, RegState::Implicit)
1348 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1349 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1350 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1351 .setMIFlags(MachineInstr::FrameSetup);
1352 if (NeedsWinCFI) {
1353 HasWinCFI = true;
1354 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1355 .setMIFlag(MachineInstr::FrameSetup);
1356 }
1357 break;
1358 case CodeModel::Large:
1359 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1360 .addReg(AArch64::X16, RegState::Define)
1361 .addExternalSymbol("__chkstk")
1362 .addExternalSymbol("__chkstk")
1363 .setMIFlags(MachineInstr::FrameSetup);
1364 if (NeedsWinCFI) {
1365 HasWinCFI = true;
1366 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1367 .setMIFlag(MachineInstr::FrameSetup);
1368 }
1369
1370 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1371 .addReg(AArch64::X16, RegState::Kill)
1372 .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1373 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1374 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1375 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1376 .setMIFlags(MachineInstr::FrameSetup);
1377 if (NeedsWinCFI) {
1378 HasWinCFI = true;
1379 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1380 .setMIFlag(MachineInstr::FrameSetup);
1381 }
1382 break;
1383 }
1384
1385 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1386 .addReg(AArch64::SP, RegState::Kill)
1387 .addReg(AArch64::X15, RegState::Kill)
1388 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1389 .setMIFlags(MachineInstr::FrameSetup);
1390 if (NeedsWinCFI) {
1391 HasWinCFI = true;
1392 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1393 .addImm(NumBytes)
1394 .setMIFlag(MachineInstr::FrameSetup);
1395 }
1396 NumBytes = 0;
1397 }
1398
1399 StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1400 MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1401
1402 // Process the SVE callee-saves to determine what space needs to be
1403 // allocated.
1404 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1405 // Find callee save instructions in frame.
1406 CalleeSavesBegin = MBBI;
1407 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1408 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1409 ++MBBI;
1410 CalleeSavesEnd = MBBI;
1411
1412 AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1413 AllocateAfter = SVEStackSize - AllocateBefore;
1414 }
1415
1416 // Allocate space for the callee saves (if any).
1417 emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
1418 -AllocateBefore, TII,
1419 MachineInstr::FrameSetup);
1420
1421 // Finally allocate remaining SVE stack space.
1422 emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1423 -AllocateAfter, TII,
1424 MachineInstr::FrameSetup);
1425
1426 // Allocate space for the rest of the frame.
1427 if (NumBytes) {
1428 // Alignment is required for the parent frame, not the funclet
1429 const bool NeedsRealignment =
1430 !IsFunclet && RegInfo->hasStackRealignment(MF);
1431 unsigned scratchSPReg = AArch64::SP;
1432
1433 if (NeedsRealignment) {
1434 scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1435 assert(scratchSPReg != AArch64::NoRegister);
1436 }
1437
1438 // If we're a leaf function, try using the red zone.
1439 if (!canUseRedZone(MF))
1440 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1441 // the correct value here, as NumBytes also includes padding bytes,
1442 // which shouldn't be counted here.
1443 emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1444 StackOffset::getFixed(-NumBytes), TII,
1445 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1446
1447 if (NeedsRealignment) {
1448 const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1449 assert(NrBitsToZero > 1);
1450 assert(scratchSPReg != AArch64::SP);
1451
1452 // SUB X9, SP, NumBytes
1453 // -- X9 is temporary register, so shouldn't contain any live data here,
1454 // -- free to use. This is already produced by emitFrameOffset above.
1455 // AND SP, X9, 0b11111...0000
1456 // The logical immediates have a non-trivial encoding. The following
1457 // formula computes the encoded immediate with all ones but
1458 // NrBitsToZero zero bits as least significant bits.
1459 uint32_t andMaskEncoded = (1 << 12) // = N
1460 | ((64 - NrBitsToZero) << 6) // immr
1461 | ((64 - NrBitsToZero - 1) << 0); // imms
1462
1463 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1464 .addReg(scratchSPReg, RegState::Kill)
1465 .addImm(andMaskEncoded);
1466 AFI->setStackRealigned(true);
1467 if (NeedsWinCFI) {
1468 HasWinCFI = true;
1469 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1470 .addImm(NumBytes & andMaskEncoded)
1471 .setMIFlag(MachineInstr::FrameSetup);
1472 }
1473 }
1474 }
1475
1476 // If we need a base pointer, set it up here. It's whatever the value of the
1477 // stack pointer is at this point. Any variable size objects will be allocated
1478 // after this, so we can still use the base pointer to reference locals.
1479 //
1480 // FIXME: Clarify FrameSetup flags here.
1481 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1482 // needed.
1483 // For funclets the BP belongs to the containing function.
1484 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1485 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1486 false);
1487 if (NeedsWinCFI) {
1488 HasWinCFI = true;
1489 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1490 .setMIFlag(MachineInstr::FrameSetup);
1491 }
1492 }
1493
1494 // The very last FrameSetup instruction indicates the end of prologue. Emit a
1495 // SEH opcode indicating the prologue end.
1496 if (NeedsWinCFI && HasWinCFI) {
1497 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1498 .setMIFlag(MachineInstr::FrameSetup);
1499 }
1500
1501 // SEH funclets are passed the frame pointer in X1. If the parent
1502 // function uses the base register, then the base register is used
1503 // directly, and is not retrieved from X1.
1504 if (IsFunclet && F.hasPersonalityFn()) {
1505 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1506 if (isAsynchronousEHPersonality(Per)) {
1507 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1508 .addReg(AArch64::X1)
1509 .setMIFlag(MachineInstr::FrameSetup);
1510 MBB.addLiveIn(AArch64::X1);
1511 }
1512 }
1513
1514 if (needsFrameMoves) {
1515 // An example of the prologue:
1516 //
1517 // .globl __foo
1518 // .align 2
1519 // __foo:
1520 // Ltmp0:
1521 // .cfi_startproc
1522 // .cfi_personality 155, ___gxx_personality_v0
1523 // Leh_func_begin:
1524 // .cfi_lsda 16, Lexception33
1525 //
1526 // stp xa,bx, [sp, -#offset]!
1527 // ...
1528 // stp x28, x27, [sp, #offset-32]
1529 // stp fp, lr, [sp, #offset-16]
1530 // add fp, sp, #offset - 16
1531 // sub sp, sp, #1360
1532 //
1533 // The Stack:
1534 // +-------------------------------------------+
1535 // 10000 | ........ | ........ | ........ | ........ |
1536 // 10004 | ........ | ........ | ........ | ........ |
1537 // +-------------------------------------------+
1538 // 10008 | ........ | ........ | ........ | ........ |
1539 // 1000c | ........ | ........ | ........ | ........ |
1540 // +===========================================+
1541 // 10010 | X28 Register |
1542 // 10014 | X28 Register |
1543 // +-------------------------------------------+
1544 // 10018 | X27 Register |
1545 // 1001c | X27 Register |
1546 // +===========================================+
1547 // 10020 | Frame Pointer |
1548 // 10024 | Frame Pointer |
1549 // +-------------------------------------------+
1550 // 10028 | Link Register |
1551 // 1002c | Link Register |
1552 // +===========================================+
1553 // 10030 | ........ | ........ | ........ | ........ |
1554 // 10034 | ........ | ........ | ........ | ........ |
1555 // +-------------------------------------------+
1556 // 10038 | ........ | ........ | ........ | ........ |
1557 // 1003c | ........ | ........ | ........ | ........ |
1558 // +-------------------------------------------+
1559 //
1560 // [sp] = 10030 :: >>initial value<<
1561 // sp = 10020 :: stp fp, lr, [sp, #-16]!
1562 // fp = sp == 10020 :: mov fp, sp
1563 // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
1564 // sp == 10010 :: >>final value<<
1565 //
1566 // The frame pointer (w29) points to address 10020. If we use an offset of
1567 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
1568 // for w27, and -32 for w28:
1569 //
1570 // Ltmp1:
1571 // .cfi_def_cfa w29, 16
1572 // Ltmp2:
1573 // .cfi_offset w30, -8
1574 // Ltmp3:
1575 // .cfi_offset w29, -16
1576 // Ltmp4:
1577 // .cfi_offset w27, -24
1578 // Ltmp5:
1579 // .cfi_offset w28, -32
1580
1581 if (HasFP) {
1582 const int OffsetToFirstCalleeSaveFromFP =
1583 AFI->getCalleeSaveBaseToFrameRecordOffset() -
1584 AFI->getCalleeSavedStackSize();
1585 Register FramePtr = RegInfo->getFrameRegister(MF);
1586
1587 // Define the current CFA rule to use the provided FP.
1588 unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1589 unsigned CFIIndex = MF.addFrameInst(
1590 MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1591 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1592 .addCFIIndex(CFIIndex)
1593 .setMIFlags(MachineInstr::FrameSetup);
1594 } else {
1595 unsigned CFIIndex;
1596 if (SVEStackSize) {
1597 const TargetSubtargetInfo &STI = MF.getSubtarget();
1598 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
1599 StackOffset TotalSize =
1600 SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
1601 CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
1602 } else {
1603 // Encode the stack size of the leaf function.
1604 CFIIndex = MF.addFrameInst(
1605 MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
1606 }
1607 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1608 .addCFIIndex(CFIIndex)
1609 .setMIFlags(MachineInstr::FrameSetup);
1610 }
1611
1612 // Now emit the moves for whatever callee saved regs we have (including FP,
1613 // LR if those are saved).
1614 emitCalleeSavedFrameMoves(MBB, MBBI);
1615 }
1616 }
1617
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1618 static void InsertReturnAddressAuth(MachineFunction &MF,
1619 MachineBasicBlock &MBB) {
1620 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1621 if (!MFI.shouldSignReturnAddress())
1622 return;
1623 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1624 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1625
1626 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1627 DebugLoc DL;
1628 if (MBBI != MBB.end())
1629 DL = MBBI->getDebugLoc();
1630
1631 // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1632 // this instruction can safely used for any v8a architecture.
1633 // From v8.3a onwards there are optimised authenticate LR and return
1634 // instructions, namely RETA{A,B}, that can be used instead.
1635 if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1636 MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1637 BuildMI(MBB, MBBI, DL,
1638 TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1639 .copyImplicitOps(*MBBI);
1640 MBB.erase(MBBI);
1641 } else {
1642 BuildMI(
1643 MBB, MBBI, DL,
1644 TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1645 .setMIFlag(MachineInstr::FrameDestroy);
1646 }
1647 }
1648
isFuncletReturnInstr(const MachineInstr & MI)1649 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1650 switch (MI.getOpcode()) {
1651 default:
1652 return false;
1653 case AArch64::CATCHRET:
1654 case AArch64::CLEANUPRET:
1655 return true;
1656 }
1657 }
1658
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1659 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1660 MachineBasicBlock &MBB) const {
1661 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1662 MachineFrameInfo &MFI = MF.getFrameInfo();
1663 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1664 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1665 DebugLoc DL;
1666 bool NeedsWinCFI = needsWinCFI(MF);
1667 bool HasWinCFI = false;
1668 bool IsFunclet = false;
1669 auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1670
1671 if (MBB.end() != MBBI) {
1672 DL = MBBI->getDebugLoc();
1673 IsFunclet = isFuncletReturnInstr(*MBBI);
1674 }
1675
1676 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1677 : MFI.getStackSize();
1678 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1679
1680 // All calls are tail calls in GHC calling conv, and functions have no
1681 // prologue/epilogue.
1682 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1683 return;
1684
1685 // How much of the stack used by incoming arguments this function is expected
1686 // to restore in this particular epilogue.
1687 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1688
1689 // The stack frame should be like below,
1690 //
1691 // ---------------------- ---
1692 // | | |
1693 // | BytesInStackArgArea| CalleeArgStackSize
1694 // | (NumReusableBytes) | (of tail call)
1695 // | | ---
1696 // | | |
1697 // ---------------------| --- |
1698 // | | | |
1699 // | CalleeSavedReg | | |
1700 // | (CalleeSavedStackSize)| | |
1701 // | | | |
1702 // ---------------------| | NumBytes
1703 // | | StackSize (StackAdjustUp)
1704 // | LocalStackSize | | |
1705 // | (covering callee | | |
1706 // | args) | | |
1707 // | | | |
1708 // ---------------------- --- ---
1709 //
1710 // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
1711 // = StackSize + ArgumentPopSize
1712 //
1713 // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
1714 // it as the 2nd argument of AArch64ISD::TC_RETURN.
1715
1716 auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
1717
1718 bool IsWin64 =
1719 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1720 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1721
1722 int64_t AfterCSRPopSize = ArgumentStackToRestore;
1723 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1724 // We cannot rely on the local stack size set in emitPrologue if the function
1725 // has funclets, as funclets have different local stack size requirements, and
1726 // the current value set in emitPrologue may be that of the containing
1727 // function.
1728 if (MF.hasEHFunclets())
1729 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1730 if (homogeneousPrologEpilog(MF, &MBB)) {
1731 assert(!NeedsWinCFI);
1732 auto LastPopI = MBB.getFirstTerminator();
1733 if (LastPopI != MBB.begin()) {
1734 auto HomogeneousEpilog = std::prev(LastPopI);
1735 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1736 LastPopI = HomogeneousEpilog;
1737 }
1738
1739 // Adjust local stack
1740 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1741 StackOffset::getFixed(AFI->getLocalStackSize()), TII,
1742 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1743
1744 // SP has been already adjusted while restoring callee save regs.
1745 // We've bailed-out the case with adjusting SP for arguments.
1746 assert(AfterCSRPopSize == 0);
1747 return;
1748 }
1749 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1750 // Assume we can't combine the last pop with the sp restore.
1751
1752 if (!CombineSPBump && PrologueSaveSize != 0) {
1753 MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1754 while (AArch64InstrInfo::isSEHInstruction(*Pop))
1755 Pop = std::prev(Pop);
1756 // Converting the last ldp to a post-index ldp is valid only if the last
1757 // ldp's offset is 0.
1758 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1759 // If the offset is 0 and the AfterCSR pop is not actually trying to
1760 // allocate more stack for arguments (in space that an untimely interrupt
1761 // may clobber), convert it to a post-index ldp.
1762 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
1763 convertCalleeSaveRestoreToSPPrePostIncDec(
1764 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
1765 else {
1766 // If not, make sure to emit an add after the last ldp.
1767 // We're doing this by transfering the size to be restored from the
1768 // adjustment *before* the CSR pops to the adjustment *after* the CSR
1769 // pops.
1770 AfterCSRPopSize += PrologueSaveSize;
1771 }
1772 }
1773
1774 // Move past the restores of the callee-saved registers.
1775 // If we plan on combining the sp bump of the local stack size and the callee
1776 // save stack size, we might need to adjust the CSR save and restore offsets.
1777 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1778 MachineBasicBlock::iterator Begin = MBB.begin();
1779 while (LastPopI != Begin) {
1780 --LastPopI;
1781 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1782 IsSVECalleeSave(LastPopI)) {
1783 ++LastPopI;
1784 break;
1785 } else if (CombineSPBump)
1786 fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1787 NeedsWinCFI, &HasWinCFI);
1788 }
1789
1790 if (MF.hasWinCFI()) {
1791 // If the prologue didn't contain any SEH opcodes and didn't set the
1792 // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
1793 // EpilogStart - to avoid generating CFI for functions that don't need it.
1794 // (And as we didn't generate any prologue at all, it would be asymmetrical
1795 // to the epilogue.) By the end of the function, we assert that
1796 // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
1797 HasWinCFI = true;
1798 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
1799 .setMIFlag(MachineInstr::FrameDestroy);
1800 }
1801
1802 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
1803 // We need to reset FP to its untagged state on return. Bit 60 is currently
1804 // used to show the presence of an extended frame.
1805
1806 // BIC x29, x29, #0x1000_0000_0000_0000
1807 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
1808 AArch64::FP)
1809 .addUse(AArch64::FP)
1810 .addImm(0x10fe)
1811 .setMIFlag(MachineInstr::FrameDestroy);
1812 }
1813
1814 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1815
1816 // If there is a single SP update, insert it before the ret and we're done.
1817 if (CombineSPBump) {
1818 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1819 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
1820 StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
1821 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
1822 &HasWinCFI);
1823 if (HasWinCFI)
1824 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1825 TII->get(AArch64::SEH_EpilogEnd))
1826 .setMIFlag(MachineInstr::FrameDestroy);
1827 return;
1828 }
1829
1830 NumBytes -= PrologueSaveSize;
1831 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1832
1833 // Process the SVE callee-saves to determine what space needs to be
1834 // deallocated.
1835 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
1836 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
1837 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1838 RestoreBegin = std::prev(RestoreEnd);
1839 while (RestoreBegin != MBB.begin() &&
1840 IsSVECalleeSave(std::prev(RestoreBegin)))
1841 --RestoreBegin;
1842
1843 assert(IsSVECalleeSave(RestoreBegin) &&
1844 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
1845
1846 StackOffset CalleeSavedSizeAsOffset =
1847 StackOffset::getScalable(CalleeSavedSize);
1848 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
1849 DeallocateAfter = CalleeSavedSizeAsOffset;
1850 }
1851
1852 // Deallocate the SVE area.
1853 if (SVEStackSize) {
1854 if (AFI->isStackRealigned()) {
1855 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
1856 // Set SP to start of SVE callee-save area from which they can
1857 // be reloaded. The code below will deallocate the stack space
1858 // space by moving FP -> SP.
1859 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
1860 StackOffset::getScalable(-CalleeSavedSize), TII,
1861 MachineInstr::FrameDestroy);
1862 } else {
1863 if (AFI->getSVECalleeSavedStackSize()) {
1864 // Deallocate the non-SVE locals first before we can deallocate (and
1865 // restore callee saves) from the SVE area.
1866 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1867 StackOffset::getFixed(NumBytes), TII,
1868 MachineInstr::FrameDestroy);
1869 NumBytes = 0;
1870 }
1871
1872 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1873 DeallocateBefore, TII, MachineInstr::FrameDestroy);
1874
1875 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1876 DeallocateAfter, TII, MachineInstr::FrameDestroy);
1877 }
1878 }
1879
1880 if (!hasFP(MF)) {
1881 bool RedZone = canUseRedZone(MF);
1882 // If this was a redzone leaf function, we don't need to restore the
1883 // stack pointer (but we may need to pop stack args for fastcc).
1884 if (RedZone && AfterCSRPopSize == 0)
1885 return;
1886
1887 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
1888 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
1889 if (NoCalleeSaveRestore)
1890 StackRestoreBytes += AfterCSRPopSize;
1891
1892 // If we were able to combine the local stack pop with the argument pop,
1893 // then we're done.
1894 bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
1895
1896 // If we're done after this, make sure to help the load store optimizer.
1897 if (Done)
1898 adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
1899
1900 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1901 StackOffset::getFixed(StackRestoreBytes), TII,
1902 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1903 if (Done) {
1904 if (HasWinCFI) {
1905 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1906 TII->get(AArch64::SEH_EpilogEnd))
1907 .setMIFlag(MachineInstr::FrameDestroy);
1908 }
1909 return;
1910 }
1911
1912 NumBytes = 0;
1913 }
1914
1915 // Restore the original stack pointer.
1916 // FIXME: Rather than doing the math here, we should instead just use
1917 // non-post-indexed loads for the restores if we aren't actually going to
1918 // be able to save any instructions.
1919 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
1920 emitFrameOffset(
1921 MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
1922 StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
1923 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
1924 } else if (NumBytes)
1925 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1926 StackOffset::getFixed(NumBytes), TII,
1927 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1928
1929 // This must be placed after the callee-save restore code because that code
1930 // assumes the SP is at the same location as it was after the callee-save save
1931 // code in the prologue.
1932 if (AfterCSRPopSize) {
1933 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
1934 "interrupt may have clobbered");
1935 // Find an insertion point for the first ldp so that it goes before the
1936 // shadow call stack epilog instruction. This ensures that the restore of
1937 // lr from x18 is placed after the restore from sp.
1938 auto FirstSPPopI = MBB.getFirstTerminator();
1939 while (FirstSPPopI != Begin) {
1940 auto Prev = std::prev(FirstSPPopI);
1941 if (Prev->getOpcode() != AArch64::LDRXpre ||
1942 Prev->getOperand(0).getReg() == AArch64::SP)
1943 break;
1944 FirstSPPopI = Prev;
1945 }
1946
1947 adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1948
1949 emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
1950 StackOffset::getFixed(AfterCSRPopSize), TII,
1951 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1952 }
1953 if (HasWinCFI)
1954 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
1955 .setMIFlag(MachineInstr::FrameDestroy);
1956 }
1957
1958 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1959 /// debug info. It's the same as what we use for resolving the code-gen
1960 /// references for now. FIXME: This can go wrong when references are
1961 /// SP-relative and simple call frames aren't used.
1962 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1963 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1964 Register &FrameReg) const {
1965 return resolveFrameIndexReference(
1966 MF, FI, FrameReg,
1967 /*PreferFP=*/
1968 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
1969 /*ForSimm=*/false);
1970 }
1971
1972 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const1973 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1974 int FI) const {
1975 return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
1976 }
1977
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)1978 static StackOffset getFPOffset(const MachineFunction &MF,
1979 int64_t ObjectOffset) {
1980 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1981 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1982 bool IsWin64 =
1983 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1984 unsigned FixedObject =
1985 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
1986 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
1987 int64_t FPAdjust =
1988 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1989 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
1990 }
1991
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)1992 static StackOffset getStackOffset(const MachineFunction &MF,
1993 int64_t ObjectOffset) {
1994 const auto &MFI = MF.getFrameInfo();
1995 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
1996 }
1997
1998 // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const1999 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2000 int FI) const {
2001 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2002 MF.getSubtarget().getRegisterInfo());
2003 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2004 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2005 ? getFPOffset(MF, ObjectOffset).getFixed()
2006 : getStackOffset(MF, ObjectOffset).getFixed();
2007 }
2008
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const2009 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2010 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2011 bool ForSimm) const {
2012 const auto &MFI = MF.getFrameInfo();
2013 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2014 bool isFixed = MFI.isFixedObjectIndex(FI);
2015 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2016 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2017 PreferFP, ForSimm);
2018 }
2019
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const2020 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2021 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2022 Register &FrameReg, bool PreferFP, bool ForSimm) const {
2023 const auto &MFI = MF.getFrameInfo();
2024 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2025 MF.getSubtarget().getRegisterInfo());
2026 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2027 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2028
2029 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2030 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2031 bool isCSR =
2032 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2033
2034 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2035
2036 // Use frame pointer to reference fixed objects. Use it for locals if
2037 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2038 // reliable as a base). Make sure useFPForScavengingIndex() does the
2039 // right thing for the emergency spill slot.
2040 bool UseFP = false;
2041 if (AFI->hasStackFrame() && !isSVE) {
2042 // We shouldn't prefer using the FP when there is an SVE area
2043 // in between the FP and the non-SVE locals/spills.
2044 PreferFP &= !SVEStackSize;
2045
2046 // Note: Keeping the following as multiple 'if' statements rather than
2047 // merging to a single expression for readability.
2048 //
2049 // Argument access should always use the FP.
2050 if (isFixed) {
2051 UseFP = hasFP(MF);
2052 } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2053 // References to the CSR area must use FP if we're re-aligning the stack
2054 // since the dynamically-sized alignment padding is between the SP/BP and
2055 // the CSR area.
2056 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2057 UseFP = true;
2058 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2059 // If the FPOffset is negative and we're producing a signed immediate, we
2060 // have to keep in mind that the available offset range for negative
2061 // offsets is smaller than for positive ones. If an offset is available
2062 // via the FP and the SP, use whichever is closest.
2063 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2064 PreferFP |= Offset > -FPOffset;
2065
2066 if (MFI.hasVarSizedObjects()) {
2067 // If we have variable sized objects, we can use either FP or BP, as the
2068 // SP offset is unknown. We can use the base pointer if we have one and
2069 // FP is not preferred. If not, we're stuck with using FP.
2070 bool CanUseBP = RegInfo->hasBasePointer(MF);
2071 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2072 UseFP = PreferFP;
2073 else if (!CanUseBP) // Can't use BP. Forced to use FP.
2074 UseFP = true;
2075 // else we can use BP and FP, but the offset from FP won't fit.
2076 // That will make us scavenge registers which we can probably avoid by
2077 // using BP. If it won't fit for BP either, we'll scavenge anyway.
2078 } else if (FPOffset >= 0) {
2079 // Use SP or FP, whichever gives us the best chance of the offset
2080 // being in range for direct access. If the FPOffset is positive,
2081 // that'll always be best, as the SP will be even further away.
2082 UseFP = true;
2083 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2084 // Funclets access the locals contained in the parent's stack frame
2085 // via the frame pointer, so we have to use the FP in the parent
2086 // function.
2087 (void) Subtarget;
2088 assert(
2089 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2090 "Funclets should only be present on Win64");
2091 UseFP = true;
2092 } else {
2093 // We have the choice between FP and (SP or BP).
2094 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2095 UseFP = true;
2096 }
2097 }
2098 }
2099
2100 assert(
2101 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2102 "In the presence of dynamic stack pointer realignment, "
2103 "non-argument/CSR objects cannot be accessed through the frame pointer");
2104
2105 if (isSVE) {
2106 StackOffset FPOffset =
2107 StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2108 StackOffset SPOffset =
2109 SVEStackSize +
2110 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2111 ObjectOffset);
2112 // Always use the FP for SVE spills if available and beneficial.
2113 if (hasFP(MF) && (SPOffset.getFixed() ||
2114 FPOffset.getScalable() < SPOffset.getScalable() ||
2115 RegInfo->hasStackRealignment(MF))) {
2116 FrameReg = RegInfo->getFrameRegister(MF);
2117 return FPOffset;
2118 }
2119
2120 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2121 : (unsigned)AArch64::SP;
2122 return SPOffset;
2123 }
2124
2125 StackOffset ScalableOffset = {};
2126 if (UseFP && !(isFixed || isCSR))
2127 ScalableOffset = -SVEStackSize;
2128 if (!UseFP && (isFixed || isCSR))
2129 ScalableOffset = SVEStackSize;
2130
2131 if (UseFP) {
2132 FrameReg = RegInfo->getFrameRegister(MF);
2133 return StackOffset::getFixed(FPOffset) + ScalableOffset;
2134 }
2135
2136 // Use the base pointer if we have one.
2137 if (RegInfo->hasBasePointer(MF))
2138 FrameReg = RegInfo->getBaseRegister();
2139 else {
2140 assert(!MFI.hasVarSizedObjects() &&
2141 "Can't use SP when we have var sized objects.");
2142 FrameReg = AArch64::SP;
2143 // If we're using the red zone for this function, the SP won't actually
2144 // be adjusted, so the offsets will be negative. They're also all
2145 // within range of the signed 9-bit immediate instructions.
2146 if (canUseRedZone(MF))
2147 Offset -= AFI->getLocalStackSize();
2148 }
2149
2150 return StackOffset::getFixed(Offset) + ScalableOffset;
2151 }
2152
getPrologueDeath(MachineFunction & MF,unsigned Reg)2153 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2154 // Do not set a kill flag on values that are also marked as live-in. This
2155 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2156 // callee saved registers.
2157 // Omitting the kill flags is conservatively correct even if the live-in
2158 // is not used after all.
2159 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2160 return getKillRegState(!IsLiveIn);
2161 }
2162
produceCompactUnwindFrame(MachineFunction & MF)2163 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2164 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2165 AttributeList Attrs = MF.getFunction().getAttributes();
2166 return Subtarget.isTargetMachO() &&
2167 !(Subtarget.getTargetLowering()->supportSwiftError() &&
2168 Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2169 MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2170 }
2171
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2172 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2173 bool NeedsWinCFI, bool IsFirst) {
2174 // If we are generating register pairs for a Windows function that requires
2175 // EH support, then pair consecutive registers only. There are no unwind
2176 // opcodes for saves/restores of non-consectuve register pairs.
2177 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2178 // save_lrpair.
2179 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2180
2181 if (Reg2 == AArch64::FP)
2182 return true;
2183 if (!NeedsWinCFI)
2184 return false;
2185 if (Reg2 == Reg1 + 1)
2186 return false;
2187 // If pairing a GPR with LR, the pair can be described by the save_lrpair
2188 // opcode. If this is the first register pair, it would end up with a
2189 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2190 // if LR is paired with something else than the first register.
2191 // The save_lrpair opcode requires the first register to be an odd one.
2192 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2193 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2194 return false;
2195 return true;
2196 }
2197
2198 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2199 /// WindowsCFI requires that only consecutive registers can be paired.
2200 /// LR and FP need to be allocated together when the frame needs to save
2201 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2202 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2203 bool UsesWinAAPCS, bool NeedsWinCFI,
2204 bool NeedsFrameRecord, bool IsFirst) {
2205 if (UsesWinAAPCS)
2206 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2207
2208 // If we need to store the frame record, don't pair any register
2209 // with LR other than FP.
2210 if (NeedsFrameRecord)
2211 return Reg2 == AArch64::LR;
2212
2213 return false;
2214 }
2215
2216 namespace {
2217
2218 struct RegPairInfo {
2219 unsigned Reg1 = AArch64::NoRegister;
2220 unsigned Reg2 = AArch64::NoRegister;
2221 int FrameIdx;
2222 int Offset;
2223 enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2224
2225 RegPairInfo() = default;
2226
isPaired__anonde4e74e70411::RegPairInfo2227 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2228
getScale__anonde4e74e70411::RegPairInfo2229 unsigned getScale() const {
2230 switch (Type) {
2231 case PPR:
2232 return 2;
2233 case GPR:
2234 case FPR64:
2235 return 8;
2236 case ZPR:
2237 case FPR128:
2238 return 16;
2239 }
2240 llvm_unreachable("Unsupported type");
2241 }
2242
isScalable__anonde4e74e70411::RegPairInfo2243 bool isScalable() const { return Type == PPR || Type == ZPR; }
2244 };
2245
2246 } // end anonymous namespace
2247
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool & NeedShadowCallStackProlog,bool NeedsFrameRecord)2248 static void computeCalleeSaveRegisterPairs(
2249 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2250 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2251 bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
2252
2253 if (CSI.empty())
2254 return;
2255
2256 bool IsWindows = isTargetWindows(MF);
2257 bool NeedsWinCFI = needsWinCFI(MF);
2258 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2259 MachineFrameInfo &MFI = MF.getFrameInfo();
2260 CallingConv::ID CC = MF.getFunction().getCallingConv();
2261 unsigned Count = CSI.size();
2262 (void)CC;
2263 // MachO's compact unwind format relies on all registers being stored in
2264 // pairs.
2265 assert((!produceCompactUnwindFrame(MF) ||
2266 CC == CallingConv::PreserveMost ||
2267 (Count & 1) == 0) &&
2268 "Odd number of callee-saved regs to spill!");
2269 int ByteOffset = AFI->getCalleeSavedStackSize();
2270 int StackFillDir = -1;
2271 int RegInc = 1;
2272 unsigned FirstReg = 0;
2273 if (NeedsWinCFI) {
2274 // For WinCFI, fill the stack from the bottom up.
2275 ByteOffset = 0;
2276 StackFillDir = 1;
2277 // As the CSI array is reversed to match PrologEpilogInserter, iterate
2278 // backwards, to pair up registers starting from lower numbered registers.
2279 RegInc = -1;
2280 FirstReg = Count - 1;
2281 }
2282 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2283 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2284
2285 // When iterating backwards, the loop condition relies on unsigned wraparound.
2286 for (unsigned i = FirstReg; i < Count; i += RegInc) {
2287 RegPairInfo RPI;
2288 RPI.Reg1 = CSI[i].getReg();
2289
2290 if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2291 RPI.Type = RegPairInfo::GPR;
2292 else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2293 RPI.Type = RegPairInfo::FPR64;
2294 else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2295 RPI.Type = RegPairInfo::FPR128;
2296 else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2297 RPI.Type = RegPairInfo::ZPR;
2298 else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2299 RPI.Type = RegPairInfo::PPR;
2300 else
2301 llvm_unreachable("Unsupported register class.");
2302
2303 // Add the next reg to the pair if it is in the same register class.
2304 if (unsigned(i + RegInc) < Count) {
2305 unsigned NextReg = CSI[i + RegInc].getReg();
2306 bool IsFirst = i == FirstReg;
2307 switch (RPI.Type) {
2308 case RegPairInfo::GPR:
2309 if (AArch64::GPR64RegClass.contains(NextReg) &&
2310 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2311 NeedsWinCFI, NeedsFrameRecord, IsFirst))
2312 RPI.Reg2 = NextReg;
2313 break;
2314 case RegPairInfo::FPR64:
2315 if (AArch64::FPR64RegClass.contains(NextReg) &&
2316 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2317 IsFirst))
2318 RPI.Reg2 = NextReg;
2319 break;
2320 case RegPairInfo::FPR128:
2321 if (AArch64::FPR128RegClass.contains(NextReg))
2322 RPI.Reg2 = NextReg;
2323 break;
2324 case RegPairInfo::PPR:
2325 case RegPairInfo::ZPR:
2326 break;
2327 }
2328 }
2329
2330 // If either of the registers to be saved is the lr register, it means that
2331 // we also need to save lr in the shadow call stack.
2332 if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
2333 MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
2334 if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
2335 report_fatal_error("Must reserve x18 to use shadow call stack");
2336 NeedShadowCallStackProlog = true;
2337 }
2338
2339 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2340 // list to come in sorted by frame index so that we can issue the store
2341 // pair instructions directly. Assert if we see anything otherwise.
2342 //
2343 // The order of the registers in the list is controlled by
2344 // getCalleeSavedRegs(), so they will always be in-order, as well.
2345 assert((!RPI.isPaired() ||
2346 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2347 "Out of order callee saved regs!");
2348
2349 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2350 RPI.Reg1 == AArch64::LR) &&
2351 "FrameRecord must be allocated together with LR");
2352
2353 // Windows AAPCS has FP and LR reversed.
2354 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2355 RPI.Reg2 == AArch64::LR) &&
2356 "FrameRecord must be allocated together with LR");
2357
2358 // MachO's compact unwind format relies on all registers being stored in
2359 // adjacent register pairs.
2360 assert((!produceCompactUnwindFrame(MF) ||
2361 CC == CallingConv::PreserveMost ||
2362 (RPI.isPaired() &&
2363 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2364 RPI.Reg1 + 1 == RPI.Reg2))) &&
2365 "Callee-save registers not saved as adjacent register pair!");
2366
2367 RPI.FrameIdx = CSI[i].getFrameIdx();
2368 if (NeedsWinCFI &&
2369 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2370 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2371
2372 int Scale = RPI.getScale();
2373
2374 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2375 assert(OffsetPre % Scale == 0);
2376
2377 if (RPI.isScalable())
2378 ScalableByteOffset += StackFillDir * Scale;
2379 else
2380 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2381
2382 // Swift's async context is directly before FP, so allocate an extra
2383 // 8 bytes for it.
2384 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2385 RPI.Reg2 == AArch64::FP)
2386 ByteOffset += StackFillDir * 8;
2387
2388 assert(!(RPI.isScalable() && RPI.isPaired()) &&
2389 "Paired spill/fill instructions don't exist for SVE vectors");
2390
2391 // Round up size of non-pair to pair size if we need to pad the
2392 // callee-save area to ensure 16-byte alignment.
2393 if (NeedGapToAlignStack && !NeedsWinCFI &&
2394 !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2395 !RPI.isPaired() && ByteOffset % 16 != 0) {
2396 ByteOffset += 8 * StackFillDir;
2397 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2398 // A stack frame with a gap looks like this, bottom up:
2399 // d9, d8. x21, gap, x20, x19.
2400 // Set extra alignment on the x21 object to create the gap above it.
2401 MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2402 NeedGapToAlignStack = false;
2403 }
2404
2405 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2406 assert(OffsetPost % Scale == 0);
2407 // If filling top down (default), we want the offset after incrementing it.
2408 // If fillibg bootom up (WinCFI) we need the original offset.
2409 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2410
2411 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2412 // Swift context can directly precede FP.
2413 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2414 RPI.Reg2 == AArch64::FP)
2415 Offset += 8;
2416 RPI.Offset = Offset / Scale;
2417
2418 assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2419 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2420 "Offset out of bounds for LDP/STP immediate");
2421
2422 // Save the offset to frame record so that the FP register can point to the
2423 // innermost frame record (spilled FP and LR registers).
2424 if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2425 RPI.Reg2 == AArch64::FP) ||
2426 (IsWindows && RPI.Reg1 == AArch64::FP &&
2427 RPI.Reg2 == AArch64::LR)))
2428 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2429
2430 RegPairs.push_back(RPI);
2431 if (RPI.isPaired())
2432 i += RegInc;
2433 }
2434 if (NeedsWinCFI) {
2435 // If we need an alignment gap in the stack, align the topmost stack
2436 // object. A stack frame with a gap looks like this, bottom up:
2437 // x19, d8. d9, gap.
2438 // Set extra alignment on the topmost stack object (the first element in
2439 // CSI, which goes top down), to create the gap above it.
2440 if (AFI->hasCalleeSaveStackFreeSpace())
2441 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2442 // We iterated bottom up over the registers; flip RegPairs back to top
2443 // down order.
2444 std::reverse(RegPairs.begin(), RegPairs.end());
2445 }
2446 }
2447
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2448 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2449 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2450 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2451 MachineFunction &MF = *MBB.getParent();
2452 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2453 bool NeedsWinCFI = needsWinCFI(MF);
2454 DebugLoc DL;
2455 SmallVector<RegPairInfo, 8> RegPairs;
2456
2457 bool NeedShadowCallStackProlog = false;
2458 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2459 NeedShadowCallStackProlog, hasFP(MF));
2460 const MachineRegisterInfo &MRI = MF.getRegInfo();
2461
2462 if (NeedShadowCallStackProlog) {
2463 // Shadow call stack prolog: str x30, [x18], #8
2464 BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
2465 .addReg(AArch64::X18, RegState::Define)
2466 .addReg(AArch64::LR)
2467 .addReg(AArch64::X18)
2468 .addImm(8)
2469 .setMIFlag(MachineInstr::FrameSetup);
2470
2471 if (NeedsWinCFI)
2472 BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
2473 .setMIFlag(MachineInstr::FrameSetup);
2474
2475 if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
2476 // Emit a CFI instruction that causes 8 to be subtracted from the value of
2477 // x18 when unwinding past this frame.
2478 static const char CFIInst[] = {
2479 dwarf::DW_CFA_val_expression,
2480 18, // register
2481 2, // length
2482 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
2483 static_cast<char>(-8) & 0x7f, // addend (sleb128)
2484 };
2485 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
2486 nullptr, StringRef(CFIInst, sizeof(CFIInst))));
2487 BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
2488 .addCFIIndex(CFIIndex)
2489 .setMIFlag(MachineInstr::FrameSetup);
2490 }
2491
2492 // This instruction also makes x18 live-in to the entry block.
2493 MBB.addLiveIn(AArch64::X18);
2494 }
2495
2496 if (homogeneousPrologEpilog(MF)) {
2497 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2498 .setMIFlag(MachineInstr::FrameSetup);
2499
2500 for (auto &RPI : RegPairs) {
2501 MIB.addReg(RPI.Reg1);
2502 MIB.addReg(RPI.Reg2);
2503
2504 // Update register live in.
2505 if (!MRI.isReserved(RPI.Reg1))
2506 MBB.addLiveIn(RPI.Reg1);
2507 if (!MRI.isReserved(RPI.Reg2))
2508 MBB.addLiveIn(RPI.Reg2);
2509 }
2510 return true;
2511 }
2512 for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
2513 ++RPII) {
2514 RegPairInfo RPI = *RPII;
2515 unsigned Reg1 = RPI.Reg1;
2516 unsigned Reg2 = RPI.Reg2;
2517 unsigned StrOpc;
2518
2519 // Issue sequence of spills for cs regs. The first spill may be converted
2520 // to a pre-decrement store later by emitPrologue if the callee-save stack
2521 // area allocation can't be combined with the local stack area allocation.
2522 // For example:
2523 // stp x22, x21, [sp, #0] // addImm(+0)
2524 // stp x20, x19, [sp, #16] // addImm(+2)
2525 // stp fp, lr, [sp, #32] // addImm(+4)
2526 // Rationale: This sequence saves uop updates compared to a sequence of
2527 // pre-increment spills like stp xi,xj,[sp,#-16]!
2528 // Note: Similar rationale and sequence for restores in epilog.
2529 unsigned Size;
2530 Align Alignment;
2531 switch (RPI.Type) {
2532 case RegPairInfo::GPR:
2533 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2534 Size = 8;
2535 Alignment = Align(8);
2536 break;
2537 case RegPairInfo::FPR64:
2538 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2539 Size = 8;
2540 Alignment = Align(8);
2541 break;
2542 case RegPairInfo::FPR128:
2543 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2544 Size = 16;
2545 Alignment = Align(16);
2546 break;
2547 case RegPairInfo::ZPR:
2548 StrOpc = AArch64::STR_ZXI;
2549 Size = 16;
2550 Alignment = Align(16);
2551 break;
2552 case RegPairInfo::PPR:
2553 StrOpc = AArch64::STR_PXI;
2554 Size = 2;
2555 Alignment = Align(2);
2556 break;
2557 }
2558 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2559 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2560 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2561 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2562 dbgs() << ")\n");
2563
2564 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2565 "Windows unwdinding requires a consecutive (FP,LR) pair");
2566 // Windows unwind codes require consecutive registers if registers are
2567 // paired. Make the switch here, so that the code below will save (x,x+1)
2568 // and not (x+1,x).
2569 unsigned FrameIdxReg1 = RPI.FrameIdx;
2570 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2571 if (NeedsWinCFI && RPI.isPaired()) {
2572 std::swap(Reg1, Reg2);
2573 std::swap(FrameIdxReg1, FrameIdxReg2);
2574 }
2575 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2576 if (!MRI.isReserved(Reg1))
2577 MBB.addLiveIn(Reg1);
2578 if (RPI.isPaired()) {
2579 if (!MRI.isReserved(Reg2))
2580 MBB.addLiveIn(Reg2);
2581 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2582 MIB.addMemOperand(MF.getMachineMemOperand(
2583 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2584 MachineMemOperand::MOStore, Size, Alignment));
2585 }
2586 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2587 .addReg(AArch64::SP)
2588 .addImm(RPI.Offset) // [sp, #offset*scale],
2589 // where factor*scale is implicit
2590 .setMIFlag(MachineInstr::FrameSetup);
2591 MIB.addMemOperand(MF.getMachineMemOperand(
2592 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2593 MachineMemOperand::MOStore, Size, Alignment));
2594 if (NeedsWinCFI)
2595 InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2596
2597 // Update the StackIDs of the SVE stack slots.
2598 MachineFrameInfo &MFI = MF.getFrameInfo();
2599 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2600 MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2601
2602 }
2603 return true;
2604 }
2605
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2606 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2607 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2608 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2609 MachineFunction &MF = *MBB.getParent();
2610 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2611 DebugLoc DL;
2612 SmallVector<RegPairInfo, 8> RegPairs;
2613 bool NeedsWinCFI = needsWinCFI(MF);
2614
2615 if (MI != MBB.end())
2616 DL = MI->getDebugLoc();
2617
2618 bool NeedShadowCallStackProlog = false;
2619 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2620 NeedShadowCallStackProlog, hasFP(MF));
2621
2622 auto EmitMI = [&](const RegPairInfo &RPI) {
2623 unsigned Reg1 = RPI.Reg1;
2624 unsigned Reg2 = RPI.Reg2;
2625
2626 // Issue sequence of restores for cs regs. The last restore may be converted
2627 // to a post-increment load later by emitEpilogue if the callee-save stack
2628 // area allocation can't be combined with the local stack area allocation.
2629 // For example:
2630 // ldp fp, lr, [sp, #32] // addImm(+4)
2631 // ldp x20, x19, [sp, #16] // addImm(+2)
2632 // ldp x22, x21, [sp, #0] // addImm(+0)
2633 // Note: see comment in spillCalleeSavedRegisters()
2634 unsigned LdrOpc;
2635 unsigned Size;
2636 Align Alignment;
2637 switch (RPI.Type) {
2638 case RegPairInfo::GPR:
2639 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2640 Size = 8;
2641 Alignment = Align(8);
2642 break;
2643 case RegPairInfo::FPR64:
2644 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2645 Size = 8;
2646 Alignment = Align(8);
2647 break;
2648 case RegPairInfo::FPR128:
2649 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2650 Size = 16;
2651 Alignment = Align(16);
2652 break;
2653 case RegPairInfo::ZPR:
2654 LdrOpc = AArch64::LDR_ZXI;
2655 Size = 16;
2656 Alignment = Align(16);
2657 break;
2658 case RegPairInfo::PPR:
2659 LdrOpc = AArch64::LDR_PXI;
2660 Size = 2;
2661 Alignment = Align(2);
2662 break;
2663 }
2664 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2665 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2666 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2667 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2668 dbgs() << ")\n");
2669
2670 // Windows unwind codes require consecutive registers if registers are
2671 // paired. Make the switch here, so that the code below will save (x,x+1)
2672 // and not (x+1,x).
2673 unsigned FrameIdxReg1 = RPI.FrameIdx;
2674 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2675 if (NeedsWinCFI && RPI.isPaired()) {
2676 std::swap(Reg1, Reg2);
2677 std::swap(FrameIdxReg1, FrameIdxReg2);
2678 }
2679 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
2680 if (RPI.isPaired()) {
2681 MIB.addReg(Reg2, getDefRegState(true));
2682 MIB.addMemOperand(MF.getMachineMemOperand(
2683 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2684 MachineMemOperand::MOLoad, Size, Alignment));
2685 }
2686 MIB.addReg(Reg1, getDefRegState(true))
2687 .addReg(AArch64::SP)
2688 .addImm(RPI.Offset) // [sp, #offset*scale]
2689 // where factor*scale is implicit
2690 .setMIFlag(MachineInstr::FrameDestroy);
2691 MIB.addMemOperand(MF.getMachineMemOperand(
2692 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2693 MachineMemOperand::MOLoad, Size, Alignment));
2694 if (NeedsWinCFI)
2695 InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2696 };
2697
2698 // SVE objects are always restored in reverse order.
2699 for (const RegPairInfo &RPI : reverse(RegPairs))
2700 if (RPI.isScalable())
2701 EmitMI(RPI);
2702
2703 if (ReverseCSRRestoreSeq) {
2704 for (const RegPairInfo &RPI : reverse(RegPairs))
2705 if (!RPI.isScalable())
2706 EmitMI(RPI);
2707 } else if (homogeneousPrologEpilog(MF, &MBB)) {
2708 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
2709 .setMIFlag(MachineInstr::FrameDestroy);
2710 for (auto &RPI : RegPairs) {
2711 MIB.addReg(RPI.Reg1, RegState::Define);
2712 MIB.addReg(RPI.Reg2, RegState::Define);
2713 }
2714 return true;
2715 } else
2716 for (const RegPairInfo &RPI : RegPairs)
2717 if (!RPI.isScalable())
2718 EmitMI(RPI);
2719
2720 if (NeedShadowCallStackProlog) {
2721 // Shadow call stack epilog: ldr x30, [x18, #-8]!
2722 BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
2723 .addReg(AArch64::X18, RegState::Define)
2724 .addReg(AArch64::LR, RegState::Define)
2725 .addReg(AArch64::X18)
2726 .addImm(-8)
2727 .setMIFlag(MachineInstr::FrameDestroy);
2728 }
2729
2730 return true;
2731 }
2732
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2733 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2734 BitVector &SavedRegs,
2735 RegScavenger *RS) const {
2736 // All calls are tail calls in GHC calling conv, and functions have no
2737 // prologue/epilogue.
2738 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2739 return;
2740
2741 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2742 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2743 MF.getSubtarget().getRegisterInfo());
2744 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2745 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2746 unsigned UnspilledCSGPR = AArch64::NoRegister;
2747 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2748
2749 MachineFrameInfo &MFI = MF.getFrameInfo();
2750 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2751
2752 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2753 ? RegInfo->getBaseRegister()
2754 : (unsigned)AArch64::NoRegister;
2755
2756 unsigned ExtraCSSpill = 0;
2757 // Figure out which callee-saved registers to save/restore.
2758 for (unsigned i = 0; CSRegs[i]; ++i) {
2759 const unsigned Reg = CSRegs[i];
2760
2761 // Add the base pointer register to SavedRegs if it is callee-save.
2762 if (Reg == BasePointerReg)
2763 SavedRegs.set(Reg);
2764
2765 bool RegUsed = SavedRegs.test(Reg);
2766 unsigned PairedReg = AArch64::NoRegister;
2767 if (AArch64::GPR64RegClass.contains(Reg) ||
2768 AArch64::FPR64RegClass.contains(Reg) ||
2769 AArch64::FPR128RegClass.contains(Reg))
2770 PairedReg = CSRegs[i ^ 1];
2771
2772 if (!RegUsed) {
2773 if (AArch64::GPR64RegClass.contains(Reg) &&
2774 !RegInfo->isReservedReg(MF, Reg)) {
2775 UnspilledCSGPR = Reg;
2776 UnspilledCSGPRPaired = PairedReg;
2777 }
2778 continue;
2779 }
2780
2781 // MachO's compact unwind format relies on all registers being stored in
2782 // pairs.
2783 // FIXME: the usual format is actually better if unwinding isn't needed.
2784 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
2785 !SavedRegs.test(PairedReg)) {
2786 SavedRegs.set(PairedReg);
2787 if (AArch64::GPR64RegClass.contains(PairedReg) &&
2788 !RegInfo->isReservedReg(MF, PairedReg))
2789 ExtraCSSpill = PairedReg;
2790 }
2791 }
2792
2793 if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2794 !Subtarget.isTargetWindows()) {
2795 // For Windows calling convention on a non-windows OS, where X18 is treated
2796 // as reserved, back up X18 when entering non-windows code (marked with the
2797 // Windows calling convention) and restore when returning regardless of
2798 // whether the individual function uses it - it might call other functions
2799 // that clobber it.
2800 SavedRegs.set(AArch64::X18);
2801 }
2802
2803 // Calculates the callee saved stack size.
2804 unsigned CSStackSize = 0;
2805 unsigned SVECSStackSize = 0;
2806 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2807 const MachineRegisterInfo &MRI = MF.getRegInfo();
2808 for (unsigned Reg : SavedRegs.set_bits()) {
2809 auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
2810 if (AArch64::PPRRegClass.contains(Reg) ||
2811 AArch64::ZPRRegClass.contains(Reg))
2812 SVECSStackSize += RegSize;
2813 else
2814 CSStackSize += RegSize;
2815 }
2816
2817 // Save number of saved regs, so we can easily update CSStackSize later.
2818 unsigned NumSavedRegs = SavedRegs.count();
2819
2820 // The frame record needs to be created by saving the appropriate registers
2821 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2822 if (hasFP(MF) ||
2823 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
2824 SavedRegs.set(AArch64::FP);
2825 SavedRegs.set(AArch64::LR);
2826 }
2827
2828 LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2829 for (unsigned Reg
2830 : SavedRegs.set_bits()) dbgs()
2831 << ' ' << printReg(Reg, RegInfo);
2832 dbgs() << "\n";);
2833
2834 // If any callee-saved registers are used, the frame cannot be eliminated.
2835 int64_t SVEStackSize =
2836 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
2837 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
2838
2839 // The CSR spill slots have not been allocated yet, so estimateStackSize
2840 // won't include them.
2841 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2842
2843 // Conservatively always assume BigStack when there are SVE spills.
2844 bool BigStack = SVEStackSize ||
2845 (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
2846 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
2847 AFI->setHasStackFrame(true);
2848
2849 // Estimate if we might need to scavenge a register at some point in order
2850 // to materialize a stack offset. If so, either spill one additional
2851 // callee-saved register or reserve a special spill slot to facilitate
2852 // register scavenging. If we already spilled an extra callee-saved register
2853 // above to keep the number of spills even, we don't need to do anything else
2854 // here.
2855 if (BigStack) {
2856 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2857 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2858 << " to get a scratch register.\n");
2859 SavedRegs.set(UnspilledCSGPR);
2860 // MachO's compact unwind format relies on all registers being stored in
2861 // pairs, so if we need to spill one extra for BigStack, then we need to
2862 // store the pair.
2863 if (producePairRegisters(MF))
2864 SavedRegs.set(UnspilledCSGPRPaired);
2865 ExtraCSSpill = UnspilledCSGPR;
2866 }
2867
2868 // If we didn't find an extra callee-saved register to spill, create
2869 // an emergency spill slot.
2870 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
2871 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2872 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2873 unsigned Size = TRI->getSpillSize(RC);
2874 Align Alignment = TRI->getSpillAlign(RC);
2875 int FI = MFI.CreateStackObject(Size, Alignment, false);
2876 RS->addScavengingFrameIndex(FI);
2877 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2878 << " as the emergency spill slot.\n");
2879 }
2880 }
2881
2882 // Adding the size of additional 64bit GPR saves.
2883 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
2884
2885 // A Swift asynchronous context extends the frame record with a pointer
2886 // directly before FP.
2887 if (hasFP(MF) && AFI->hasSwiftAsyncContext())
2888 CSStackSize += 8;
2889
2890 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
2891 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2892 << EstimatedStackSize + AlignedCSStackSize
2893 << " bytes.\n");
2894
2895 assert((!MFI.isCalleeSavedInfoValid() ||
2896 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2897 "Should not invalidate callee saved info");
2898
2899 // Round up to register pair alignment to avoid additional SP adjustment
2900 // instructions.
2901 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2902 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2903 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
2904 }
2905
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * RegInfo,std::vector<CalleeSavedInfo> & CSI,unsigned & MinCSFrameIndex,unsigned & MaxCSFrameIndex) const2906 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2907 MachineFunction &MF, const TargetRegisterInfo *RegInfo,
2908 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
2909 unsigned &MaxCSFrameIndex) const {
2910 bool NeedsWinCFI = needsWinCFI(MF);
2911 // To match the canonical windows frame layout, reverse the list of
2912 // callee saved registers to get them laid out by PrologEpilogInserter
2913 // in the right order. (PrologEpilogInserter allocates stack objects top
2914 // down. Windows canonical prologs store higher numbered registers at
2915 // the top, thus have the CSI array start from the highest registers.)
2916 if (NeedsWinCFI)
2917 std::reverse(CSI.begin(), CSI.end());
2918
2919 if (CSI.empty())
2920 return true; // Early exit if no callee saved registers are modified!
2921
2922 // Now that we know which registers need to be saved and restored, allocate
2923 // stack slots for them.
2924 MachineFrameInfo &MFI = MF.getFrameInfo();
2925 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2926 for (auto &CS : CSI) {
2927 Register Reg = CS.getReg();
2928 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
2929
2930 unsigned Size = RegInfo->getSpillSize(*RC);
2931 Align Alignment(RegInfo->getSpillAlign(*RC));
2932 int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
2933 CS.setFrameIdx(FrameIdx);
2934
2935 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2936 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2937
2938 // Grab 8 bytes below FP for the extended asynchronous frame info.
2939 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
2940 FrameIdx = MFI.CreateStackObject(8, Alignment, true);
2941 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
2942 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2943 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2944 }
2945 }
2946 return true;
2947 }
2948
enableStackSlotScavenging(const MachineFunction & MF) const2949 bool AArch64FrameLowering::enableStackSlotScavenging(
2950 const MachineFunction &MF) const {
2951 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2952 return AFI->hasCalleeSaveStackFreeSpace();
2953 }
2954
2955 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)2956 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2957 int &Min, int &Max) {
2958 Min = std::numeric_limits<int>::max();
2959 Max = std::numeric_limits<int>::min();
2960
2961 if (!MFI.isCalleeSavedInfoValid())
2962 return false;
2963
2964 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2965 for (auto &CS : CSI) {
2966 if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
2967 AArch64::PPRRegClass.contains(CS.getReg())) {
2968 assert((Max == std::numeric_limits<int>::min() ||
2969 Max + 1 == CS.getFrameIdx()) &&
2970 "SVE CalleeSaves are not consecutive");
2971
2972 Min = std::min(Min, CS.getFrameIdx());
2973 Max = std::max(Max, CS.getFrameIdx());
2974 }
2975 }
2976 return Min != std::numeric_limits<int>::max();
2977 }
2978
2979 // Process all the SVE stack objects and determine offsets for each
2980 // object. If AssignOffsets is true, the offsets get assigned.
2981 // Fills in the first and last callee-saved frame indices into
2982 // Min/MaxCSFrameIndex, respectively.
2983 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)2984 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
2985 int &MinCSFrameIndex,
2986 int &MaxCSFrameIndex,
2987 bool AssignOffsets) {
2988 #ifndef NDEBUG
2989 // First process all fixed stack objects.
2990 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
2991 assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
2992 "SVE vectors should never be passed on the stack by value, only by "
2993 "reference.");
2994 #endif
2995
2996 auto Assign = [&MFI](int FI, int64_t Offset) {
2997 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2998 MFI.setObjectOffset(FI, Offset);
2999 };
3000
3001 int64_t Offset = 0;
3002
3003 // Then process all callee saved slots.
3004 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
3005 // Assign offsets to the callee save slots.
3006 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3007 Offset += MFI.getObjectSize(I);
3008 Offset = alignTo(Offset, MFI.getObjectAlign(I));
3009 if (AssignOffsets)
3010 Assign(I, -Offset);
3011 }
3012 }
3013
3014 // Ensure that the Callee-save area is aligned to 16bytes.
3015 Offset = alignTo(Offset, Align(16U));
3016
3017 // Create a buffer of SVE objects to allocate and sort it.
3018 SmallVector<int, 8> ObjectsToAllocate;
3019 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3020 unsigned StackID = MFI.getStackID(I);
3021 if (StackID != TargetStackID::ScalableVector)
3022 continue;
3023 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3024 continue;
3025 if (MFI.isDeadObjectIndex(I))
3026 continue;
3027
3028 ObjectsToAllocate.push_back(I);
3029 }
3030
3031 // Allocate all SVE locals and spills
3032 for (unsigned FI : ObjectsToAllocate) {
3033 Align Alignment = MFI.getObjectAlign(FI);
3034 // FIXME: Given that the length of SVE vectors is not necessarily a power of
3035 // two, we'd need to align every object dynamically at runtime if the
3036 // alignment is larger than 16. This is not yet supported.
3037 if (Alignment > Align(16))
3038 report_fatal_error(
3039 "Alignment of scalable vectors > 16 bytes is not yet supported");
3040
3041 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3042 if (AssignOffsets)
3043 Assign(FI, -Offset);
3044 }
3045
3046 return Offset;
3047 }
3048
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const3049 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3050 MachineFrameInfo &MFI) const {
3051 int MinCSFrameIndex, MaxCSFrameIndex;
3052 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3053 }
3054
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const3055 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3056 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3057 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3058 true);
3059 }
3060
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const3061 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3062 MachineFunction &MF, RegScavenger *RS) const {
3063 MachineFrameInfo &MFI = MF.getFrameInfo();
3064
3065 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3066 "Upwards growing stack unsupported");
3067
3068 int MinCSFrameIndex, MaxCSFrameIndex;
3069 int64_t SVEStackSize =
3070 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3071
3072 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3073 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3074 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3075
3076 // If this function isn't doing Win64-style C++ EH, we don't need to do
3077 // anything.
3078 if (!MF.hasEHFunclets())
3079 return;
3080 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3081 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3082
3083 MachineBasicBlock &MBB = MF.front();
3084 auto MBBI = MBB.begin();
3085 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3086 ++MBBI;
3087
3088 // Create an UnwindHelp object.
3089 // The UnwindHelp object is allocated at the start of the fixed object area
3090 int64_t FixedObject =
3091 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3092 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3093 /*SPOffset*/ -FixedObject,
3094 /*IsImmutable=*/false);
3095 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3096
3097 // We need to store -2 into the UnwindHelp object at the start of the
3098 // function.
3099 DebugLoc DL;
3100 RS->enterBasicBlockEnd(MBB);
3101 RS->backward(std::prev(MBBI));
3102 unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3103 assert(DstReg && "There must be a free register after frame setup");
3104 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3105 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3106 .addReg(DstReg, getKillRegState(true))
3107 .addFrameIndex(UnwindHelpFI)
3108 .addImm(0);
3109 }
3110
3111 namespace {
3112 struct TagStoreInstr {
3113 MachineInstr *MI;
3114 int64_t Offset, Size;
TagStoreInstr__anonde4e74e70711::TagStoreInstr3115 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3116 : MI(MI), Offset(Offset), Size(Size) {}
3117 };
3118
3119 class TagStoreEdit {
3120 MachineFunction *MF;
3121 MachineBasicBlock *MBB;
3122 MachineRegisterInfo *MRI;
3123 // Tag store instructions that are being replaced.
3124 SmallVector<TagStoreInstr, 8> TagStores;
3125 // Combined memref arguments of the above instructions.
3126 SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3127
3128 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3129 // FrameRegOffset + Size) with the address tag of SP.
3130 Register FrameReg;
3131 StackOffset FrameRegOffset;
3132 int64_t Size;
3133 // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3134 Optional<int64_t> FrameRegUpdate;
3135 // MIFlags for any FrameReg updating instructions.
3136 unsigned FrameRegUpdateFlags;
3137
3138 // Use zeroing instruction variants.
3139 bool ZeroData;
3140 DebugLoc DL;
3141
3142 void emitUnrolled(MachineBasicBlock::iterator InsertI);
3143 void emitLoop(MachineBasicBlock::iterator InsertI);
3144
3145 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)3146 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3147 : MBB(MBB), ZeroData(ZeroData) {
3148 MF = MBB->getParent();
3149 MRI = &MF->getRegInfo();
3150 }
3151 // Add an instruction to be replaced. Instructions must be added in the
3152 // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)3153 void addInstruction(TagStoreInstr I) {
3154 assert((TagStores.empty() ||
3155 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3156 "Non-adjacent tag store instructions.");
3157 TagStores.push_back(I);
3158 }
clear()3159 void clear() { TagStores.clear(); }
3160 // Emit equivalent code at the given location, and erase the current set of
3161 // instructions. May skip if the replacement is not profitable. May invalidate
3162 // the input iterator and replace it with a valid one.
3163 void emitCode(MachineBasicBlock::iterator &InsertI,
3164 const AArch64FrameLowering *TFI, bool IsLast);
3165 };
3166
emitUnrolled(MachineBasicBlock::iterator InsertI)3167 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3168 const AArch64InstrInfo *TII =
3169 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3170
3171 const int64_t kMinOffset = -256 * 16;
3172 const int64_t kMaxOffset = 255 * 16;
3173
3174 Register BaseReg = FrameReg;
3175 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3176 if (BaseRegOffsetBytes < kMinOffset ||
3177 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3178 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3179 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3180 StackOffset::getFixed(BaseRegOffsetBytes), TII);
3181 BaseReg = ScratchReg;
3182 BaseRegOffsetBytes = 0;
3183 }
3184
3185 MachineInstr *LastI = nullptr;
3186 while (Size) {
3187 int64_t InstrSize = (Size > 16) ? 32 : 16;
3188 unsigned Opcode =
3189 InstrSize == 16
3190 ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3191 : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3192 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3193 .addReg(AArch64::SP)
3194 .addReg(BaseReg)
3195 .addImm(BaseRegOffsetBytes / 16)
3196 .setMemRefs(CombinedMemRefs);
3197 // A store to [BaseReg, #0] should go last for an opportunity to fold the
3198 // final SP adjustment in the epilogue.
3199 if (BaseRegOffsetBytes == 0)
3200 LastI = I;
3201 BaseRegOffsetBytes += InstrSize;
3202 Size -= InstrSize;
3203 }
3204
3205 if (LastI)
3206 MBB->splice(InsertI, MBB, LastI);
3207 }
3208
emitLoop(MachineBasicBlock::iterator InsertI)3209 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3210 const AArch64InstrInfo *TII =
3211 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3212
3213 Register BaseReg = FrameRegUpdate
3214 ? FrameReg
3215 : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3216 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3217
3218 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3219
3220 int64_t LoopSize = Size;
3221 // If the loop size is not a multiple of 32, split off one 16-byte store at
3222 // the end to fold BaseReg update into.
3223 if (FrameRegUpdate && *FrameRegUpdate)
3224 LoopSize -= LoopSize % 32;
3225 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3226 TII->get(ZeroData ? AArch64::STZGloop_wback
3227 : AArch64::STGloop_wback))
3228 .addDef(SizeReg)
3229 .addDef(BaseReg)
3230 .addImm(LoopSize)
3231 .addReg(BaseReg)
3232 .setMemRefs(CombinedMemRefs);
3233 if (FrameRegUpdate)
3234 LoopI->setFlags(FrameRegUpdateFlags);
3235
3236 int64_t ExtraBaseRegUpdate =
3237 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3238 if (LoopSize < Size) {
3239 assert(FrameRegUpdate);
3240 assert(Size - LoopSize == 16);
3241 // Tag 16 more bytes at BaseReg and update BaseReg.
3242 BuildMI(*MBB, InsertI, DL,
3243 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3244 .addDef(BaseReg)
3245 .addReg(BaseReg)
3246 .addReg(BaseReg)
3247 .addImm(1 + ExtraBaseRegUpdate / 16)
3248 .setMemRefs(CombinedMemRefs)
3249 .setMIFlags(FrameRegUpdateFlags);
3250 } else if (ExtraBaseRegUpdate) {
3251 // Update BaseReg.
3252 BuildMI(
3253 *MBB, InsertI, DL,
3254 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3255 .addDef(BaseReg)
3256 .addReg(BaseReg)
3257 .addImm(std::abs(ExtraBaseRegUpdate))
3258 .addImm(0)
3259 .setMIFlags(FrameRegUpdateFlags);
3260 }
3261 }
3262
3263 // Check if *II is a register update that can be merged into STGloop that ends
3264 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3265 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3266 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3267 int64_t Size, int64_t *TotalOffset) {
3268 MachineInstr &MI = *II;
3269 if ((MI.getOpcode() == AArch64::ADDXri ||
3270 MI.getOpcode() == AArch64::SUBXri) &&
3271 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3272 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3273 int64_t Offset = MI.getOperand(2).getImm() << Shift;
3274 if (MI.getOpcode() == AArch64::SUBXri)
3275 Offset = -Offset;
3276 int64_t AbsPostOffset = std::abs(Offset - Size);
3277 const int64_t kMaxOffset =
3278 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3279 if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3280 *TotalOffset = Offset;
3281 return true;
3282 }
3283 }
3284 return false;
3285 }
3286
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3287 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3288 SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3289 MemRefs.clear();
3290 for (auto &TS : TSE) {
3291 MachineInstr *MI = TS.MI;
3292 // An instruction without memory operands may access anything. Be
3293 // conservative and return an empty list.
3294 if (MI->memoperands_empty()) {
3295 MemRefs.clear();
3296 return;
3297 }
3298 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3299 }
3300 }
3301
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool IsLast)3302 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3303 const AArch64FrameLowering *TFI, bool IsLast) {
3304 if (TagStores.empty())
3305 return;
3306 TagStoreInstr &FirstTagStore = TagStores[0];
3307 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3308 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3309 DL = TagStores[0].MI->getDebugLoc();
3310
3311 Register Reg;
3312 FrameRegOffset = TFI->resolveFrameOffsetReference(
3313 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3314 /*PreferFP=*/false, /*ForSimm=*/true);
3315 FrameReg = Reg;
3316 FrameRegUpdate = None;
3317
3318 mergeMemRefs(TagStores, CombinedMemRefs);
3319
3320 LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3321 for (const auto &Instr
3322 : TagStores) { dbgs() << " " << *Instr.MI; });
3323
3324 // Size threshold where a loop becomes shorter than a linear sequence of
3325 // tagging instructions.
3326 const int kSetTagLoopThreshold = 176;
3327 if (Size < kSetTagLoopThreshold) {
3328 if (TagStores.size() < 2)
3329 return;
3330 emitUnrolled(InsertI);
3331 } else {
3332 MachineInstr *UpdateInstr = nullptr;
3333 int64_t TotalOffset;
3334 if (IsLast) {
3335 // See if we can merge base register update into the STGloop.
3336 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3337 // but STGloop is way too unusual for that, and also it only
3338 // realistically happens in function epilogue. Also, STGloop is expanded
3339 // before that pass.
3340 if (InsertI != MBB->end() &&
3341 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3342 &TotalOffset)) {
3343 UpdateInstr = &*InsertI++;
3344 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
3345 << *UpdateInstr);
3346 }
3347 }
3348
3349 if (!UpdateInstr && TagStores.size() < 2)
3350 return;
3351
3352 if (UpdateInstr) {
3353 FrameRegUpdate = TotalOffset;
3354 FrameRegUpdateFlags = UpdateInstr->getFlags();
3355 }
3356 emitLoop(InsertI);
3357 if (UpdateInstr)
3358 UpdateInstr->eraseFromParent();
3359 }
3360
3361 for (auto &TS : TagStores)
3362 TS.MI->eraseFromParent();
3363 }
3364
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3365 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3366 int64_t &Size, bool &ZeroData) {
3367 MachineFunction &MF = *MI.getParent()->getParent();
3368 const MachineFrameInfo &MFI = MF.getFrameInfo();
3369
3370 unsigned Opcode = MI.getOpcode();
3371 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3372 Opcode == AArch64::STZ2GOffset);
3373
3374 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3375 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3376 return false;
3377 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3378 return false;
3379 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3380 Size = MI.getOperand(2).getImm();
3381 return true;
3382 }
3383
3384 if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3385 Size = 16;
3386 else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3387 Size = 32;
3388 else
3389 return false;
3390
3391 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3392 return false;
3393
3394 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3395 16 * MI.getOperand(2).getImm();
3396 return true;
3397 }
3398
3399 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3400 // and replace them with a shorter instruction sequence:
3401 // * replace STG + STG with ST2G
3402 // * replace STGloop + STGloop with STGloop
3403 // This code needs to run when stack slot offsets are already known, but before
3404 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3405 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3406 const AArch64FrameLowering *TFI,
3407 RegScavenger *RS) {
3408 bool FirstZeroData;
3409 int64_t Size, Offset;
3410 MachineInstr &MI = *II;
3411 MachineBasicBlock *MBB = MI.getParent();
3412 MachineBasicBlock::iterator NextI = ++II;
3413 if (&MI == &MBB->instr_back())
3414 return II;
3415 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3416 return II;
3417
3418 SmallVector<TagStoreInstr, 4> Instrs;
3419 Instrs.emplace_back(&MI, Offset, Size);
3420
3421 constexpr int kScanLimit = 10;
3422 int Count = 0;
3423 for (MachineBasicBlock::iterator E = MBB->end();
3424 NextI != E && Count < kScanLimit; ++NextI) {
3425 MachineInstr &MI = *NextI;
3426 bool ZeroData;
3427 int64_t Size, Offset;
3428 // Collect instructions that update memory tags with a FrameIndex operand
3429 // and (when applicable) constant size, and whose output registers are dead
3430 // (the latter is almost always the case in practice). Since these
3431 // instructions effectively have no inputs or outputs, we are free to skip
3432 // any non-aliasing instructions in between without tracking used registers.
3433 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3434 if (ZeroData != FirstZeroData)
3435 break;
3436 Instrs.emplace_back(&MI, Offset, Size);
3437 continue;
3438 }
3439
3440 // Only count non-transient, non-tagging instructions toward the scan
3441 // limit.
3442 if (!MI.isTransient())
3443 ++Count;
3444
3445 // Just in case, stop before the epilogue code starts.
3446 if (MI.getFlag(MachineInstr::FrameSetup) ||
3447 MI.getFlag(MachineInstr::FrameDestroy))
3448 break;
3449
3450 // Reject anything that may alias the collected instructions.
3451 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3452 break;
3453 }
3454
3455 // New code will be inserted after the last tagging instruction we've found.
3456 MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3457 InsertI++;
3458
3459 llvm::stable_sort(Instrs,
3460 [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3461 return Left.Offset < Right.Offset;
3462 });
3463
3464 // Make sure that we don't have any overlapping stores.
3465 int64_t CurOffset = Instrs[0].Offset;
3466 for (auto &Instr : Instrs) {
3467 if (CurOffset > Instr.Offset)
3468 return NextI;
3469 CurOffset = Instr.Offset + Instr.Size;
3470 }
3471
3472 // Find contiguous runs of tagged memory and emit shorter instruction
3473 // sequencies for them when possible.
3474 TagStoreEdit TSE(MBB, FirstZeroData);
3475 Optional<int64_t> EndOffset;
3476 for (auto &Instr : Instrs) {
3477 if (EndOffset && *EndOffset != Instr.Offset) {
3478 // Found a gap.
3479 TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
3480 TSE.clear();
3481 }
3482
3483 TSE.addInstruction(Instr);
3484 EndOffset = Instr.Offset + Instr.Size;
3485 }
3486
3487 TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
3488
3489 return InsertI;
3490 }
3491 } // namespace
3492
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3493 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3494 MachineFunction &MF, RegScavenger *RS = nullptr) const {
3495 if (StackTaggingMergeSetTag)
3496 for (auto &BB : MF)
3497 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3498 II = tryMergeAdjacentSTG(II, this, RS);
3499 }
3500
3501 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3502 /// before the update. This is easily retrieved as it is exactly the offset
3503 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3504 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3505 const MachineFunction &MF, int FI, Register &FrameReg,
3506 bool IgnoreSPUpdates) const {
3507 const MachineFrameInfo &MFI = MF.getFrameInfo();
3508 if (IgnoreSPUpdates) {
3509 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3510 << MFI.getObjectOffset(FI) << "\n");
3511 FrameReg = AArch64::SP;
3512 return StackOffset::getFixed(MFI.getObjectOffset(FI));
3513 }
3514
3515 return getFrameIndexReference(MF, FI, FrameReg);
3516 }
3517
3518 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3519 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3520 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3521 const MachineFunction &MF) const {
3522 return 0;
3523 }
3524
3525 /// Funclets only need to account for space for the callee saved registers,
3526 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3527 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3528 const MachineFunction &MF) const {
3529 // This is the size of the pushed CSRs.
3530 unsigned CSSize =
3531 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3532 // This is the amount of stack a funclet needs to allocate.
3533 return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3534 getStackAlign());
3535 }
3536
3537 namespace {
3538 struct FrameObject {
3539 bool IsValid = false;
3540 // Index of the object in MFI.
3541 int ObjectIndex = 0;
3542 // Group ID this object belongs to.
3543 int GroupIndex = -1;
3544 // This object should be placed first (closest to SP).
3545 bool ObjectFirst = false;
3546 // This object's group (which always contains the object with
3547 // ObjectFirst==true) should be placed first.
3548 bool GroupFirst = false;
3549 };
3550
3551 class GroupBuilder {
3552 SmallVector<int, 8> CurrentMembers;
3553 int NextGroupIndex = 0;
3554 std::vector<FrameObject> &Objects;
3555
3556 public:
GroupBuilder(std::vector<FrameObject> & Objects)3557 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3558 void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3559 void EndCurrentGroup() {
3560 if (CurrentMembers.size() > 1) {
3561 // Create a new group with the current member list. This might remove them
3562 // from their pre-existing groups. That's OK, dealing with overlapping
3563 // groups is too hard and unlikely to make a difference.
3564 LLVM_DEBUG(dbgs() << "group:");
3565 for (int Index : CurrentMembers) {
3566 Objects[Index].GroupIndex = NextGroupIndex;
3567 LLVM_DEBUG(dbgs() << " " << Index);
3568 }
3569 LLVM_DEBUG(dbgs() << "\n");
3570 NextGroupIndex++;
3571 }
3572 CurrentMembers.clear();
3573 }
3574 };
3575
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3576 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3577 // Objects at a lower index are closer to FP; objects at a higher index are
3578 // closer to SP.
3579 //
3580 // For consistency in our comparison, all invalid objects are placed
3581 // at the end. This also allows us to stop walking when we hit the
3582 // first invalid item after it's all sorted.
3583 //
3584 // The "first" object goes first (closest to SP), followed by the members of
3585 // the "first" group.
3586 //
3587 // The rest are sorted by the group index to keep the groups together.
3588 // Higher numbered groups are more likely to be around longer (i.e. untagged
3589 // in the function epilogue and not at some earlier point). Place them closer
3590 // to SP.
3591 //
3592 // If all else equal, sort by the object index to keep the objects in the
3593 // original order.
3594 return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3595 A.ObjectIndex) <
3596 std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3597 B.ObjectIndex);
3598 }
3599 } // namespace
3600
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3601 void AArch64FrameLowering::orderFrameObjects(
3602 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3603 if (!OrderFrameObjects || ObjectsToAllocate.empty())
3604 return;
3605
3606 const MachineFrameInfo &MFI = MF.getFrameInfo();
3607 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3608 for (auto &Obj : ObjectsToAllocate) {
3609 FrameObjects[Obj].IsValid = true;
3610 FrameObjects[Obj].ObjectIndex = Obj;
3611 }
3612
3613 // Identify stack slots that are tagged at the same time.
3614 GroupBuilder GB(FrameObjects);
3615 for (auto &MBB : MF) {
3616 for (auto &MI : MBB) {
3617 if (MI.isDebugInstr())
3618 continue;
3619 int OpIndex;
3620 switch (MI.getOpcode()) {
3621 case AArch64::STGloop:
3622 case AArch64::STZGloop:
3623 OpIndex = 3;
3624 break;
3625 case AArch64::STGOffset:
3626 case AArch64::STZGOffset:
3627 case AArch64::ST2GOffset:
3628 case AArch64::STZ2GOffset:
3629 OpIndex = 1;
3630 break;
3631 default:
3632 OpIndex = -1;
3633 }
3634
3635 int TaggedFI = -1;
3636 if (OpIndex >= 0) {
3637 const MachineOperand &MO = MI.getOperand(OpIndex);
3638 if (MO.isFI()) {
3639 int FI = MO.getIndex();
3640 if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3641 FrameObjects[FI].IsValid)
3642 TaggedFI = FI;
3643 }
3644 }
3645
3646 // If this is a stack tagging instruction for a slot that is not part of a
3647 // group yet, either start a new group or add it to the current one.
3648 if (TaggedFI >= 0)
3649 GB.AddMember(TaggedFI);
3650 else
3651 GB.EndCurrentGroup();
3652 }
3653 // Groups should never span multiple basic blocks.
3654 GB.EndCurrentGroup();
3655 }
3656
3657 // If the function's tagged base pointer is pinned to a stack slot, we want to
3658 // put that slot first when possible. This will likely place it at SP + 0,
3659 // and save one instruction when generating the base pointer because IRG does
3660 // not allow an immediate offset.
3661 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3662 Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3663 if (TBPI) {
3664 FrameObjects[*TBPI].ObjectFirst = true;
3665 FrameObjects[*TBPI].GroupFirst = true;
3666 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3667 if (FirstGroupIndex >= 0)
3668 for (FrameObject &Object : FrameObjects)
3669 if (Object.GroupIndex == FirstGroupIndex)
3670 Object.GroupFirst = true;
3671 }
3672
3673 llvm::stable_sort(FrameObjects, FrameObjectCompare);
3674
3675 int i = 0;
3676 for (auto &Obj : FrameObjects) {
3677 // All invalid items are sorted at the end, so it's safe to stop.
3678 if (!Obj.IsValid)
3679 break;
3680 ObjectsToAllocate[i++] = Obj.ObjectIndex;
3681 }
3682
3683 LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3684 : FrameObjects) {
3685 if (!Obj.IsValid)
3686 break;
3687 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3688 if (Obj.ObjectFirst)
3689 dbgs() << ", first";
3690 if (Obj.GroupFirst)
3691 dbgs() << ", group-first";
3692 dbgs() << "\n";
3693 });
3694 }
3695