1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // | | Higher address
22 // |-----------------------------------|
23 // | |
24 // | arguments passed on the stack |
25 // | |
26 // |-----------------------------------| <- sp
27 // | | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // | | Higher address
37 // |-----------------------------------|
38 // | |
39 // | arguments passed on the stack |
40 // | |
41 // |-----------------------------------|
42 // | |
43 // | (Win64 only) varargs from reg |
44 // | |
45 // |-----------------------------------|
46 // | |
47 // | callee-saved gpr registers | <--.
48 // | | | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
50 // | | | (frame record first)
51 // | prev_fp, prev_lr | <--'
52 // | (a.k.a. "frame record") |
53 // |-----------------------------------| <- fp(=x29)
54 // | |
55 // | callee-saved fp/simd/SVE regs |
56 // | |
57 // |-----------------------------------|
58 // | |
59 // | SVE stack objects |
60 // | |
61 // |-----------------------------------|
62 // |.empty.space.to.make.part.below....|
63 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
64 // |.the.standard.16-byte.alignment....| compile time; if present)
65 // |-----------------------------------|
66 // | |
67 // | local variables of fixed size |
68 // | including spill slots |
69 // |-----------------------------------| <- bp(not defined by ABI,
70 // |.variable-sized.local.variables....| LLVM chooses X19)
71 // |.(VLAs)............................| (size of this area is unknown at
72 // |...................................| compile time)
73 // |-----------------------------------| <- sp
74 // | | Lower address
75 //
76 //
77 // To access the data in a frame, at-compile time, a constant offset must be
78 // computable from one of the pointers (fp, bp, sp) to access it. The size
79 // of the areas with a dotted background cannot be computed at compile-time
80 // if they are present, making it required to have all three of fp, bp and
81 // sp to be set up to be able to access all contents in the frame areas,
82 // assuming all of the frame areas are non-empty.
83 //
84 // For most functions, some of the frame areas are empty. For those functions,
85 // it may not be necessary to set up fp or bp:
86 // * A base pointer is definitely needed when there are both VLAs and local
87 // variables with more-than-default alignment requirements.
88 // * A frame pointer is definitely needed when there are local variables with
89 // more-than-default alignment requirements.
90 //
91 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
92 // callee-saved area, since the unwind encoding does not allow for encoding
93 // this dynamically and existing tools depend on this layout. For other
94 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
95 // area to allow SVE stack objects (allocated directly below the callee-saves,
96 // if available) to be accessed directly from the framepointer.
97 // The SVE spill/fill instructions have VL-scaled addressing modes such
98 // as:
99 // ldr z8, [fp, #-7 mul vl]
100 // For SVE the size of the vector length (VL) is not known at compile-time, so
101 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
102 // layout, we don't need to add an unscaled offset to the framepointer before
103 // accessing the SVE object in the frame.
104 //
105 // In some cases when a base pointer is not strictly needed, it is generated
106 // anyway when offsets from the frame pointer to access local variables become
107 // so large that the offset can't be encoded in the immediate fields of loads
108 // or stores.
109 //
110 // FIXME: also explain the redzone concept.
111 // FIXME: also explain the concept of reserved call frames.
112 //
113 //===----------------------------------------------------------------------===//
114
115 #include "AArch64FrameLowering.h"
116 #include "AArch64InstrInfo.h"
117 #include "AArch64MachineFunctionInfo.h"
118 #include "AArch64RegisterInfo.h"
119 #include "AArch64Subtarget.h"
120 #include "AArch64TargetMachine.h"
121 #include "MCTargetDesc/AArch64AddressingModes.h"
122 #include "llvm/ADT/ScopeExit.h"
123 #include "llvm/ADT/SmallVector.h"
124 #include "llvm/ADT/Statistic.h"
125 #include "llvm/CodeGen/LivePhysRegs.h"
126 #include "llvm/CodeGen/MachineBasicBlock.h"
127 #include "llvm/CodeGen/MachineFrameInfo.h"
128 #include "llvm/CodeGen/MachineFunction.h"
129 #include "llvm/CodeGen/MachineInstr.h"
130 #include "llvm/CodeGen/MachineInstrBuilder.h"
131 #include "llvm/CodeGen/MachineMemOperand.h"
132 #include "llvm/CodeGen/MachineModuleInfo.h"
133 #include "llvm/CodeGen/MachineOperand.h"
134 #include "llvm/CodeGen/MachineRegisterInfo.h"
135 #include "llvm/CodeGen/RegisterScavenging.h"
136 #include "llvm/CodeGen/TargetInstrInfo.h"
137 #include "llvm/CodeGen/TargetRegisterInfo.h"
138 #include "llvm/CodeGen/TargetSubtargetInfo.h"
139 #include "llvm/CodeGen/WinEHFuncInfo.h"
140 #include "llvm/IR/Attributes.h"
141 #include "llvm/IR/CallingConv.h"
142 #include "llvm/IR/DataLayout.h"
143 #include "llvm/IR/DebugLoc.h"
144 #include "llvm/IR/Function.h"
145 #include "llvm/MC/MCAsmInfo.h"
146 #include "llvm/MC/MCDwarf.h"
147 #include "llvm/Support/CommandLine.h"
148 #include "llvm/Support/Debug.h"
149 #include "llvm/Support/ErrorHandling.h"
150 #include "llvm/Support/LEB128.h"
151 #include "llvm/Support/MathExtras.h"
152 #include "llvm/Support/raw_ostream.h"
153 #include "llvm/Target/TargetMachine.h"
154 #include "llvm/Target/TargetOptions.h"
155 #include <cassert>
156 #include <cstdint>
157 #include <iterator>
158 #include <vector>
159
160 using namespace llvm;
161
162 #define DEBUG_TYPE "frame-info"
163
164 static cl::opt<bool> EnableRedZone("aarch64-redzone",
165 cl::desc("enable use of redzone on AArch64"),
166 cl::init(false), cl::Hidden);
167
168 static cl::opt<bool>
169 ReverseCSRRestoreSeq("reverse-csr-restore-seq",
170 cl::desc("reverse the CSR restore sequence"),
171 cl::init(false), cl::Hidden);
172
173 static cl::opt<bool> StackTaggingMergeSetTag(
174 "stack-tagging-merge-settag",
175 cl::desc("merge settag instruction in function epilog"), cl::init(true),
176 cl::Hidden);
177
178 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
179 cl::desc("sort stack allocations"),
180 cl::init(true), cl::Hidden);
181
182 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
183
184 /// Returns the argument pop size.
getArgumentPopSize(MachineFunction & MF,MachineBasicBlock & MBB)185 static uint64_t getArgumentPopSize(MachineFunction &MF,
186 MachineBasicBlock &MBB) {
187 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
188 bool IsTailCallReturn = false;
189 if (MBB.end() != MBBI) {
190 unsigned RetOpcode = MBBI->getOpcode();
191 IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
192 RetOpcode == AArch64::TCRETURNri ||
193 RetOpcode == AArch64::TCRETURNriBTI;
194 }
195 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
196
197 uint64_t ArgumentPopSize = 0;
198 if (IsTailCallReturn) {
199 MachineOperand &StackAdjust = MBBI->getOperand(1);
200
201 // For a tail-call in a callee-pops-arguments environment, some or all of
202 // the stack may actually be in use for the call's arguments, this is
203 // calculated during LowerCall and consumed here...
204 ArgumentPopSize = StackAdjust.getImm();
205 } else {
206 // ... otherwise the amount to pop is *all* of the argument space,
207 // conveniently stored in the MachineFunctionInfo by
208 // LowerFormalArguments. This will, of course, be zero for the C calling
209 // convention.
210 ArgumentPopSize = AFI->getArgumentStackToRestore();
211 }
212
213 return ArgumentPopSize;
214 }
215
216 /// This is the biggest offset to the stack pointer we can encode in aarch64
217 /// instructions (without using a separate calculation and a temp register).
218 /// Note that the exception here are vector stores/loads which cannot encode any
219 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
220 static const unsigned DefaultSafeSPDisplacement = 255;
221
222 /// Look at each instruction that references stack frames and return the stack
223 /// size limit beyond which some of these instructions will require a scratch
224 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)225 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
226 // FIXME: For now, just conservatively guestimate based on unscaled indexing
227 // range. We'll end up allocating an unnecessary spill slot a lot, but
228 // realistically that's not a big deal at this stage of the game.
229 for (MachineBasicBlock &MBB : MF) {
230 for (MachineInstr &MI : MBB) {
231 if (MI.isDebugInstr() || MI.isPseudo() ||
232 MI.getOpcode() == AArch64::ADDXri ||
233 MI.getOpcode() == AArch64::ADDSXri)
234 continue;
235
236 for (const MachineOperand &MO : MI.operands()) {
237 if (!MO.isFI())
238 continue;
239
240 StackOffset Offset;
241 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
242 AArch64FrameOffsetCannotUpdate)
243 return 0;
244 }
245 }
246 }
247 return DefaultSafeSPDisplacement;
248 }
249
250 TargetStackID::Value
getStackIDForScalableVectors() const251 AArch64FrameLowering::getStackIDForScalableVectors() const {
252 return TargetStackID::ScalableVector;
253 }
254
255 /// Returns the size of the fixed object area (allocated next to sp on entry)
256 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)257 static unsigned getFixedObjectSize(const MachineFunction &MF,
258 const AArch64FunctionInfo *AFI, bool IsWin64,
259 bool IsFunclet) {
260 if (!IsWin64 || IsFunclet) {
261 // Only Win64 uses fixed objects, and then only for the function (not
262 // funclets)
263 return 0;
264 } else {
265 // Var args are stored here in the primary function.
266 const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
267 // To support EH funclets we allocate an UnwindHelp object
268 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
269 return alignTo(VarArgsArea + UnwindHelpObject, 16);
270 }
271 }
272
273 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)274 static StackOffset getSVEStackSize(const MachineFunction &MF) {
275 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
276 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
277 }
278
canUseRedZone(const MachineFunction & MF) const279 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
280 if (!EnableRedZone)
281 return false;
282 // Don't use the red zone if the function explicitly asks us not to.
283 // This is typically used for kernel code.
284 if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
285 return false;
286
287 const MachineFrameInfo &MFI = MF.getFrameInfo();
288 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
289 uint64_t NumBytes = AFI->getLocalStackSize();
290
291 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
292 getSVEStackSize(MF));
293 }
294
295 /// hasFP - Return true if the specified function should have a dedicated frame
296 /// pointer register.
hasFP(const MachineFunction & MF) const297 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
298 const MachineFrameInfo &MFI = MF.getFrameInfo();
299 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
300 // Win64 EH requires a frame pointer if funclets are present, as the locals
301 // are accessed off the frame pointer in both the parent function and the
302 // funclets.
303 if (MF.hasEHFunclets())
304 return true;
305 // Retain behavior of always omitting the FP for leaf functions when possible.
306 if (MF.getTarget().Options.DisableFramePointerElim(MF))
307 return true;
308 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
309 MFI.hasStackMap() || MFI.hasPatchPoint() ||
310 RegInfo->needsStackRealignment(MF))
311 return true;
312 // With large callframes around we may need to use FP to access the scavenging
313 // emergency spillslot.
314 //
315 // Unfortunately some calls to hasFP() like machine verifier ->
316 // getReservedReg() -> hasFP in the middle of global isel are too early
317 // to know the max call frame size. Hopefully conservatively returning "true"
318 // in those cases is fine.
319 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
320 if (!MFI.isMaxCallFrameSizeComputed() ||
321 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
322 return true;
323
324 return false;
325 }
326
327 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
328 /// not required, we reserve argument space for call sites in the function
329 /// immediately on entry to the current function. This eliminates the need for
330 /// add/sub sp brackets around call sites. Returns true if the call frame is
331 /// included as part of the stack frame.
332 bool
hasReservedCallFrame(const MachineFunction & MF) const333 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
334 return !MF.getFrameInfo().hasVarSizedObjects();
335 }
336
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const337 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
338 MachineFunction &MF, MachineBasicBlock &MBB,
339 MachineBasicBlock::iterator I) const {
340 const AArch64InstrInfo *TII =
341 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
342 DebugLoc DL = I->getDebugLoc();
343 unsigned Opc = I->getOpcode();
344 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
345 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
346
347 if (!hasReservedCallFrame(MF)) {
348 int64_t Amount = I->getOperand(0).getImm();
349 Amount = alignTo(Amount, getStackAlign());
350 if (!IsDestroy)
351 Amount = -Amount;
352
353 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
354 // doesn't have to pop anything), then the first operand will be zero too so
355 // this adjustment is a no-op.
356 if (CalleePopAmount == 0) {
357 // FIXME: in-function stack adjustment for calls is limited to 24-bits
358 // because there's no guaranteed temporary register available.
359 //
360 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
361 // 1) For offset <= 12-bit, we use LSL #0
362 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
363 // LSL #0, and the other uses LSL #12.
364 //
365 // Most call frames will be allocated at the start of a function so
366 // this is OK, but it is a limitation that needs dealing with.
367 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
368 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
369 StackOffset::getFixed(Amount), TII);
370 }
371 } else if (CalleePopAmount != 0) {
372 // If the calling convention demands that the callee pops arguments from the
373 // stack, we want to add it back if we have a reserved call frame.
374 assert(CalleePopAmount < 0xffffff && "call frame too large");
375 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
376 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
377 }
378 return MBB.erase(I);
379 }
380
381 // Convenience function to create a DWARF expression for
382 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)383 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
384 int NumBytes, int NumVGScaledBytes, unsigned VG,
385 llvm::raw_string_ostream &Comment) {
386 uint8_t buffer[16];
387
388 if (NumBytes) {
389 Expr.push_back(dwarf::DW_OP_consts);
390 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
391 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
392 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
393 }
394
395 if (NumVGScaledBytes) {
396 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
397 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
398
399 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
400 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
401 Expr.push_back(0);
402
403 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
404 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
405
406 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
407 << std::abs(NumVGScaledBytes) << " * VG";
408 }
409 }
410
411 // Creates an MCCFIInstruction:
412 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpressionFromSP(const TargetRegisterInfo & TRI,const StackOffset & OffsetFromSP) const413 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
414 const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
415 int64_t NumBytes, NumVGScaledBytes;
416 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
417 NumVGScaledBytes);
418
419 std::string CommentBuffer = "sp";
420 llvm::raw_string_ostream Comment(CommentBuffer);
421
422 // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
423 SmallString<64> Expr;
424 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
425 Expr.push_back(0);
426 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
427 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
428
429 // Wrap this into DW_CFA_def_cfa.
430 SmallString<64> DefCfaExpr;
431 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
432 uint8_t buffer[16];
433 DefCfaExpr.append(buffer,
434 buffer + encodeULEB128(Expr.size(), buffer));
435 DefCfaExpr.append(Expr.str());
436 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
437 Comment.str());
438 }
439
createCfaOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA) const440 MCCFIInstruction AArch64FrameLowering::createCfaOffset(
441 const TargetRegisterInfo &TRI, unsigned Reg,
442 const StackOffset &OffsetFromDefCFA) const {
443 int64_t NumBytes, NumVGScaledBytes;
444 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
445 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
446
447 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
448
449 // Non-scalable offsets can use DW_CFA_offset directly.
450 if (!NumVGScaledBytes)
451 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
452
453 std::string CommentBuffer;
454 llvm::raw_string_ostream Comment(CommentBuffer);
455 Comment << printReg(Reg, &TRI) << " @ cfa";
456
457 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
458 SmallString<64> OffsetExpr;
459 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
460 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
461
462 // Wrap this into DW_CFA_expression
463 SmallString<64> CfaExpr;
464 CfaExpr.push_back(dwarf::DW_CFA_expression);
465 uint8_t buffer[16];
466 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
467 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
468 CfaExpr.append(OffsetExpr.str());
469
470 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
471 }
472
emitCalleeSavedFrameMoves(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const473 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
474 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
475 MachineFunction &MF = *MBB.getParent();
476 MachineFrameInfo &MFI = MF.getFrameInfo();
477 const TargetSubtargetInfo &STI = MF.getSubtarget();
478 const TargetRegisterInfo *TRI = STI.getRegisterInfo();
479 const TargetInstrInfo *TII = STI.getInstrInfo();
480 DebugLoc DL = MBB.findDebugLoc(MBBI);
481
482 // Add callee saved registers to move list.
483 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
484 if (CSI.empty())
485 return;
486
487 for (const auto &Info : CSI) {
488 unsigned Reg = Info.getReg();
489
490 // Not all unwinders may know about SVE registers, so assume the lowest
491 // common demoninator.
492 unsigned NewReg;
493 if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
494 Reg = NewReg;
495 else
496 continue;
497
498 StackOffset Offset;
499 if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
500 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
501 Offset =
502 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
503 StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
504 } else {
505 Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
506 getOffsetOfLocalArea());
507 }
508 unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
509 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
510 .addCFIIndex(CFIIndex)
511 .setMIFlags(MachineInstr::FrameSetup);
512 }
513 }
514
515 // Find a scratch register that we can use at the start of the prologue to
516 // re-align the stack pointer. We avoid using callee-save registers since they
517 // may appear to be free when this is called from canUseAsPrologue (during
518 // shrink wrapping), but then no longer be free when this is called from
519 // emitPrologue.
520 //
521 // FIXME: This is a bit conservative, since in the above case we could use one
522 // of the callee-save registers as a scratch temp to re-align the stack pointer,
523 // but we would then have to make sure that we were in fact saving at least one
524 // callee-save register in the prologue, which is additional complexity that
525 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB)526 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
527 MachineFunction *MF = MBB->getParent();
528
529 // If MBB is an entry block, use X9 as the scratch register
530 if (&MF->front() == MBB)
531 return AArch64::X9;
532
533 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
534 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
535 LivePhysRegs LiveRegs(TRI);
536 LiveRegs.addLiveIns(*MBB);
537
538 // Mark callee saved registers as used so we will not choose them.
539 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
540 for (unsigned i = 0; CSRegs[i]; ++i)
541 LiveRegs.addReg(CSRegs[i]);
542
543 // Prefer X9 since it was historically used for the prologue scratch reg.
544 const MachineRegisterInfo &MRI = MF->getRegInfo();
545 if (LiveRegs.available(MRI, AArch64::X9))
546 return AArch64::X9;
547
548 for (unsigned Reg : AArch64::GPR64RegClass) {
549 if (LiveRegs.available(MRI, Reg))
550 return Reg;
551 }
552 return AArch64::NoRegister;
553 }
554
canUseAsPrologue(const MachineBasicBlock & MBB) const555 bool AArch64FrameLowering::canUseAsPrologue(
556 const MachineBasicBlock &MBB) const {
557 const MachineFunction *MF = MBB.getParent();
558 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
559 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
560 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
561
562 // Don't need a scratch register if we're not going to re-align the stack.
563 if (!RegInfo->needsStackRealignment(*MF))
564 return true;
565 // Otherwise, we can use any block as long as it has a scratch register
566 // available.
567 return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
568 }
569
windowsRequiresStackProbe(MachineFunction & MF,uint64_t StackSizeInBytes)570 static bool windowsRequiresStackProbe(MachineFunction &MF,
571 uint64_t StackSizeInBytes) {
572 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
573 if (!Subtarget.isTargetWindows())
574 return false;
575 const Function &F = MF.getFunction();
576 // TODO: When implementing stack protectors, take that into account
577 // for the probe threshold.
578 unsigned StackProbeSize = 4096;
579 if (F.hasFnAttribute("stack-probe-size"))
580 F.getFnAttribute("stack-probe-size")
581 .getValueAsString()
582 .getAsInteger(0, StackProbeSize);
583 return (StackSizeInBytes >= StackProbeSize) &&
584 !F.hasFnAttribute("no-stack-arg-probe");
585 }
586
needsWinCFI(const MachineFunction & MF)587 static bool needsWinCFI(const MachineFunction &MF) {
588 const Function &F = MF.getFunction();
589 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
590 F.needsUnwindTableEntry();
591 }
592
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const593 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
594 MachineFunction &MF, uint64_t StackBumpBytes) const {
595 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
596 const MachineFrameInfo &MFI = MF.getFrameInfo();
597 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
598 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
599
600 if (AFI->getLocalStackSize() == 0)
601 return false;
602
603 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
604 // (to force a stp with predecrement) to match the packed unwind format,
605 // provided that there actually are any callee saved registers to merge the
606 // decrement with.
607 // This is potentially marginally slower, but allows using the packed
608 // unwind format for functions that both have a local area and callee saved
609 // registers. Using the packed unwind format notably reduces the size of
610 // the unwind info.
611 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
612 MF.getFunction().hasOptSize())
613 return false;
614
615 // 512 is the maximum immediate for stp/ldp that will be used for
616 // callee-save save/restores
617 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
618 return false;
619
620 if (MFI.hasVarSizedObjects())
621 return false;
622
623 if (RegInfo->needsStackRealignment(MF))
624 return false;
625
626 // This isn't strictly necessary, but it simplifies things a bit since the
627 // current RedZone handling code assumes the SP is adjusted by the
628 // callee-save save/restore code.
629 if (canUseRedZone(MF))
630 return false;
631
632 // When there is an SVE area on the stack, always allocate the
633 // callee-saves and spills/locals separately.
634 if (getSVEStackSize(MF))
635 return false;
636
637 return true;
638 }
639
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,unsigned StackBumpBytes) const640 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
641 MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
642 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
643 return false;
644
645 if (MBB.empty())
646 return true;
647
648 // Disable combined SP bump if the last instruction is an MTE tag store. It
649 // is almost always better to merge SP adjustment into those instructions.
650 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
651 MachineBasicBlock::iterator Begin = MBB.begin();
652 while (LastI != Begin) {
653 --LastI;
654 if (LastI->isTransient())
655 continue;
656 if (!LastI->getFlag(MachineInstr::FrameDestroy))
657 break;
658 }
659 switch (LastI->getOpcode()) {
660 case AArch64::STGloop:
661 case AArch64::STZGloop:
662 case AArch64::STGOffset:
663 case AArch64::STZGOffset:
664 case AArch64::ST2GOffset:
665 case AArch64::STZ2GOffset:
666 return false;
667 default:
668 return true;
669 }
670 llvm_unreachable("unreachable");
671 }
672
673 // Given a load or a store instruction, generate an appropriate unwinding SEH
674 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)675 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
676 const TargetInstrInfo &TII,
677 MachineInstr::MIFlag Flag) {
678 unsigned Opc = MBBI->getOpcode();
679 MachineBasicBlock *MBB = MBBI->getParent();
680 MachineFunction &MF = *MBB->getParent();
681 DebugLoc DL = MBBI->getDebugLoc();
682 unsigned ImmIdx = MBBI->getNumOperands() - 1;
683 int Imm = MBBI->getOperand(ImmIdx).getImm();
684 MachineInstrBuilder MIB;
685 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
686 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
687
688 switch (Opc) {
689 default:
690 llvm_unreachable("No SEH Opcode for this instruction");
691 case AArch64::LDPDpost:
692 Imm = -Imm;
693 LLVM_FALLTHROUGH;
694 case AArch64::STPDpre: {
695 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
696 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
697 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
698 .addImm(Reg0)
699 .addImm(Reg1)
700 .addImm(Imm * 8)
701 .setMIFlag(Flag);
702 break;
703 }
704 case AArch64::LDPXpost:
705 Imm = -Imm;
706 LLVM_FALLTHROUGH;
707 case AArch64::STPXpre: {
708 Register Reg0 = MBBI->getOperand(1).getReg();
709 Register Reg1 = MBBI->getOperand(2).getReg();
710 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
711 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
712 .addImm(Imm * 8)
713 .setMIFlag(Flag);
714 else
715 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
716 .addImm(RegInfo->getSEHRegNum(Reg0))
717 .addImm(RegInfo->getSEHRegNum(Reg1))
718 .addImm(Imm * 8)
719 .setMIFlag(Flag);
720 break;
721 }
722 case AArch64::LDRDpost:
723 Imm = -Imm;
724 LLVM_FALLTHROUGH;
725 case AArch64::STRDpre: {
726 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
727 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
728 .addImm(Reg)
729 .addImm(Imm)
730 .setMIFlag(Flag);
731 break;
732 }
733 case AArch64::LDRXpost:
734 Imm = -Imm;
735 LLVM_FALLTHROUGH;
736 case AArch64::STRXpre: {
737 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
738 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
739 .addImm(Reg)
740 .addImm(Imm)
741 .setMIFlag(Flag);
742 break;
743 }
744 case AArch64::STPDi:
745 case AArch64::LDPDi: {
746 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
747 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
748 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
749 .addImm(Reg0)
750 .addImm(Reg1)
751 .addImm(Imm * 8)
752 .setMIFlag(Flag);
753 break;
754 }
755 case AArch64::STPXi:
756 case AArch64::LDPXi: {
757 Register Reg0 = MBBI->getOperand(0).getReg();
758 Register Reg1 = MBBI->getOperand(1).getReg();
759 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
760 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
761 .addImm(Imm * 8)
762 .setMIFlag(Flag);
763 else
764 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
765 .addImm(RegInfo->getSEHRegNum(Reg0))
766 .addImm(RegInfo->getSEHRegNum(Reg1))
767 .addImm(Imm * 8)
768 .setMIFlag(Flag);
769 break;
770 }
771 case AArch64::STRXui:
772 case AArch64::LDRXui: {
773 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
774 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
775 .addImm(Reg)
776 .addImm(Imm * 8)
777 .setMIFlag(Flag);
778 break;
779 }
780 case AArch64::STRDui:
781 case AArch64::LDRDui: {
782 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
783 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
784 .addImm(Reg)
785 .addImm(Imm * 8)
786 .setMIFlag(Flag);
787 break;
788 }
789 }
790 auto I = MBB->insertAfter(MBBI, MIB);
791 return I;
792 }
793
794 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)795 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
796 unsigned LocalStackSize) {
797 MachineOperand *ImmOpnd = nullptr;
798 unsigned ImmIdx = MBBI->getNumOperands() - 1;
799 switch (MBBI->getOpcode()) {
800 default:
801 llvm_unreachable("Fix the offset in the SEH instruction");
802 case AArch64::SEH_SaveFPLR:
803 case AArch64::SEH_SaveRegP:
804 case AArch64::SEH_SaveReg:
805 case AArch64::SEH_SaveFRegP:
806 case AArch64::SEH_SaveFReg:
807 ImmOpnd = &MBBI->getOperand(ImmIdx);
808 break;
809 }
810 if (ImmOpnd)
811 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
812 }
813
814 // Convert callee-save register save/restore instruction to do stack pointer
815 // decrement/increment to allocate/deallocate the callee-save stack area by
816 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool InProlog=true)817 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
818 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
819 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
820 bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
821 // Ignore instructions that do not operate on SP, i.e. shadow call stack
822 // instructions and associated CFI instruction.
823 while (MBBI->getOpcode() == AArch64::STRXpost ||
824 MBBI->getOpcode() == AArch64::LDRXpre ||
825 MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
826 if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
827 assert(MBBI->getOperand(0).getReg() != AArch64::SP);
828 ++MBBI;
829 }
830 unsigned NewOpc;
831 int Scale = 1;
832 switch (MBBI->getOpcode()) {
833 default:
834 llvm_unreachable("Unexpected callee-save save/restore opcode!");
835 case AArch64::STPXi:
836 NewOpc = AArch64::STPXpre;
837 Scale = 8;
838 break;
839 case AArch64::STPDi:
840 NewOpc = AArch64::STPDpre;
841 Scale = 8;
842 break;
843 case AArch64::STPQi:
844 NewOpc = AArch64::STPQpre;
845 Scale = 16;
846 break;
847 case AArch64::STRXui:
848 NewOpc = AArch64::STRXpre;
849 break;
850 case AArch64::STRDui:
851 NewOpc = AArch64::STRDpre;
852 break;
853 case AArch64::STRQui:
854 NewOpc = AArch64::STRQpre;
855 break;
856 case AArch64::LDPXi:
857 NewOpc = AArch64::LDPXpost;
858 Scale = 8;
859 break;
860 case AArch64::LDPDi:
861 NewOpc = AArch64::LDPDpost;
862 Scale = 8;
863 break;
864 case AArch64::LDPQi:
865 NewOpc = AArch64::LDPQpost;
866 Scale = 16;
867 break;
868 case AArch64::LDRXui:
869 NewOpc = AArch64::LDRXpost;
870 break;
871 case AArch64::LDRDui:
872 NewOpc = AArch64::LDRDpost;
873 break;
874 case AArch64::LDRQui:
875 NewOpc = AArch64::LDRQpost;
876 break;
877 }
878 // Get rid of the SEH code associated with the old instruction.
879 if (NeedsWinCFI) {
880 auto SEH = std::next(MBBI);
881 if (AArch64InstrInfo::isSEHInstruction(*SEH))
882 SEH->eraseFromParent();
883 }
884
885 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
886 MIB.addReg(AArch64::SP, RegState::Define);
887
888 // Copy all operands other than the immediate offset.
889 unsigned OpndIdx = 0;
890 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
891 ++OpndIdx)
892 MIB.add(MBBI->getOperand(OpndIdx));
893
894 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
895 "Unexpected immediate offset in first/last callee-save save/restore "
896 "instruction!");
897 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
898 "Unexpected base register in callee-save save/restore instruction!");
899 assert(CSStackSizeInc % Scale == 0);
900 MIB.addImm(CSStackSizeInc / Scale);
901
902 MIB.setMIFlags(MBBI->getFlags());
903 MIB.setMemRefs(MBBI->memoperands());
904
905 // Generate a new SEH code that corresponds to the new instruction.
906 if (NeedsWinCFI) {
907 *HasWinCFI = true;
908 InsertSEH(*MIB, *TII,
909 InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
910 }
911
912 return std::prev(MBB.erase(MBBI));
913 }
914
915 // Fixup callee-save register save/restore instructions to take into account
916 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)917 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
918 uint64_t LocalStackSize,
919 bool NeedsWinCFI,
920 bool *HasWinCFI) {
921 if (AArch64InstrInfo::isSEHInstruction(MI))
922 return;
923
924 unsigned Opc = MI.getOpcode();
925
926 // Ignore instructions that do not operate on SP, i.e. shadow call stack
927 // instructions and associated CFI instruction.
928 if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
929 Opc == AArch64::CFI_INSTRUCTION) {
930 if (Opc != AArch64::CFI_INSTRUCTION)
931 assert(MI.getOperand(0).getReg() != AArch64::SP);
932 return;
933 }
934
935 unsigned Scale;
936 switch (Opc) {
937 case AArch64::STPXi:
938 case AArch64::STRXui:
939 case AArch64::STPDi:
940 case AArch64::STRDui:
941 case AArch64::LDPXi:
942 case AArch64::LDRXui:
943 case AArch64::LDPDi:
944 case AArch64::LDRDui:
945 Scale = 8;
946 break;
947 case AArch64::STPQi:
948 case AArch64::STRQui:
949 case AArch64::LDPQi:
950 case AArch64::LDRQui:
951 Scale = 16;
952 break;
953 default:
954 llvm_unreachable("Unexpected callee-save save/restore opcode!");
955 }
956
957 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
958 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
959 "Unexpected base register in callee-save save/restore instruction!");
960 // Last operand is immediate offset that needs fixing.
961 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
962 // All generated opcodes have scaled offsets.
963 assert(LocalStackSize % Scale == 0);
964 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
965
966 if (NeedsWinCFI) {
967 *HasWinCFI = true;
968 auto MBBI = std::next(MachineBasicBlock::iterator(MI));
969 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
970 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
971 "Expecting a SEH instruction");
972 fixupSEHOpcode(MBBI, LocalStackSize);
973 }
974 }
975
adaptForLdStOpt(MachineBasicBlock & MBB,MachineBasicBlock::iterator FirstSPPopI,MachineBasicBlock::iterator LastPopI)976 static void adaptForLdStOpt(MachineBasicBlock &MBB,
977 MachineBasicBlock::iterator FirstSPPopI,
978 MachineBasicBlock::iterator LastPopI) {
979 // Sometimes (when we restore in the same order as we save), we can end up
980 // with code like this:
981 //
982 // ldp x26, x25, [sp]
983 // ldp x24, x23, [sp, #16]
984 // ldp x22, x21, [sp, #32]
985 // ldp x20, x19, [sp, #48]
986 // add sp, sp, #64
987 //
988 // In this case, it is always better to put the first ldp at the end, so
989 // that the load-store optimizer can run and merge the ldp and the add into
990 // a post-index ldp.
991 // If we managed to grab the first pop instruction, move it to the end.
992 if (ReverseCSRRestoreSeq)
993 MBB.splice(FirstSPPopI, &MBB, LastPopI);
994 // We should end up with something like this now:
995 //
996 // ldp x24, x23, [sp, #16]
997 // ldp x22, x21, [sp, #32]
998 // ldp x20, x19, [sp, #48]
999 // ldp x26, x25, [sp]
1000 // add sp, sp, #64
1001 //
1002 // and the load-store optimizer can merge the last two instructions into:
1003 //
1004 // ldp x26, x25, [sp], #64
1005 //
1006 }
1007
isTargetWindows(const MachineFunction & MF)1008 static bool isTargetWindows(const MachineFunction &MF) {
1009 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1010 }
1011
1012 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1013 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1014 switch (I->getOpcode()) {
1015 default:
1016 return false;
1017 case AArch64::STR_ZXI:
1018 case AArch64::STR_PXI:
1019 case AArch64::LDR_ZXI:
1020 case AArch64::LDR_PXI:
1021 return I->getFlag(MachineInstr::FrameSetup) ||
1022 I->getFlag(MachineInstr::FrameDestroy);
1023 }
1024 }
1025
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1026 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1027 MachineBasicBlock &MBB) const {
1028 MachineBasicBlock::iterator MBBI = MBB.begin();
1029 const MachineFrameInfo &MFI = MF.getFrameInfo();
1030 const Function &F = MF.getFunction();
1031 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1032 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1033 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1034 MachineModuleInfo &MMI = MF.getMMI();
1035 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1036 bool needsFrameMoves =
1037 MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1038 bool HasFP = hasFP(MF);
1039 bool NeedsWinCFI = needsWinCFI(MF);
1040 bool HasWinCFI = false;
1041 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1042
1043 bool IsFunclet = MBB.isEHFuncletEntry();
1044
1045 // At this point, we're going to decide whether or not the function uses a
1046 // redzone. In most cases, the function doesn't have a redzone so let's
1047 // assume that's false and set it to true in the case that there's a redzone.
1048 AFI->setHasRedZone(false);
1049
1050 // Debug location must be unknown since the first debug location is used
1051 // to determine the end of the prologue.
1052 DebugLoc DL;
1053
1054 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1055 if (MFnI.shouldSignReturnAddress()) {
1056 if (MFnI.shouldSignWithBKey()) {
1057 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1058 .setMIFlag(MachineInstr::FrameSetup);
1059 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
1060 .setMIFlag(MachineInstr::FrameSetup);
1061 } else {
1062 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
1063 .setMIFlag(MachineInstr::FrameSetup);
1064 }
1065
1066 unsigned CFIIndex =
1067 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1068 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1069 .addCFIIndex(CFIIndex)
1070 .setMIFlags(MachineInstr::FrameSetup);
1071 }
1072
1073 // All calls are tail calls in GHC calling conv, and functions have no
1074 // prologue/epilogue.
1075 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1076 return;
1077
1078 // Set tagged base pointer to the requested stack slot.
1079 // Ideally it should match SP value after prologue.
1080 Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1081 if (TBPI)
1082 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1083 else
1084 AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1085
1086 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1087
1088 // getStackSize() includes all the locals in its size calculation. We don't
1089 // include these locals when computing the stack size of a funclet, as they
1090 // are allocated in the parent's stack frame and accessed via the frame
1091 // pointer from the funclet. We only save the callee saved registers in the
1092 // funclet, which are really the callee saved registers of the parent
1093 // function, including the funclet.
1094 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1095 : MFI.getStackSize();
1096 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1097 assert(!HasFP && "unexpected function without stack frame but with FP");
1098 assert(!SVEStackSize &&
1099 "unexpected function without stack frame but with SVE objects");
1100 // All of the stack allocation is for locals.
1101 AFI->setLocalStackSize(NumBytes);
1102 if (!NumBytes)
1103 return;
1104 // REDZONE: If the stack size is less than 128 bytes, we don't need
1105 // to actually allocate.
1106 if (canUseRedZone(MF)) {
1107 AFI->setHasRedZone(true);
1108 ++NumRedZoneFunctions;
1109 } else {
1110 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1111 StackOffset::getFixed(-NumBytes), TII,
1112 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1113 if (!NeedsWinCFI && needsFrameMoves) {
1114 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1115 MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1116 // Encode the stack size of the leaf function.
1117 unsigned CFIIndex = MF.addFrameInst(
1118 MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1119 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1120 .addCFIIndex(CFIIndex)
1121 .setMIFlags(MachineInstr::FrameSetup);
1122 }
1123 }
1124
1125 if (NeedsWinCFI) {
1126 HasWinCFI = true;
1127 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1128 .setMIFlag(MachineInstr::FrameSetup);
1129 }
1130
1131 return;
1132 }
1133
1134 bool IsWin64 =
1135 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1136 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1137
1138 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1139 // All of the remaining stack allocations are for locals.
1140 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1141 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1142 if (CombineSPBump) {
1143 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1144 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1145 StackOffset::getFixed(-NumBytes), TII,
1146 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1147 NumBytes = 0;
1148 } else if (PrologueSaveSize != 0) {
1149 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1150 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
1151 NumBytes -= PrologueSaveSize;
1152 }
1153 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1154
1155 // Move past the saves of the callee-saved registers, fixing up the offsets
1156 // and pre-inc if we decided to combine the callee-save and local stack
1157 // pointer bump above.
1158 MachineBasicBlock::iterator End = MBB.end();
1159 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1160 !IsSVECalleeSave(MBBI)) {
1161 if (CombineSPBump)
1162 fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1163 NeedsWinCFI, &HasWinCFI);
1164 ++MBBI;
1165 }
1166
1167 // For funclets the FP belongs to the containing function.
1168 if (!IsFunclet && HasFP) {
1169 // Only set up FP if we actually need to.
1170 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1171
1172 if (CombineSPBump)
1173 FPOffset += AFI->getLocalStackSize();
1174
1175 // Issue sub fp, sp, FPOffset or
1176 // mov fp,sp when FPOffset is zero.
1177 // Note: All stores of callee-saved registers are marked as "FrameSetup".
1178 // This code marks the instruction(s) that set the FP also.
1179 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1180 StackOffset::getFixed(FPOffset), TII,
1181 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1182 }
1183
1184 if (windowsRequiresStackProbe(MF, NumBytes)) {
1185 uint64_t NumWords = NumBytes >> 4;
1186 if (NeedsWinCFI) {
1187 HasWinCFI = true;
1188 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1189 // exceed this amount. We need to move at most 2^24 - 1 into x15.
1190 // This is at most two instructions, MOVZ follwed by MOVK.
1191 // TODO: Fix to use multiple stack alloc unwind codes for stacks
1192 // exceeding 256MB in size.
1193 if (NumBytes >= (1 << 28))
1194 report_fatal_error("Stack size cannot exceed 256MB for stack "
1195 "unwinding purposes");
1196
1197 uint32_t LowNumWords = NumWords & 0xFFFF;
1198 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1199 .addImm(LowNumWords)
1200 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1201 .setMIFlag(MachineInstr::FrameSetup);
1202 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1203 .setMIFlag(MachineInstr::FrameSetup);
1204 if ((NumWords & 0xFFFF0000) != 0) {
1205 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1206 .addReg(AArch64::X15)
1207 .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1208 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1209 .setMIFlag(MachineInstr::FrameSetup);
1210 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1211 .setMIFlag(MachineInstr::FrameSetup);
1212 }
1213 } else {
1214 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1215 .addImm(NumWords)
1216 .setMIFlags(MachineInstr::FrameSetup);
1217 }
1218
1219 switch (MF.getTarget().getCodeModel()) {
1220 case CodeModel::Tiny:
1221 case CodeModel::Small:
1222 case CodeModel::Medium:
1223 case CodeModel::Kernel:
1224 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1225 .addExternalSymbol("__chkstk")
1226 .addReg(AArch64::X15, RegState::Implicit)
1227 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1228 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1229 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1230 .setMIFlags(MachineInstr::FrameSetup);
1231 if (NeedsWinCFI) {
1232 HasWinCFI = true;
1233 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1234 .setMIFlag(MachineInstr::FrameSetup);
1235 }
1236 break;
1237 case CodeModel::Large:
1238 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1239 .addReg(AArch64::X16, RegState::Define)
1240 .addExternalSymbol("__chkstk")
1241 .addExternalSymbol("__chkstk")
1242 .setMIFlags(MachineInstr::FrameSetup);
1243 if (NeedsWinCFI) {
1244 HasWinCFI = true;
1245 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1246 .setMIFlag(MachineInstr::FrameSetup);
1247 }
1248
1249 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1250 .addReg(AArch64::X16, RegState::Kill)
1251 .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1252 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1253 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1254 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1255 .setMIFlags(MachineInstr::FrameSetup);
1256 if (NeedsWinCFI) {
1257 HasWinCFI = true;
1258 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1259 .setMIFlag(MachineInstr::FrameSetup);
1260 }
1261 break;
1262 }
1263
1264 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1265 .addReg(AArch64::SP, RegState::Kill)
1266 .addReg(AArch64::X15, RegState::Kill)
1267 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1268 .setMIFlags(MachineInstr::FrameSetup);
1269 if (NeedsWinCFI) {
1270 HasWinCFI = true;
1271 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1272 .addImm(NumBytes)
1273 .setMIFlag(MachineInstr::FrameSetup);
1274 }
1275 NumBytes = 0;
1276 }
1277
1278 StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1279 MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1280
1281 // Process the SVE callee-saves to determine what space needs to be
1282 // allocated.
1283 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1284 // Find callee save instructions in frame.
1285 CalleeSavesBegin = MBBI;
1286 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1287 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1288 ++MBBI;
1289 CalleeSavesEnd = MBBI;
1290
1291 AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1292 AllocateAfter = SVEStackSize - AllocateBefore;
1293 }
1294
1295 // Allocate space for the callee saves (if any).
1296 emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
1297 -AllocateBefore, TII,
1298 MachineInstr::FrameSetup);
1299
1300 // Finally allocate remaining SVE stack space.
1301 emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1302 -AllocateAfter, TII,
1303 MachineInstr::FrameSetup);
1304
1305 // Allocate space for the rest of the frame.
1306 if (NumBytes) {
1307 // Alignment is required for the parent frame, not the funclet
1308 const bool NeedsRealignment =
1309 !IsFunclet && RegInfo->needsStackRealignment(MF);
1310 unsigned scratchSPReg = AArch64::SP;
1311
1312 if (NeedsRealignment) {
1313 scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1314 assert(scratchSPReg != AArch64::NoRegister);
1315 }
1316
1317 // If we're a leaf function, try using the red zone.
1318 if (!canUseRedZone(MF))
1319 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1320 // the correct value here, as NumBytes also includes padding bytes,
1321 // which shouldn't be counted here.
1322 emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1323 StackOffset::getFixed(-NumBytes), TII,
1324 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1325
1326 if (NeedsRealignment) {
1327 const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1328 assert(NrBitsToZero > 1);
1329 assert(scratchSPReg != AArch64::SP);
1330
1331 // SUB X9, SP, NumBytes
1332 // -- X9 is temporary register, so shouldn't contain any live data here,
1333 // -- free to use. This is already produced by emitFrameOffset above.
1334 // AND SP, X9, 0b11111...0000
1335 // The logical immediates have a non-trivial encoding. The following
1336 // formula computes the encoded immediate with all ones but
1337 // NrBitsToZero zero bits as least significant bits.
1338 uint32_t andMaskEncoded = (1 << 12) // = N
1339 | ((64 - NrBitsToZero) << 6) // immr
1340 | ((64 - NrBitsToZero - 1) << 0); // imms
1341
1342 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1343 .addReg(scratchSPReg, RegState::Kill)
1344 .addImm(andMaskEncoded);
1345 AFI->setStackRealigned(true);
1346 if (NeedsWinCFI) {
1347 HasWinCFI = true;
1348 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1349 .addImm(NumBytes & andMaskEncoded)
1350 .setMIFlag(MachineInstr::FrameSetup);
1351 }
1352 }
1353 }
1354
1355 // If we need a base pointer, set it up here. It's whatever the value of the
1356 // stack pointer is at this point. Any variable size objects will be allocated
1357 // after this, so we can still use the base pointer to reference locals.
1358 //
1359 // FIXME: Clarify FrameSetup flags here.
1360 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1361 // needed.
1362 // For funclets the BP belongs to the containing function.
1363 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1364 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1365 false);
1366 if (NeedsWinCFI) {
1367 HasWinCFI = true;
1368 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1369 .setMIFlag(MachineInstr::FrameSetup);
1370 }
1371 }
1372
1373 // The very last FrameSetup instruction indicates the end of prologue. Emit a
1374 // SEH opcode indicating the prologue end.
1375 if (NeedsWinCFI && HasWinCFI) {
1376 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1377 .setMIFlag(MachineInstr::FrameSetup);
1378 }
1379
1380 // SEH funclets are passed the frame pointer in X1. If the parent
1381 // function uses the base register, then the base register is used
1382 // directly, and is not retrieved from X1.
1383 if (IsFunclet && F.hasPersonalityFn()) {
1384 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1385 if (isAsynchronousEHPersonality(Per)) {
1386 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1387 .addReg(AArch64::X1)
1388 .setMIFlag(MachineInstr::FrameSetup);
1389 MBB.addLiveIn(AArch64::X1);
1390 }
1391 }
1392
1393 if (needsFrameMoves) {
1394 // An example of the prologue:
1395 //
1396 // .globl __foo
1397 // .align 2
1398 // __foo:
1399 // Ltmp0:
1400 // .cfi_startproc
1401 // .cfi_personality 155, ___gxx_personality_v0
1402 // Leh_func_begin:
1403 // .cfi_lsda 16, Lexception33
1404 //
1405 // stp xa,bx, [sp, -#offset]!
1406 // ...
1407 // stp x28, x27, [sp, #offset-32]
1408 // stp fp, lr, [sp, #offset-16]
1409 // add fp, sp, #offset - 16
1410 // sub sp, sp, #1360
1411 //
1412 // The Stack:
1413 // +-------------------------------------------+
1414 // 10000 | ........ | ........ | ........ | ........ |
1415 // 10004 | ........ | ........ | ........ | ........ |
1416 // +-------------------------------------------+
1417 // 10008 | ........ | ........ | ........ | ........ |
1418 // 1000c | ........ | ........ | ........ | ........ |
1419 // +===========================================+
1420 // 10010 | X28 Register |
1421 // 10014 | X28 Register |
1422 // +-------------------------------------------+
1423 // 10018 | X27 Register |
1424 // 1001c | X27 Register |
1425 // +===========================================+
1426 // 10020 | Frame Pointer |
1427 // 10024 | Frame Pointer |
1428 // +-------------------------------------------+
1429 // 10028 | Link Register |
1430 // 1002c | Link Register |
1431 // +===========================================+
1432 // 10030 | ........ | ........ | ........ | ........ |
1433 // 10034 | ........ | ........ | ........ | ........ |
1434 // +-------------------------------------------+
1435 // 10038 | ........ | ........ | ........ | ........ |
1436 // 1003c | ........ | ........ | ........ | ........ |
1437 // +-------------------------------------------+
1438 //
1439 // [sp] = 10030 :: >>initial value<<
1440 // sp = 10020 :: stp fp, lr, [sp, #-16]!
1441 // fp = sp == 10020 :: mov fp, sp
1442 // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
1443 // sp == 10010 :: >>final value<<
1444 //
1445 // The frame pointer (w29) points to address 10020. If we use an offset of
1446 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
1447 // for w27, and -32 for w28:
1448 //
1449 // Ltmp1:
1450 // .cfi_def_cfa w29, 16
1451 // Ltmp2:
1452 // .cfi_offset w30, -8
1453 // Ltmp3:
1454 // .cfi_offset w29, -16
1455 // Ltmp4:
1456 // .cfi_offset w27, -24
1457 // Ltmp5:
1458 // .cfi_offset w28, -32
1459
1460 if (HasFP) {
1461 const int OffsetToFirstCalleeSaveFromFP =
1462 AFI->getCalleeSaveBaseToFrameRecordOffset() -
1463 AFI->getCalleeSavedStackSize();
1464 Register FramePtr = RegInfo->getFrameRegister(MF);
1465
1466 // Define the current CFA rule to use the provided FP.
1467 unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1468 unsigned CFIIndex = MF.addFrameInst(
1469 MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1470 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1471 .addCFIIndex(CFIIndex)
1472 .setMIFlags(MachineInstr::FrameSetup);
1473 } else {
1474 unsigned CFIIndex;
1475 if (SVEStackSize) {
1476 const TargetSubtargetInfo &STI = MF.getSubtarget();
1477 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
1478 StackOffset TotalSize =
1479 SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
1480 CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
1481 } else {
1482 // Encode the stack size of the leaf function.
1483 CFIIndex = MF.addFrameInst(
1484 MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
1485 }
1486 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1487 .addCFIIndex(CFIIndex)
1488 .setMIFlags(MachineInstr::FrameSetup);
1489 }
1490
1491 // Now emit the moves for whatever callee saved regs we have (including FP,
1492 // LR if those are saved).
1493 emitCalleeSavedFrameMoves(MBB, MBBI);
1494 }
1495 }
1496
InsertReturnAddressAuth(MachineFunction & MF,MachineBasicBlock & MBB)1497 static void InsertReturnAddressAuth(MachineFunction &MF,
1498 MachineBasicBlock &MBB) {
1499 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1500 if (!MFI.shouldSignReturnAddress())
1501 return;
1502 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1503 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1504
1505 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1506 DebugLoc DL;
1507 if (MBBI != MBB.end())
1508 DL = MBBI->getDebugLoc();
1509
1510 // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1511 // this instruction can safely used for any v8a architecture.
1512 // From v8.3a onwards there are optimised authenticate LR and return
1513 // instructions, namely RETA{A,B}, that can be used instead.
1514 if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1515 MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1516 BuildMI(MBB, MBBI, DL,
1517 TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1518 .copyImplicitOps(*MBBI);
1519 MBB.erase(MBBI);
1520 } else {
1521 BuildMI(
1522 MBB, MBBI, DL,
1523 TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1524 .setMIFlag(MachineInstr::FrameDestroy);
1525 }
1526 }
1527
isFuncletReturnInstr(const MachineInstr & MI)1528 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1529 switch (MI.getOpcode()) {
1530 default:
1531 return false;
1532 case AArch64::CATCHRET:
1533 case AArch64::CLEANUPRET:
1534 return true;
1535 }
1536 }
1537
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1538 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1539 MachineBasicBlock &MBB) const {
1540 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1541 MachineFrameInfo &MFI = MF.getFrameInfo();
1542 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1543 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1544 DebugLoc DL;
1545 bool NeedsWinCFI = needsWinCFI(MF);
1546 bool HasWinCFI = false;
1547 bool IsFunclet = false;
1548 auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1549
1550 if (MBB.end() != MBBI) {
1551 DL = MBBI->getDebugLoc();
1552 IsFunclet = isFuncletReturnInstr(*MBBI);
1553 }
1554
1555 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1556 : MFI.getStackSize();
1557 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1558
1559 // All calls are tail calls in GHC calling conv, and functions have no
1560 // prologue/epilogue.
1561 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1562 return;
1563
1564 // Initial and residual are named for consistency with the prologue. Note that
1565 // in the epilogue, the residual adjustment is executed first.
1566 uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);
1567
1568 // The stack frame should be like below,
1569 //
1570 // ---------------------- ---
1571 // | | |
1572 // | BytesInStackArgArea| CalleeArgStackSize
1573 // | (NumReusableBytes) | (of tail call)
1574 // | | ---
1575 // | | |
1576 // ---------------------| --- |
1577 // | | | |
1578 // | CalleeSavedReg | | |
1579 // | (CalleeSavedStackSize)| | |
1580 // | | | |
1581 // ---------------------| | NumBytes
1582 // | | StackSize (StackAdjustUp)
1583 // | LocalStackSize | | |
1584 // | (covering callee | | |
1585 // | args) | | |
1586 // | | | |
1587 // ---------------------- --- ---
1588 //
1589 // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
1590 // = StackSize + ArgumentPopSize
1591 //
1592 // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
1593 // it as the 2nd argument of AArch64ISD::TC_RETURN.
1594
1595 auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
1596
1597 bool IsWin64 =
1598 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1599 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1600
1601 uint64_t AfterCSRPopSize = ArgumentPopSize;
1602 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1603 // We cannot rely on the local stack size set in emitPrologue if the function
1604 // has funclets, as funclets have different local stack size requirements, and
1605 // the current value set in emitPrologue may be that of the containing
1606 // function.
1607 if (MF.hasEHFunclets())
1608 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1609 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1610 // Assume we can't combine the last pop with the sp restore.
1611
1612 if (!CombineSPBump && PrologueSaveSize != 0) {
1613 MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1614 while (AArch64InstrInfo::isSEHInstruction(*Pop))
1615 Pop = std::prev(Pop);
1616 // Converting the last ldp to a post-index ldp is valid only if the last
1617 // ldp's offset is 0.
1618 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1619 // If the offset is 0, convert it to a post-index ldp.
1620 if (OffsetOp.getImm() == 0)
1621 convertCalleeSaveRestoreToSPPrePostIncDec(
1622 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
1623 else {
1624 // If not, make sure to emit an add after the last ldp.
1625 // We're doing this by transfering the size to be restored from the
1626 // adjustment *before* the CSR pops to the adjustment *after* the CSR
1627 // pops.
1628 AfterCSRPopSize += PrologueSaveSize;
1629 }
1630 }
1631
1632 // Move past the restores of the callee-saved registers.
1633 // If we plan on combining the sp bump of the local stack size and the callee
1634 // save stack size, we might need to adjust the CSR save and restore offsets.
1635 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1636 MachineBasicBlock::iterator Begin = MBB.begin();
1637 while (LastPopI != Begin) {
1638 --LastPopI;
1639 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1640 IsSVECalleeSave(LastPopI)) {
1641 ++LastPopI;
1642 break;
1643 } else if (CombineSPBump)
1644 fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1645 NeedsWinCFI, &HasWinCFI);
1646 }
1647
1648 if (MF.hasWinCFI()) {
1649 // If the prologue didn't contain any SEH opcodes and didn't set the
1650 // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
1651 // EpilogStart - to avoid generating CFI for functions that don't need it.
1652 // (And as we didn't generate any prologue at all, it would be asymmetrical
1653 // to the epilogue.) By the end of the function, we assert that
1654 // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
1655 HasWinCFI = true;
1656 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
1657 .setMIFlag(MachineInstr::FrameDestroy);
1658 }
1659
1660 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1661
1662 // If there is a single SP update, insert it before the ret and we're done.
1663 if (CombineSPBump) {
1664 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1665 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
1666 StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
1667 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
1668 &HasWinCFI);
1669 if (HasWinCFI)
1670 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1671 TII->get(AArch64::SEH_EpilogEnd))
1672 .setMIFlag(MachineInstr::FrameDestroy);
1673 return;
1674 }
1675
1676 NumBytes -= PrologueSaveSize;
1677 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1678
1679 // Process the SVE callee-saves to determine what space needs to be
1680 // deallocated.
1681 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
1682 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
1683 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1684 RestoreBegin = std::prev(RestoreEnd);
1685 while (RestoreBegin != MBB.begin() &&
1686 IsSVECalleeSave(std::prev(RestoreBegin)))
1687 --RestoreBegin;
1688
1689 assert(IsSVECalleeSave(RestoreBegin) &&
1690 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
1691
1692 StackOffset CalleeSavedSizeAsOffset =
1693 StackOffset::getScalable(CalleeSavedSize);
1694 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
1695 DeallocateAfter = CalleeSavedSizeAsOffset;
1696 }
1697
1698 // Deallocate the SVE area.
1699 if (SVEStackSize) {
1700 if (AFI->isStackRealigned()) {
1701 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
1702 // Set SP to start of SVE callee-save area from which they can
1703 // be reloaded. The code below will deallocate the stack space
1704 // space by moving FP -> SP.
1705 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
1706 StackOffset::getScalable(-CalleeSavedSize), TII,
1707 MachineInstr::FrameDestroy);
1708 } else {
1709 if (AFI->getSVECalleeSavedStackSize()) {
1710 // Deallocate the non-SVE locals first before we can deallocate (and
1711 // restore callee saves) from the SVE area.
1712 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1713 StackOffset::getFixed(NumBytes), TII,
1714 MachineInstr::FrameDestroy);
1715 NumBytes = 0;
1716 }
1717
1718 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1719 DeallocateBefore, TII, MachineInstr::FrameDestroy);
1720
1721 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1722 DeallocateAfter, TII, MachineInstr::FrameDestroy);
1723 }
1724 }
1725
1726 if (!hasFP(MF)) {
1727 bool RedZone = canUseRedZone(MF);
1728 // If this was a redzone leaf function, we don't need to restore the
1729 // stack pointer (but we may need to pop stack args for fastcc).
1730 if (RedZone && AfterCSRPopSize == 0)
1731 return;
1732
1733 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
1734 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
1735 if (NoCalleeSaveRestore)
1736 StackRestoreBytes += AfterCSRPopSize;
1737
1738 // If we were able to combine the local stack pop with the argument pop,
1739 // then we're done.
1740 bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
1741
1742 // If we're done after this, make sure to help the load store optimizer.
1743 if (Done)
1744 adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
1745
1746 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1747 StackOffset::getFixed(StackRestoreBytes), TII,
1748 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1749 if (Done) {
1750 if (HasWinCFI) {
1751 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1752 TII->get(AArch64::SEH_EpilogEnd))
1753 .setMIFlag(MachineInstr::FrameDestroy);
1754 }
1755 return;
1756 }
1757
1758 NumBytes = 0;
1759 }
1760
1761 // Restore the original stack pointer.
1762 // FIXME: Rather than doing the math here, we should instead just use
1763 // non-post-indexed loads for the restores if we aren't actually going to
1764 // be able to save any instructions.
1765 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
1766 emitFrameOffset(
1767 MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
1768 StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
1769 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
1770 } else if (NumBytes)
1771 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1772 StackOffset::getFixed(NumBytes), TII,
1773 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1774
1775 // This must be placed after the callee-save restore code because that code
1776 // assumes the SP is at the same location as it was after the callee-save save
1777 // code in the prologue.
1778 if (AfterCSRPopSize) {
1779 // Find an insertion point for the first ldp so that it goes before the
1780 // shadow call stack epilog instruction. This ensures that the restore of
1781 // lr from x18 is placed after the restore from sp.
1782 auto FirstSPPopI = MBB.getFirstTerminator();
1783 while (FirstSPPopI != Begin) {
1784 auto Prev = std::prev(FirstSPPopI);
1785 if (Prev->getOpcode() != AArch64::LDRXpre ||
1786 Prev->getOperand(0).getReg() == AArch64::SP)
1787 break;
1788 FirstSPPopI = Prev;
1789 }
1790
1791 adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1792
1793 emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
1794 StackOffset::getFixed((int64_t)AfterCSRPopSize), TII,
1795 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1796 }
1797 if (HasWinCFI)
1798 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
1799 .setMIFlag(MachineInstr::FrameDestroy);
1800 }
1801
1802 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1803 /// debug info. It's the same as what we use for resolving the code-gen
1804 /// references for now. FIXME: This can go wrong when references are
1805 /// SP-relative and simple call frames aren't used.
1806 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1807 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1808 Register &FrameReg) const {
1809 return resolveFrameIndexReference(
1810 MF, FI, FrameReg,
1811 /*PreferFP=*/
1812 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
1813 /*ForSimm=*/false);
1814 }
1815
1816 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const1817 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1818 int FI) const {
1819 return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
1820 }
1821
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)1822 static StackOffset getFPOffset(const MachineFunction &MF,
1823 int64_t ObjectOffset) {
1824 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1825 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1826 bool IsWin64 =
1827 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1828 unsigned FixedObject =
1829 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
1830 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
1831 int64_t FPAdjust =
1832 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1833 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
1834 }
1835
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)1836 static StackOffset getStackOffset(const MachineFunction &MF,
1837 int64_t ObjectOffset) {
1838 const auto &MFI = MF.getFrameInfo();
1839 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
1840 }
1841
1842 // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const1843 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
1844 int FI) const {
1845 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
1846 MF.getSubtarget().getRegisterInfo());
1847 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
1848 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
1849 ? getFPOffset(MF, ObjectOffset).getFixed()
1850 : getStackOffset(MF, ObjectOffset).getFixed();
1851 }
1852
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const1853 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
1854 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
1855 bool ForSimm) const {
1856 const auto &MFI = MF.getFrameInfo();
1857 int64_t ObjectOffset = MFI.getObjectOffset(FI);
1858 bool isFixed = MFI.isFixedObjectIndex(FI);
1859 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
1860 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
1861 PreferFP, ForSimm);
1862 }
1863
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const1864 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
1865 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
1866 Register &FrameReg, bool PreferFP, bool ForSimm) const {
1867 const auto &MFI = MF.getFrameInfo();
1868 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
1869 MF.getSubtarget().getRegisterInfo());
1870 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1871 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1872
1873 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
1874 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
1875 bool isCSR =
1876 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
1877
1878 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1879
1880 // Use frame pointer to reference fixed objects. Use it for locals if
1881 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
1882 // reliable as a base). Make sure useFPForScavengingIndex() does the
1883 // right thing for the emergency spill slot.
1884 bool UseFP = false;
1885 if (AFI->hasStackFrame() && !isSVE) {
1886 // We shouldn't prefer using the FP when there is an SVE area
1887 // in between the FP and the non-SVE locals/spills.
1888 PreferFP &= !SVEStackSize;
1889
1890 // Note: Keeping the following as multiple 'if' statements rather than
1891 // merging to a single expression for readability.
1892 //
1893 // Argument access should always use the FP.
1894 if (isFixed) {
1895 UseFP = hasFP(MF);
1896 } else if (isCSR && RegInfo->needsStackRealignment(MF)) {
1897 // References to the CSR area must use FP if we're re-aligning the stack
1898 // since the dynamically-sized alignment padding is between the SP/BP and
1899 // the CSR area.
1900 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
1901 UseFP = true;
1902 } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
1903 // If the FPOffset is negative and we're producing a signed immediate, we
1904 // have to keep in mind that the available offset range for negative
1905 // offsets is smaller than for positive ones. If an offset is available
1906 // via the FP and the SP, use whichever is closest.
1907 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
1908 PreferFP |= Offset > -FPOffset;
1909
1910 if (MFI.hasVarSizedObjects()) {
1911 // If we have variable sized objects, we can use either FP or BP, as the
1912 // SP offset is unknown. We can use the base pointer if we have one and
1913 // FP is not preferred. If not, we're stuck with using FP.
1914 bool CanUseBP = RegInfo->hasBasePointer(MF);
1915 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
1916 UseFP = PreferFP;
1917 else if (!CanUseBP) // Can't use BP. Forced to use FP.
1918 UseFP = true;
1919 // else we can use BP and FP, but the offset from FP won't fit.
1920 // That will make us scavenge registers which we can probably avoid by
1921 // using BP. If it won't fit for BP either, we'll scavenge anyway.
1922 } else if (FPOffset >= 0) {
1923 // Use SP or FP, whichever gives us the best chance of the offset
1924 // being in range for direct access. If the FPOffset is positive,
1925 // that'll always be best, as the SP will be even further away.
1926 UseFP = true;
1927 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
1928 // Funclets access the locals contained in the parent's stack frame
1929 // via the frame pointer, so we have to use the FP in the parent
1930 // function.
1931 (void) Subtarget;
1932 assert(
1933 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
1934 "Funclets should only be present on Win64");
1935 UseFP = true;
1936 } else {
1937 // We have the choice between FP and (SP or BP).
1938 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
1939 UseFP = true;
1940 }
1941 }
1942 }
1943
1944 assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
1945 "In the presence of dynamic stack pointer realignment, "
1946 "non-argument/CSR objects cannot be accessed through the frame pointer");
1947
1948 if (isSVE) {
1949 StackOffset FPOffset =
1950 StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
1951 StackOffset SPOffset =
1952 SVEStackSize +
1953 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
1954 ObjectOffset);
1955 // Always use the FP for SVE spills if available and beneficial.
1956 if (hasFP(MF) &&
1957 (SPOffset.getFixed() ||
1958 FPOffset.getScalable() < SPOffset.getScalable() ||
1959 RegInfo->needsStackRealignment(MF))) {
1960 FrameReg = RegInfo->getFrameRegister(MF);
1961 return FPOffset;
1962 }
1963
1964 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
1965 : (unsigned)AArch64::SP;
1966 return SPOffset;
1967 }
1968
1969 StackOffset ScalableOffset = {};
1970 if (UseFP && !(isFixed || isCSR))
1971 ScalableOffset = -SVEStackSize;
1972 if (!UseFP && (isFixed || isCSR))
1973 ScalableOffset = SVEStackSize;
1974
1975 if (UseFP) {
1976 FrameReg = RegInfo->getFrameRegister(MF);
1977 return StackOffset::getFixed(FPOffset) + ScalableOffset;
1978 }
1979
1980 // Use the base pointer if we have one.
1981 if (RegInfo->hasBasePointer(MF))
1982 FrameReg = RegInfo->getBaseRegister();
1983 else {
1984 assert(!MFI.hasVarSizedObjects() &&
1985 "Can't use SP when we have var sized objects.");
1986 FrameReg = AArch64::SP;
1987 // If we're using the red zone for this function, the SP won't actually
1988 // be adjusted, so the offsets will be negative. They're also all
1989 // within range of the signed 9-bit immediate instructions.
1990 if (canUseRedZone(MF))
1991 Offset -= AFI->getLocalStackSize();
1992 }
1993
1994 return StackOffset::getFixed(Offset) + ScalableOffset;
1995 }
1996
getPrologueDeath(MachineFunction & MF,unsigned Reg)1997 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
1998 // Do not set a kill flag on values that are also marked as live-in. This
1999 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2000 // callee saved registers.
2001 // Omitting the kill flags is conservatively correct even if the live-in
2002 // is not used after all.
2003 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2004 return getKillRegState(!IsLiveIn);
2005 }
2006
produceCompactUnwindFrame(MachineFunction & MF)2007 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2008 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2009 AttributeList Attrs = MF.getFunction().getAttributes();
2010 return Subtarget.isTargetMachO() &&
2011 !(Subtarget.getTargetLowering()->supportSwiftError() &&
2012 Attrs.hasAttrSomewhere(Attribute::SwiftError));
2013 }
2014
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst)2015 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2016 bool NeedsWinCFI, bool IsFirst) {
2017 // If we are generating register pairs for a Windows function that requires
2018 // EH support, then pair consecutive registers only. There are no unwind
2019 // opcodes for saves/restores of non-consectuve register pairs.
2020 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2021 // save_lrpair.
2022 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2023
2024 if (Reg2 == AArch64::FP)
2025 return true;
2026 if (!NeedsWinCFI)
2027 return false;
2028 if (Reg2 == Reg1 + 1)
2029 return false;
2030 // If pairing a GPR with LR, the pair can be described by the save_lrpair
2031 // opcode. If this is the first register pair, it would end up with a
2032 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2033 // if LR is paired with something else than the first register.
2034 // The save_lrpair opcode requires the first register to be an odd one.
2035 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2036 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2037 return false;
2038 return true;
2039 }
2040
2041 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2042 /// WindowsCFI requires that only consecutive registers can be paired.
2043 /// LR and FP need to be allocated together when the frame needs to save
2044 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst)2045 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2046 bool UsesWinAAPCS, bool NeedsWinCFI,
2047 bool NeedsFrameRecord, bool IsFirst) {
2048 if (UsesWinAAPCS)
2049 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2050
2051 // If we need to store the frame record, don't pair any register
2052 // with LR other than FP.
2053 if (NeedsFrameRecord)
2054 return Reg2 == AArch64::LR;
2055
2056 return false;
2057 }
2058
2059 namespace {
2060
2061 struct RegPairInfo {
2062 unsigned Reg1 = AArch64::NoRegister;
2063 unsigned Reg2 = AArch64::NoRegister;
2064 int FrameIdx;
2065 int Offset;
2066 enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2067
2068 RegPairInfo() = default;
2069
isPaired__anon36e7b1910411::RegPairInfo2070 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2071
getScale__anon36e7b1910411::RegPairInfo2072 unsigned getScale() const {
2073 switch (Type) {
2074 case PPR:
2075 return 2;
2076 case GPR:
2077 case FPR64:
2078 return 8;
2079 case ZPR:
2080 case FPR128:
2081 return 16;
2082 }
2083 llvm_unreachable("Unsupported type");
2084 }
2085
isScalable__anon36e7b1910411::RegPairInfo2086 bool isScalable() const { return Type == PPR || Type == ZPR; }
2087 };
2088
2089 } // end anonymous namespace
2090
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool & NeedShadowCallStackProlog,bool NeedsFrameRecord)2091 static void computeCalleeSaveRegisterPairs(
2092 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2093 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2094 bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
2095
2096 if (CSI.empty())
2097 return;
2098
2099 bool IsWindows = isTargetWindows(MF);
2100 bool NeedsWinCFI = needsWinCFI(MF);
2101 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2102 MachineFrameInfo &MFI = MF.getFrameInfo();
2103 CallingConv::ID CC = MF.getFunction().getCallingConv();
2104 unsigned Count = CSI.size();
2105 (void)CC;
2106 // MachO's compact unwind format relies on all registers being stored in
2107 // pairs.
2108 assert((!produceCompactUnwindFrame(MF) ||
2109 CC == CallingConv::PreserveMost ||
2110 (Count & 1) == 0) &&
2111 "Odd number of callee-saved regs to spill!");
2112 int ByteOffset = AFI->getCalleeSavedStackSize();
2113 int StackFillDir = -1;
2114 int RegInc = 1;
2115 unsigned FirstReg = 0;
2116 if (NeedsWinCFI) {
2117 // For WinCFI, fill the stack from the bottom up.
2118 ByteOffset = 0;
2119 StackFillDir = 1;
2120 // As the CSI array is reversed to match PrologEpilogInserter, iterate
2121 // backwards, to pair up registers starting from lower numbered registers.
2122 RegInc = -1;
2123 FirstReg = Count - 1;
2124 }
2125 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2126
2127 // When iterating backwards, the loop condition relies on unsigned wraparound.
2128 for (unsigned i = FirstReg; i < Count; i += RegInc) {
2129 RegPairInfo RPI;
2130 RPI.Reg1 = CSI[i].getReg();
2131
2132 if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2133 RPI.Type = RegPairInfo::GPR;
2134 else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2135 RPI.Type = RegPairInfo::FPR64;
2136 else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2137 RPI.Type = RegPairInfo::FPR128;
2138 else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2139 RPI.Type = RegPairInfo::ZPR;
2140 else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2141 RPI.Type = RegPairInfo::PPR;
2142 else
2143 llvm_unreachable("Unsupported register class.");
2144
2145 // Add the next reg to the pair if it is in the same register class.
2146 if (unsigned(i + RegInc) < Count) {
2147 unsigned NextReg = CSI[i + RegInc].getReg();
2148 bool IsFirst = i == FirstReg;
2149 switch (RPI.Type) {
2150 case RegPairInfo::GPR:
2151 if (AArch64::GPR64RegClass.contains(NextReg) &&
2152 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2153 NeedsWinCFI, NeedsFrameRecord, IsFirst))
2154 RPI.Reg2 = NextReg;
2155 break;
2156 case RegPairInfo::FPR64:
2157 if (AArch64::FPR64RegClass.contains(NextReg) &&
2158 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2159 IsFirst))
2160 RPI.Reg2 = NextReg;
2161 break;
2162 case RegPairInfo::FPR128:
2163 if (AArch64::FPR128RegClass.contains(NextReg))
2164 RPI.Reg2 = NextReg;
2165 break;
2166 case RegPairInfo::PPR:
2167 case RegPairInfo::ZPR:
2168 break;
2169 }
2170 }
2171
2172 // If either of the registers to be saved is the lr register, it means that
2173 // we also need to save lr in the shadow call stack.
2174 if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
2175 MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
2176 if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
2177 report_fatal_error("Must reserve x18 to use shadow call stack");
2178 NeedShadowCallStackProlog = true;
2179 }
2180
2181 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2182 // list to come in sorted by frame index so that we can issue the store
2183 // pair instructions directly. Assert if we see anything otherwise.
2184 //
2185 // The order of the registers in the list is controlled by
2186 // getCalleeSavedRegs(), so they will always be in-order, as well.
2187 assert((!RPI.isPaired() ||
2188 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2189 "Out of order callee saved regs!");
2190
2191 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2192 RPI.Reg1 == AArch64::LR) &&
2193 "FrameRecord must be allocated together with LR");
2194
2195 // Windows AAPCS has FP and LR reversed.
2196 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2197 RPI.Reg2 == AArch64::LR) &&
2198 "FrameRecord must be allocated together with LR");
2199
2200 // MachO's compact unwind format relies on all registers being stored in
2201 // adjacent register pairs.
2202 assert((!produceCompactUnwindFrame(MF) ||
2203 CC == CallingConv::PreserveMost ||
2204 (RPI.isPaired() &&
2205 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2206 RPI.Reg1 + 1 == RPI.Reg2))) &&
2207 "Callee-save registers not saved as adjacent register pair!");
2208
2209 RPI.FrameIdx = CSI[i].getFrameIdx();
2210 if (NeedsWinCFI &&
2211 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2212 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2213
2214 int Scale = RPI.getScale();
2215
2216 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2217 assert(OffsetPre % Scale == 0);
2218
2219 if (RPI.isScalable())
2220 ScalableByteOffset += StackFillDir * Scale;
2221 else
2222 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2223
2224 assert(!(RPI.isScalable() && RPI.isPaired()) &&
2225 "Paired spill/fill instructions don't exist for SVE vectors");
2226
2227 // Round up size of non-pair to pair size if we need to pad the
2228 // callee-save area to ensure 16-byte alignment.
2229 if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI &&
2230 !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2231 !RPI.isPaired()) {
2232 ByteOffset += 8 * StackFillDir;
2233 assert(ByteOffset % 16 == 0);
2234 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2235 // A stack frame with a gap looks like this, bottom up:
2236 // d9, d8. x21, gap, x20, x19.
2237 // Set extra alignment on the x21 object (the only unpaired register)
2238 // to create the gap above it.
2239 MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2240 }
2241
2242 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2243 assert(OffsetPost % Scale == 0);
2244 // If filling top down (default), we want the offset after incrementing it.
2245 // If fillibg bootom up (WinCFI) we need the original offset.
2246 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2247 RPI.Offset = Offset / Scale;
2248
2249 assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2250 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2251 "Offset out of bounds for LDP/STP immediate");
2252
2253 // Save the offset to frame record so that the FP register can point to the
2254 // innermost frame record (spilled FP and LR registers).
2255 if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2256 RPI.Reg2 == AArch64::FP) ||
2257 (IsWindows && RPI.Reg1 == AArch64::FP &&
2258 RPI.Reg2 == AArch64::LR)))
2259 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2260
2261 RegPairs.push_back(RPI);
2262 if (RPI.isPaired())
2263 i += RegInc;
2264 }
2265 if (NeedsWinCFI) {
2266 // If we need an alignment gap in the stack, align the topmost stack
2267 // object. A stack frame with a gap looks like this, bottom up:
2268 // x19, d8. d9, gap.
2269 // Set extra alignment on the topmost stack object (the first element in
2270 // CSI, which goes top down), to create the gap above it.
2271 if (AFI->hasCalleeSaveStackFreeSpace())
2272 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2273 // We iterated bottom up over the registers; flip RegPairs back to top
2274 // down order.
2275 std::reverse(RegPairs.begin(), RegPairs.end());
2276 }
2277 }
2278
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2279 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2280 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2281 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2282 MachineFunction &MF = *MBB.getParent();
2283 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2284 bool NeedsWinCFI = needsWinCFI(MF);
2285 DebugLoc DL;
2286 SmallVector<RegPairInfo, 8> RegPairs;
2287
2288 bool NeedShadowCallStackProlog = false;
2289 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2290 NeedShadowCallStackProlog, hasFP(MF));
2291 const MachineRegisterInfo &MRI = MF.getRegInfo();
2292
2293 if (NeedShadowCallStackProlog) {
2294 // Shadow call stack prolog: str x30, [x18], #8
2295 BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
2296 .addReg(AArch64::X18, RegState::Define)
2297 .addReg(AArch64::LR)
2298 .addReg(AArch64::X18)
2299 .addImm(8)
2300 .setMIFlag(MachineInstr::FrameSetup);
2301
2302 if (NeedsWinCFI)
2303 BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
2304 .setMIFlag(MachineInstr::FrameSetup);
2305
2306 if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
2307 // Emit a CFI instruction that causes 8 to be subtracted from the value of
2308 // x18 when unwinding past this frame.
2309 static const char CFIInst[] = {
2310 dwarf::DW_CFA_val_expression,
2311 18, // register
2312 2, // length
2313 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
2314 static_cast<char>(-8) & 0x7f, // addend (sleb128)
2315 };
2316 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
2317 nullptr, StringRef(CFIInst, sizeof(CFIInst))));
2318 BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
2319 .addCFIIndex(CFIIndex)
2320 .setMIFlag(MachineInstr::FrameSetup);
2321 }
2322
2323 // This instruction also makes x18 live-in to the entry block.
2324 MBB.addLiveIn(AArch64::X18);
2325 }
2326
2327 for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
2328 ++RPII) {
2329 RegPairInfo RPI = *RPII;
2330 unsigned Reg1 = RPI.Reg1;
2331 unsigned Reg2 = RPI.Reg2;
2332 unsigned StrOpc;
2333
2334 // Issue sequence of spills for cs regs. The first spill may be converted
2335 // to a pre-decrement store later by emitPrologue if the callee-save stack
2336 // area allocation can't be combined with the local stack area allocation.
2337 // For example:
2338 // stp x22, x21, [sp, #0] // addImm(+0)
2339 // stp x20, x19, [sp, #16] // addImm(+2)
2340 // stp fp, lr, [sp, #32] // addImm(+4)
2341 // Rationale: This sequence saves uop updates compared to a sequence of
2342 // pre-increment spills like stp xi,xj,[sp,#-16]!
2343 // Note: Similar rationale and sequence for restores in epilog.
2344 unsigned Size;
2345 Align Alignment;
2346 switch (RPI.Type) {
2347 case RegPairInfo::GPR:
2348 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2349 Size = 8;
2350 Alignment = Align(8);
2351 break;
2352 case RegPairInfo::FPR64:
2353 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2354 Size = 8;
2355 Alignment = Align(8);
2356 break;
2357 case RegPairInfo::FPR128:
2358 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2359 Size = 16;
2360 Alignment = Align(16);
2361 break;
2362 case RegPairInfo::ZPR:
2363 StrOpc = AArch64::STR_ZXI;
2364 Size = 16;
2365 Alignment = Align(16);
2366 break;
2367 case RegPairInfo::PPR:
2368 StrOpc = AArch64::STR_PXI;
2369 Size = 2;
2370 Alignment = Align(2);
2371 break;
2372 }
2373 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2374 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2375 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2376 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2377 dbgs() << ")\n");
2378
2379 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2380 "Windows unwdinding requires a consecutive (FP,LR) pair");
2381 // Windows unwind codes require consecutive registers if registers are
2382 // paired. Make the switch here, so that the code below will save (x,x+1)
2383 // and not (x+1,x).
2384 unsigned FrameIdxReg1 = RPI.FrameIdx;
2385 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2386 if (NeedsWinCFI && RPI.isPaired()) {
2387 std::swap(Reg1, Reg2);
2388 std::swap(FrameIdxReg1, FrameIdxReg2);
2389 }
2390 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2391 if (!MRI.isReserved(Reg1))
2392 MBB.addLiveIn(Reg1);
2393 if (RPI.isPaired()) {
2394 if (!MRI.isReserved(Reg2))
2395 MBB.addLiveIn(Reg2);
2396 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2397 MIB.addMemOperand(MF.getMachineMemOperand(
2398 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2399 MachineMemOperand::MOStore, Size, Alignment));
2400 }
2401 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2402 .addReg(AArch64::SP)
2403 .addImm(RPI.Offset) // [sp, #offset*scale],
2404 // where factor*scale is implicit
2405 .setMIFlag(MachineInstr::FrameSetup);
2406 MIB.addMemOperand(MF.getMachineMemOperand(
2407 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2408 MachineMemOperand::MOStore, Size, Alignment));
2409 if (NeedsWinCFI)
2410 InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2411
2412 // Update the StackIDs of the SVE stack slots.
2413 MachineFrameInfo &MFI = MF.getFrameInfo();
2414 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2415 MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2416
2417 }
2418 return true;
2419 }
2420
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const2421 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2422 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2423 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2424 MachineFunction &MF = *MBB.getParent();
2425 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2426 DebugLoc DL;
2427 SmallVector<RegPairInfo, 8> RegPairs;
2428 bool NeedsWinCFI = needsWinCFI(MF);
2429
2430 if (MI != MBB.end())
2431 DL = MI->getDebugLoc();
2432
2433 bool NeedShadowCallStackProlog = false;
2434 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2435 NeedShadowCallStackProlog, hasFP(MF));
2436
2437 auto EmitMI = [&](const RegPairInfo &RPI) {
2438 unsigned Reg1 = RPI.Reg1;
2439 unsigned Reg2 = RPI.Reg2;
2440
2441 // Issue sequence of restores for cs regs. The last restore may be converted
2442 // to a post-increment load later by emitEpilogue if the callee-save stack
2443 // area allocation can't be combined with the local stack area allocation.
2444 // For example:
2445 // ldp fp, lr, [sp, #32] // addImm(+4)
2446 // ldp x20, x19, [sp, #16] // addImm(+2)
2447 // ldp x22, x21, [sp, #0] // addImm(+0)
2448 // Note: see comment in spillCalleeSavedRegisters()
2449 unsigned LdrOpc;
2450 unsigned Size;
2451 Align Alignment;
2452 switch (RPI.Type) {
2453 case RegPairInfo::GPR:
2454 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2455 Size = 8;
2456 Alignment = Align(8);
2457 break;
2458 case RegPairInfo::FPR64:
2459 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2460 Size = 8;
2461 Alignment = Align(8);
2462 break;
2463 case RegPairInfo::FPR128:
2464 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2465 Size = 16;
2466 Alignment = Align(16);
2467 break;
2468 case RegPairInfo::ZPR:
2469 LdrOpc = AArch64::LDR_ZXI;
2470 Size = 16;
2471 Alignment = Align(16);
2472 break;
2473 case RegPairInfo::PPR:
2474 LdrOpc = AArch64::LDR_PXI;
2475 Size = 2;
2476 Alignment = Align(2);
2477 break;
2478 }
2479 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2480 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2481 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2482 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2483 dbgs() << ")\n");
2484
2485 // Windows unwind codes require consecutive registers if registers are
2486 // paired. Make the switch here, so that the code below will save (x,x+1)
2487 // and not (x+1,x).
2488 unsigned FrameIdxReg1 = RPI.FrameIdx;
2489 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2490 if (NeedsWinCFI && RPI.isPaired()) {
2491 std::swap(Reg1, Reg2);
2492 std::swap(FrameIdxReg1, FrameIdxReg2);
2493 }
2494 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
2495 if (RPI.isPaired()) {
2496 MIB.addReg(Reg2, getDefRegState(true));
2497 MIB.addMemOperand(MF.getMachineMemOperand(
2498 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2499 MachineMemOperand::MOLoad, Size, Alignment));
2500 }
2501 MIB.addReg(Reg1, getDefRegState(true))
2502 .addReg(AArch64::SP)
2503 .addImm(RPI.Offset) // [sp, #offset*scale]
2504 // where factor*scale is implicit
2505 .setMIFlag(MachineInstr::FrameDestroy);
2506 MIB.addMemOperand(MF.getMachineMemOperand(
2507 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2508 MachineMemOperand::MOLoad, Size, Alignment));
2509 if (NeedsWinCFI)
2510 InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2511 };
2512
2513 // SVE objects are always restored in reverse order.
2514 for (const RegPairInfo &RPI : reverse(RegPairs))
2515 if (RPI.isScalable())
2516 EmitMI(RPI);
2517
2518 if (ReverseCSRRestoreSeq) {
2519 for (const RegPairInfo &RPI : reverse(RegPairs))
2520 if (!RPI.isScalable())
2521 EmitMI(RPI);
2522 } else
2523 for (const RegPairInfo &RPI : RegPairs)
2524 if (!RPI.isScalable())
2525 EmitMI(RPI);
2526
2527 if (NeedShadowCallStackProlog) {
2528 // Shadow call stack epilog: ldr x30, [x18, #-8]!
2529 BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
2530 .addReg(AArch64::X18, RegState::Define)
2531 .addReg(AArch64::LR, RegState::Define)
2532 .addReg(AArch64::X18)
2533 .addImm(-8)
2534 .setMIFlag(MachineInstr::FrameDestroy);
2535 }
2536
2537 return true;
2538 }
2539
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const2540 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2541 BitVector &SavedRegs,
2542 RegScavenger *RS) const {
2543 // All calls are tail calls in GHC calling conv, and functions have no
2544 // prologue/epilogue.
2545 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2546 return;
2547
2548 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2549 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2550 MF.getSubtarget().getRegisterInfo());
2551 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2552 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2553 unsigned UnspilledCSGPR = AArch64::NoRegister;
2554 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2555
2556 MachineFrameInfo &MFI = MF.getFrameInfo();
2557 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2558
2559 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2560 ? RegInfo->getBaseRegister()
2561 : (unsigned)AArch64::NoRegister;
2562
2563 unsigned ExtraCSSpill = 0;
2564 // Figure out which callee-saved registers to save/restore.
2565 for (unsigned i = 0; CSRegs[i]; ++i) {
2566 const unsigned Reg = CSRegs[i];
2567
2568 // Add the base pointer register to SavedRegs if it is callee-save.
2569 if (Reg == BasePointerReg)
2570 SavedRegs.set(Reg);
2571
2572 bool RegUsed = SavedRegs.test(Reg);
2573 unsigned PairedReg = AArch64::NoRegister;
2574 if (AArch64::GPR64RegClass.contains(Reg) ||
2575 AArch64::FPR64RegClass.contains(Reg) ||
2576 AArch64::FPR128RegClass.contains(Reg))
2577 PairedReg = CSRegs[i ^ 1];
2578
2579 if (!RegUsed) {
2580 if (AArch64::GPR64RegClass.contains(Reg) &&
2581 !RegInfo->isReservedReg(MF, Reg)) {
2582 UnspilledCSGPR = Reg;
2583 UnspilledCSGPRPaired = PairedReg;
2584 }
2585 continue;
2586 }
2587
2588 // MachO's compact unwind format relies on all registers being stored in
2589 // pairs.
2590 // FIXME: the usual format is actually better if unwinding isn't needed.
2591 if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
2592 !SavedRegs.test(PairedReg)) {
2593 SavedRegs.set(PairedReg);
2594 if (AArch64::GPR64RegClass.contains(PairedReg) &&
2595 !RegInfo->isReservedReg(MF, PairedReg))
2596 ExtraCSSpill = PairedReg;
2597 }
2598 }
2599
2600 if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2601 !Subtarget.isTargetWindows()) {
2602 // For Windows calling convention on a non-windows OS, where X18 is treated
2603 // as reserved, back up X18 when entering non-windows code (marked with the
2604 // Windows calling convention) and restore when returning regardless of
2605 // whether the individual function uses it - it might call other functions
2606 // that clobber it.
2607 SavedRegs.set(AArch64::X18);
2608 }
2609
2610 // Calculates the callee saved stack size.
2611 unsigned CSStackSize = 0;
2612 unsigned SVECSStackSize = 0;
2613 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2614 const MachineRegisterInfo &MRI = MF.getRegInfo();
2615 for (unsigned Reg : SavedRegs.set_bits()) {
2616 auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
2617 if (AArch64::PPRRegClass.contains(Reg) ||
2618 AArch64::ZPRRegClass.contains(Reg))
2619 SVECSStackSize += RegSize;
2620 else
2621 CSStackSize += RegSize;
2622 }
2623
2624 // Save number of saved regs, so we can easily update CSStackSize later.
2625 unsigned NumSavedRegs = SavedRegs.count();
2626
2627 // The frame record needs to be created by saving the appropriate registers
2628 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2629 if (hasFP(MF) ||
2630 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
2631 SavedRegs.set(AArch64::FP);
2632 SavedRegs.set(AArch64::LR);
2633 }
2634
2635 LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2636 for (unsigned Reg
2637 : SavedRegs.set_bits()) dbgs()
2638 << ' ' << printReg(Reg, RegInfo);
2639 dbgs() << "\n";);
2640
2641 // If any callee-saved registers are used, the frame cannot be eliminated.
2642 int64_t SVEStackSize =
2643 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
2644 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
2645
2646 // The CSR spill slots have not been allocated yet, so estimateStackSize
2647 // won't include them.
2648 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2649
2650 // Conservatively always assume BigStack when there are SVE spills.
2651 bool BigStack = SVEStackSize ||
2652 (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
2653 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
2654 AFI->setHasStackFrame(true);
2655
2656 // Estimate if we might need to scavenge a register at some point in order
2657 // to materialize a stack offset. If so, either spill one additional
2658 // callee-saved register or reserve a special spill slot to facilitate
2659 // register scavenging. If we already spilled an extra callee-saved register
2660 // above to keep the number of spills even, we don't need to do anything else
2661 // here.
2662 if (BigStack) {
2663 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2664 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2665 << " to get a scratch register.\n");
2666 SavedRegs.set(UnspilledCSGPR);
2667 // MachO's compact unwind format relies on all registers being stored in
2668 // pairs, so if we need to spill one extra for BigStack, then we need to
2669 // store the pair.
2670 if (produceCompactUnwindFrame(MF))
2671 SavedRegs.set(UnspilledCSGPRPaired);
2672 ExtraCSSpill = UnspilledCSGPR;
2673 }
2674
2675 // If we didn't find an extra callee-saved register to spill, create
2676 // an emergency spill slot.
2677 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
2678 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2679 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2680 unsigned Size = TRI->getSpillSize(RC);
2681 Align Alignment = TRI->getSpillAlign(RC);
2682 int FI = MFI.CreateStackObject(Size, Alignment, false);
2683 RS->addScavengingFrameIndex(FI);
2684 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2685 << " as the emergency spill slot.\n");
2686 }
2687 }
2688
2689 // Adding the size of additional 64bit GPR saves.
2690 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
2691 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
2692 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2693 << EstimatedStackSize + AlignedCSStackSize
2694 << " bytes.\n");
2695
2696 assert((!MFI.isCalleeSavedInfoValid() ||
2697 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2698 "Should not invalidate callee saved info");
2699
2700 // Round up to register pair alignment to avoid additional SP adjustment
2701 // instructions.
2702 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2703 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2704 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
2705 }
2706
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * TRI,std::vector<CalleeSavedInfo> & CSI) const2707 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2708 MachineFunction &MF, const TargetRegisterInfo *TRI,
2709 std::vector<CalleeSavedInfo> &CSI) const {
2710 bool NeedsWinCFI = needsWinCFI(MF);
2711 // To match the canonical windows frame layout, reverse the list of
2712 // callee saved registers to get them laid out by PrologEpilogInserter
2713 // in the right order. (PrologEpilogInserter allocates stack objects top
2714 // down. Windows canonical prologs store higher numbered registers at
2715 // the top, thus have the CSI array start from the highest registers.)
2716 if (NeedsWinCFI)
2717 std::reverse(CSI.begin(), CSI.end());
2718 // Let the generic code do the rest of the setup.
2719 return false;
2720 }
2721
enableStackSlotScavenging(const MachineFunction & MF) const2722 bool AArch64FrameLowering::enableStackSlotScavenging(
2723 const MachineFunction &MF) const {
2724 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2725 return AFI->hasCalleeSaveStackFreeSpace();
2726 }
2727
2728 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)2729 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2730 int &Min, int &Max) {
2731 Min = std::numeric_limits<int>::max();
2732 Max = std::numeric_limits<int>::min();
2733
2734 if (!MFI.isCalleeSavedInfoValid())
2735 return false;
2736
2737 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2738 for (auto &CS : CSI) {
2739 if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
2740 AArch64::PPRRegClass.contains(CS.getReg())) {
2741 assert((Max == std::numeric_limits<int>::min() ||
2742 Max + 1 == CS.getFrameIdx()) &&
2743 "SVE CalleeSaves are not consecutive");
2744
2745 Min = std::min(Min, CS.getFrameIdx());
2746 Max = std::max(Max, CS.getFrameIdx());
2747 }
2748 }
2749 return Min != std::numeric_limits<int>::max();
2750 }
2751
2752 // Process all the SVE stack objects and determine offsets for each
2753 // object. If AssignOffsets is true, the offsets get assigned.
2754 // Fills in the first and last callee-saved frame indices into
2755 // Min/MaxCSFrameIndex, respectively.
2756 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)2757 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
2758 int &MinCSFrameIndex,
2759 int &MaxCSFrameIndex,
2760 bool AssignOffsets) {
2761 #ifndef NDEBUG
2762 // First process all fixed stack objects.
2763 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
2764 assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
2765 "SVE vectors should never be passed on the stack by value, only by "
2766 "reference.");
2767 #endif
2768
2769 auto Assign = [&MFI](int FI, int64_t Offset) {
2770 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2771 MFI.setObjectOffset(FI, Offset);
2772 };
2773
2774 int64_t Offset = 0;
2775
2776 // Then process all callee saved slots.
2777 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
2778 // Assign offsets to the callee save slots.
2779 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
2780 Offset += MFI.getObjectSize(I);
2781 Offset = alignTo(Offset, MFI.getObjectAlign(I));
2782 if (AssignOffsets)
2783 Assign(I, -Offset);
2784 }
2785 }
2786
2787 // Ensure that the Callee-save area is aligned to 16bytes.
2788 Offset = alignTo(Offset, Align(16U));
2789
2790 // Create a buffer of SVE objects to allocate and sort it.
2791 SmallVector<int, 8> ObjectsToAllocate;
2792 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
2793 unsigned StackID = MFI.getStackID(I);
2794 if (StackID != TargetStackID::ScalableVector)
2795 continue;
2796 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
2797 continue;
2798 if (MFI.isDeadObjectIndex(I))
2799 continue;
2800
2801 ObjectsToAllocate.push_back(I);
2802 }
2803
2804 // Allocate all SVE locals and spills
2805 for (unsigned FI : ObjectsToAllocate) {
2806 Align Alignment = MFI.getObjectAlign(FI);
2807 // FIXME: Given that the length of SVE vectors is not necessarily a power of
2808 // two, we'd need to align every object dynamically at runtime if the
2809 // alignment is larger than 16. This is not yet supported.
2810 if (Alignment > Align(16))
2811 report_fatal_error(
2812 "Alignment of scalable vectors > 16 bytes is not yet supported");
2813
2814 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
2815 if (AssignOffsets)
2816 Assign(FI, -Offset);
2817 }
2818
2819 return Offset;
2820 }
2821
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const2822 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
2823 MachineFrameInfo &MFI) const {
2824 int MinCSFrameIndex, MaxCSFrameIndex;
2825 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
2826 }
2827
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const2828 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
2829 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
2830 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
2831 true);
2832 }
2833
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const2834 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
2835 MachineFunction &MF, RegScavenger *RS) const {
2836 MachineFrameInfo &MFI = MF.getFrameInfo();
2837
2838 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
2839 "Upwards growing stack unsupported");
2840
2841 int MinCSFrameIndex, MaxCSFrameIndex;
2842 int64_t SVEStackSize =
2843 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
2844
2845 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2846 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
2847 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
2848
2849 // If this function isn't doing Win64-style C++ EH, we don't need to do
2850 // anything.
2851 if (!MF.hasEHFunclets())
2852 return;
2853 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2854 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
2855
2856 MachineBasicBlock &MBB = MF.front();
2857 auto MBBI = MBB.begin();
2858 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
2859 ++MBBI;
2860
2861 // Create an UnwindHelp object.
2862 // The UnwindHelp object is allocated at the start of the fixed object area
2863 int64_t FixedObject =
2864 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
2865 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
2866 /*SPOffset*/ -FixedObject,
2867 /*IsImmutable=*/false);
2868 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
2869
2870 // We need to store -2 into the UnwindHelp object at the start of the
2871 // function.
2872 DebugLoc DL;
2873 RS->enterBasicBlockEnd(MBB);
2874 RS->backward(std::prev(MBBI));
2875 unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
2876 assert(DstReg && "There must be a free register after frame setup");
2877 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
2878 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
2879 .addReg(DstReg, getKillRegState(true))
2880 .addFrameIndex(UnwindHelpFI)
2881 .addImm(0);
2882 }
2883
2884 namespace {
2885 struct TagStoreInstr {
2886 MachineInstr *MI;
2887 int64_t Offset, Size;
TagStoreInstr__anon36e7b1910711::TagStoreInstr2888 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
2889 : MI(MI), Offset(Offset), Size(Size) {}
2890 };
2891
2892 class TagStoreEdit {
2893 MachineFunction *MF;
2894 MachineBasicBlock *MBB;
2895 MachineRegisterInfo *MRI;
2896 // Tag store instructions that are being replaced.
2897 SmallVector<TagStoreInstr, 8> TagStores;
2898 // Combined memref arguments of the above instructions.
2899 SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
2900
2901 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
2902 // FrameRegOffset + Size) with the address tag of SP.
2903 Register FrameReg;
2904 StackOffset FrameRegOffset;
2905 int64_t Size;
2906 // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
2907 Optional<int64_t> FrameRegUpdate;
2908 // MIFlags for any FrameReg updating instructions.
2909 unsigned FrameRegUpdateFlags;
2910
2911 // Use zeroing instruction variants.
2912 bool ZeroData;
2913 DebugLoc DL;
2914
2915 void emitUnrolled(MachineBasicBlock::iterator InsertI);
2916 void emitLoop(MachineBasicBlock::iterator InsertI);
2917
2918 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)2919 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
2920 : MBB(MBB), ZeroData(ZeroData) {
2921 MF = MBB->getParent();
2922 MRI = &MF->getRegInfo();
2923 }
2924 // Add an instruction to be replaced. Instructions must be added in the
2925 // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)2926 void addInstruction(TagStoreInstr I) {
2927 assert((TagStores.empty() ||
2928 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
2929 "Non-adjacent tag store instructions.");
2930 TagStores.push_back(I);
2931 }
clear()2932 void clear() { TagStores.clear(); }
2933 // Emit equivalent code at the given location, and erase the current set of
2934 // instructions. May skip if the replacement is not profitable. May invalidate
2935 // the input iterator and replace it with a valid one.
2936 void emitCode(MachineBasicBlock::iterator &InsertI,
2937 const AArch64FrameLowering *TFI, bool IsLast);
2938 };
2939
emitUnrolled(MachineBasicBlock::iterator InsertI)2940 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
2941 const AArch64InstrInfo *TII =
2942 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
2943
2944 const int64_t kMinOffset = -256 * 16;
2945 const int64_t kMaxOffset = 255 * 16;
2946
2947 Register BaseReg = FrameReg;
2948 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
2949 if (BaseRegOffsetBytes < kMinOffset ||
2950 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
2951 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
2952 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
2953 StackOffset::getFixed(BaseRegOffsetBytes), TII);
2954 BaseReg = ScratchReg;
2955 BaseRegOffsetBytes = 0;
2956 }
2957
2958 MachineInstr *LastI = nullptr;
2959 while (Size) {
2960 int64_t InstrSize = (Size > 16) ? 32 : 16;
2961 unsigned Opcode =
2962 InstrSize == 16
2963 ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
2964 : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
2965 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
2966 .addReg(AArch64::SP)
2967 .addReg(BaseReg)
2968 .addImm(BaseRegOffsetBytes / 16)
2969 .setMemRefs(CombinedMemRefs);
2970 // A store to [BaseReg, #0] should go last for an opportunity to fold the
2971 // final SP adjustment in the epilogue.
2972 if (BaseRegOffsetBytes == 0)
2973 LastI = I;
2974 BaseRegOffsetBytes += InstrSize;
2975 Size -= InstrSize;
2976 }
2977
2978 if (LastI)
2979 MBB->splice(InsertI, MBB, LastI);
2980 }
2981
emitLoop(MachineBasicBlock::iterator InsertI)2982 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
2983 const AArch64InstrInfo *TII =
2984 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
2985
2986 Register BaseReg = FrameRegUpdate
2987 ? FrameReg
2988 : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
2989 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
2990
2991 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
2992
2993 int64_t LoopSize = Size;
2994 // If the loop size is not a multiple of 32, split off one 16-byte store at
2995 // the end to fold BaseReg update into.
2996 if (FrameRegUpdate && *FrameRegUpdate)
2997 LoopSize -= LoopSize % 32;
2998 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
2999 TII->get(ZeroData ? AArch64::STZGloop_wback
3000 : AArch64::STGloop_wback))
3001 .addDef(SizeReg)
3002 .addDef(BaseReg)
3003 .addImm(LoopSize)
3004 .addReg(BaseReg)
3005 .setMemRefs(CombinedMemRefs);
3006 if (FrameRegUpdate)
3007 LoopI->setFlags(FrameRegUpdateFlags);
3008
3009 int64_t ExtraBaseRegUpdate =
3010 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3011 if (LoopSize < Size) {
3012 assert(FrameRegUpdate);
3013 assert(Size - LoopSize == 16);
3014 // Tag 16 more bytes at BaseReg and update BaseReg.
3015 BuildMI(*MBB, InsertI, DL,
3016 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3017 .addDef(BaseReg)
3018 .addReg(BaseReg)
3019 .addReg(BaseReg)
3020 .addImm(1 + ExtraBaseRegUpdate / 16)
3021 .setMemRefs(CombinedMemRefs)
3022 .setMIFlags(FrameRegUpdateFlags);
3023 } else if (ExtraBaseRegUpdate) {
3024 // Update BaseReg.
3025 BuildMI(
3026 *MBB, InsertI, DL,
3027 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3028 .addDef(BaseReg)
3029 .addReg(BaseReg)
3030 .addImm(std::abs(ExtraBaseRegUpdate))
3031 .addImm(0)
3032 .setMIFlags(FrameRegUpdateFlags);
3033 }
3034 }
3035
3036 // Check if *II is a register update that can be merged into STGloop that ends
3037 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3038 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)3039 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3040 int64_t Size, int64_t *TotalOffset) {
3041 MachineInstr &MI = *II;
3042 if ((MI.getOpcode() == AArch64::ADDXri ||
3043 MI.getOpcode() == AArch64::SUBXri) &&
3044 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3045 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3046 int64_t Offset = MI.getOperand(2).getImm() << Shift;
3047 if (MI.getOpcode() == AArch64::SUBXri)
3048 Offset = -Offset;
3049 int64_t AbsPostOffset = std::abs(Offset - Size);
3050 const int64_t kMaxOffset =
3051 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3052 if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3053 *TotalOffset = Offset;
3054 return true;
3055 }
3056 }
3057 return false;
3058 }
3059
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)3060 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3061 SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3062 MemRefs.clear();
3063 for (auto &TS : TSE) {
3064 MachineInstr *MI = TS.MI;
3065 // An instruction without memory operands may access anything. Be
3066 // conservative and return an empty list.
3067 if (MI->memoperands_empty()) {
3068 MemRefs.clear();
3069 return;
3070 }
3071 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3072 }
3073 }
3074
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool IsLast)3075 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3076 const AArch64FrameLowering *TFI, bool IsLast) {
3077 if (TagStores.empty())
3078 return;
3079 TagStoreInstr &FirstTagStore = TagStores[0];
3080 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3081 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3082 DL = TagStores[0].MI->getDebugLoc();
3083
3084 Register Reg;
3085 FrameRegOffset = TFI->resolveFrameOffsetReference(
3086 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3087 /*PreferFP=*/false, /*ForSimm=*/true);
3088 FrameReg = Reg;
3089 FrameRegUpdate = None;
3090
3091 mergeMemRefs(TagStores, CombinedMemRefs);
3092
3093 LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3094 for (const auto &Instr
3095 : TagStores) { dbgs() << " " << *Instr.MI; });
3096
3097 // Size threshold where a loop becomes shorter than a linear sequence of
3098 // tagging instructions.
3099 const int kSetTagLoopThreshold = 176;
3100 if (Size < kSetTagLoopThreshold) {
3101 if (TagStores.size() < 2)
3102 return;
3103 emitUnrolled(InsertI);
3104 } else {
3105 MachineInstr *UpdateInstr = nullptr;
3106 int64_t TotalOffset;
3107 if (IsLast) {
3108 // See if we can merge base register update into the STGloop.
3109 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3110 // but STGloop is way too unusual for that, and also it only
3111 // realistically happens in function epilogue. Also, STGloop is expanded
3112 // before that pass.
3113 if (InsertI != MBB->end() &&
3114 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3115 &TotalOffset)) {
3116 UpdateInstr = &*InsertI++;
3117 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
3118 << *UpdateInstr);
3119 }
3120 }
3121
3122 if (!UpdateInstr && TagStores.size() < 2)
3123 return;
3124
3125 if (UpdateInstr) {
3126 FrameRegUpdate = TotalOffset;
3127 FrameRegUpdateFlags = UpdateInstr->getFlags();
3128 }
3129 emitLoop(InsertI);
3130 if (UpdateInstr)
3131 UpdateInstr->eraseFromParent();
3132 }
3133
3134 for (auto &TS : TagStores)
3135 TS.MI->eraseFromParent();
3136 }
3137
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)3138 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3139 int64_t &Size, bool &ZeroData) {
3140 MachineFunction &MF = *MI.getParent()->getParent();
3141 const MachineFrameInfo &MFI = MF.getFrameInfo();
3142
3143 unsigned Opcode = MI.getOpcode();
3144 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3145 Opcode == AArch64::STZ2GOffset);
3146
3147 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3148 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3149 return false;
3150 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3151 return false;
3152 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3153 Size = MI.getOperand(2).getImm();
3154 return true;
3155 }
3156
3157 if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3158 Size = 16;
3159 else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3160 Size = 32;
3161 else
3162 return false;
3163
3164 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3165 return false;
3166
3167 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3168 16 * MI.getOperand(2).getImm();
3169 return true;
3170 }
3171
3172 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3173 // and replace them with a shorter instruction sequence:
3174 // * replace STG + STG with ST2G
3175 // * replace STGloop + STGloop with STGloop
3176 // This code needs to run when stack slot offsets are already known, but before
3177 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)3178 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3179 const AArch64FrameLowering *TFI,
3180 RegScavenger *RS) {
3181 bool FirstZeroData;
3182 int64_t Size, Offset;
3183 MachineInstr &MI = *II;
3184 MachineBasicBlock *MBB = MI.getParent();
3185 MachineBasicBlock::iterator NextI = ++II;
3186 if (&MI == &MBB->instr_back())
3187 return II;
3188 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3189 return II;
3190
3191 SmallVector<TagStoreInstr, 4> Instrs;
3192 Instrs.emplace_back(&MI, Offset, Size);
3193
3194 constexpr int kScanLimit = 10;
3195 int Count = 0;
3196 for (MachineBasicBlock::iterator E = MBB->end();
3197 NextI != E && Count < kScanLimit; ++NextI) {
3198 MachineInstr &MI = *NextI;
3199 bool ZeroData;
3200 int64_t Size, Offset;
3201 // Collect instructions that update memory tags with a FrameIndex operand
3202 // and (when applicable) constant size, and whose output registers are dead
3203 // (the latter is almost always the case in practice). Since these
3204 // instructions effectively have no inputs or outputs, we are free to skip
3205 // any non-aliasing instructions in between without tracking used registers.
3206 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3207 if (ZeroData != FirstZeroData)
3208 break;
3209 Instrs.emplace_back(&MI, Offset, Size);
3210 continue;
3211 }
3212
3213 // Only count non-transient, non-tagging instructions toward the scan
3214 // limit.
3215 if (!MI.isTransient())
3216 ++Count;
3217
3218 // Just in case, stop before the epilogue code starts.
3219 if (MI.getFlag(MachineInstr::FrameSetup) ||
3220 MI.getFlag(MachineInstr::FrameDestroy))
3221 break;
3222
3223 // Reject anything that may alias the collected instructions.
3224 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3225 break;
3226 }
3227
3228 // New code will be inserted after the last tagging instruction we've found.
3229 MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3230 InsertI++;
3231
3232 llvm::stable_sort(Instrs,
3233 [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3234 return Left.Offset < Right.Offset;
3235 });
3236
3237 // Make sure that we don't have any overlapping stores.
3238 int64_t CurOffset = Instrs[0].Offset;
3239 for (auto &Instr : Instrs) {
3240 if (CurOffset > Instr.Offset)
3241 return NextI;
3242 CurOffset = Instr.Offset + Instr.Size;
3243 }
3244
3245 // Find contiguous runs of tagged memory and emit shorter instruction
3246 // sequencies for them when possible.
3247 TagStoreEdit TSE(MBB, FirstZeroData);
3248 Optional<int64_t> EndOffset;
3249 for (auto &Instr : Instrs) {
3250 if (EndOffset && *EndOffset != Instr.Offset) {
3251 // Found a gap.
3252 TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
3253 TSE.clear();
3254 }
3255
3256 TSE.addInstruction(Instr);
3257 EndOffset = Instr.Offset + Instr.Size;
3258 }
3259
3260 TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
3261
3262 return InsertI;
3263 }
3264 } // namespace
3265
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const3266 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3267 MachineFunction &MF, RegScavenger *RS = nullptr) const {
3268 if (StackTaggingMergeSetTag)
3269 for (auto &BB : MF)
3270 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3271 II = tryMergeAdjacentSTG(II, this, RS);
3272 }
3273
3274 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3275 /// before the update. This is easily retrieved as it is exactly the offset
3276 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const3277 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3278 const MachineFunction &MF, int FI, Register &FrameReg,
3279 bool IgnoreSPUpdates) const {
3280 const MachineFrameInfo &MFI = MF.getFrameInfo();
3281 if (IgnoreSPUpdates) {
3282 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3283 << MFI.getObjectOffset(FI) << "\n");
3284 FrameReg = AArch64::SP;
3285 return StackOffset::getFixed(MFI.getObjectOffset(FI));
3286 }
3287
3288 return getFrameIndexReference(MF, FI, FrameReg);
3289 }
3290
3291 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3292 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const3293 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3294 const MachineFunction &MF) const {
3295 return 0;
3296 }
3297
3298 /// Funclets only need to account for space for the callee saved registers,
3299 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const3300 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3301 const MachineFunction &MF) const {
3302 // This is the size of the pushed CSRs.
3303 unsigned CSSize =
3304 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3305 // This is the amount of stack a funclet needs to allocate.
3306 return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3307 getStackAlign());
3308 }
3309
3310 namespace {
3311 struct FrameObject {
3312 bool IsValid = false;
3313 // Index of the object in MFI.
3314 int ObjectIndex = 0;
3315 // Group ID this object belongs to.
3316 int GroupIndex = -1;
3317 // This object should be placed first (closest to SP).
3318 bool ObjectFirst = false;
3319 // This object's group (which always contains the object with
3320 // ObjectFirst==true) should be placed first.
3321 bool GroupFirst = false;
3322 };
3323
3324 class GroupBuilder {
3325 SmallVector<int, 8> CurrentMembers;
3326 int NextGroupIndex = 0;
3327 std::vector<FrameObject> &Objects;
3328
3329 public:
GroupBuilder(std::vector<FrameObject> & Objects)3330 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)3331 void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()3332 void EndCurrentGroup() {
3333 if (CurrentMembers.size() > 1) {
3334 // Create a new group with the current member list. This might remove them
3335 // from their pre-existing groups. That's OK, dealing with overlapping
3336 // groups is too hard and unlikely to make a difference.
3337 LLVM_DEBUG(dbgs() << "group:");
3338 for (int Index : CurrentMembers) {
3339 Objects[Index].GroupIndex = NextGroupIndex;
3340 LLVM_DEBUG(dbgs() << " " << Index);
3341 }
3342 LLVM_DEBUG(dbgs() << "\n");
3343 NextGroupIndex++;
3344 }
3345 CurrentMembers.clear();
3346 }
3347 };
3348
FrameObjectCompare(const FrameObject & A,const FrameObject & B)3349 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3350 // Objects at a lower index are closer to FP; objects at a higher index are
3351 // closer to SP.
3352 //
3353 // For consistency in our comparison, all invalid objects are placed
3354 // at the end. This also allows us to stop walking when we hit the
3355 // first invalid item after it's all sorted.
3356 //
3357 // The "first" object goes first (closest to SP), followed by the members of
3358 // the "first" group.
3359 //
3360 // The rest are sorted by the group index to keep the groups together.
3361 // Higher numbered groups are more likely to be around longer (i.e. untagged
3362 // in the function epilogue and not at some earlier point). Place them closer
3363 // to SP.
3364 //
3365 // If all else equal, sort by the object index to keep the objects in the
3366 // original order.
3367 return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3368 A.ObjectIndex) <
3369 std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3370 B.ObjectIndex);
3371 }
3372 } // namespace
3373
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const3374 void AArch64FrameLowering::orderFrameObjects(
3375 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3376 if (!OrderFrameObjects || ObjectsToAllocate.empty())
3377 return;
3378
3379 const MachineFrameInfo &MFI = MF.getFrameInfo();
3380 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3381 for (auto &Obj : ObjectsToAllocate) {
3382 FrameObjects[Obj].IsValid = true;
3383 FrameObjects[Obj].ObjectIndex = Obj;
3384 }
3385
3386 // Identify stack slots that are tagged at the same time.
3387 GroupBuilder GB(FrameObjects);
3388 for (auto &MBB : MF) {
3389 for (auto &MI : MBB) {
3390 if (MI.isDebugInstr())
3391 continue;
3392 int OpIndex;
3393 switch (MI.getOpcode()) {
3394 case AArch64::STGloop:
3395 case AArch64::STZGloop:
3396 OpIndex = 3;
3397 break;
3398 case AArch64::STGOffset:
3399 case AArch64::STZGOffset:
3400 case AArch64::ST2GOffset:
3401 case AArch64::STZ2GOffset:
3402 OpIndex = 1;
3403 break;
3404 default:
3405 OpIndex = -1;
3406 }
3407
3408 int TaggedFI = -1;
3409 if (OpIndex >= 0) {
3410 const MachineOperand &MO = MI.getOperand(OpIndex);
3411 if (MO.isFI()) {
3412 int FI = MO.getIndex();
3413 if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3414 FrameObjects[FI].IsValid)
3415 TaggedFI = FI;
3416 }
3417 }
3418
3419 // If this is a stack tagging instruction for a slot that is not part of a
3420 // group yet, either start a new group or add it to the current one.
3421 if (TaggedFI >= 0)
3422 GB.AddMember(TaggedFI);
3423 else
3424 GB.EndCurrentGroup();
3425 }
3426 // Groups should never span multiple basic blocks.
3427 GB.EndCurrentGroup();
3428 }
3429
3430 // If the function's tagged base pointer is pinned to a stack slot, we want to
3431 // put that slot first when possible. This will likely place it at SP + 0,
3432 // and save one instruction when generating the base pointer because IRG does
3433 // not allow an immediate offset.
3434 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3435 Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3436 if (TBPI) {
3437 FrameObjects[*TBPI].ObjectFirst = true;
3438 FrameObjects[*TBPI].GroupFirst = true;
3439 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3440 if (FirstGroupIndex >= 0)
3441 for (FrameObject &Object : FrameObjects)
3442 if (Object.GroupIndex == FirstGroupIndex)
3443 Object.GroupFirst = true;
3444 }
3445
3446 llvm::stable_sort(FrameObjects, FrameObjectCompare);
3447
3448 int i = 0;
3449 for (auto &Obj : FrameObjects) {
3450 // All invalid items are sorted at the end, so it's safe to stop.
3451 if (!Obj.IsValid)
3452 break;
3453 ObjectsToAllocate[i++] = Obj.ObjectIndex;
3454 }
3455
3456 LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3457 : FrameObjects) {
3458 if (!Obj.IsValid)
3459 break;
3460 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3461 if (Obj.ObjectFirst)
3462 dbgs() << ", first";
3463 if (Obj.GroupFirst)
3464 dbgs() << ", group-first";
3465 dbgs() << "\n";
3466 });
3467 }
3468