1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64FrameLowering.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PointerAuth.h"
18 #include "AArch64Subtarget.h"
19 #include "MCTargetDesc/AArch64AddressingModes.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/ArrayRef.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/LivePhysRegs.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineCombinerPattern.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineInstr.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineMemOperand.h"
32 #include "llvm/CodeGen/MachineModuleInfo.h"
33 #include "llvm/CodeGen/MachineOperand.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/RegisterScavenging.h"
36 #include "llvm/CodeGen/StackMaps.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/CodeGen/TargetSubtargetInfo.h"
39 #include "llvm/IR/DebugInfoMetadata.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/IR/GlobalValue.h"
42 #include "llvm/MC/MCAsmInfo.h"
43 #include "llvm/MC/MCInst.h"
44 #include "llvm/MC/MCInstBuilder.h"
45 #include "llvm/MC/MCInstrDesc.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CodeGen.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/LEB128.h"
51 #include "llvm/Support/MathExtras.h"
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Target/TargetOptions.h"
54 #include <cassert>
55 #include <cstdint>
56 #include <iterator>
57 #include <utility>
58
59 using namespace llvm;
60
61 #define GET_INSTRINFO_CTOR_DTOR
62 #include "AArch64GenInstrInfo.inc"
63
64 static cl::opt<unsigned> TBZDisplacementBits(
65 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
66 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
67
68 static cl::opt<unsigned> CBZDisplacementBits(
69 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
70 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
71
72 static cl::opt<unsigned>
73 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
74 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
75
76 static cl::opt<unsigned>
77 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
78 cl::desc("Restrict range of B instructions (DEBUG)"));
79
AArch64InstrInfo(const AArch64Subtarget & STI)80 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
81 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
82 AArch64::CATCHRET),
83 RI(STI.getTargetTriple()), Subtarget(STI) {}
84
85 /// GetInstSize - Return the number of bytes of code the specified
86 /// instruction may be. This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const87 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
88 const MachineBasicBlock &MBB = *MI.getParent();
89 const MachineFunction *MF = MBB.getParent();
90 const Function &F = MF->getFunction();
91 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
92
93 {
94 auto Op = MI.getOpcode();
95 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
96 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
97 }
98
99 // Meta-instructions emit no code.
100 if (MI.isMetaInstruction())
101 return 0;
102
103 // FIXME: We currently only handle pseudoinstructions that don't get expanded
104 // before the assembly printer.
105 unsigned NumBytes = 0;
106 const MCInstrDesc &Desc = MI.getDesc();
107
108 // Size should be preferably set in
109 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
110 // Specific cases handle instructions of variable sizes
111 switch (Desc.getOpcode()) {
112 default:
113 if (Desc.getSize())
114 return Desc.getSize();
115
116 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
117 // with fixed constant size but not specified in .td file) is a normal
118 // 4-byte insn.
119 NumBytes = 4;
120 break;
121 case TargetOpcode::STACKMAP:
122 // The upper bound for a stackmap intrinsic is the full length of its shadow
123 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
124 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
125 break;
126 case TargetOpcode::PATCHPOINT:
127 // The size of the patchpoint intrinsic is the number of bytes requested
128 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
129 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
130 break;
131 case TargetOpcode::STATEPOINT:
132 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
133 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
134 // No patch bytes means a normal call inst is emitted
135 if (NumBytes == 0)
136 NumBytes = 4;
137 break;
138 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
139 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
140 // instructions are expanded to the specified number of NOPs. Otherwise,
141 // they are expanded to 36-byte XRay sleds.
142 NumBytes =
143 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
144 break;
145 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
146 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
147 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
148 NumBytes = 36;
149 break;
150 case TargetOpcode::PATCHABLE_EVENT_CALL:
151 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
152 NumBytes = 24;
153 break;
154
155 case AArch64::SPACE:
156 NumBytes = MI.getOperand(1).getImm();
157 break;
158 case TargetOpcode::BUNDLE:
159 NumBytes = getInstBundleLength(MI);
160 break;
161 }
162
163 return NumBytes;
164 }
165
getInstBundleLength(const MachineInstr & MI) const166 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
167 unsigned Size = 0;
168 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
169 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
170 while (++I != E && I->isInsideBundle()) {
171 assert(!I->isBundle() && "No nested bundle!");
172 Size += getInstSizeInBytes(*I);
173 }
174 return Size;
175 }
176
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)177 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
178 SmallVectorImpl<MachineOperand> &Cond) {
179 // Block ends with fall-through condbranch.
180 switch (LastInst->getOpcode()) {
181 default:
182 llvm_unreachable("Unknown branch instruction?");
183 case AArch64::Bcc:
184 Target = LastInst->getOperand(1).getMBB();
185 Cond.push_back(LastInst->getOperand(0));
186 break;
187 case AArch64::CBZW:
188 case AArch64::CBZX:
189 case AArch64::CBNZW:
190 case AArch64::CBNZX:
191 Target = LastInst->getOperand(1).getMBB();
192 Cond.push_back(MachineOperand::CreateImm(-1));
193 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
194 Cond.push_back(LastInst->getOperand(0));
195 break;
196 case AArch64::TBZW:
197 case AArch64::TBZX:
198 case AArch64::TBNZW:
199 case AArch64::TBNZX:
200 Target = LastInst->getOperand(2).getMBB();
201 Cond.push_back(MachineOperand::CreateImm(-1));
202 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
203 Cond.push_back(LastInst->getOperand(0));
204 Cond.push_back(LastInst->getOperand(1));
205 }
206 }
207
getBranchDisplacementBits(unsigned Opc)208 static unsigned getBranchDisplacementBits(unsigned Opc) {
209 switch (Opc) {
210 default:
211 llvm_unreachable("unexpected opcode!");
212 case AArch64::B:
213 return BDisplacementBits;
214 case AArch64::TBNZW:
215 case AArch64::TBZW:
216 case AArch64::TBNZX:
217 case AArch64::TBZX:
218 return TBZDisplacementBits;
219 case AArch64::CBNZW:
220 case AArch64::CBZW:
221 case AArch64::CBNZX:
222 case AArch64::CBZX:
223 return CBZDisplacementBits;
224 case AArch64::Bcc:
225 return BCCDisplacementBits;
226 }
227 }
228
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const229 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
230 int64_t BrOffset) const {
231 unsigned Bits = getBranchDisplacementBits(BranchOp);
232 assert(Bits >= 3 && "max branch displacement must be enough to jump"
233 "over conditional branch expansion");
234 return isIntN(Bits, BrOffset / 4);
235 }
236
237 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const238 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
239 switch (MI.getOpcode()) {
240 default:
241 llvm_unreachable("unexpected opcode!");
242 case AArch64::B:
243 return MI.getOperand(0).getMBB();
244 case AArch64::TBZW:
245 case AArch64::TBNZW:
246 case AArch64::TBZX:
247 case AArch64::TBNZX:
248 return MI.getOperand(2).getMBB();
249 case AArch64::CBZW:
250 case AArch64::CBNZW:
251 case AArch64::CBZX:
252 case AArch64::CBNZX:
253 case AArch64::Bcc:
254 return MI.getOperand(1).getMBB();
255 }
256 }
257
insertIndirectBranch(MachineBasicBlock & MBB,MachineBasicBlock & NewDestBB,MachineBasicBlock & RestoreBB,const DebugLoc & DL,int64_t BrOffset,RegScavenger * RS) const258 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
259 MachineBasicBlock &NewDestBB,
260 MachineBasicBlock &RestoreBB,
261 const DebugLoc &DL,
262 int64_t BrOffset,
263 RegScavenger *RS) const {
264 assert(RS && "RegScavenger required for long branching");
265 assert(MBB.empty() &&
266 "new block should be inserted for expanding unconditional branch");
267 assert(MBB.pred_size() == 1);
268 assert(RestoreBB.empty() &&
269 "restore block should be inserted for restoring clobbered registers");
270
271 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
272 // Offsets outside of the signed 33-bit range are not supported for ADRP +
273 // ADD.
274 if (!isInt<33>(BrOffset))
275 report_fatal_error(
276 "Branch offsets outside of the signed 33-bit range not supported");
277
278 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
279 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
280 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
281 .addReg(Reg)
282 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
283 .addImm(0);
284 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
285 };
286
287 RS->enterBasicBlockEnd(MBB);
288 // If X16 is unused, we can rely on the linker to insert a range extension
289 // thunk if NewDestBB is out of range of a single B instruction.
290 constexpr Register Reg = AArch64::X16;
291 if (!RS->isRegUsed(Reg)) {
292 insertUnconditionalBranch(MBB, &NewDestBB, DL);
293 RS->setRegUsed(Reg);
294 return;
295 }
296
297 // If there's a free register and it's worth inflating the code size,
298 // manually insert the indirect branch.
299 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
300 if (Scavenged != AArch64::NoRegister &&
301 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
302 buildIndirectBranch(Scavenged, NewDestBB);
303 RS->setRegUsed(Scavenged);
304 return;
305 }
306
307 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
308 // with red zones.
309 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
310 if (!AFI || AFI->hasRedZone().value_or(true))
311 report_fatal_error(
312 "Unable to insert indirect branch inside function that has red zone");
313
314 // Otherwise, spill X16 and defer range extension to the linker.
315 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
316 .addReg(AArch64::SP, RegState::Define)
317 .addReg(Reg)
318 .addReg(AArch64::SP)
319 .addImm(-16);
320
321 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
322
323 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
324 .addReg(AArch64::SP, RegState::Define)
325 .addReg(Reg, RegState::Define)
326 .addReg(AArch64::SP)
327 .addImm(16);
328 }
329
330 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const331 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
332 MachineBasicBlock *&TBB,
333 MachineBasicBlock *&FBB,
334 SmallVectorImpl<MachineOperand> &Cond,
335 bool AllowModify) const {
336 // If the block has no terminators, it just falls into the block after it.
337 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
338 if (I == MBB.end())
339 return false;
340
341 // Skip over SpeculationBarrierEndBB terminators
342 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
343 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
344 --I;
345 }
346
347 if (!isUnpredicatedTerminator(*I))
348 return false;
349
350 // Get the last instruction in the block.
351 MachineInstr *LastInst = &*I;
352
353 // If there is only one terminator instruction, process it.
354 unsigned LastOpc = LastInst->getOpcode();
355 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
356 if (isUncondBranchOpcode(LastOpc)) {
357 TBB = LastInst->getOperand(0).getMBB();
358 return false;
359 }
360 if (isCondBranchOpcode(LastOpc)) {
361 // Block ends with fall-through condbranch.
362 parseCondBranch(LastInst, TBB, Cond);
363 return false;
364 }
365 return true; // Can't handle indirect branch.
366 }
367
368 // Get the instruction before it if it is a terminator.
369 MachineInstr *SecondLastInst = &*I;
370 unsigned SecondLastOpc = SecondLastInst->getOpcode();
371
372 // If AllowModify is true and the block ends with two or more unconditional
373 // branches, delete all but the first unconditional branch.
374 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
375 while (isUncondBranchOpcode(SecondLastOpc)) {
376 LastInst->eraseFromParent();
377 LastInst = SecondLastInst;
378 LastOpc = LastInst->getOpcode();
379 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
380 // Return now the only terminator is an unconditional branch.
381 TBB = LastInst->getOperand(0).getMBB();
382 return false;
383 }
384 SecondLastInst = &*I;
385 SecondLastOpc = SecondLastInst->getOpcode();
386 }
387 }
388
389 // If we're allowed to modify and the block ends in a unconditional branch
390 // which could simply fallthrough, remove the branch. (Note: This case only
391 // matters when we can't understand the whole sequence, otherwise it's also
392 // handled by BranchFolding.cpp.)
393 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
394 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
395 LastInst->eraseFromParent();
396 LastInst = SecondLastInst;
397 LastOpc = LastInst->getOpcode();
398 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
399 assert(!isUncondBranchOpcode(LastOpc) &&
400 "unreachable unconditional branches removed above");
401
402 if (isCondBranchOpcode(LastOpc)) {
403 // Block ends with fall-through condbranch.
404 parseCondBranch(LastInst, TBB, Cond);
405 return false;
406 }
407 return true; // Can't handle indirect branch.
408 }
409 SecondLastInst = &*I;
410 SecondLastOpc = SecondLastInst->getOpcode();
411 }
412
413 // If there are three terminators, we don't know what sort of block this is.
414 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
415 return true;
416
417 // If the block ends with a B and a Bcc, handle it.
418 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
419 parseCondBranch(SecondLastInst, TBB, Cond);
420 FBB = LastInst->getOperand(0).getMBB();
421 return false;
422 }
423
424 // If the block ends with two unconditional branches, handle it. The second
425 // one is not executed, so remove it.
426 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
427 TBB = SecondLastInst->getOperand(0).getMBB();
428 I = LastInst;
429 if (AllowModify)
430 I->eraseFromParent();
431 return false;
432 }
433
434 // ...likewise if it ends with an indirect branch followed by an unconditional
435 // branch.
436 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
437 I = LastInst;
438 if (AllowModify)
439 I->eraseFromParent();
440 return true;
441 }
442
443 // Otherwise, can't handle this.
444 return true;
445 }
446
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const447 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
448 MachineBranchPredicate &MBP,
449 bool AllowModify) const {
450 // For the moment, handle only a block which ends with a cb(n)zx followed by
451 // a fallthrough. Why this? Because it is a common form.
452 // TODO: Should we handle b.cc?
453
454 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
455 if (I == MBB.end())
456 return true;
457
458 // Skip over SpeculationBarrierEndBB terminators
459 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
460 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
461 --I;
462 }
463
464 if (!isUnpredicatedTerminator(*I))
465 return true;
466
467 // Get the last instruction in the block.
468 MachineInstr *LastInst = &*I;
469 unsigned LastOpc = LastInst->getOpcode();
470 if (!isCondBranchOpcode(LastOpc))
471 return true;
472
473 switch (LastOpc) {
474 default:
475 return true;
476 case AArch64::CBZW:
477 case AArch64::CBZX:
478 case AArch64::CBNZW:
479 case AArch64::CBNZX:
480 break;
481 };
482
483 MBP.TrueDest = LastInst->getOperand(1).getMBB();
484 assert(MBP.TrueDest && "expected!");
485 MBP.FalseDest = MBB.getNextNode();
486
487 MBP.ConditionDef = nullptr;
488 MBP.SingleUseCondition = false;
489
490 MBP.LHS = LastInst->getOperand(0);
491 MBP.RHS = MachineOperand::CreateImm(0);
492 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
493 : MachineBranchPredicate::PRED_EQ;
494 return false;
495 }
496
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const497 bool AArch64InstrInfo::reverseBranchCondition(
498 SmallVectorImpl<MachineOperand> &Cond) const {
499 if (Cond[0].getImm() != -1) {
500 // Regular Bcc
501 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
502 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
503 } else {
504 // Folded compare-and-branch
505 switch (Cond[1].getImm()) {
506 default:
507 llvm_unreachable("Unknown conditional branch!");
508 case AArch64::CBZW:
509 Cond[1].setImm(AArch64::CBNZW);
510 break;
511 case AArch64::CBNZW:
512 Cond[1].setImm(AArch64::CBZW);
513 break;
514 case AArch64::CBZX:
515 Cond[1].setImm(AArch64::CBNZX);
516 break;
517 case AArch64::CBNZX:
518 Cond[1].setImm(AArch64::CBZX);
519 break;
520 case AArch64::TBZW:
521 Cond[1].setImm(AArch64::TBNZW);
522 break;
523 case AArch64::TBNZW:
524 Cond[1].setImm(AArch64::TBZW);
525 break;
526 case AArch64::TBZX:
527 Cond[1].setImm(AArch64::TBNZX);
528 break;
529 case AArch64::TBNZX:
530 Cond[1].setImm(AArch64::TBZX);
531 break;
532 }
533 }
534
535 return false;
536 }
537
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const538 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
539 int *BytesRemoved) const {
540 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
541 if (I == MBB.end())
542 return 0;
543
544 if (!isUncondBranchOpcode(I->getOpcode()) &&
545 !isCondBranchOpcode(I->getOpcode()))
546 return 0;
547
548 // Remove the branch.
549 I->eraseFromParent();
550
551 I = MBB.end();
552
553 if (I == MBB.begin()) {
554 if (BytesRemoved)
555 *BytesRemoved = 4;
556 return 1;
557 }
558 --I;
559 if (!isCondBranchOpcode(I->getOpcode())) {
560 if (BytesRemoved)
561 *BytesRemoved = 4;
562 return 1;
563 }
564
565 // Remove the branch.
566 I->eraseFromParent();
567 if (BytesRemoved)
568 *BytesRemoved = 8;
569
570 return 2;
571 }
572
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const573 void AArch64InstrInfo::instantiateCondBranch(
574 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
575 ArrayRef<MachineOperand> Cond) const {
576 if (Cond[0].getImm() != -1) {
577 // Regular Bcc
578 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
579 } else {
580 // Folded compare-and-branch
581 // Note that we use addOperand instead of addReg to keep the flags.
582 const MachineInstrBuilder MIB =
583 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
584 if (Cond.size() > 3)
585 MIB.addImm(Cond[3].getImm());
586 MIB.addMBB(TBB);
587 }
588 }
589
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const590 unsigned AArch64InstrInfo::insertBranch(
591 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
592 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
593 // Shouldn't be a fall through.
594 assert(TBB && "insertBranch must not be told to insert a fallthrough");
595
596 if (!FBB) {
597 if (Cond.empty()) // Unconditional branch?
598 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
599 else
600 instantiateCondBranch(MBB, DL, TBB, Cond);
601
602 if (BytesAdded)
603 *BytesAdded = 4;
604
605 return 1;
606 }
607
608 // Two-way conditional branch.
609 instantiateCondBranch(MBB, DL, TBB, Cond);
610 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
611
612 if (BytesAdded)
613 *BytesAdded = 8;
614
615 return 2;
616 }
617
618 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)619 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
620 while (Register::isVirtualRegister(VReg)) {
621 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
622 if (!DefMI->isFullCopy())
623 return VReg;
624 VReg = DefMI->getOperand(1).getReg();
625 }
626 return VReg;
627 }
628
629 // Determine if VReg is defined by an instruction that can be folded into a
630 // csel instruction. If so, return the folded opcode, and the replacement
631 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)632 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
633 unsigned *NewVReg = nullptr) {
634 VReg = removeCopies(MRI, VReg);
635 if (!Register::isVirtualRegister(VReg))
636 return 0;
637
638 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
639 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
640 unsigned Opc = 0;
641 unsigned SrcOpNum = 0;
642 switch (DefMI->getOpcode()) {
643 case AArch64::ADDSXri:
644 case AArch64::ADDSWri:
645 // if NZCV is used, do not fold.
646 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
647 return 0;
648 // fall-through to ADDXri and ADDWri.
649 [[fallthrough]];
650 case AArch64::ADDXri:
651 case AArch64::ADDWri:
652 // add x, 1 -> csinc.
653 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
654 DefMI->getOperand(3).getImm() != 0)
655 return 0;
656 SrcOpNum = 1;
657 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
658 break;
659
660 case AArch64::ORNXrr:
661 case AArch64::ORNWrr: {
662 // not x -> csinv, represented as orn dst, xzr, src.
663 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
664 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
665 return 0;
666 SrcOpNum = 2;
667 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
668 break;
669 }
670
671 case AArch64::SUBSXrr:
672 case AArch64::SUBSWrr:
673 // if NZCV is used, do not fold.
674 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
675 return 0;
676 // fall-through to SUBXrr and SUBWrr.
677 [[fallthrough]];
678 case AArch64::SUBXrr:
679 case AArch64::SUBWrr: {
680 // neg x -> csneg, represented as sub dst, xzr, src.
681 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
682 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
683 return 0;
684 SrcOpNum = 2;
685 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
686 break;
687 }
688 default:
689 return 0;
690 }
691 assert(Opc && SrcOpNum && "Missing parameters");
692
693 if (NewVReg)
694 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
695 return Opc;
696 }
697
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const698 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
699 ArrayRef<MachineOperand> Cond,
700 Register DstReg, Register TrueReg,
701 Register FalseReg, int &CondCycles,
702 int &TrueCycles,
703 int &FalseCycles) const {
704 // Check register classes.
705 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
706 const TargetRegisterClass *RC =
707 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
708 if (!RC)
709 return false;
710
711 // Also need to check the dest regclass, in case we're trying to optimize
712 // something like:
713 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
714 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
715 return false;
716
717 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
718 unsigned ExtraCondLat = Cond.size() != 1;
719
720 // GPRs are handled by csel.
721 // FIXME: Fold in x+1, -x, and ~x when applicable.
722 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
723 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
724 // Single-cycle csel, csinc, csinv, and csneg.
725 CondCycles = 1 + ExtraCondLat;
726 TrueCycles = FalseCycles = 1;
727 if (canFoldIntoCSel(MRI, TrueReg))
728 TrueCycles = 0;
729 else if (canFoldIntoCSel(MRI, FalseReg))
730 FalseCycles = 0;
731 return true;
732 }
733
734 // Scalar floating point is handled by fcsel.
735 // FIXME: Form fabs, fmin, and fmax when applicable.
736 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
737 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
738 CondCycles = 5 + ExtraCondLat;
739 TrueCycles = FalseCycles = 2;
740 return true;
741 }
742
743 // Can't do vectors.
744 return false;
745 }
746
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const747 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
748 MachineBasicBlock::iterator I,
749 const DebugLoc &DL, Register DstReg,
750 ArrayRef<MachineOperand> Cond,
751 Register TrueReg, Register FalseReg) const {
752 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
753
754 // Parse the condition code, see parseCondBranch() above.
755 AArch64CC::CondCode CC;
756 switch (Cond.size()) {
757 default:
758 llvm_unreachable("Unknown condition opcode in Cond");
759 case 1: // b.cc
760 CC = AArch64CC::CondCode(Cond[0].getImm());
761 break;
762 case 3: { // cbz/cbnz
763 // We must insert a compare against 0.
764 bool Is64Bit;
765 switch (Cond[1].getImm()) {
766 default:
767 llvm_unreachable("Unknown branch opcode in Cond");
768 case AArch64::CBZW:
769 Is64Bit = false;
770 CC = AArch64CC::EQ;
771 break;
772 case AArch64::CBZX:
773 Is64Bit = true;
774 CC = AArch64CC::EQ;
775 break;
776 case AArch64::CBNZW:
777 Is64Bit = false;
778 CC = AArch64CC::NE;
779 break;
780 case AArch64::CBNZX:
781 Is64Bit = true;
782 CC = AArch64CC::NE;
783 break;
784 }
785 Register SrcReg = Cond[2].getReg();
786 if (Is64Bit) {
787 // cmp reg, #0 is actually subs xzr, reg, #0.
788 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
789 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
790 .addReg(SrcReg)
791 .addImm(0)
792 .addImm(0);
793 } else {
794 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
795 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
796 .addReg(SrcReg)
797 .addImm(0)
798 .addImm(0);
799 }
800 break;
801 }
802 case 4: { // tbz/tbnz
803 // We must insert a tst instruction.
804 switch (Cond[1].getImm()) {
805 default:
806 llvm_unreachable("Unknown branch opcode in Cond");
807 case AArch64::TBZW:
808 case AArch64::TBZX:
809 CC = AArch64CC::EQ;
810 break;
811 case AArch64::TBNZW:
812 case AArch64::TBNZX:
813 CC = AArch64CC::NE;
814 break;
815 }
816 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
817 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
818 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
819 .addReg(Cond[2].getReg())
820 .addImm(
821 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
822 else
823 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
824 .addReg(Cond[2].getReg())
825 .addImm(
826 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
827 break;
828 }
829 }
830
831 unsigned Opc = 0;
832 const TargetRegisterClass *RC = nullptr;
833 bool TryFold = false;
834 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
835 RC = &AArch64::GPR64RegClass;
836 Opc = AArch64::CSELXr;
837 TryFold = true;
838 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
839 RC = &AArch64::GPR32RegClass;
840 Opc = AArch64::CSELWr;
841 TryFold = true;
842 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
843 RC = &AArch64::FPR64RegClass;
844 Opc = AArch64::FCSELDrrr;
845 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
846 RC = &AArch64::FPR32RegClass;
847 Opc = AArch64::FCSELSrrr;
848 }
849 assert(RC && "Unsupported regclass");
850
851 // Try folding simple instructions into the csel.
852 if (TryFold) {
853 unsigned NewVReg = 0;
854 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
855 if (FoldedOpc) {
856 // The folded opcodes csinc, csinc and csneg apply the operation to
857 // FalseReg, so we need to invert the condition.
858 CC = AArch64CC::getInvertedCondCode(CC);
859 TrueReg = FalseReg;
860 } else
861 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
862
863 // Fold the operation. Leave any dead instructions for DCE to clean up.
864 if (FoldedOpc) {
865 FalseReg = NewVReg;
866 Opc = FoldedOpc;
867 // The extends the live range of NewVReg.
868 MRI.clearKillFlags(NewVReg);
869 }
870 }
871
872 // Pull all virtual register into the appropriate class.
873 MRI.constrainRegClass(TrueReg, RC);
874 MRI.constrainRegClass(FalseReg, RC);
875
876 // Insert the csel.
877 BuildMI(MBB, I, DL, get(Opc), DstReg)
878 .addReg(TrueReg)
879 .addReg(FalseReg)
880 .addImm(CC);
881 }
882
883 // Return true if Imm can be loaded into a register by a "cheap" sequence of
884 // instructions. For now, "cheap" means at most two instructions.
isCheapImmediate(const MachineInstr & MI,unsigned BitSize)885 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
886 if (BitSize == 32)
887 return true;
888
889 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
890 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
891 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
892 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
893
894 return Is.size() <= 2;
895 }
896
897 // FIXME: this implementation should be micro-architecture dependent, so a
898 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const899 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
900 if (Subtarget.hasExynosCheapAsMoveHandling()) {
901 if (isExynosCheapAsMove(MI))
902 return true;
903 return MI.isAsCheapAsAMove();
904 }
905
906 switch (MI.getOpcode()) {
907 default:
908 return MI.isAsCheapAsAMove();
909
910 case AArch64::ADDWrs:
911 case AArch64::ADDXrs:
912 case AArch64::SUBWrs:
913 case AArch64::SUBXrs:
914 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
915
916 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
917 // ORRXri, it is as cheap as MOV.
918 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
919 case AArch64::MOVi32imm:
920 return isCheapImmediate(MI, 32);
921 case AArch64::MOVi64imm:
922 return isCheapImmediate(MI, 64);
923 }
924 }
925
isFalkorShiftExtFast(const MachineInstr & MI)926 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
927 switch (MI.getOpcode()) {
928 default:
929 return false;
930
931 case AArch64::ADDWrs:
932 case AArch64::ADDXrs:
933 case AArch64::ADDSWrs:
934 case AArch64::ADDSXrs: {
935 unsigned Imm = MI.getOperand(3).getImm();
936 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
937 if (ShiftVal == 0)
938 return true;
939 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
940 }
941
942 case AArch64::ADDWrx:
943 case AArch64::ADDXrx:
944 case AArch64::ADDXrx64:
945 case AArch64::ADDSWrx:
946 case AArch64::ADDSXrx:
947 case AArch64::ADDSXrx64: {
948 unsigned Imm = MI.getOperand(3).getImm();
949 switch (AArch64_AM::getArithExtendType(Imm)) {
950 default:
951 return false;
952 case AArch64_AM::UXTB:
953 case AArch64_AM::UXTH:
954 case AArch64_AM::UXTW:
955 case AArch64_AM::UXTX:
956 return AArch64_AM::getArithShiftValue(Imm) <= 4;
957 }
958 }
959
960 case AArch64::SUBWrs:
961 case AArch64::SUBSWrs: {
962 unsigned Imm = MI.getOperand(3).getImm();
963 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
964 return ShiftVal == 0 ||
965 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
966 }
967
968 case AArch64::SUBXrs:
969 case AArch64::SUBSXrs: {
970 unsigned Imm = MI.getOperand(3).getImm();
971 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
972 return ShiftVal == 0 ||
973 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
974 }
975
976 case AArch64::SUBWrx:
977 case AArch64::SUBXrx:
978 case AArch64::SUBXrx64:
979 case AArch64::SUBSWrx:
980 case AArch64::SUBSXrx:
981 case AArch64::SUBSXrx64: {
982 unsigned Imm = MI.getOperand(3).getImm();
983 switch (AArch64_AM::getArithExtendType(Imm)) {
984 default:
985 return false;
986 case AArch64_AM::UXTB:
987 case AArch64_AM::UXTH:
988 case AArch64_AM::UXTW:
989 case AArch64_AM::UXTX:
990 return AArch64_AM::getArithShiftValue(Imm) == 0;
991 }
992 }
993
994 case AArch64::LDRBBroW:
995 case AArch64::LDRBBroX:
996 case AArch64::LDRBroW:
997 case AArch64::LDRBroX:
998 case AArch64::LDRDroW:
999 case AArch64::LDRDroX:
1000 case AArch64::LDRHHroW:
1001 case AArch64::LDRHHroX:
1002 case AArch64::LDRHroW:
1003 case AArch64::LDRHroX:
1004 case AArch64::LDRQroW:
1005 case AArch64::LDRQroX:
1006 case AArch64::LDRSBWroW:
1007 case AArch64::LDRSBWroX:
1008 case AArch64::LDRSBXroW:
1009 case AArch64::LDRSBXroX:
1010 case AArch64::LDRSHWroW:
1011 case AArch64::LDRSHWroX:
1012 case AArch64::LDRSHXroW:
1013 case AArch64::LDRSHXroX:
1014 case AArch64::LDRSWroW:
1015 case AArch64::LDRSWroX:
1016 case AArch64::LDRSroW:
1017 case AArch64::LDRSroX:
1018 case AArch64::LDRWroW:
1019 case AArch64::LDRWroX:
1020 case AArch64::LDRXroW:
1021 case AArch64::LDRXroX:
1022 case AArch64::PRFMroW:
1023 case AArch64::PRFMroX:
1024 case AArch64::STRBBroW:
1025 case AArch64::STRBBroX:
1026 case AArch64::STRBroW:
1027 case AArch64::STRBroX:
1028 case AArch64::STRDroW:
1029 case AArch64::STRDroX:
1030 case AArch64::STRHHroW:
1031 case AArch64::STRHHroX:
1032 case AArch64::STRHroW:
1033 case AArch64::STRHroX:
1034 case AArch64::STRQroW:
1035 case AArch64::STRQroX:
1036 case AArch64::STRSroW:
1037 case AArch64::STRSroX:
1038 case AArch64::STRWroW:
1039 case AArch64::STRWroX:
1040 case AArch64::STRXroW:
1041 case AArch64::STRXroX: {
1042 unsigned IsSigned = MI.getOperand(3).getImm();
1043 return !IsSigned;
1044 }
1045 }
1046 }
1047
isSEHInstruction(const MachineInstr & MI)1048 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1049 unsigned Opc = MI.getOpcode();
1050 switch (Opc) {
1051 default:
1052 return false;
1053 case AArch64::SEH_StackAlloc:
1054 case AArch64::SEH_SaveFPLR:
1055 case AArch64::SEH_SaveFPLR_X:
1056 case AArch64::SEH_SaveReg:
1057 case AArch64::SEH_SaveReg_X:
1058 case AArch64::SEH_SaveRegP:
1059 case AArch64::SEH_SaveRegP_X:
1060 case AArch64::SEH_SaveFReg:
1061 case AArch64::SEH_SaveFReg_X:
1062 case AArch64::SEH_SaveFRegP:
1063 case AArch64::SEH_SaveFRegP_X:
1064 case AArch64::SEH_SetFP:
1065 case AArch64::SEH_AddFP:
1066 case AArch64::SEH_Nop:
1067 case AArch64::SEH_PrologEnd:
1068 case AArch64::SEH_EpilogStart:
1069 case AArch64::SEH_EpilogEnd:
1070 case AArch64::SEH_PACSignLR:
1071 case AArch64::SEH_SaveAnyRegQP:
1072 case AArch64::SEH_SaveAnyRegQPX:
1073 return true;
1074 }
1075 }
1076
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1077 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1078 Register &SrcReg, Register &DstReg,
1079 unsigned &SubIdx) const {
1080 switch (MI.getOpcode()) {
1081 default:
1082 return false;
1083 case AArch64::SBFMXri: // aka sxtw
1084 case AArch64::UBFMXri: // aka uxtw
1085 // Check for the 32 -> 64 bit extension case, these instructions can do
1086 // much more.
1087 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1088 return false;
1089 // This is a signed or unsigned 32 -> 64 bit extension.
1090 SrcReg = MI.getOperand(1).getReg();
1091 DstReg = MI.getOperand(0).getReg();
1092 SubIdx = AArch64::sub_32;
1093 return true;
1094 }
1095 }
1096
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1097 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1098 const MachineInstr &MIa, const MachineInstr &MIb) const {
1099 const TargetRegisterInfo *TRI = &getRegisterInfo();
1100 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1101 int64_t OffsetA = 0, OffsetB = 0;
1102 TypeSize WidthA(0, false), WidthB(0, false);
1103 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1104
1105 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1106 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1107
1108 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1109 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1110 return false;
1111
1112 // Retrieve the base, offset from the base and width. Width
1113 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1114 // base are identical, and the offset of a lower memory access +
1115 // the width doesn't overlap the offset of a higher memory access,
1116 // then the memory accesses are different.
1117 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1118 // are assumed to have the same scale (vscale).
1119 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1120 WidthA, TRI) &&
1121 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1122 WidthB, TRI)) {
1123 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1124 OffsetAIsScalable == OffsetBIsScalable) {
1125 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1126 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1127 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1128 if (LowWidth.isScalable() == OffsetAIsScalable &&
1129 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1130 return true;
1131 }
1132 }
1133 return false;
1134 }
1135
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1136 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1137 const MachineBasicBlock *MBB,
1138 const MachineFunction &MF) const {
1139 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1140 return true;
1141
1142 // Do not move an instruction that can be recognized as a branch target.
1143 if (hasBTISemantics(MI))
1144 return true;
1145
1146 switch (MI.getOpcode()) {
1147 case AArch64::HINT:
1148 // CSDB hints are scheduling barriers.
1149 if (MI.getOperand(0).getImm() == 0x14)
1150 return true;
1151 break;
1152 case AArch64::DSB:
1153 case AArch64::ISB:
1154 // DSB and ISB also are scheduling barriers.
1155 return true;
1156 case AArch64::MSRpstatesvcrImm1:
1157 // SMSTART and SMSTOP are also scheduling barriers.
1158 return true;
1159 default:;
1160 }
1161 if (isSEHInstruction(MI))
1162 return true;
1163 auto Next = std::next(MI.getIterator());
1164 return Next != MBB->end() && Next->isCFIInstruction();
1165 }
1166
1167 /// analyzeCompare - For a comparison instruction, return the source registers
1168 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1169 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int64_t & CmpMask,int64_t & CmpValue) const1170 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1171 Register &SrcReg2, int64_t &CmpMask,
1172 int64_t &CmpValue) const {
1173 // The first operand can be a frame index where we'd normally expect a
1174 // register.
1175 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1176 if (!MI.getOperand(1).isReg())
1177 return false;
1178
1179 switch (MI.getOpcode()) {
1180 default:
1181 break;
1182 case AArch64::PTEST_PP:
1183 case AArch64::PTEST_PP_ANY:
1184 SrcReg = MI.getOperand(0).getReg();
1185 SrcReg2 = MI.getOperand(1).getReg();
1186 // Not sure about the mask and value for now...
1187 CmpMask = ~0;
1188 CmpValue = 0;
1189 return true;
1190 case AArch64::SUBSWrr:
1191 case AArch64::SUBSWrs:
1192 case AArch64::SUBSWrx:
1193 case AArch64::SUBSXrr:
1194 case AArch64::SUBSXrs:
1195 case AArch64::SUBSXrx:
1196 case AArch64::ADDSWrr:
1197 case AArch64::ADDSWrs:
1198 case AArch64::ADDSWrx:
1199 case AArch64::ADDSXrr:
1200 case AArch64::ADDSXrs:
1201 case AArch64::ADDSXrx:
1202 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1203 SrcReg = MI.getOperand(1).getReg();
1204 SrcReg2 = MI.getOperand(2).getReg();
1205 CmpMask = ~0;
1206 CmpValue = 0;
1207 return true;
1208 case AArch64::SUBSWri:
1209 case AArch64::ADDSWri:
1210 case AArch64::SUBSXri:
1211 case AArch64::ADDSXri:
1212 SrcReg = MI.getOperand(1).getReg();
1213 SrcReg2 = 0;
1214 CmpMask = ~0;
1215 CmpValue = MI.getOperand(2).getImm();
1216 return true;
1217 case AArch64::ANDSWri:
1218 case AArch64::ANDSXri:
1219 // ANDS does not use the same encoding scheme as the others xxxS
1220 // instructions.
1221 SrcReg = MI.getOperand(1).getReg();
1222 SrcReg2 = 0;
1223 CmpMask = ~0;
1224 CmpValue = AArch64_AM::decodeLogicalImmediate(
1225 MI.getOperand(2).getImm(),
1226 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1227 return true;
1228 }
1229
1230 return false;
1231 }
1232
UpdateOperandRegClass(MachineInstr & Instr)1233 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1234 MachineBasicBlock *MBB = Instr.getParent();
1235 assert(MBB && "Can't get MachineBasicBlock here");
1236 MachineFunction *MF = MBB->getParent();
1237 assert(MF && "Can't get MachineFunction here");
1238 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1239 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1240 MachineRegisterInfo *MRI = &MF->getRegInfo();
1241
1242 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1243 ++OpIdx) {
1244 MachineOperand &MO = Instr.getOperand(OpIdx);
1245 const TargetRegisterClass *OpRegCstraints =
1246 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1247
1248 // If there's no constraint, there's nothing to do.
1249 if (!OpRegCstraints)
1250 continue;
1251 // If the operand is a frame index, there's nothing to do here.
1252 // A frame index operand will resolve correctly during PEI.
1253 if (MO.isFI())
1254 continue;
1255
1256 assert(MO.isReg() &&
1257 "Operand has register constraints without being a register!");
1258
1259 Register Reg = MO.getReg();
1260 if (Reg.isPhysical()) {
1261 if (!OpRegCstraints->contains(Reg))
1262 return false;
1263 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1264 !MRI->constrainRegClass(Reg, OpRegCstraints))
1265 return false;
1266 }
1267
1268 return true;
1269 }
1270
1271 /// Return the opcode that does not set flags when possible - otherwise
1272 /// return the original opcode. The caller is responsible to do the actual
1273 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1274 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1275 // Don't convert all compare instructions, because for some the zero register
1276 // encoding becomes the sp register.
1277 bool MIDefinesZeroReg = false;
1278 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1279 MIDefinesZeroReg = true;
1280
1281 switch (MI.getOpcode()) {
1282 default:
1283 return MI.getOpcode();
1284 case AArch64::ADDSWrr:
1285 return AArch64::ADDWrr;
1286 case AArch64::ADDSWri:
1287 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1288 case AArch64::ADDSWrs:
1289 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1290 case AArch64::ADDSWrx:
1291 return AArch64::ADDWrx;
1292 case AArch64::ADDSXrr:
1293 return AArch64::ADDXrr;
1294 case AArch64::ADDSXri:
1295 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1296 case AArch64::ADDSXrs:
1297 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1298 case AArch64::ADDSXrx:
1299 return AArch64::ADDXrx;
1300 case AArch64::SUBSWrr:
1301 return AArch64::SUBWrr;
1302 case AArch64::SUBSWri:
1303 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1304 case AArch64::SUBSWrs:
1305 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1306 case AArch64::SUBSWrx:
1307 return AArch64::SUBWrx;
1308 case AArch64::SUBSXrr:
1309 return AArch64::SUBXrr;
1310 case AArch64::SUBSXri:
1311 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1312 case AArch64::SUBSXrs:
1313 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1314 case AArch64::SUBSXrx:
1315 return AArch64::SUBXrx;
1316 }
1317 }
1318
1319 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1320
1321 /// True when condition flags are accessed (either by writing or reading)
1322 /// on the instruction trace starting at From and ending at To.
1323 ///
1324 /// Note: If From and To are from different blocks it's assumed CC are accessed
1325 /// on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1326 static bool areCFlagsAccessedBetweenInstrs(
1327 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1328 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1329 // Early exit if To is at the beginning of the BB.
1330 if (To == To->getParent()->begin())
1331 return true;
1332
1333 // Check whether the instructions are in the same basic block
1334 // If not, assume the condition flags might get modified somewhere.
1335 if (To->getParent() != From->getParent())
1336 return true;
1337
1338 // From must be above To.
1339 assert(std::any_of(
1340 ++To.getReverse(), To->getParent()->rend(),
1341 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1342
1343 // We iterate backward starting at \p To until we hit \p From.
1344 for (const MachineInstr &Instr :
1345 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1346 if (((AccessToCheck & AK_Write) &&
1347 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1348 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1349 return true;
1350 }
1351 return false;
1352 }
1353
1354 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1355 /// operation which could set the flags in an identical manner
optimizePTestInstr(MachineInstr * PTest,unsigned MaskReg,unsigned PredReg,const MachineRegisterInfo * MRI) const1356 bool AArch64InstrInfo::optimizePTestInstr(
1357 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1358 const MachineRegisterInfo *MRI) const {
1359 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1360 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1361 auto NewOp = Pred->getOpcode();
1362 bool OpChanged = false;
1363
1364 unsigned MaskOpcode = Mask->getOpcode();
1365 unsigned PredOpcode = Pred->getOpcode();
1366 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1367 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1368
1369 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) &&
1370 getElementSizeForOpcode(MaskOpcode) ==
1371 getElementSizeForOpcode(PredOpcode) &&
1372 Mask->getOperand(1).getImm() == 31) {
1373 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1374 // redundant since WHILE performs an implicit PTEST with an all active
1375 // mask. Must be an all active predicate of matching element size.
1376
1377 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1378 // PTEST_LIKE instruction uses the same all active mask and the element
1379 // size matches. If the PTEST has a condition of any then it is always
1380 // redundant.
1381 if (PredIsPTestLike) {
1382 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1383 if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY)
1384 return false;
1385 }
1386
1387 // Fallthough to simply remove the PTEST.
1388 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) &&
1389 PTest->getOpcode() == AArch64::PTEST_PP_ANY) {
1390 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1391 // instruction that sets the flags as PTEST would. This is only valid when
1392 // the condition is any.
1393
1394 // Fallthough to simply remove the PTEST.
1395 } else if (PredIsPTestLike) {
1396 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1397 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1398 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1399 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1400 // performed by the compare could consider fewer lanes for these element
1401 // sizes.
1402 //
1403 // For example, consider
1404 //
1405 // ptrue p0.b ; P0=1111-1111-1111-1111
1406 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1407 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1408 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1409 // ; ^ last active
1410 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1411 // ; ^ last active
1412 //
1413 // where the compare generates a canonical all active 32-bit predicate
1414 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1415 // active flag, whereas the PTEST instruction with the same mask doesn't.
1416 // For PTEST_ANY this doesn't apply as the flags in this case would be
1417 // identical regardless of element size.
1418 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1419 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1420 if ((Mask != PTestLikeMask) ||
1421 (PredElementSize != AArch64::ElementSizeB &&
1422 PTest->getOpcode() != AArch64::PTEST_PP_ANY))
1423 return false;
1424
1425 // Fallthough to simply remove the PTEST.
1426 } else {
1427 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1428 // opcode so the PTEST becomes redundant.
1429 switch (PredOpcode) {
1430 case AArch64::AND_PPzPP:
1431 case AArch64::BIC_PPzPP:
1432 case AArch64::EOR_PPzPP:
1433 case AArch64::NAND_PPzPP:
1434 case AArch64::NOR_PPzPP:
1435 case AArch64::ORN_PPzPP:
1436 case AArch64::ORR_PPzPP:
1437 case AArch64::BRKA_PPzP:
1438 case AArch64::BRKPA_PPzPP:
1439 case AArch64::BRKB_PPzP:
1440 case AArch64::BRKPB_PPzPP:
1441 case AArch64::RDFFR_PPz: {
1442 // Check to see if our mask is the same. If not the resulting flag bits
1443 // may be different and we can't remove the ptest.
1444 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1445 if (Mask != PredMask)
1446 return false;
1447 break;
1448 }
1449 case AArch64::BRKN_PPzP: {
1450 // BRKN uses an all active implicit mask to set flags unlike the other
1451 // flag-setting instructions.
1452 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1453 if ((MaskOpcode != AArch64::PTRUE_B) ||
1454 (Mask->getOperand(1).getImm() != 31))
1455 return false;
1456 break;
1457 }
1458 case AArch64::PTRUE_B:
1459 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1460 break;
1461 default:
1462 // Bail out if we don't recognize the input
1463 return false;
1464 }
1465
1466 NewOp = convertToFlagSettingOpc(PredOpcode);
1467 OpChanged = true;
1468 }
1469
1470 const TargetRegisterInfo *TRI = &getRegisterInfo();
1471
1472 // If another instruction between Pred and PTest accesses flags, don't remove
1473 // the ptest or update the earlier instruction to modify them.
1474 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1475 return false;
1476
1477 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1478 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1479 // operand to be replaced with an equivalent instruction that also sets the
1480 // flags.
1481 Pred->setDesc(get(NewOp));
1482 PTest->eraseFromParent();
1483 if (OpChanged) {
1484 bool succeeded = UpdateOperandRegClass(*Pred);
1485 (void)succeeded;
1486 assert(succeeded && "Operands have incompatible register classes!");
1487 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1488 }
1489
1490 // Ensure that the flags def is live.
1491 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1492 unsigned i = 0, e = Pred->getNumOperands();
1493 for (; i != e; ++i) {
1494 MachineOperand &MO = Pred->getOperand(i);
1495 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1496 MO.setIsDead(false);
1497 break;
1498 }
1499 }
1500 }
1501 return true;
1502 }
1503
1504 /// Try to optimize a compare instruction. A compare instruction is an
1505 /// instruction which produces AArch64::NZCV. It can be truly compare
1506 /// instruction
1507 /// when there are no uses of its destination register.
1508 ///
1509 /// The following steps are tried in order:
1510 /// 1. Convert CmpInstr into an unconditional version.
1511 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1512 /// condition code or an instruction which can be converted into such an
1513 /// instruction.
1514 /// Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int64_t CmpMask,int64_t CmpValue,const MachineRegisterInfo * MRI) const1515 bool AArch64InstrInfo::optimizeCompareInstr(
1516 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1517 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1518 assert(CmpInstr.getParent());
1519 assert(MRI);
1520
1521 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1522 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1523 if (DeadNZCVIdx != -1) {
1524 if (CmpInstr.definesRegister(AArch64::WZR) ||
1525 CmpInstr.definesRegister(AArch64::XZR)) {
1526 CmpInstr.eraseFromParent();
1527 return true;
1528 }
1529 unsigned Opc = CmpInstr.getOpcode();
1530 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1531 if (NewOpc == Opc)
1532 return false;
1533 const MCInstrDesc &MCID = get(NewOpc);
1534 CmpInstr.setDesc(MCID);
1535 CmpInstr.removeOperand(DeadNZCVIdx);
1536 bool succeeded = UpdateOperandRegClass(CmpInstr);
1537 (void)succeeded;
1538 assert(succeeded && "Some operands reg class are incompatible!");
1539 return true;
1540 }
1541
1542 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1543 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1544 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1545
1546 if (SrcReg2 != 0)
1547 return false;
1548
1549 // CmpInstr is a Compare instruction if destination register is not used.
1550 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1551 return false;
1552
1553 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1554 return true;
1555 return (CmpValue == 0 || CmpValue == 1) &&
1556 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1557 }
1558
1559 /// Get opcode of S version of Instr.
1560 /// If Instr is S version its opcode is returned.
1561 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1562 /// or we are not interested in it.
sForm(MachineInstr & Instr)1563 static unsigned sForm(MachineInstr &Instr) {
1564 switch (Instr.getOpcode()) {
1565 default:
1566 return AArch64::INSTRUCTION_LIST_END;
1567
1568 case AArch64::ADDSWrr:
1569 case AArch64::ADDSWri:
1570 case AArch64::ADDSXrr:
1571 case AArch64::ADDSXri:
1572 case AArch64::SUBSWrr:
1573 case AArch64::SUBSWri:
1574 case AArch64::SUBSXrr:
1575 case AArch64::SUBSXri:
1576 return Instr.getOpcode();
1577
1578 case AArch64::ADDWrr:
1579 return AArch64::ADDSWrr;
1580 case AArch64::ADDWri:
1581 return AArch64::ADDSWri;
1582 case AArch64::ADDXrr:
1583 return AArch64::ADDSXrr;
1584 case AArch64::ADDXri:
1585 return AArch64::ADDSXri;
1586 case AArch64::ADCWr:
1587 return AArch64::ADCSWr;
1588 case AArch64::ADCXr:
1589 return AArch64::ADCSXr;
1590 case AArch64::SUBWrr:
1591 return AArch64::SUBSWrr;
1592 case AArch64::SUBWri:
1593 return AArch64::SUBSWri;
1594 case AArch64::SUBXrr:
1595 return AArch64::SUBSXrr;
1596 case AArch64::SUBXri:
1597 return AArch64::SUBSXri;
1598 case AArch64::SBCWr:
1599 return AArch64::SBCSWr;
1600 case AArch64::SBCXr:
1601 return AArch64::SBCSXr;
1602 case AArch64::ANDWri:
1603 return AArch64::ANDSWri;
1604 case AArch64::ANDXri:
1605 return AArch64::ANDSXri;
1606 }
1607 }
1608
1609 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(const MachineBasicBlock * MBB)1610 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1611 for (auto *BB : MBB->successors())
1612 if (BB->isLiveIn(AArch64::NZCV))
1613 return true;
1614 return false;
1615 }
1616
1617 /// \returns The condition code operand index for \p Instr if it is a branch
1618 /// or select and -1 otherwise.
1619 static int
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr & Instr)1620 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1621 switch (Instr.getOpcode()) {
1622 default:
1623 return -1;
1624
1625 case AArch64::Bcc: {
1626 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1627 assert(Idx >= 2);
1628 return Idx - 2;
1629 }
1630
1631 case AArch64::CSINVWr:
1632 case AArch64::CSINVXr:
1633 case AArch64::CSINCWr:
1634 case AArch64::CSINCXr:
1635 case AArch64::CSELWr:
1636 case AArch64::CSELXr:
1637 case AArch64::CSNEGWr:
1638 case AArch64::CSNEGXr:
1639 case AArch64::FCSELSrrr:
1640 case AArch64::FCSELDrrr: {
1641 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1642 assert(Idx >= 1);
1643 return Idx - 1;
1644 }
1645 }
1646 }
1647
1648 /// Find a condition code used by the instruction.
1649 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1650 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1651 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1652 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1653 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1654 Instr.getOperand(CCIdx).getImm())
1655 : AArch64CC::Invalid;
1656 }
1657
getUsedNZCV(AArch64CC::CondCode CC)1658 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1659 assert(CC != AArch64CC::Invalid);
1660 UsedNZCV UsedFlags;
1661 switch (CC) {
1662 default:
1663 break;
1664
1665 case AArch64CC::EQ: // Z set
1666 case AArch64CC::NE: // Z clear
1667 UsedFlags.Z = true;
1668 break;
1669
1670 case AArch64CC::HI: // Z clear and C set
1671 case AArch64CC::LS: // Z set or C clear
1672 UsedFlags.Z = true;
1673 [[fallthrough]];
1674 case AArch64CC::HS: // C set
1675 case AArch64CC::LO: // C clear
1676 UsedFlags.C = true;
1677 break;
1678
1679 case AArch64CC::MI: // N set
1680 case AArch64CC::PL: // N clear
1681 UsedFlags.N = true;
1682 break;
1683
1684 case AArch64CC::VS: // V set
1685 case AArch64CC::VC: // V clear
1686 UsedFlags.V = true;
1687 break;
1688
1689 case AArch64CC::GT: // Z clear, N and V the same
1690 case AArch64CC::LE: // Z set, N and V differ
1691 UsedFlags.Z = true;
1692 [[fallthrough]];
1693 case AArch64CC::GE: // N and V the same
1694 case AArch64CC::LT: // N and V differ
1695 UsedFlags.N = true;
1696 UsedFlags.V = true;
1697 break;
1698 }
1699 return UsedFlags;
1700 }
1701
1702 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1703 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1704 /// \returns std::nullopt otherwise.
1705 ///
1706 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1707 std::optional<UsedNZCV>
examineCFlagsUse(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > * CCUseInstrs)1708 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1709 const TargetRegisterInfo &TRI,
1710 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1711 MachineBasicBlock *CmpParent = CmpInstr.getParent();
1712 if (MI.getParent() != CmpParent)
1713 return std::nullopt;
1714
1715 if (areCFlagsAliveInSuccessors(CmpParent))
1716 return std::nullopt;
1717
1718 UsedNZCV NZCVUsedAfterCmp;
1719 for (MachineInstr &Instr : instructionsWithoutDebug(
1720 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1721 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1722 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1723 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1724 return std::nullopt;
1725 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1726 if (CCUseInstrs)
1727 CCUseInstrs->push_back(&Instr);
1728 }
1729 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1730 break;
1731 }
1732 return NZCVUsedAfterCmp;
1733 }
1734
isADDSRegImm(unsigned Opcode)1735 static bool isADDSRegImm(unsigned Opcode) {
1736 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1737 }
1738
isSUBSRegImm(unsigned Opcode)1739 static bool isSUBSRegImm(unsigned Opcode) {
1740 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1741 }
1742
1743 /// Check if CmpInstr can be substituted by MI.
1744 ///
1745 /// CmpInstr can be substituted:
1746 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1747 /// - and, MI and CmpInstr are from the same MachineBB
1748 /// - and, condition flags are not alive in successors of the CmpInstr parent
1749 /// - and, if MI opcode is the S form there must be no defs of flags between
1750 /// MI and CmpInstr
1751 /// or if MI opcode is not the S form there must be neither defs of flags
1752 /// nor uses of flags between MI and CmpInstr.
1753 /// - and, if C/V flags are not used after CmpInstr
1754 /// or if N flag is used but MI produces poison value if signed overflow
1755 /// occurs.
canInstrSubstituteCmpInstr(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI)1756 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1757 const TargetRegisterInfo &TRI) {
1758 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1759 // that may or may not set flags.
1760 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1761
1762 const unsigned CmpOpcode = CmpInstr.getOpcode();
1763 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1764 return false;
1765
1766 assert((CmpInstr.getOperand(2).isImm() &&
1767 CmpInstr.getOperand(2).getImm() == 0) &&
1768 "Caller guarantees that CmpInstr compares with constant 0");
1769
1770 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1771 if (!NZVCUsed || NZVCUsed->C)
1772 return false;
1773
1774 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1775 // '%vreg = add ...' or '%vreg = sub ...'.
1776 // Condition flag V is used to indicate signed overflow.
1777 // 1) MI and CmpInstr set N and V to the same value.
1778 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1779 // signed overflow occurs, so CmpInstr could still be simplified away.
1780 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1781 return false;
1782
1783 AccessKind AccessToCheck = AK_Write;
1784 if (sForm(MI) != MI.getOpcode())
1785 AccessToCheck = AK_All;
1786 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1787 }
1788
1789 /// Substitute an instruction comparing to zero with another instruction
1790 /// which produces needed condition flags.
1791 ///
1792 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo & MRI) const1793 bool AArch64InstrInfo::substituteCmpToZero(
1794 MachineInstr &CmpInstr, unsigned SrcReg,
1795 const MachineRegisterInfo &MRI) const {
1796 // Get the unique definition of SrcReg.
1797 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1798 if (!MI)
1799 return false;
1800
1801 const TargetRegisterInfo &TRI = getRegisterInfo();
1802
1803 unsigned NewOpc = sForm(*MI);
1804 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1805 return false;
1806
1807 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1808 return false;
1809
1810 // Update the instruction to set NZCV.
1811 MI->setDesc(get(NewOpc));
1812 CmpInstr.eraseFromParent();
1813 bool succeeded = UpdateOperandRegClass(*MI);
1814 (void)succeeded;
1815 assert(succeeded && "Some operands reg class are incompatible!");
1816 MI->addRegisterDefined(AArch64::NZCV, &TRI);
1817 return true;
1818 }
1819
1820 /// \returns True if \p CmpInstr can be removed.
1821 ///
1822 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1823 /// codes used in \p CCUseInstrs must be inverted.
canCmpInstrBeRemoved(MachineInstr & MI,MachineInstr & CmpInstr,int CmpValue,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > & CCUseInstrs,bool & IsInvertCC)1824 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1825 int CmpValue, const TargetRegisterInfo &TRI,
1826 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1827 bool &IsInvertCC) {
1828 assert((CmpValue == 0 || CmpValue == 1) &&
1829 "Only comparisons to 0 or 1 considered for removal!");
1830
1831 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1832 unsigned MIOpc = MI.getOpcode();
1833 if (MIOpc == AArch64::CSINCWr) {
1834 if (MI.getOperand(1).getReg() != AArch64::WZR ||
1835 MI.getOperand(2).getReg() != AArch64::WZR)
1836 return false;
1837 } else if (MIOpc == AArch64::CSINCXr) {
1838 if (MI.getOperand(1).getReg() != AArch64::XZR ||
1839 MI.getOperand(2).getReg() != AArch64::XZR)
1840 return false;
1841 } else {
1842 return false;
1843 }
1844 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1845 if (MICC == AArch64CC::Invalid)
1846 return false;
1847
1848 // NZCV needs to be defined
1849 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
1850 return false;
1851
1852 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1853 const unsigned CmpOpcode = CmpInstr.getOpcode();
1854 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1855 if (CmpValue && !IsSubsRegImm)
1856 return false;
1857 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1858 return false;
1859
1860 // MI conditions allowed: eq, ne, mi, pl
1861 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1862 if (MIUsedNZCV.C || MIUsedNZCV.V)
1863 return false;
1864
1865 std::optional<UsedNZCV> NZCVUsedAfterCmp =
1866 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1867 // Condition flags are not used in CmpInstr basic block successors and only
1868 // Z or N flags allowed to be used after CmpInstr within its basic block
1869 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1870 return false;
1871 // Z or N flag used after CmpInstr must correspond to the flag used in MI
1872 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1873 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1874 return false;
1875 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1876 if (MIUsedNZCV.N && !CmpValue)
1877 return false;
1878
1879 // There must be no defs of flags between MI and CmpInstr
1880 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1881 return false;
1882
1883 // Condition code is inverted in the following cases:
1884 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1885 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1886 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1887 (!CmpValue && MICC == AArch64CC::NE);
1888 return true;
1889 }
1890
1891 /// Remove comparison in csinc-cmp sequence
1892 ///
1893 /// Examples:
1894 /// 1. \code
1895 /// csinc w9, wzr, wzr, ne
1896 /// cmp w9, #0
1897 /// b.eq
1898 /// \endcode
1899 /// to
1900 /// \code
1901 /// csinc w9, wzr, wzr, ne
1902 /// b.ne
1903 /// \endcode
1904 ///
1905 /// 2. \code
1906 /// csinc x2, xzr, xzr, mi
1907 /// cmp x2, #1
1908 /// b.pl
1909 /// \endcode
1910 /// to
1911 /// \code
1912 /// csinc x2, xzr, xzr, mi
1913 /// b.pl
1914 /// \endcode
1915 ///
1916 /// \param CmpInstr comparison instruction
1917 /// \return True when comparison removed
removeCmpToZeroOrOne(MachineInstr & CmpInstr,unsigned SrcReg,int CmpValue,const MachineRegisterInfo & MRI) const1918 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1919 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1920 const MachineRegisterInfo &MRI) const {
1921 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1922 if (!MI)
1923 return false;
1924 const TargetRegisterInfo &TRI = getRegisterInfo();
1925 SmallVector<MachineInstr *, 4> CCUseInstrs;
1926 bool IsInvertCC = false;
1927 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1928 IsInvertCC))
1929 return false;
1930 // Make transformation
1931 CmpInstr.eraseFromParent();
1932 if (IsInvertCC) {
1933 // Invert condition codes in CmpInstr CC users
1934 for (MachineInstr *CCUseInstr : CCUseInstrs) {
1935 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1936 assert(Idx >= 0 && "Unexpected instruction using CC.");
1937 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1938 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1939 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1940 CCOperand.setImm(CCUse);
1941 }
1942 }
1943 return true;
1944 }
1945
expandPostRAPseudo(MachineInstr & MI) const1946 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1947 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1948 MI.getOpcode() != AArch64::CATCHRET)
1949 return false;
1950
1951 MachineBasicBlock &MBB = *MI.getParent();
1952 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1953 auto TRI = Subtarget.getRegisterInfo();
1954 DebugLoc DL = MI.getDebugLoc();
1955
1956 if (MI.getOpcode() == AArch64::CATCHRET) {
1957 // Skip to the first instruction before the epilog.
1958 const TargetInstrInfo *TII =
1959 MBB.getParent()->getSubtarget().getInstrInfo();
1960 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1961 auto MBBI = MachineBasicBlock::iterator(MI);
1962 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1963 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1964 FirstEpilogSEH != MBB.begin())
1965 FirstEpilogSEH = std::prev(FirstEpilogSEH);
1966 if (FirstEpilogSEH != MBB.begin())
1967 FirstEpilogSEH = std::next(FirstEpilogSEH);
1968 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1969 .addReg(AArch64::X0, RegState::Define)
1970 .addMBB(TargetMBB);
1971 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1972 .addReg(AArch64::X0, RegState::Define)
1973 .addReg(AArch64::X0)
1974 .addMBB(TargetMBB)
1975 .addImm(0);
1976 return true;
1977 }
1978
1979 Register Reg = MI.getOperand(0).getReg();
1980 Module &M = *MBB.getParent()->getFunction().getParent();
1981 if (M.getStackProtectorGuard() == "sysreg") {
1982 const AArch64SysReg::SysReg *SrcReg =
1983 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
1984 if (!SrcReg)
1985 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
1986
1987 // mrs xN, sysreg
1988 BuildMI(MBB, MI, DL, get(AArch64::MRS))
1989 .addDef(Reg, RegState::Renamable)
1990 .addImm(SrcReg->Encoding);
1991 int Offset = M.getStackProtectorGuardOffset();
1992 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
1993 // ldr xN, [xN, #offset]
1994 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1995 .addDef(Reg)
1996 .addUse(Reg, RegState::Kill)
1997 .addImm(Offset / 8);
1998 } else if (Offset >= -256 && Offset <= 255) {
1999 // ldur xN, [xN, #offset]
2000 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2001 .addDef(Reg)
2002 .addUse(Reg, RegState::Kill)
2003 .addImm(Offset);
2004 } else if (Offset >= -4095 && Offset <= 4095) {
2005 if (Offset > 0) {
2006 // add xN, xN, #offset
2007 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2008 .addDef(Reg)
2009 .addUse(Reg, RegState::Kill)
2010 .addImm(Offset)
2011 .addImm(0);
2012 } else {
2013 // sub xN, xN, #offset
2014 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2015 .addDef(Reg)
2016 .addUse(Reg, RegState::Kill)
2017 .addImm(-Offset)
2018 .addImm(0);
2019 }
2020 // ldr xN, [xN]
2021 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2022 .addDef(Reg)
2023 .addUse(Reg, RegState::Kill)
2024 .addImm(0);
2025 } else {
2026 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2027 // than 23760.
2028 // It might be nice to use AArch64::MOVi32imm here, which would get
2029 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2030 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2031 // AArch64FrameLowering might help us find such a scratch register
2032 // though. If we failed to find a scratch register, we could emit a
2033 // stream of add instructions to build up the immediate. Or, we could try
2034 // to insert a AArch64::MOVi32imm before register allocation so that we
2035 // didn't need to scavenge for a scratch register.
2036 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2037 }
2038 MBB.erase(MI);
2039 return true;
2040 }
2041
2042 const GlobalValue *GV =
2043 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2044 const TargetMachine &TM = MBB.getParent()->getTarget();
2045 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2046 const unsigned char MO_NC = AArch64II::MO_NC;
2047
2048 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2049 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2050 .addGlobalAddress(GV, 0, OpFlags);
2051 if (Subtarget.isTargetILP32()) {
2052 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2053 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2054 .addDef(Reg32, RegState::Dead)
2055 .addUse(Reg, RegState::Kill)
2056 .addImm(0)
2057 .addMemOperand(*MI.memoperands_begin())
2058 .addDef(Reg, RegState::Implicit);
2059 } else {
2060 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2061 .addReg(Reg, RegState::Kill)
2062 .addImm(0)
2063 .addMemOperand(*MI.memoperands_begin());
2064 }
2065 } else if (TM.getCodeModel() == CodeModel::Large) {
2066 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2067 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2068 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2069 .addImm(0);
2070 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2071 .addReg(Reg, RegState::Kill)
2072 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2073 .addImm(16);
2074 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2075 .addReg(Reg, RegState::Kill)
2076 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2077 .addImm(32);
2078 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2079 .addReg(Reg, RegState::Kill)
2080 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2081 .addImm(48);
2082 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2083 .addReg(Reg, RegState::Kill)
2084 .addImm(0)
2085 .addMemOperand(*MI.memoperands_begin());
2086 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2087 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2088 .addGlobalAddress(GV, 0, OpFlags);
2089 } else {
2090 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2091 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2092 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2093 if (Subtarget.isTargetILP32()) {
2094 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2095 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2096 .addDef(Reg32, RegState::Dead)
2097 .addUse(Reg, RegState::Kill)
2098 .addGlobalAddress(GV, 0, LoFlags)
2099 .addMemOperand(*MI.memoperands_begin())
2100 .addDef(Reg, RegState::Implicit);
2101 } else {
2102 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2103 .addReg(Reg, RegState::Kill)
2104 .addGlobalAddress(GV, 0, LoFlags)
2105 .addMemOperand(*MI.memoperands_begin());
2106 }
2107 }
2108
2109 MBB.erase(MI);
2110
2111 return true;
2112 }
2113
2114 // Return true if this instruction simply sets its single destination register
2115 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)2116 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2117 switch (MI.getOpcode()) {
2118 default:
2119 break;
2120 case AArch64::MOVZWi:
2121 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2122 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2123 assert(MI.getDesc().getNumOperands() == 3 &&
2124 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2125 return true;
2126 }
2127 break;
2128 case AArch64::ANDWri: // and Rd, Rzr, #imm
2129 return MI.getOperand(1).getReg() == AArch64::WZR;
2130 case AArch64::ANDXri:
2131 return MI.getOperand(1).getReg() == AArch64::XZR;
2132 case TargetOpcode::COPY:
2133 return MI.getOperand(1).getReg() == AArch64::WZR;
2134 }
2135 return false;
2136 }
2137
2138 // Return true if this instruction simply renames a general register without
2139 // modifying bits.
isGPRCopy(const MachineInstr & MI)2140 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2141 switch (MI.getOpcode()) {
2142 default:
2143 break;
2144 case TargetOpcode::COPY: {
2145 // GPR32 copies will by lowered to ORRXrs
2146 Register DstReg = MI.getOperand(0).getReg();
2147 return (AArch64::GPR32RegClass.contains(DstReg) ||
2148 AArch64::GPR64RegClass.contains(DstReg));
2149 }
2150 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2151 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2152 assert(MI.getDesc().getNumOperands() == 4 &&
2153 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2154 return true;
2155 }
2156 break;
2157 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2158 if (MI.getOperand(2).getImm() == 0) {
2159 assert(MI.getDesc().getNumOperands() == 4 &&
2160 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2161 return true;
2162 }
2163 break;
2164 }
2165 return false;
2166 }
2167
2168 // Return true if this instruction simply renames a general register without
2169 // modifying bits.
isFPRCopy(const MachineInstr & MI)2170 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2171 switch (MI.getOpcode()) {
2172 default:
2173 break;
2174 case TargetOpcode::COPY: {
2175 Register DstReg = MI.getOperand(0).getReg();
2176 return AArch64::FPR128RegClass.contains(DstReg);
2177 }
2178 case AArch64::ORRv16i8:
2179 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2180 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2181 "invalid ORRv16i8 operands");
2182 return true;
2183 }
2184 break;
2185 }
2186 return false;
2187 }
2188
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const2189 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2190 int &FrameIndex) const {
2191 switch (MI.getOpcode()) {
2192 default:
2193 break;
2194 case AArch64::LDRWui:
2195 case AArch64::LDRXui:
2196 case AArch64::LDRBui:
2197 case AArch64::LDRHui:
2198 case AArch64::LDRSui:
2199 case AArch64::LDRDui:
2200 case AArch64::LDRQui:
2201 case AArch64::LDR_PXI:
2202 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2203 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2204 FrameIndex = MI.getOperand(1).getIndex();
2205 return MI.getOperand(0).getReg();
2206 }
2207 break;
2208 }
2209
2210 return 0;
2211 }
2212
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const2213 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2214 int &FrameIndex) const {
2215 switch (MI.getOpcode()) {
2216 default:
2217 break;
2218 case AArch64::STRWui:
2219 case AArch64::STRXui:
2220 case AArch64::STRBui:
2221 case AArch64::STRHui:
2222 case AArch64::STRSui:
2223 case AArch64::STRDui:
2224 case AArch64::STRQui:
2225 case AArch64::STR_PXI:
2226 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2227 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2228 FrameIndex = MI.getOperand(1).getIndex();
2229 return MI.getOperand(0).getReg();
2230 }
2231 break;
2232 }
2233 return 0;
2234 }
2235
2236 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)2237 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2238 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2239 return MMO->getFlags() & MOSuppressPair;
2240 });
2241 }
2242
2243 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)2244 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2245 if (MI.memoperands_empty())
2246 return;
2247 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2248 }
2249
2250 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)2251 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2252 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2253 return MMO->getFlags() & MOStridedAccess;
2254 });
2255 }
2256
hasUnscaledLdStOffset(unsigned Opc)2257 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2258 switch (Opc) {
2259 default:
2260 return false;
2261 case AArch64::STURSi:
2262 case AArch64::STRSpre:
2263 case AArch64::STURDi:
2264 case AArch64::STRDpre:
2265 case AArch64::STURQi:
2266 case AArch64::STRQpre:
2267 case AArch64::STURBBi:
2268 case AArch64::STURHHi:
2269 case AArch64::STURWi:
2270 case AArch64::STRWpre:
2271 case AArch64::STURXi:
2272 case AArch64::STRXpre:
2273 case AArch64::LDURSi:
2274 case AArch64::LDRSpre:
2275 case AArch64::LDURDi:
2276 case AArch64::LDRDpre:
2277 case AArch64::LDURQi:
2278 case AArch64::LDRQpre:
2279 case AArch64::LDURWi:
2280 case AArch64::LDRWpre:
2281 case AArch64::LDURXi:
2282 case AArch64::LDRXpre:
2283 case AArch64::LDRSWpre:
2284 case AArch64::LDURSWi:
2285 case AArch64::LDURHHi:
2286 case AArch64::LDURBBi:
2287 case AArch64::LDURSBWi:
2288 case AArch64::LDURSHWi:
2289 return true;
2290 }
2291 }
2292
getUnscaledLdSt(unsigned Opc)2293 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2294 switch (Opc) {
2295 default: return {};
2296 case AArch64::PRFMui: return AArch64::PRFUMi;
2297 case AArch64::LDRXui: return AArch64::LDURXi;
2298 case AArch64::LDRWui: return AArch64::LDURWi;
2299 case AArch64::LDRBui: return AArch64::LDURBi;
2300 case AArch64::LDRHui: return AArch64::LDURHi;
2301 case AArch64::LDRSui: return AArch64::LDURSi;
2302 case AArch64::LDRDui: return AArch64::LDURDi;
2303 case AArch64::LDRQui: return AArch64::LDURQi;
2304 case AArch64::LDRBBui: return AArch64::LDURBBi;
2305 case AArch64::LDRHHui: return AArch64::LDURHHi;
2306 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2307 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2308 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2309 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2310 case AArch64::LDRSWui: return AArch64::LDURSWi;
2311 case AArch64::STRXui: return AArch64::STURXi;
2312 case AArch64::STRWui: return AArch64::STURWi;
2313 case AArch64::STRBui: return AArch64::STURBi;
2314 case AArch64::STRHui: return AArch64::STURHi;
2315 case AArch64::STRSui: return AArch64::STURSi;
2316 case AArch64::STRDui: return AArch64::STURDi;
2317 case AArch64::STRQui: return AArch64::STURQi;
2318 case AArch64::STRBBui: return AArch64::STURBBi;
2319 case AArch64::STRHHui: return AArch64::STURHHi;
2320 }
2321 }
2322
getLoadStoreImmIdx(unsigned Opc)2323 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2324 switch (Opc) {
2325 default:
2326 return 2;
2327 case AArch64::LDPXi:
2328 case AArch64::LDPDi:
2329 case AArch64::STPXi:
2330 case AArch64::STPDi:
2331 case AArch64::LDNPXi:
2332 case AArch64::LDNPDi:
2333 case AArch64::STNPXi:
2334 case AArch64::STNPDi:
2335 case AArch64::LDPQi:
2336 case AArch64::STPQi:
2337 case AArch64::LDNPQi:
2338 case AArch64::STNPQi:
2339 case AArch64::LDPWi:
2340 case AArch64::LDPSi:
2341 case AArch64::STPWi:
2342 case AArch64::STPSi:
2343 case AArch64::LDNPWi:
2344 case AArch64::LDNPSi:
2345 case AArch64::STNPWi:
2346 case AArch64::STNPSi:
2347 case AArch64::LDG:
2348 case AArch64::STGPi:
2349
2350 case AArch64::LD1B_IMM:
2351 case AArch64::LD1B_H_IMM:
2352 case AArch64::LD1B_S_IMM:
2353 case AArch64::LD1B_D_IMM:
2354 case AArch64::LD1SB_H_IMM:
2355 case AArch64::LD1SB_S_IMM:
2356 case AArch64::LD1SB_D_IMM:
2357 case AArch64::LD1H_IMM:
2358 case AArch64::LD1H_S_IMM:
2359 case AArch64::LD1H_D_IMM:
2360 case AArch64::LD1SH_S_IMM:
2361 case AArch64::LD1SH_D_IMM:
2362 case AArch64::LD1W_IMM:
2363 case AArch64::LD1W_D_IMM:
2364 case AArch64::LD1SW_D_IMM:
2365 case AArch64::LD1D_IMM:
2366
2367 case AArch64::LD2B_IMM:
2368 case AArch64::LD2H_IMM:
2369 case AArch64::LD2W_IMM:
2370 case AArch64::LD2D_IMM:
2371 case AArch64::LD3B_IMM:
2372 case AArch64::LD3H_IMM:
2373 case AArch64::LD3W_IMM:
2374 case AArch64::LD3D_IMM:
2375 case AArch64::LD4B_IMM:
2376 case AArch64::LD4H_IMM:
2377 case AArch64::LD4W_IMM:
2378 case AArch64::LD4D_IMM:
2379
2380 case AArch64::ST1B_IMM:
2381 case AArch64::ST1B_H_IMM:
2382 case AArch64::ST1B_S_IMM:
2383 case AArch64::ST1B_D_IMM:
2384 case AArch64::ST1H_IMM:
2385 case AArch64::ST1H_S_IMM:
2386 case AArch64::ST1H_D_IMM:
2387 case AArch64::ST1W_IMM:
2388 case AArch64::ST1W_D_IMM:
2389 case AArch64::ST1D_IMM:
2390
2391 case AArch64::ST2B_IMM:
2392 case AArch64::ST2H_IMM:
2393 case AArch64::ST2W_IMM:
2394 case AArch64::ST2D_IMM:
2395 case AArch64::ST3B_IMM:
2396 case AArch64::ST3H_IMM:
2397 case AArch64::ST3W_IMM:
2398 case AArch64::ST3D_IMM:
2399 case AArch64::ST4B_IMM:
2400 case AArch64::ST4H_IMM:
2401 case AArch64::ST4W_IMM:
2402 case AArch64::ST4D_IMM:
2403
2404 case AArch64::LD1RB_IMM:
2405 case AArch64::LD1RB_H_IMM:
2406 case AArch64::LD1RB_S_IMM:
2407 case AArch64::LD1RB_D_IMM:
2408 case AArch64::LD1RSB_H_IMM:
2409 case AArch64::LD1RSB_S_IMM:
2410 case AArch64::LD1RSB_D_IMM:
2411 case AArch64::LD1RH_IMM:
2412 case AArch64::LD1RH_S_IMM:
2413 case AArch64::LD1RH_D_IMM:
2414 case AArch64::LD1RSH_S_IMM:
2415 case AArch64::LD1RSH_D_IMM:
2416 case AArch64::LD1RW_IMM:
2417 case AArch64::LD1RW_D_IMM:
2418 case AArch64::LD1RSW_IMM:
2419 case AArch64::LD1RD_IMM:
2420
2421 case AArch64::LDNT1B_ZRI:
2422 case AArch64::LDNT1H_ZRI:
2423 case AArch64::LDNT1W_ZRI:
2424 case AArch64::LDNT1D_ZRI:
2425 case AArch64::STNT1B_ZRI:
2426 case AArch64::STNT1H_ZRI:
2427 case AArch64::STNT1W_ZRI:
2428 case AArch64::STNT1D_ZRI:
2429
2430 case AArch64::LDNF1B_IMM:
2431 case AArch64::LDNF1B_H_IMM:
2432 case AArch64::LDNF1B_S_IMM:
2433 case AArch64::LDNF1B_D_IMM:
2434 case AArch64::LDNF1SB_H_IMM:
2435 case AArch64::LDNF1SB_S_IMM:
2436 case AArch64::LDNF1SB_D_IMM:
2437 case AArch64::LDNF1H_IMM:
2438 case AArch64::LDNF1H_S_IMM:
2439 case AArch64::LDNF1H_D_IMM:
2440 case AArch64::LDNF1SH_S_IMM:
2441 case AArch64::LDNF1SH_D_IMM:
2442 case AArch64::LDNF1W_IMM:
2443 case AArch64::LDNF1W_D_IMM:
2444 case AArch64::LDNF1SW_D_IMM:
2445 case AArch64::LDNF1D_IMM:
2446 return 3;
2447 case AArch64::ADDG:
2448 case AArch64::STGi:
2449 case AArch64::LDR_PXI:
2450 case AArch64::STR_PXI:
2451 return 2;
2452 }
2453 }
2454
isPairableLdStInst(const MachineInstr & MI)2455 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2456 switch (MI.getOpcode()) {
2457 default:
2458 return false;
2459 // Scaled instructions.
2460 case AArch64::STRSui:
2461 case AArch64::STRDui:
2462 case AArch64::STRQui:
2463 case AArch64::STRXui:
2464 case AArch64::STRWui:
2465 case AArch64::LDRSui:
2466 case AArch64::LDRDui:
2467 case AArch64::LDRQui:
2468 case AArch64::LDRXui:
2469 case AArch64::LDRWui:
2470 case AArch64::LDRSWui:
2471 // Unscaled instructions.
2472 case AArch64::STURSi:
2473 case AArch64::STRSpre:
2474 case AArch64::STURDi:
2475 case AArch64::STRDpre:
2476 case AArch64::STURQi:
2477 case AArch64::STRQpre:
2478 case AArch64::STURWi:
2479 case AArch64::STRWpre:
2480 case AArch64::STURXi:
2481 case AArch64::STRXpre:
2482 case AArch64::LDURSi:
2483 case AArch64::LDRSpre:
2484 case AArch64::LDURDi:
2485 case AArch64::LDRDpre:
2486 case AArch64::LDURQi:
2487 case AArch64::LDRQpre:
2488 case AArch64::LDURWi:
2489 case AArch64::LDRWpre:
2490 case AArch64::LDURXi:
2491 case AArch64::LDRXpre:
2492 case AArch64::LDURSWi:
2493 case AArch64::LDRSWpre:
2494 return true;
2495 }
2496 }
2497
isTailCallReturnInst(const MachineInstr & MI)2498 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2499 switch (MI.getOpcode()) {
2500 default:
2501 assert((!MI.isCall() || !MI.isReturn()) &&
2502 "Unexpected instruction - was a new tail call opcode introduced?");
2503 return false;
2504 case AArch64::TCRETURNdi:
2505 case AArch64::TCRETURNri:
2506 case AArch64::TCRETURNriBTI:
2507 case AArch64::TCRETURNriALL:
2508 return true;
2509 }
2510 }
2511
convertToFlagSettingOpc(unsigned Opc)2512 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2513 switch (Opc) {
2514 default:
2515 llvm_unreachable("Opcode has no flag setting equivalent!");
2516 // 32-bit cases:
2517 case AArch64::ADDWri:
2518 return AArch64::ADDSWri;
2519 case AArch64::ADDWrr:
2520 return AArch64::ADDSWrr;
2521 case AArch64::ADDWrs:
2522 return AArch64::ADDSWrs;
2523 case AArch64::ADDWrx:
2524 return AArch64::ADDSWrx;
2525 case AArch64::ANDWri:
2526 return AArch64::ANDSWri;
2527 case AArch64::ANDWrr:
2528 return AArch64::ANDSWrr;
2529 case AArch64::ANDWrs:
2530 return AArch64::ANDSWrs;
2531 case AArch64::BICWrr:
2532 return AArch64::BICSWrr;
2533 case AArch64::BICWrs:
2534 return AArch64::BICSWrs;
2535 case AArch64::SUBWri:
2536 return AArch64::SUBSWri;
2537 case AArch64::SUBWrr:
2538 return AArch64::SUBSWrr;
2539 case AArch64::SUBWrs:
2540 return AArch64::SUBSWrs;
2541 case AArch64::SUBWrx:
2542 return AArch64::SUBSWrx;
2543 // 64-bit cases:
2544 case AArch64::ADDXri:
2545 return AArch64::ADDSXri;
2546 case AArch64::ADDXrr:
2547 return AArch64::ADDSXrr;
2548 case AArch64::ADDXrs:
2549 return AArch64::ADDSXrs;
2550 case AArch64::ADDXrx:
2551 return AArch64::ADDSXrx;
2552 case AArch64::ANDXri:
2553 return AArch64::ANDSXri;
2554 case AArch64::ANDXrr:
2555 return AArch64::ANDSXrr;
2556 case AArch64::ANDXrs:
2557 return AArch64::ANDSXrs;
2558 case AArch64::BICXrr:
2559 return AArch64::BICSXrr;
2560 case AArch64::BICXrs:
2561 return AArch64::BICSXrs;
2562 case AArch64::SUBXri:
2563 return AArch64::SUBSXri;
2564 case AArch64::SUBXrr:
2565 return AArch64::SUBSXrr;
2566 case AArch64::SUBXrs:
2567 return AArch64::SUBSXrs;
2568 case AArch64::SUBXrx:
2569 return AArch64::SUBSXrx;
2570 // SVE instructions:
2571 case AArch64::AND_PPzPP:
2572 return AArch64::ANDS_PPzPP;
2573 case AArch64::BIC_PPzPP:
2574 return AArch64::BICS_PPzPP;
2575 case AArch64::EOR_PPzPP:
2576 return AArch64::EORS_PPzPP;
2577 case AArch64::NAND_PPzPP:
2578 return AArch64::NANDS_PPzPP;
2579 case AArch64::NOR_PPzPP:
2580 return AArch64::NORS_PPzPP;
2581 case AArch64::ORN_PPzPP:
2582 return AArch64::ORNS_PPzPP;
2583 case AArch64::ORR_PPzPP:
2584 return AArch64::ORRS_PPzPP;
2585 case AArch64::BRKA_PPzP:
2586 return AArch64::BRKAS_PPzP;
2587 case AArch64::BRKPA_PPzPP:
2588 return AArch64::BRKPAS_PPzPP;
2589 case AArch64::BRKB_PPzP:
2590 return AArch64::BRKBS_PPzP;
2591 case AArch64::BRKPB_PPzPP:
2592 return AArch64::BRKPBS_PPzPP;
2593 case AArch64::BRKN_PPzP:
2594 return AArch64::BRKNS_PPzP;
2595 case AArch64::RDFFR_PPz:
2596 return AArch64::RDFFRS_PPz;
2597 case AArch64::PTRUE_B:
2598 return AArch64::PTRUES_B;
2599 }
2600 }
2601
2602 // Is this a candidate for ld/st merging or pairing? For example, we don't
2603 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2604 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2605
2606 bool IsPreLdSt = isPreLdSt(MI);
2607
2608 // If this is a volatile load/store, don't mess with it.
2609 if (MI.hasOrderedMemoryRef())
2610 return false;
2611
2612 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2613 // For Pre-inc LD/ST, the operand is shifted by one.
2614 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2615 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2616 "Expected a reg or frame index operand.");
2617
2618 // For Pre-indexed addressing quadword instructions, the third operand is the
2619 // immediate value.
2620 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2621
2622 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2623 return false;
2624
2625 // Can't merge/pair if the instruction modifies the base register.
2626 // e.g., ldr x0, [x0]
2627 // This case will never occur with an FI base.
2628 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2629 // STR<S,D,Q,W,X>pre, it can be merged.
2630 // For example:
2631 // ldr q0, [x11, #32]!
2632 // ldr q1, [x11, #16]
2633 // to
2634 // ldp q0, q1, [x11, #32]!
2635 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2636 Register BaseReg = MI.getOperand(1).getReg();
2637 const TargetRegisterInfo *TRI = &getRegisterInfo();
2638 if (MI.modifiesRegister(BaseReg, TRI))
2639 return false;
2640 }
2641
2642 // Check if this load/store has a hint to avoid pair formation.
2643 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2644 if (isLdStPairSuppressed(MI))
2645 return false;
2646
2647 // Do not pair any callee-save store/reload instructions in the
2648 // prologue/epilogue if the CFI information encoded the operations as separate
2649 // instructions, as that will cause the size of the actual prologue to mismatch
2650 // with the prologue size recorded in the Windows CFI.
2651 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2652 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2653 MI.getMF()->getFunction().needsUnwindTableEntry();
2654 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2655 MI.getFlag(MachineInstr::FrameDestroy)))
2656 return false;
2657
2658 // On some CPUs quad load/store pairs are slower than two single load/stores.
2659 if (Subtarget.isPaired128Slow()) {
2660 switch (MI.getOpcode()) {
2661 default:
2662 break;
2663 case AArch64::LDURQi:
2664 case AArch64::STURQi:
2665 case AArch64::LDRQui:
2666 case AArch64::STRQui:
2667 return false;
2668 }
2669 }
2670
2671 return true;
2672 }
2673
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2674 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2675 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2676 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2677 const TargetRegisterInfo *TRI) const {
2678 if (!LdSt.mayLoadOrStore())
2679 return false;
2680
2681 const MachineOperand *BaseOp;
2682 TypeSize WidthN(0, false);
2683 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2684 WidthN, TRI))
2685 return false;
2686 // The maximum vscale is 16 under AArch64, return the maximal extent for the
2687 // vector.
2688 Width = WidthN.isScalable()
2689 ? WidthN.getKnownMinValue() * AArch64::SVEMaxBitsPerVector /
2690 AArch64::SVEBitsPerBlock
2691 : WidthN.getKnownMinValue();
2692 BaseOps.push_back(BaseOp);
2693 return true;
2694 }
2695
2696 std::optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2697 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2698 const TargetRegisterInfo *TRI) const {
2699 const MachineOperand *Base; // Filled with the base operand of MI.
2700 int64_t Offset; // Filled with the offset of MI.
2701 bool OffsetIsScalable;
2702 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2703 return std::nullopt;
2704
2705 if (!Base->isReg())
2706 return std::nullopt;
2707 ExtAddrMode AM;
2708 AM.BaseReg = Base->getReg();
2709 AM.Displacement = Offset;
2710 AM.ScaledReg = 0;
2711 AM.Scale = 0;
2712 return AM;
2713 }
2714
canFoldIntoAddrMode(const MachineInstr & MemI,Register Reg,const MachineInstr & AddrI,ExtAddrMode & AM) const2715 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2716 Register Reg,
2717 const MachineInstr &AddrI,
2718 ExtAddrMode &AM) const {
2719 // Filter out instructions into which we cannot fold.
2720 unsigned NumBytes;
2721 int64_t OffsetScale = 1;
2722 switch (MemI.getOpcode()) {
2723 default:
2724 return false;
2725
2726 case AArch64::LDURQi:
2727 case AArch64::STURQi:
2728 NumBytes = 16;
2729 break;
2730
2731 case AArch64::LDURDi:
2732 case AArch64::STURDi:
2733 case AArch64::LDURXi:
2734 case AArch64::STURXi:
2735 NumBytes = 8;
2736 break;
2737
2738 case AArch64::LDURWi:
2739 case AArch64::LDURSWi:
2740 case AArch64::STURWi:
2741 NumBytes = 4;
2742 break;
2743
2744 case AArch64::LDURHi:
2745 case AArch64::STURHi:
2746 case AArch64::LDURHHi:
2747 case AArch64::STURHHi:
2748 case AArch64::LDURSHXi:
2749 case AArch64::LDURSHWi:
2750 NumBytes = 2;
2751 break;
2752
2753 case AArch64::LDRBroX:
2754 case AArch64::LDRBBroX:
2755 case AArch64::LDRSBXroX:
2756 case AArch64::LDRSBWroX:
2757 case AArch64::STRBroX:
2758 case AArch64::STRBBroX:
2759 case AArch64::LDURBi:
2760 case AArch64::LDURBBi:
2761 case AArch64::LDURSBXi:
2762 case AArch64::LDURSBWi:
2763 case AArch64::STURBi:
2764 case AArch64::STURBBi:
2765 case AArch64::LDRBui:
2766 case AArch64::LDRBBui:
2767 case AArch64::LDRSBXui:
2768 case AArch64::LDRSBWui:
2769 case AArch64::STRBui:
2770 case AArch64::STRBBui:
2771 NumBytes = 1;
2772 break;
2773
2774 case AArch64::LDRQroX:
2775 case AArch64::STRQroX:
2776 case AArch64::LDRQui:
2777 case AArch64::STRQui:
2778 NumBytes = 16;
2779 OffsetScale = 16;
2780 break;
2781
2782 case AArch64::LDRDroX:
2783 case AArch64::STRDroX:
2784 case AArch64::LDRXroX:
2785 case AArch64::STRXroX:
2786 case AArch64::LDRDui:
2787 case AArch64::STRDui:
2788 case AArch64::LDRXui:
2789 case AArch64::STRXui:
2790 NumBytes = 8;
2791 OffsetScale = 8;
2792 break;
2793
2794 case AArch64::LDRWroX:
2795 case AArch64::LDRSWroX:
2796 case AArch64::STRWroX:
2797 case AArch64::LDRWui:
2798 case AArch64::LDRSWui:
2799 case AArch64::STRWui:
2800 NumBytes = 4;
2801 OffsetScale = 4;
2802 break;
2803
2804 case AArch64::LDRHroX:
2805 case AArch64::STRHroX:
2806 case AArch64::LDRHHroX:
2807 case AArch64::STRHHroX:
2808 case AArch64::LDRSHXroX:
2809 case AArch64::LDRSHWroX:
2810 case AArch64::LDRHui:
2811 case AArch64::STRHui:
2812 case AArch64::LDRHHui:
2813 case AArch64::STRHHui:
2814 case AArch64::LDRSHXui:
2815 case AArch64::LDRSHWui:
2816 NumBytes = 2;
2817 OffsetScale = 2;
2818 break;
2819 }
2820
2821 // Check the fold operand is not the loaded/stored value.
2822 const MachineOperand &BaseRegOp = MemI.getOperand(0);
2823 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2824 return false;
2825
2826 // Handle memory instructions with a [Reg, Reg] addressing mode.
2827 if (MemI.getOperand(2).isReg()) {
2828 // Bail if the addressing mode already includes extension of the offset
2829 // register.
2830 if (MemI.getOperand(3).getImm())
2831 return false;
2832
2833 // Check if we actually have a scaled offset.
2834 if (MemI.getOperand(4).getImm() == 0)
2835 OffsetScale = 1;
2836
2837 // If the address instructions is folded into the base register, then the
2838 // addressing mode must not have a scale. Then we can swap the base and the
2839 // scaled registers.
2840 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
2841 return false;
2842
2843 switch (AddrI.getOpcode()) {
2844 default:
2845 return false;
2846
2847 case AArch64::SBFMXri:
2848 // sxtw Xa, Wm
2849 // ldr Xd, [Xn, Xa, lsl #N]
2850 // ->
2851 // ldr Xd, [Xn, Wm, sxtw #N]
2852 if (AddrI.getOperand(2).getImm() != 0 ||
2853 AddrI.getOperand(3).getImm() != 31)
2854 return false;
2855
2856 AM.BaseReg = MemI.getOperand(1).getReg();
2857 if (AM.BaseReg == Reg)
2858 AM.BaseReg = MemI.getOperand(2).getReg();
2859 AM.ScaledReg = AddrI.getOperand(1).getReg();
2860 AM.Scale = OffsetScale;
2861 AM.Displacement = 0;
2862 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2863 return true;
2864
2865 case TargetOpcode::SUBREG_TO_REG: {
2866 // mov Wa, Wm
2867 // ldr Xd, [Xn, Xa, lsl #N]
2868 // ->
2869 // ldr Xd, [Xn, Wm, uxtw #N]
2870
2871 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2872 if (AddrI.getOperand(1).getImm() != 0 ||
2873 AddrI.getOperand(3).getImm() != AArch64::sub_32)
2874 return false;
2875
2876 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2877 Register OffsetReg = AddrI.getOperand(2).getReg();
2878 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
2879 return false;
2880
2881 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
2882 if (DefMI.getOpcode() != AArch64::ORRWrs ||
2883 DefMI.getOperand(1).getReg() != AArch64::WZR ||
2884 DefMI.getOperand(3).getImm() != 0)
2885 return false;
2886
2887 AM.BaseReg = MemI.getOperand(1).getReg();
2888 if (AM.BaseReg == Reg)
2889 AM.BaseReg = MemI.getOperand(2).getReg();
2890 AM.ScaledReg = DefMI.getOperand(2).getReg();
2891 AM.Scale = OffsetScale;
2892 AM.Displacement = 0;
2893 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2894 return true;
2895 }
2896 }
2897 }
2898
2899 // Handle memory instructions with a [Reg, #Imm] addressing mode.
2900
2901 // Check we are not breaking a potential conversion to an LDP.
2902 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2903 int64_t NewOffset) -> bool {
2904 int64_t MinOffset, MaxOffset;
2905 switch (NumBytes) {
2906 default:
2907 return true;
2908 case 4:
2909 MinOffset = -256;
2910 MaxOffset = 252;
2911 break;
2912 case 8:
2913 MinOffset = -512;
2914 MaxOffset = 504;
2915 break;
2916 case 16:
2917 MinOffset = -1024;
2918 MaxOffset = 1008;
2919 break;
2920 }
2921 return OldOffset < MinOffset || OldOffset > MaxOffset ||
2922 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2923 };
2924 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2925 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
2926 int64_t NewOffset = OldOffset + Disp;
2927 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
2928 return false;
2929 // If the old offset would fit into an LDP, but the new offset wouldn't,
2930 // bail out.
2931 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2932 return false;
2933 AM.BaseReg = AddrI.getOperand(1).getReg();
2934 AM.ScaledReg = 0;
2935 AM.Scale = 0;
2936 AM.Displacement = NewOffset;
2937 AM.Form = ExtAddrMode::Formula::Basic;
2938 return true;
2939 };
2940
2941 auto canFoldAddRegIntoAddrMode =
2942 [&](int64_t Scale,
2943 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2944 if (MemI.getOperand(2).getImm() != 0)
2945 return false;
2946 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2947 return false;
2948 AM.BaseReg = AddrI.getOperand(1).getReg();
2949 AM.ScaledReg = AddrI.getOperand(2).getReg();
2950 AM.Scale = Scale;
2951 AM.Displacement = 0;
2952 AM.Form = Form;
2953 return true;
2954 };
2955
2956 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2957 unsigned Opcode = MemI.getOpcode();
2958 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2959 Subtarget.isSTRQroSlow();
2960 };
2961
2962 int64_t Disp = 0;
2963 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2964 switch (AddrI.getOpcode()) {
2965 default:
2966 return false;
2967
2968 case AArch64::ADDXri:
2969 // add Xa, Xn, #N
2970 // ldr Xd, [Xa, #M]
2971 // ->
2972 // ldr Xd, [Xn, #N'+M]
2973 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2974 return canFoldAddSubImmIntoAddrMode(Disp);
2975
2976 case AArch64::SUBXri:
2977 // sub Xa, Xn, #N
2978 // ldr Xd, [Xa, #M]
2979 // ->
2980 // ldr Xd, [Xn, #N'+M]
2981 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2982 return canFoldAddSubImmIntoAddrMode(-Disp);
2983
2984 case AArch64::ADDXrs: {
2985 // add Xa, Xn, Xm, lsl #N
2986 // ldr Xd, [Xa]
2987 // ->
2988 // ldr Xd, [Xn, Xm, lsl #N]
2989
2990 // Don't fold the add if the result would be slower, unless optimising for
2991 // size.
2992 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
2993 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
2994 return false;
2995 Shift = AArch64_AM::getShiftValue(Shift);
2996 if (!OptSize) {
2997 if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
2998 return false;
2999 if (avoidSlowSTRQ(MemI))
3000 return false;
3001 }
3002 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3003 }
3004
3005 case AArch64::ADDXrr:
3006 // add Xa, Xn, Xm
3007 // ldr Xd, [Xa]
3008 // ->
3009 // ldr Xd, [Xn, Xm, lsl #0]
3010
3011 // Don't fold the add if the result would be slower, unless optimising for
3012 // size.
3013 if (!OptSize && avoidSlowSTRQ(MemI))
3014 return false;
3015 return canFoldAddRegIntoAddrMode(1);
3016
3017 case AArch64::ADDXrx:
3018 // add Xa, Xn, Wm, {s,u}xtw #N
3019 // ldr Xd, [Xa]
3020 // ->
3021 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3022
3023 // Don't fold the add if the result would be slower, unless optimising for
3024 // size.
3025 if (!OptSize && avoidSlowSTRQ(MemI))
3026 return false;
3027
3028 // Can fold only sign-/zero-extend of a word.
3029 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3030 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3031 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3032 return false;
3033
3034 return canFoldAddRegIntoAddrMode(
3035 1ULL << AArch64_AM::getArithShiftValue(Imm),
3036 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3037 : ExtAddrMode::Formula::ZExtScaledReg);
3038 }
3039 }
3040
3041 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3042 // return the opcode of an instruction performing the same operation, but using
3043 // the [Reg, Reg] addressing mode.
regOffsetOpcode(unsigned Opcode)3044 static unsigned regOffsetOpcode(unsigned Opcode) {
3045 switch (Opcode) {
3046 default:
3047 llvm_unreachable("Address folding not implemented for instruction");
3048
3049 case AArch64::LDURQi:
3050 case AArch64::LDRQui:
3051 return AArch64::LDRQroX;
3052 case AArch64::STURQi:
3053 case AArch64::STRQui:
3054 return AArch64::STRQroX;
3055 case AArch64::LDURDi:
3056 case AArch64::LDRDui:
3057 return AArch64::LDRDroX;
3058 case AArch64::STURDi:
3059 case AArch64::STRDui:
3060 return AArch64::STRDroX;
3061 case AArch64::LDURXi:
3062 case AArch64::LDRXui:
3063 return AArch64::LDRXroX;
3064 case AArch64::STURXi:
3065 case AArch64::STRXui:
3066 return AArch64::STRXroX;
3067 case AArch64::LDURWi:
3068 case AArch64::LDRWui:
3069 return AArch64::LDRWroX;
3070 case AArch64::LDURSWi:
3071 case AArch64::LDRSWui:
3072 return AArch64::LDRSWroX;
3073 case AArch64::STURWi:
3074 case AArch64::STRWui:
3075 return AArch64::STRWroX;
3076 case AArch64::LDURHi:
3077 case AArch64::LDRHui:
3078 return AArch64::LDRHroX;
3079 case AArch64::STURHi:
3080 case AArch64::STRHui:
3081 return AArch64::STRHroX;
3082 case AArch64::LDURHHi:
3083 case AArch64::LDRHHui:
3084 return AArch64::LDRHHroX;
3085 case AArch64::STURHHi:
3086 case AArch64::STRHHui:
3087 return AArch64::STRHHroX;
3088 case AArch64::LDURSHXi:
3089 case AArch64::LDRSHXui:
3090 return AArch64::LDRSHXroX;
3091 case AArch64::LDURSHWi:
3092 case AArch64::LDRSHWui:
3093 return AArch64::LDRSHWroX;
3094 case AArch64::LDURBi:
3095 case AArch64::LDRBui:
3096 return AArch64::LDRBroX;
3097 case AArch64::LDURBBi:
3098 case AArch64::LDRBBui:
3099 return AArch64::LDRBBroX;
3100 case AArch64::LDURSBXi:
3101 case AArch64::LDRSBXui:
3102 return AArch64::LDRSBXroX;
3103 case AArch64::LDURSBWi:
3104 case AArch64::LDRSBWui:
3105 return AArch64::LDRSBWroX;
3106 case AArch64::STURBi:
3107 case AArch64::STRBui:
3108 return AArch64::STRBroX;
3109 case AArch64::STURBBi:
3110 case AArch64::STRBBui:
3111 return AArch64::STRBBroX;
3112 }
3113 }
3114
3115 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3116 // the opcode of an instruction performing the same operation, but using the
3117 // [Reg, #Imm] addressing mode with scaled offset.
scaledOffsetOpcode(unsigned Opcode,unsigned & Scale)3118 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3119 switch (Opcode) {
3120 default:
3121 llvm_unreachable("Address folding not implemented for instruction");
3122
3123 case AArch64::LDURQi:
3124 Scale = 16;
3125 return AArch64::LDRQui;
3126 case AArch64::STURQi:
3127 Scale = 16;
3128 return AArch64::STRQui;
3129 case AArch64::LDURDi:
3130 Scale = 8;
3131 return AArch64::LDRDui;
3132 case AArch64::STURDi:
3133 Scale = 8;
3134 return AArch64::STRDui;
3135 case AArch64::LDURXi:
3136 Scale = 8;
3137 return AArch64::LDRXui;
3138 case AArch64::STURXi:
3139 Scale = 8;
3140 return AArch64::STRXui;
3141 case AArch64::LDURWi:
3142 Scale = 4;
3143 return AArch64::LDRWui;
3144 case AArch64::LDURSWi:
3145 Scale = 4;
3146 return AArch64::LDRSWui;
3147 case AArch64::STURWi:
3148 Scale = 4;
3149 return AArch64::STRWui;
3150 case AArch64::LDURHi:
3151 Scale = 2;
3152 return AArch64::LDRHui;
3153 case AArch64::STURHi:
3154 Scale = 2;
3155 return AArch64::STRHui;
3156 case AArch64::LDURHHi:
3157 Scale = 2;
3158 return AArch64::LDRHHui;
3159 case AArch64::STURHHi:
3160 Scale = 2;
3161 return AArch64::STRHHui;
3162 case AArch64::LDURSHXi:
3163 Scale = 2;
3164 return AArch64::LDRSHXui;
3165 case AArch64::LDURSHWi:
3166 Scale = 2;
3167 return AArch64::LDRSHWui;
3168 case AArch64::LDURBi:
3169 Scale = 1;
3170 return AArch64::LDRBui;
3171 case AArch64::LDURBBi:
3172 Scale = 1;
3173 return AArch64::LDRBBui;
3174 case AArch64::LDURSBXi:
3175 Scale = 1;
3176 return AArch64::LDRSBXui;
3177 case AArch64::LDURSBWi:
3178 Scale = 1;
3179 return AArch64::LDRSBWui;
3180 case AArch64::STURBi:
3181 Scale = 1;
3182 return AArch64::STRBui;
3183 case AArch64::STURBBi:
3184 Scale = 1;
3185 return AArch64::STRBBui;
3186 case AArch64::LDRQui:
3187 case AArch64::STRQui:
3188 Scale = 16;
3189 return Opcode;
3190 case AArch64::LDRDui:
3191 case AArch64::STRDui:
3192 case AArch64::LDRXui:
3193 case AArch64::STRXui:
3194 Scale = 8;
3195 return Opcode;
3196 case AArch64::LDRWui:
3197 case AArch64::LDRSWui:
3198 case AArch64::STRWui:
3199 Scale = 4;
3200 return Opcode;
3201 case AArch64::LDRHui:
3202 case AArch64::STRHui:
3203 case AArch64::LDRHHui:
3204 case AArch64::STRHHui:
3205 case AArch64::LDRSHXui:
3206 case AArch64::LDRSHWui:
3207 Scale = 2;
3208 return Opcode;
3209 case AArch64::LDRBui:
3210 case AArch64::LDRBBui:
3211 case AArch64::LDRSBXui:
3212 case AArch64::LDRSBWui:
3213 case AArch64::STRBui:
3214 case AArch64::STRBBui:
3215 Scale = 1;
3216 return Opcode;
3217 }
3218 }
3219
3220 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3221 // the opcode of an instruction performing the same operation, but using the
3222 // [Reg, #Imm] addressing mode with unscaled offset.
unscaledOffsetOpcode(unsigned Opcode)3223 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3224 switch (Opcode) {
3225 default:
3226 llvm_unreachable("Address folding not implemented for instruction");
3227
3228 case AArch64::LDURQi:
3229 case AArch64::STURQi:
3230 case AArch64::LDURDi:
3231 case AArch64::STURDi:
3232 case AArch64::LDURXi:
3233 case AArch64::STURXi:
3234 case AArch64::LDURWi:
3235 case AArch64::LDURSWi:
3236 case AArch64::STURWi:
3237 case AArch64::LDURHi:
3238 case AArch64::STURHi:
3239 case AArch64::LDURHHi:
3240 case AArch64::STURHHi:
3241 case AArch64::LDURSHXi:
3242 case AArch64::LDURSHWi:
3243 case AArch64::LDURBi:
3244 case AArch64::STURBi:
3245 case AArch64::LDURBBi:
3246 case AArch64::STURBBi:
3247 case AArch64::LDURSBWi:
3248 case AArch64::LDURSBXi:
3249 return Opcode;
3250 case AArch64::LDRQui:
3251 return AArch64::LDURQi;
3252 case AArch64::STRQui:
3253 return AArch64::STURQi;
3254 case AArch64::LDRDui:
3255 return AArch64::LDURDi;
3256 case AArch64::STRDui:
3257 return AArch64::STURDi;
3258 case AArch64::LDRXui:
3259 return AArch64::LDURXi;
3260 case AArch64::STRXui:
3261 return AArch64::STURXi;
3262 case AArch64::LDRWui:
3263 return AArch64::LDURWi;
3264 case AArch64::LDRSWui:
3265 return AArch64::LDURSWi;
3266 case AArch64::STRWui:
3267 return AArch64::STURWi;
3268 case AArch64::LDRHui:
3269 return AArch64::LDURHi;
3270 case AArch64::STRHui:
3271 return AArch64::STURHi;
3272 case AArch64::LDRHHui:
3273 return AArch64::LDURHHi;
3274 case AArch64::STRHHui:
3275 return AArch64::STURHHi;
3276 case AArch64::LDRSHXui:
3277 return AArch64::LDURSHXi;
3278 case AArch64::LDRSHWui:
3279 return AArch64::LDURSHWi;
3280 case AArch64::LDRBBui:
3281 return AArch64::LDURBBi;
3282 case AArch64::LDRBui:
3283 return AArch64::LDURBi;
3284 case AArch64::STRBBui:
3285 return AArch64::STURBBi;
3286 case AArch64::STRBui:
3287 return AArch64::STURBi;
3288 case AArch64::LDRSBWui:
3289 return AArch64::LDURSBWi;
3290 case AArch64::LDRSBXui:
3291 return AArch64::LDURSBXi;
3292 }
3293 }
3294
3295 // Given the opcode of a memory load/store instruction, return the opcode of an
3296 // instruction performing the same operation, but using
3297 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3298 // offset register.
offsetExtendOpcode(unsigned Opcode)3299 static unsigned offsetExtendOpcode(unsigned Opcode) {
3300 switch (Opcode) {
3301 default:
3302 llvm_unreachable("Address folding not implemented for instruction");
3303
3304 case AArch64::LDRQroX:
3305 case AArch64::LDURQi:
3306 case AArch64::LDRQui:
3307 return AArch64::LDRQroW;
3308 case AArch64::STRQroX:
3309 case AArch64::STURQi:
3310 case AArch64::STRQui:
3311 return AArch64::STRQroW;
3312 case AArch64::LDRDroX:
3313 case AArch64::LDURDi:
3314 case AArch64::LDRDui:
3315 return AArch64::LDRDroW;
3316 case AArch64::STRDroX:
3317 case AArch64::STURDi:
3318 case AArch64::STRDui:
3319 return AArch64::STRDroW;
3320 case AArch64::LDRXroX:
3321 case AArch64::LDURXi:
3322 case AArch64::LDRXui:
3323 return AArch64::LDRXroW;
3324 case AArch64::STRXroX:
3325 case AArch64::STURXi:
3326 case AArch64::STRXui:
3327 return AArch64::STRXroW;
3328 case AArch64::LDRWroX:
3329 case AArch64::LDURWi:
3330 case AArch64::LDRWui:
3331 return AArch64::LDRWroW;
3332 case AArch64::LDRSWroX:
3333 case AArch64::LDURSWi:
3334 case AArch64::LDRSWui:
3335 return AArch64::LDRSWroW;
3336 case AArch64::STRWroX:
3337 case AArch64::STURWi:
3338 case AArch64::STRWui:
3339 return AArch64::STRWroW;
3340 case AArch64::LDRHroX:
3341 case AArch64::LDURHi:
3342 case AArch64::LDRHui:
3343 return AArch64::LDRHroW;
3344 case AArch64::STRHroX:
3345 case AArch64::STURHi:
3346 case AArch64::STRHui:
3347 return AArch64::STRHroW;
3348 case AArch64::LDRHHroX:
3349 case AArch64::LDURHHi:
3350 case AArch64::LDRHHui:
3351 return AArch64::LDRHHroW;
3352 case AArch64::STRHHroX:
3353 case AArch64::STURHHi:
3354 case AArch64::STRHHui:
3355 return AArch64::STRHHroW;
3356 case AArch64::LDRSHXroX:
3357 case AArch64::LDURSHXi:
3358 case AArch64::LDRSHXui:
3359 return AArch64::LDRSHXroW;
3360 case AArch64::LDRSHWroX:
3361 case AArch64::LDURSHWi:
3362 case AArch64::LDRSHWui:
3363 return AArch64::LDRSHWroW;
3364 case AArch64::LDRBroX:
3365 case AArch64::LDURBi:
3366 case AArch64::LDRBui:
3367 return AArch64::LDRBroW;
3368 case AArch64::LDRBBroX:
3369 case AArch64::LDURBBi:
3370 case AArch64::LDRBBui:
3371 return AArch64::LDRBBroW;
3372 case AArch64::LDRSBXroX:
3373 case AArch64::LDURSBXi:
3374 case AArch64::LDRSBXui:
3375 return AArch64::LDRSBXroW;
3376 case AArch64::LDRSBWroX:
3377 case AArch64::LDURSBWi:
3378 case AArch64::LDRSBWui:
3379 return AArch64::LDRSBWroW;
3380 case AArch64::STRBroX:
3381 case AArch64::STURBi:
3382 case AArch64::STRBui:
3383 return AArch64::STRBroW;
3384 case AArch64::STRBBroX:
3385 case AArch64::STURBBi:
3386 case AArch64::STRBBui:
3387 return AArch64::STRBBroW;
3388 }
3389 }
3390
emitLdStWithAddr(MachineInstr & MemI,const ExtAddrMode & AM) const3391 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3392 const ExtAddrMode &AM) const {
3393
3394 const DebugLoc &DL = MemI.getDebugLoc();
3395 MachineBasicBlock &MBB = *MemI.getParent();
3396 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3397
3398 if (AM.Form == ExtAddrMode::Formula::Basic) {
3399 if (AM.ScaledReg) {
3400 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3401 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3402 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3403 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3404 .addReg(MemI.getOperand(0).getReg(),
3405 MemI.mayLoad() ? RegState::Define : 0)
3406 .addReg(AM.BaseReg)
3407 .addReg(AM.ScaledReg)
3408 .addImm(0)
3409 .addImm(AM.Scale > 1)
3410 .setMemRefs(MemI.memoperands())
3411 .setMIFlags(MemI.getFlags());
3412 return B.getInstr();
3413 }
3414
3415 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3416 "Addressing mode not supported for folding");
3417
3418 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3419 unsigned Scale = 1;
3420 unsigned Opcode = MemI.getOpcode();
3421 if (isInt<9>(AM.Displacement))
3422 Opcode = unscaledOffsetOpcode(Opcode);
3423 else
3424 Opcode = scaledOffsetOpcode(Opcode, Scale);
3425
3426 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3427 .addReg(MemI.getOperand(0).getReg(),
3428 MemI.mayLoad() ? RegState::Define : 0)
3429 .addReg(AM.BaseReg)
3430 .addImm(AM.Displacement / Scale)
3431 .setMemRefs(MemI.memoperands())
3432 .setMIFlags(MemI.getFlags());
3433 return B.getInstr();
3434 }
3435
3436 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3437 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3438 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3439 assert(AM.ScaledReg && !AM.Displacement &&
3440 "Address offset can be a register or an immediate, but not both");
3441 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3442 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3443 // Make sure the offset register is in the correct register class.
3444 Register OffsetReg = AM.ScaledReg;
3445 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3446 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3447 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3448 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3449 .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3450 }
3451 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3452 .addReg(MemI.getOperand(0).getReg(),
3453 MemI.mayLoad() ? RegState::Define : 0)
3454 .addReg(AM.BaseReg)
3455 .addReg(OffsetReg)
3456 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3457 .addImm(AM.Scale != 1)
3458 .setMemRefs(MemI.memoperands())
3459 .setMIFlags(MemI.getFlags());
3460
3461 return B.getInstr();
3462 }
3463
3464 llvm_unreachable(
3465 "Function must not be called with an addressing mode it can't handle");
3466 }
3467
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,TypeSize & Width,const TargetRegisterInfo * TRI) const3468 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3469 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3470 bool &OffsetIsScalable, TypeSize &Width,
3471 const TargetRegisterInfo *TRI) const {
3472 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3473 // Handle only loads/stores with base register followed by immediate offset.
3474 if (LdSt.getNumExplicitOperands() == 3) {
3475 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3476 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3477 !LdSt.getOperand(2).isImm())
3478 return false;
3479 } else if (LdSt.getNumExplicitOperands() == 4) {
3480 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3481 if (!LdSt.getOperand(1).isReg() ||
3482 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3483 !LdSt.getOperand(3).isImm())
3484 return false;
3485 } else
3486 return false;
3487
3488 // Get the scaling factor for the instruction and set the width for the
3489 // instruction.
3490 TypeSize Scale(0U, false);
3491 int64_t Dummy1, Dummy2;
3492
3493 // If this returns false, then it's an instruction we don't want to handle.
3494 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3495 return false;
3496
3497 // Compute the offset. Offset is calculated as the immediate operand
3498 // multiplied by the scaling factor. Unscaled instructions have scaling factor
3499 // set to 1.
3500 if (LdSt.getNumExplicitOperands() == 3) {
3501 BaseOp = &LdSt.getOperand(1);
3502 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3503 } else {
3504 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3505 BaseOp = &LdSt.getOperand(2);
3506 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3507 }
3508 OffsetIsScalable = Scale.isScalable();
3509
3510 if (!BaseOp->isReg() && !BaseOp->isFI())
3511 return false;
3512
3513 return true;
3514 }
3515
3516 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const3517 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3518 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3519 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3520 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3521 return OfsOp;
3522 }
3523
getMemOpInfo(unsigned Opcode,TypeSize & Scale,TypeSize & Width,int64_t & MinOffset,int64_t & MaxOffset)3524 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3525 TypeSize &Width, int64_t &MinOffset,
3526 int64_t &MaxOffset) {
3527 switch (Opcode) {
3528 // Not a memory operation or something we want to handle.
3529 default:
3530 Scale = TypeSize::getFixed(0);
3531 Width = TypeSize::getFixed(0);
3532 MinOffset = MaxOffset = 0;
3533 return false;
3534 case AArch64::STRWpost:
3535 case AArch64::LDRWpost:
3536 Width = TypeSize::getFixed(32);
3537 Scale = TypeSize::getFixed(4);
3538 MinOffset = -256;
3539 MaxOffset = 255;
3540 break;
3541 case AArch64::LDURQi:
3542 case AArch64::STURQi:
3543 Width = TypeSize::getFixed(16);
3544 Scale = TypeSize::getFixed(1);
3545 MinOffset = -256;
3546 MaxOffset = 255;
3547 break;
3548 case AArch64::PRFUMi:
3549 case AArch64::LDURXi:
3550 case AArch64::LDURDi:
3551 case AArch64::LDAPURXi:
3552 case AArch64::STURXi:
3553 case AArch64::STURDi:
3554 case AArch64::STLURXi:
3555 Width = TypeSize::getFixed(8);
3556 Scale = TypeSize::getFixed(1);
3557 MinOffset = -256;
3558 MaxOffset = 255;
3559 break;
3560 case AArch64::LDURWi:
3561 case AArch64::LDURSi:
3562 case AArch64::LDURSWi:
3563 case AArch64::LDAPURi:
3564 case AArch64::LDAPURSWi:
3565 case AArch64::STURWi:
3566 case AArch64::STURSi:
3567 case AArch64::STLURWi:
3568 Width = TypeSize::getFixed(4);
3569 Scale = TypeSize::getFixed(1);
3570 MinOffset = -256;
3571 MaxOffset = 255;
3572 break;
3573 case AArch64::LDURHi:
3574 case AArch64::LDURHHi:
3575 case AArch64::LDURSHXi:
3576 case AArch64::LDURSHWi:
3577 case AArch64::LDAPURHi:
3578 case AArch64::LDAPURSHWi:
3579 case AArch64::LDAPURSHXi:
3580 case AArch64::STURHi:
3581 case AArch64::STURHHi:
3582 case AArch64::STLURHi:
3583 Width = TypeSize::getFixed(2);
3584 Scale = TypeSize::getFixed(1);
3585 MinOffset = -256;
3586 MaxOffset = 255;
3587 break;
3588 case AArch64::LDURBi:
3589 case AArch64::LDURBBi:
3590 case AArch64::LDURSBXi:
3591 case AArch64::LDURSBWi:
3592 case AArch64::LDAPURBi:
3593 case AArch64::LDAPURSBWi:
3594 case AArch64::LDAPURSBXi:
3595 case AArch64::STURBi:
3596 case AArch64::STURBBi:
3597 case AArch64::STLURBi:
3598 Width = TypeSize::getFixed(1);
3599 Scale = TypeSize::getFixed(1);
3600 MinOffset = -256;
3601 MaxOffset = 255;
3602 break;
3603 case AArch64::LDPQi:
3604 case AArch64::LDNPQi:
3605 case AArch64::STPQi:
3606 case AArch64::STNPQi:
3607 Scale = TypeSize::getFixed(16);
3608 Width = TypeSize::getFixed(32);
3609 MinOffset = -64;
3610 MaxOffset = 63;
3611 break;
3612 case AArch64::LDRQui:
3613 case AArch64::STRQui:
3614 Scale = TypeSize::getFixed(16);
3615 Width = TypeSize::getFixed(16);
3616 MinOffset = 0;
3617 MaxOffset = 4095;
3618 break;
3619 case AArch64::LDPXi:
3620 case AArch64::LDPDi:
3621 case AArch64::LDNPXi:
3622 case AArch64::LDNPDi:
3623 case AArch64::STPXi:
3624 case AArch64::STPDi:
3625 case AArch64::STNPXi:
3626 case AArch64::STNPDi:
3627 Scale = TypeSize::getFixed(8);
3628 Width = TypeSize::getFixed(16);
3629 MinOffset = -64;
3630 MaxOffset = 63;
3631 break;
3632 case AArch64::PRFMui:
3633 case AArch64::LDRXui:
3634 case AArch64::LDRDui:
3635 case AArch64::STRXui:
3636 case AArch64::STRDui:
3637 Scale = TypeSize::getFixed(8);
3638 Width = TypeSize::getFixed(8);
3639 MinOffset = 0;
3640 MaxOffset = 4095;
3641 break;
3642 case AArch64::StoreSwiftAsyncContext:
3643 // Store is an STRXui, but there might be an ADDXri in the expansion too.
3644 Scale = TypeSize::getFixed(1);
3645 Width = TypeSize::getFixed(8);
3646 MinOffset = 0;
3647 MaxOffset = 4095;
3648 break;
3649 case AArch64::LDPWi:
3650 case AArch64::LDPSi:
3651 case AArch64::LDNPWi:
3652 case AArch64::LDNPSi:
3653 case AArch64::STPWi:
3654 case AArch64::STPSi:
3655 case AArch64::STNPWi:
3656 case AArch64::STNPSi:
3657 Scale = TypeSize::getFixed(4);
3658 Width = TypeSize::getFixed(8);
3659 MinOffset = -64;
3660 MaxOffset = 63;
3661 break;
3662 case AArch64::LDRWui:
3663 case AArch64::LDRSui:
3664 case AArch64::LDRSWui:
3665 case AArch64::STRWui:
3666 case AArch64::STRSui:
3667 Scale = TypeSize::getFixed(4);
3668 Width = TypeSize::getFixed(4);
3669 MinOffset = 0;
3670 MaxOffset = 4095;
3671 break;
3672 case AArch64::LDRHui:
3673 case AArch64::LDRHHui:
3674 case AArch64::LDRSHWui:
3675 case AArch64::LDRSHXui:
3676 case AArch64::STRHui:
3677 case AArch64::STRHHui:
3678 Scale = TypeSize::getFixed(2);
3679 Width = TypeSize::getFixed(2);
3680 MinOffset = 0;
3681 MaxOffset = 4095;
3682 break;
3683 case AArch64::LDRBui:
3684 case AArch64::LDRBBui:
3685 case AArch64::LDRSBWui:
3686 case AArch64::LDRSBXui:
3687 case AArch64::STRBui:
3688 case AArch64::STRBBui:
3689 Scale = TypeSize::getFixed(1);
3690 Width = TypeSize::getFixed(1);
3691 MinOffset = 0;
3692 MaxOffset = 4095;
3693 break;
3694 case AArch64::STPXpre:
3695 case AArch64::LDPXpost:
3696 case AArch64::STPDpre:
3697 case AArch64::LDPDpost:
3698 Scale = TypeSize::getFixed(8);
3699 Width = TypeSize::getFixed(8);
3700 MinOffset = -512;
3701 MaxOffset = 504;
3702 break;
3703 case AArch64::STPQpre:
3704 case AArch64::LDPQpost:
3705 Scale = TypeSize::getFixed(16);
3706 Width = TypeSize::getFixed(16);
3707 MinOffset = -1024;
3708 MaxOffset = 1008;
3709 break;
3710 case AArch64::STRXpre:
3711 case AArch64::STRDpre:
3712 case AArch64::LDRXpost:
3713 case AArch64::LDRDpost:
3714 Scale = TypeSize::getFixed(1);
3715 Width = TypeSize::getFixed(8);
3716 MinOffset = -256;
3717 MaxOffset = 255;
3718 break;
3719 case AArch64::STRQpre:
3720 case AArch64::LDRQpost:
3721 Scale = TypeSize::getFixed(1);
3722 Width = TypeSize::getFixed(16);
3723 MinOffset = -256;
3724 MaxOffset = 255;
3725 break;
3726 case AArch64::ADDG:
3727 Scale = TypeSize::getFixed(16);
3728 Width = TypeSize::getFixed(0);
3729 MinOffset = 0;
3730 MaxOffset = 63;
3731 break;
3732 case AArch64::TAGPstack:
3733 Scale = TypeSize::getFixed(16);
3734 Width = TypeSize::getFixed(0);
3735 // TAGP with a negative offset turns into SUBP, which has a maximum offset
3736 // of 63 (not 64!).
3737 MinOffset = -63;
3738 MaxOffset = 63;
3739 break;
3740 case AArch64::LDG:
3741 case AArch64::STGi:
3742 case AArch64::STZGi:
3743 Scale = TypeSize::getFixed(16);
3744 Width = TypeSize::getFixed(16);
3745 MinOffset = -256;
3746 MaxOffset = 255;
3747 break;
3748 case AArch64::STR_ZZZZXI:
3749 case AArch64::LDR_ZZZZXI:
3750 Scale = TypeSize::getScalable(16);
3751 Width = TypeSize::getScalable(16 * 4);
3752 MinOffset = -256;
3753 MaxOffset = 252;
3754 break;
3755 case AArch64::STR_ZZZXI:
3756 case AArch64::LDR_ZZZXI:
3757 Scale = TypeSize::getScalable(16);
3758 Width = TypeSize::getScalable(16 * 3);
3759 MinOffset = -256;
3760 MaxOffset = 253;
3761 break;
3762 case AArch64::STR_ZZXI:
3763 case AArch64::LDR_ZZXI:
3764 Scale = TypeSize::getScalable(16);
3765 Width = TypeSize::getScalable(16 * 2);
3766 MinOffset = -256;
3767 MaxOffset = 254;
3768 break;
3769 case AArch64::LDR_PXI:
3770 case AArch64::STR_PXI:
3771 Scale = TypeSize::getScalable(2);
3772 Width = TypeSize::getScalable(2);
3773 MinOffset = -256;
3774 MaxOffset = 255;
3775 break;
3776 case AArch64::LDR_PPXI:
3777 case AArch64::STR_PPXI:
3778 Scale = TypeSize::getScalable(2);
3779 Width = TypeSize::getScalable(2 * 2);
3780 MinOffset = -256;
3781 MaxOffset = 254;
3782 break;
3783 case AArch64::LDR_ZXI:
3784 case AArch64::STR_ZXI:
3785 Scale = TypeSize::getScalable(16);
3786 Width = TypeSize::getScalable(16);
3787 MinOffset = -256;
3788 MaxOffset = 255;
3789 break;
3790 case AArch64::LD1B_IMM:
3791 case AArch64::LD1H_IMM:
3792 case AArch64::LD1W_IMM:
3793 case AArch64::LD1D_IMM:
3794 case AArch64::LDNT1B_ZRI:
3795 case AArch64::LDNT1H_ZRI:
3796 case AArch64::LDNT1W_ZRI:
3797 case AArch64::LDNT1D_ZRI:
3798 case AArch64::ST1B_IMM:
3799 case AArch64::ST1H_IMM:
3800 case AArch64::ST1W_IMM:
3801 case AArch64::ST1D_IMM:
3802 case AArch64::STNT1B_ZRI:
3803 case AArch64::STNT1H_ZRI:
3804 case AArch64::STNT1W_ZRI:
3805 case AArch64::STNT1D_ZRI:
3806 case AArch64::LDNF1B_IMM:
3807 case AArch64::LDNF1H_IMM:
3808 case AArch64::LDNF1W_IMM:
3809 case AArch64::LDNF1D_IMM:
3810 // A full vectors worth of data
3811 // Width = mbytes * elements
3812 Scale = TypeSize::getScalable(16);
3813 Width = TypeSize::getScalable(16);
3814 MinOffset = -8;
3815 MaxOffset = 7;
3816 break;
3817 case AArch64::LD2B_IMM:
3818 case AArch64::LD2H_IMM:
3819 case AArch64::LD2W_IMM:
3820 case AArch64::LD2D_IMM:
3821 case AArch64::ST2B_IMM:
3822 case AArch64::ST2H_IMM:
3823 case AArch64::ST2W_IMM:
3824 case AArch64::ST2D_IMM:
3825 Scale = TypeSize::getScalable(32);
3826 Width = TypeSize::getScalable(16 * 2);
3827 MinOffset = -8;
3828 MaxOffset = 7;
3829 break;
3830 case AArch64::LD3B_IMM:
3831 case AArch64::LD3H_IMM:
3832 case AArch64::LD3W_IMM:
3833 case AArch64::LD3D_IMM:
3834 case AArch64::ST3B_IMM:
3835 case AArch64::ST3H_IMM:
3836 case AArch64::ST3W_IMM:
3837 case AArch64::ST3D_IMM:
3838 Scale = TypeSize::getScalable(48);
3839 Width = TypeSize::getScalable(16 * 3);
3840 MinOffset = -8;
3841 MaxOffset = 7;
3842 break;
3843 case AArch64::LD4B_IMM:
3844 case AArch64::LD4H_IMM:
3845 case AArch64::LD4W_IMM:
3846 case AArch64::LD4D_IMM:
3847 case AArch64::ST4B_IMM:
3848 case AArch64::ST4H_IMM:
3849 case AArch64::ST4W_IMM:
3850 case AArch64::ST4D_IMM:
3851 Scale = TypeSize::getScalable(64);
3852 Width = TypeSize::getScalable(16 * 4);
3853 MinOffset = -8;
3854 MaxOffset = 7;
3855 break;
3856 case AArch64::LD1B_H_IMM:
3857 case AArch64::LD1SB_H_IMM:
3858 case AArch64::LD1H_S_IMM:
3859 case AArch64::LD1SH_S_IMM:
3860 case AArch64::LD1W_D_IMM:
3861 case AArch64::LD1SW_D_IMM:
3862 case AArch64::ST1B_H_IMM:
3863 case AArch64::ST1H_S_IMM:
3864 case AArch64::ST1W_D_IMM:
3865 case AArch64::LDNF1B_H_IMM:
3866 case AArch64::LDNF1SB_H_IMM:
3867 case AArch64::LDNF1H_S_IMM:
3868 case AArch64::LDNF1SH_S_IMM:
3869 case AArch64::LDNF1W_D_IMM:
3870 case AArch64::LDNF1SW_D_IMM:
3871 // A half vector worth of data
3872 // Width = mbytes * elements
3873 Scale = TypeSize::getScalable(8);
3874 Width = TypeSize::getScalable(8);
3875 MinOffset = -8;
3876 MaxOffset = 7;
3877 break;
3878 case AArch64::LD1B_S_IMM:
3879 case AArch64::LD1SB_S_IMM:
3880 case AArch64::LD1H_D_IMM:
3881 case AArch64::LD1SH_D_IMM:
3882 case AArch64::ST1B_S_IMM:
3883 case AArch64::ST1H_D_IMM:
3884 case AArch64::LDNF1B_S_IMM:
3885 case AArch64::LDNF1SB_S_IMM:
3886 case AArch64::LDNF1H_D_IMM:
3887 case AArch64::LDNF1SH_D_IMM:
3888 // A quarter vector worth of data
3889 // Width = mbytes * elements
3890 Scale = TypeSize::getScalable(4);
3891 Width = TypeSize::getScalable(4);
3892 MinOffset = -8;
3893 MaxOffset = 7;
3894 break;
3895 case AArch64::LD1B_D_IMM:
3896 case AArch64::LD1SB_D_IMM:
3897 case AArch64::ST1B_D_IMM:
3898 case AArch64::LDNF1B_D_IMM:
3899 case AArch64::LDNF1SB_D_IMM:
3900 // A eighth vector worth of data
3901 // Width = mbytes * elements
3902 Scale = TypeSize::getScalable(2);
3903 Width = TypeSize::getScalable(2);
3904 MinOffset = -8;
3905 MaxOffset = 7;
3906 break;
3907 case AArch64::ST2Gi:
3908 case AArch64::STZ2Gi:
3909 Scale = TypeSize::getFixed(16);
3910 Width = TypeSize::getFixed(32);
3911 MinOffset = -256;
3912 MaxOffset = 255;
3913 break;
3914 case AArch64::STGPi:
3915 Scale = TypeSize::getFixed(16);
3916 Width = TypeSize::getFixed(16);
3917 MinOffset = -64;
3918 MaxOffset = 63;
3919 break;
3920 case AArch64::LD1RB_IMM:
3921 case AArch64::LD1RB_H_IMM:
3922 case AArch64::LD1RB_S_IMM:
3923 case AArch64::LD1RB_D_IMM:
3924 case AArch64::LD1RSB_H_IMM:
3925 case AArch64::LD1RSB_S_IMM:
3926 case AArch64::LD1RSB_D_IMM:
3927 Scale = TypeSize::getFixed(1);
3928 Width = TypeSize::getFixed(1);
3929 MinOffset = 0;
3930 MaxOffset = 63;
3931 break;
3932 case AArch64::LD1RH_IMM:
3933 case AArch64::LD1RH_S_IMM:
3934 case AArch64::LD1RH_D_IMM:
3935 case AArch64::LD1RSH_S_IMM:
3936 case AArch64::LD1RSH_D_IMM:
3937 Scale = TypeSize::getFixed(2);
3938 Width = TypeSize::getFixed(2);
3939 MinOffset = 0;
3940 MaxOffset = 63;
3941 break;
3942 case AArch64::LD1RW_IMM:
3943 case AArch64::LD1RW_D_IMM:
3944 case AArch64::LD1RSW_IMM:
3945 Scale = TypeSize::getFixed(4);
3946 Width = TypeSize::getFixed(4);
3947 MinOffset = 0;
3948 MaxOffset = 63;
3949 break;
3950 case AArch64::LD1RD_IMM:
3951 Scale = TypeSize::getFixed(8);
3952 Width = TypeSize::getFixed(8);
3953 MinOffset = 0;
3954 MaxOffset = 63;
3955 break;
3956 }
3957
3958 return true;
3959 }
3960
3961 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)3962 int AArch64InstrInfo::getMemScale(unsigned Opc) {
3963 switch (Opc) {
3964 default:
3965 llvm_unreachable("Opcode has unknown scale!");
3966 case AArch64::LDRBBui:
3967 case AArch64::LDURBBi:
3968 case AArch64::LDRSBWui:
3969 case AArch64::LDURSBWi:
3970 case AArch64::STRBBui:
3971 case AArch64::STURBBi:
3972 return 1;
3973 case AArch64::LDRHHui:
3974 case AArch64::LDURHHi:
3975 case AArch64::LDRSHWui:
3976 case AArch64::LDURSHWi:
3977 case AArch64::STRHHui:
3978 case AArch64::STURHHi:
3979 return 2;
3980 case AArch64::LDRSui:
3981 case AArch64::LDURSi:
3982 case AArch64::LDRSpre:
3983 case AArch64::LDRSWui:
3984 case AArch64::LDURSWi:
3985 case AArch64::LDRSWpre:
3986 case AArch64::LDRWpre:
3987 case AArch64::LDRWui:
3988 case AArch64::LDURWi:
3989 case AArch64::STRSui:
3990 case AArch64::STURSi:
3991 case AArch64::STRSpre:
3992 case AArch64::STRWui:
3993 case AArch64::STURWi:
3994 case AArch64::STRWpre:
3995 case AArch64::LDPSi:
3996 case AArch64::LDPSWi:
3997 case AArch64::LDPWi:
3998 case AArch64::STPSi:
3999 case AArch64::STPWi:
4000 return 4;
4001 case AArch64::LDRDui:
4002 case AArch64::LDURDi:
4003 case AArch64::LDRDpre:
4004 case AArch64::LDRXui:
4005 case AArch64::LDURXi:
4006 case AArch64::LDRXpre:
4007 case AArch64::STRDui:
4008 case AArch64::STURDi:
4009 case AArch64::STRDpre:
4010 case AArch64::STRXui:
4011 case AArch64::STURXi:
4012 case AArch64::STRXpre:
4013 case AArch64::LDPDi:
4014 case AArch64::LDPXi:
4015 case AArch64::STPDi:
4016 case AArch64::STPXi:
4017 return 8;
4018 case AArch64::LDRQui:
4019 case AArch64::LDURQi:
4020 case AArch64::STRQui:
4021 case AArch64::STURQi:
4022 case AArch64::STRQpre:
4023 case AArch64::LDPQi:
4024 case AArch64::LDRQpre:
4025 case AArch64::STPQi:
4026 case AArch64::STGi:
4027 case AArch64::STZGi:
4028 case AArch64::ST2Gi:
4029 case AArch64::STZ2Gi:
4030 case AArch64::STGPi:
4031 return 16;
4032 }
4033 }
4034
isPreLd(const MachineInstr & MI)4035 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4036 switch (MI.getOpcode()) {
4037 default:
4038 return false;
4039 case AArch64::LDRWpre:
4040 case AArch64::LDRXpre:
4041 case AArch64::LDRSWpre:
4042 case AArch64::LDRSpre:
4043 case AArch64::LDRDpre:
4044 case AArch64::LDRQpre:
4045 return true;
4046 }
4047 }
4048
isPreSt(const MachineInstr & MI)4049 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4050 switch (MI.getOpcode()) {
4051 default:
4052 return false;
4053 case AArch64::STRWpre:
4054 case AArch64::STRXpre:
4055 case AArch64::STRSpre:
4056 case AArch64::STRDpre:
4057 case AArch64::STRQpre:
4058 return true;
4059 }
4060 }
4061
isPreLdSt(const MachineInstr & MI)4062 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4063 return isPreLd(MI) || isPreSt(MI);
4064 }
4065
isPairedLdSt(const MachineInstr & MI)4066 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4067 switch (MI.getOpcode()) {
4068 default:
4069 return false;
4070 case AArch64::LDPSi:
4071 case AArch64::LDPSWi:
4072 case AArch64::LDPDi:
4073 case AArch64::LDPQi:
4074 case AArch64::LDPWi:
4075 case AArch64::LDPXi:
4076 case AArch64::STPSi:
4077 case AArch64::STPDi:
4078 case AArch64::STPQi:
4079 case AArch64::STPWi:
4080 case AArch64::STPXi:
4081 case AArch64::STGPi:
4082 return true;
4083 }
4084 }
4085
getLdStBaseOp(const MachineInstr & MI)4086 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4087 unsigned Idx =
4088 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4089 : 1;
4090 return MI.getOperand(Idx);
4091 }
4092
4093 const MachineOperand &
getLdStOffsetOp(const MachineInstr & MI)4094 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4095 unsigned Idx =
4096 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4097 : 2;
4098 return MI.getOperand(Idx);
4099 }
4100
getRegClass(const MachineInstr & MI,Register Reg)4101 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4102 Register Reg) {
4103 if (MI.getParent() == nullptr)
4104 return nullptr;
4105 const MachineFunction *MF = MI.getParent()->getParent();
4106 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4107 }
4108
isHForm(const MachineInstr & MI)4109 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4110 auto IsHFPR = [&](const MachineOperand &Op) {
4111 if (!Op.isReg())
4112 return false;
4113 auto Reg = Op.getReg();
4114 if (Reg.isPhysical())
4115 return AArch64::FPR16RegClass.contains(Reg);
4116 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4117 return TRC == &AArch64::FPR16RegClass ||
4118 TRC == &AArch64::FPR16_loRegClass;
4119 };
4120 return llvm::any_of(MI.operands(), IsHFPR);
4121 }
4122
isQForm(const MachineInstr & MI)4123 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4124 auto IsQFPR = [&](const MachineOperand &Op) {
4125 if (!Op.isReg())
4126 return false;
4127 auto Reg = Op.getReg();
4128 if (Reg.isPhysical())
4129 return AArch64::FPR128RegClass.contains(Reg);
4130 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4131 return TRC == &AArch64::FPR128RegClass ||
4132 TRC == &AArch64::FPR128_loRegClass;
4133 };
4134 return llvm::any_of(MI.operands(), IsQFPR);
4135 }
4136
hasBTISemantics(const MachineInstr & MI)4137 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4138 switch (MI.getOpcode()) {
4139 case AArch64::BRK:
4140 case AArch64::HLT:
4141 case AArch64::PACIASP:
4142 case AArch64::PACIBSP:
4143 // Implicit BTI behavior.
4144 return true;
4145 case AArch64::PAUTH_PROLOGUE:
4146 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4147 return true;
4148 case AArch64::HINT: {
4149 unsigned Imm = MI.getOperand(0).getImm();
4150 // Explicit BTI instruction.
4151 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4152 return true;
4153 // PACI(A|B)SP instructions.
4154 if (Imm == 25 || Imm == 27)
4155 return true;
4156 return false;
4157 }
4158 default:
4159 return false;
4160 }
4161 }
4162
isFpOrNEON(const MachineInstr & MI)4163 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4164 auto IsFPR = [&](const MachineOperand &Op) {
4165 if (!Op.isReg())
4166 return false;
4167 auto Reg = Op.getReg();
4168 if (Reg.isPhysical())
4169 return AArch64::FPR128RegClass.contains(Reg) ||
4170 AArch64::FPR64RegClass.contains(Reg) ||
4171 AArch64::FPR32RegClass.contains(Reg) ||
4172 AArch64::FPR16RegClass.contains(Reg) ||
4173 AArch64::FPR8RegClass.contains(Reg);
4174
4175 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4176 return TRC == &AArch64::FPR128RegClass ||
4177 TRC == &AArch64::FPR128_loRegClass ||
4178 TRC == &AArch64::FPR64RegClass ||
4179 TRC == &AArch64::FPR64_loRegClass ||
4180 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4181 TRC == &AArch64::FPR8RegClass;
4182 };
4183 return llvm::any_of(MI.operands(), IsFPR);
4184 }
4185
4186 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
4187 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)4188 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4189 int Scale = AArch64InstrInfo::getMemScale(Opc);
4190
4191 // If the byte-offset isn't a multiple of the stride, we can't scale this
4192 // offset.
4193 if (Offset % Scale != 0)
4194 return false;
4195
4196 // Convert the byte-offset used by unscaled into an "element" offset used
4197 // by the scaled pair load/store instructions.
4198 Offset /= Scale;
4199 return true;
4200 }
4201
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)4202 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4203 if (FirstOpc == SecondOpc)
4204 return true;
4205 // We can also pair sign-ext and zero-ext instructions.
4206 switch (FirstOpc) {
4207 default:
4208 return false;
4209 case AArch64::LDRQui:
4210 case AArch64::LDURQi:
4211 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4212 case AArch64::LDRWui:
4213 case AArch64::LDURWi:
4214 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4215 case AArch64::LDRSWui:
4216 case AArch64::LDURSWi:
4217 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4218 }
4219 // These instructions can't be paired based on their opcodes.
4220 return false;
4221 }
4222
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)4223 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4224 int64_t Offset1, unsigned Opcode1, int FI2,
4225 int64_t Offset2, unsigned Opcode2) {
4226 // Accesses through fixed stack object frame indices may access a different
4227 // fixed stack slot. Check that the object offsets + offsets match.
4228 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4229 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4230 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4231 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4232 // Convert to scaled object offsets.
4233 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4234 if (ObjectOffset1 % Scale1 != 0)
4235 return false;
4236 ObjectOffset1 /= Scale1;
4237 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4238 if (ObjectOffset2 % Scale2 != 0)
4239 return false;
4240 ObjectOffset2 /= Scale2;
4241 ObjectOffset1 += Offset1;
4242 ObjectOffset2 += Offset2;
4243 return ObjectOffset1 + 1 == ObjectOffset2;
4244 }
4245
4246 return FI1 == FI2;
4247 }
4248
4249 /// Detect opportunities for ldp/stp formation.
4250 ///
4251 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,int64_t OpOffset1,bool OffsetIsScalable1,ArrayRef<const MachineOperand * > BaseOps2,int64_t OpOffset2,bool OffsetIsScalable2,unsigned ClusterSize,unsigned NumBytes) const4252 bool AArch64InstrInfo::shouldClusterMemOps(
4253 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4254 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4255 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4256 unsigned NumBytes) const {
4257 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4258 const MachineOperand &BaseOp1 = *BaseOps1.front();
4259 const MachineOperand &BaseOp2 = *BaseOps2.front();
4260 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4261 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4262 if (BaseOp1.getType() != BaseOp2.getType())
4263 return false;
4264
4265 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4266 "Only base registers and frame indices are supported.");
4267
4268 // Check for both base regs and base FI.
4269 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4270 return false;
4271
4272 // Only cluster up to a single pair.
4273 if (ClusterSize > 2)
4274 return false;
4275
4276 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4277 return false;
4278
4279 // Can we pair these instructions based on their opcodes?
4280 unsigned FirstOpc = FirstLdSt.getOpcode();
4281 unsigned SecondOpc = SecondLdSt.getOpcode();
4282 if (!canPairLdStOpc(FirstOpc, SecondOpc))
4283 return false;
4284
4285 // Can't merge volatiles or load/stores that have a hint to avoid pair
4286 // formation, for example.
4287 if (!isCandidateToMergeOrPair(FirstLdSt) ||
4288 !isCandidateToMergeOrPair(SecondLdSt))
4289 return false;
4290
4291 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4292 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4293 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4294 return false;
4295
4296 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4297 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4298 return false;
4299
4300 // Pairwise instructions have a 7-bit signed offset field.
4301 if (Offset1 > 63 || Offset1 < -64)
4302 return false;
4303
4304 // The caller should already have ordered First/SecondLdSt by offset.
4305 // Note: except for non-equal frame index bases
4306 if (BaseOp1.isFI()) {
4307 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4308 "Caller should have ordered offsets.");
4309
4310 const MachineFrameInfo &MFI =
4311 FirstLdSt.getParent()->getParent()->getFrameInfo();
4312 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4313 BaseOp2.getIndex(), Offset2, SecondOpc);
4314 }
4315
4316 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4317
4318 return Offset1 + 1 == Offset2;
4319 }
4320
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)4321 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4322 unsigned Reg, unsigned SubIdx,
4323 unsigned State,
4324 const TargetRegisterInfo *TRI) {
4325 if (!SubIdx)
4326 return MIB.addReg(Reg, State);
4327
4328 if (Register::isPhysicalRegister(Reg))
4329 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4330 return MIB.addReg(Reg, State, SubIdx);
4331 }
4332
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)4333 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4334 unsigned NumRegs) {
4335 // We really want the positive remainder mod 32 here, that happens to be
4336 // easily obtainable with a mask.
4337 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4338 }
4339
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const4340 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4341 MachineBasicBlock::iterator I,
4342 const DebugLoc &DL, MCRegister DestReg,
4343 MCRegister SrcReg, bool KillSrc,
4344 unsigned Opcode,
4345 ArrayRef<unsigned> Indices) const {
4346 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4347 const TargetRegisterInfo *TRI = &getRegisterInfo();
4348 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4349 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4350 unsigned NumRegs = Indices.size();
4351
4352 int SubReg = 0, End = NumRegs, Incr = 1;
4353 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4354 SubReg = NumRegs - 1;
4355 End = -1;
4356 Incr = -1;
4357 }
4358
4359 for (; SubReg != End; SubReg += Incr) {
4360 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4361 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4362 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4363 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4364 }
4365 }
4366
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const4367 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4368 MachineBasicBlock::iterator I,
4369 DebugLoc DL, unsigned DestReg,
4370 unsigned SrcReg, bool KillSrc,
4371 unsigned Opcode, unsigned ZeroReg,
4372 llvm::ArrayRef<unsigned> Indices) const {
4373 const TargetRegisterInfo *TRI = &getRegisterInfo();
4374 unsigned NumRegs = Indices.size();
4375
4376 #ifndef NDEBUG
4377 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4378 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4379 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4380 "GPR reg sequences should not be able to overlap");
4381 #endif
4382
4383 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4384 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4385 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4386 MIB.addReg(ZeroReg);
4387 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4388 MIB.addImm(0);
4389 }
4390 }
4391
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const4392 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4393 MachineBasicBlock::iterator I,
4394 const DebugLoc &DL, MCRegister DestReg,
4395 MCRegister SrcReg, bool KillSrc) const {
4396 if (AArch64::GPR32spRegClass.contains(DestReg) &&
4397 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4398 const TargetRegisterInfo *TRI = &getRegisterInfo();
4399
4400 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4401 // If either operand is WSP, expand to ADD #0.
4402 if (Subtarget.hasZeroCycleRegMove()) {
4403 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4404 MCRegister DestRegX = TRI->getMatchingSuperReg(
4405 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4406 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4407 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4408 // This instruction is reading and writing X registers. This may upset
4409 // the register scavenger and machine verifier, so we need to indicate
4410 // that we are reading an undefined value from SrcRegX, but a proper
4411 // value from SrcReg.
4412 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4413 .addReg(SrcRegX, RegState::Undef)
4414 .addImm(0)
4415 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4416 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4417 } else {
4418 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4419 .addReg(SrcReg, getKillRegState(KillSrc))
4420 .addImm(0)
4421 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4422 }
4423 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4424 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4425 .addImm(0)
4426 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4427 } else {
4428 if (Subtarget.hasZeroCycleRegMove()) {
4429 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4430 MCRegister DestRegX = TRI->getMatchingSuperReg(
4431 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4432 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4433 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4434 // This instruction is reading and writing X registers. This may upset
4435 // the register scavenger and machine verifier, so we need to indicate
4436 // that we are reading an undefined value from SrcRegX, but a proper
4437 // value from SrcReg.
4438 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4439 .addReg(AArch64::XZR)
4440 .addReg(SrcRegX, RegState::Undef)
4441 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4442 } else {
4443 // Otherwise, expand to ORR WZR.
4444 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4445 .addReg(AArch64::WZR)
4446 .addReg(SrcReg, getKillRegState(KillSrc));
4447 }
4448 }
4449 return;
4450 }
4451
4452 // Copy a Predicate register by ORRing with itself.
4453 if (AArch64::PPRRegClass.contains(DestReg) &&
4454 AArch64::PPRRegClass.contains(SrcReg)) {
4455 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4456 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4457 .addReg(SrcReg) // Pg
4458 .addReg(SrcReg)
4459 .addReg(SrcReg, getKillRegState(KillSrc));
4460 return;
4461 }
4462
4463 // Copy a predicate-as-counter register by ORRing with itself as if it
4464 // were a regular predicate (mask) register.
4465 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4466 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4467 if (DestIsPNR || SrcIsPNR) {
4468 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4469 "Unexpected predicate-as-counter register.");
4470 auto ToPPR = [](MCRegister R) -> MCRegister {
4471 return (R - AArch64::PN0) + AArch64::P0;
4472 };
4473 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4474 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4475
4476 if (PPRSrcReg != PPRDestReg) {
4477 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4478 .addReg(PPRSrcReg) // Pg
4479 .addReg(PPRSrcReg)
4480 .addReg(PPRSrcReg, getKillRegState(KillSrc));
4481 if (DestIsPNR)
4482 NewMI.addDef(DestReg, RegState::Implicit);
4483 }
4484 return;
4485 }
4486
4487 // Copy a Z register by ORRing with itself.
4488 if (AArch64::ZPRRegClass.contains(DestReg) &&
4489 AArch64::ZPRRegClass.contains(SrcReg)) {
4490 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4491 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4492 .addReg(SrcReg)
4493 .addReg(SrcReg, getKillRegState(KillSrc));
4494 return;
4495 }
4496
4497 // Copy a Z register pair by copying the individual sub-registers.
4498 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4499 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4500 (AArch64::ZPR2RegClass.contains(SrcReg) ||
4501 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4502 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4503 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4504 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4505 Indices);
4506 return;
4507 }
4508
4509 // Copy a Z register triple by copying the individual sub-registers.
4510 if (AArch64::ZPR3RegClass.contains(DestReg) &&
4511 AArch64::ZPR3RegClass.contains(SrcReg)) {
4512 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4513 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4514 AArch64::zsub2};
4515 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4516 Indices);
4517 return;
4518 }
4519
4520 // Copy a Z register quad by copying the individual sub-registers.
4521 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4522 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4523 (AArch64::ZPR4RegClass.contains(SrcReg) ||
4524 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4525 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4526 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4527 AArch64::zsub2, AArch64::zsub3};
4528 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4529 Indices);
4530 return;
4531 }
4532
4533 if (AArch64::GPR64spRegClass.contains(DestReg) &&
4534 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4535 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4536 // If either operand is SP, expand to ADD #0.
4537 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4538 .addReg(SrcReg, getKillRegState(KillSrc))
4539 .addImm(0)
4540 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4541 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4542 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4543 .addImm(0)
4544 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4545 } else {
4546 // Otherwise, expand to ORR XZR.
4547 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4548 .addReg(AArch64::XZR)
4549 .addReg(SrcReg, getKillRegState(KillSrc));
4550 }
4551 return;
4552 }
4553
4554 // Copy a DDDD register quad by copying the individual sub-registers.
4555 if (AArch64::DDDDRegClass.contains(DestReg) &&
4556 AArch64::DDDDRegClass.contains(SrcReg)) {
4557 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4558 AArch64::dsub2, AArch64::dsub3};
4559 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4560 Indices);
4561 return;
4562 }
4563
4564 // Copy a DDD register triple by copying the individual sub-registers.
4565 if (AArch64::DDDRegClass.contains(DestReg) &&
4566 AArch64::DDDRegClass.contains(SrcReg)) {
4567 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4568 AArch64::dsub2};
4569 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4570 Indices);
4571 return;
4572 }
4573
4574 // Copy a DD register pair by copying the individual sub-registers.
4575 if (AArch64::DDRegClass.contains(DestReg) &&
4576 AArch64::DDRegClass.contains(SrcReg)) {
4577 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4578 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4579 Indices);
4580 return;
4581 }
4582
4583 // Copy a QQQQ register quad by copying the individual sub-registers.
4584 if (AArch64::QQQQRegClass.contains(DestReg) &&
4585 AArch64::QQQQRegClass.contains(SrcReg)) {
4586 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4587 AArch64::qsub2, AArch64::qsub3};
4588 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4589 Indices);
4590 return;
4591 }
4592
4593 // Copy a QQQ register triple by copying the individual sub-registers.
4594 if (AArch64::QQQRegClass.contains(DestReg) &&
4595 AArch64::QQQRegClass.contains(SrcReg)) {
4596 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4597 AArch64::qsub2};
4598 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4599 Indices);
4600 return;
4601 }
4602
4603 // Copy a QQ register pair by copying the individual sub-registers.
4604 if (AArch64::QQRegClass.contains(DestReg) &&
4605 AArch64::QQRegClass.contains(SrcReg)) {
4606 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4607 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4608 Indices);
4609 return;
4610 }
4611
4612 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4613 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4614 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4615 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4616 AArch64::XZR, Indices);
4617 return;
4618 }
4619
4620 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4621 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4622 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4623 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4624 AArch64::WZR, Indices);
4625 return;
4626 }
4627
4628 if (AArch64::FPR128RegClass.contains(DestReg) &&
4629 AArch64::FPR128RegClass.contains(SrcReg)) {
4630 if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable())
4631 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4632 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4633 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4634 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4635 else if (Subtarget.hasNEON())
4636 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4637 .addReg(SrcReg)
4638 .addReg(SrcReg, getKillRegState(KillSrc));
4639 else {
4640 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4641 .addReg(AArch64::SP, RegState::Define)
4642 .addReg(SrcReg, getKillRegState(KillSrc))
4643 .addReg(AArch64::SP)
4644 .addImm(-16);
4645 BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
4646 .addReg(AArch64::SP, RegState::Define)
4647 .addReg(DestReg, RegState::Define)
4648 .addReg(AArch64::SP)
4649 .addImm(16);
4650 }
4651 return;
4652 }
4653
4654 if (AArch64::FPR64RegClass.contains(DestReg) &&
4655 AArch64::FPR64RegClass.contains(SrcReg)) {
4656 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4657 .addReg(SrcReg, getKillRegState(KillSrc));
4658 return;
4659 }
4660
4661 if (AArch64::FPR32RegClass.contains(DestReg) &&
4662 AArch64::FPR32RegClass.contains(SrcReg)) {
4663 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4664 .addReg(SrcReg, getKillRegState(KillSrc));
4665 return;
4666 }
4667
4668 if (AArch64::FPR16RegClass.contains(DestReg) &&
4669 AArch64::FPR16RegClass.contains(SrcReg)) {
4670 DestReg =
4671 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4672 SrcReg =
4673 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4674 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4675 .addReg(SrcReg, getKillRegState(KillSrc));
4676 return;
4677 }
4678
4679 if (AArch64::FPR8RegClass.contains(DestReg) &&
4680 AArch64::FPR8RegClass.contains(SrcReg)) {
4681 DestReg =
4682 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4683 SrcReg =
4684 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4685 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4686 .addReg(SrcReg, getKillRegState(KillSrc));
4687 return;
4688 }
4689
4690 // Copies between GPR64 and FPR64.
4691 if (AArch64::FPR64RegClass.contains(DestReg) &&
4692 AArch64::GPR64RegClass.contains(SrcReg)) {
4693 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4694 .addReg(SrcReg, getKillRegState(KillSrc));
4695 return;
4696 }
4697 if (AArch64::GPR64RegClass.contains(DestReg) &&
4698 AArch64::FPR64RegClass.contains(SrcReg)) {
4699 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4700 .addReg(SrcReg, getKillRegState(KillSrc));
4701 return;
4702 }
4703 // Copies between GPR32 and FPR32.
4704 if (AArch64::FPR32RegClass.contains(DestReg) &&
4705 AArch64::GPR32RegClass.contains(SrcReg)) {
4706 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4707 .addReg(SrcReg, getKillRegState(KillSrc));
4708 return;
4709 }
4710 if (AArch64::GPR32RegClass.contains(DestReg) &&
4711 AArch64::FPR32RegClass.contains(SrcReg)) {
4712 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4713 .addReg(SrcReg, getKillRegState(KillSrc));
4714 return;
4715 }
4716
4717 if (DestReg == AArch64::NZCV) {
4718 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4719 BuildMI(MBB, I, DL, get(AArch64::MSR))
4720 .addImm(AArch64SysReg::NZCV)
4721 .addReg(SrcReg, getKillRegState(KillSrc))
4722 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4723 return;
4724 }
4725
4726 if (SrcReg == AArch64::NZCV) {
4727 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4728 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4729 .addImm(AArch64SysReg::NZCV)
4730 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4731 return;
4732 }
4733
4734 #ifndef NDEBUG
4735 const TargetRegisterInfo &TRI = getRegisterInfo();
4736 errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4737 << TRI.getRegAsmName(SrcReg) << "\n";
4738 #endif
4739 llvm_unreachable("unimplemented reg-to-reg copy");
4740 }
4741
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4742 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4743 MachineBasicBlock &MBB,
4744 MachineBasicBlock::iterator InsertBefore,
4745 const MCInstrDesc &MCID,
4746 Register SrcReg, bool IsKill,
4747 unsigned SubIdx0, unsigned SubIdx1, int FI,
4748 MachineMemOperand *MMO) {
4749 Register SrcReg0 = SrcReg;
4750 Register SrcReg1 = SrcReg;
4751 if (SrcReg.isPhysical()) {
4752 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
4753 SubIdx0 = 0;
4754 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
4755 SubIdx1 = 0;
4756 }
4757 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4758 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
4759 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
4760 .addFrameIndex(FI)
4761 .addImm(0)
4762 .addMemOperand(MMO);
4763 }
4764
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const4765 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4766 MachineBasicBlock::iterator MBBI,
4767 Register SrcReg, bool isKill, int FI,
4768 const TargetRegisterClass *RC,
4769 const TargetRegisterInfo *TRI,
4770 Register VReg) const {
4771 MachineFunction &MF = *MBB.getParent();
4772 MachineFrameInfo &MFI = MF.getFrameInfo();
4773
4774 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4775 MachineMemOperand *MMO =
4776 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
4777 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4778 unsigned Opc = 0;
4779 bool Offset = true;
4780 MCRegister PNRReg = MCRegister::NoRegister;
4781 unsigned StackID = TargetStackID::Default;
4782 switch (TRI->getSpillSize(*RC)) {
4783 case 1:
4784 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4785 Opc = AArch64::STRBui;
4786 break;
4787 case 2:
4788 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4789 Opc = AArch64::STRHui;
4790 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
4791 assert(Subtarget.hasSVEorSME() &&
4792 "Unexpected register store without SVE store instructions");
4793 Opc = AArch64::STR_PXI;
4794 StackID = TargetStackID::ScalableVector;
4795 } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
4796 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4797 "Unexpected register store without SVE2p1 or SME2");
4798 if (SrcReg.isVirtual()) {
4799 auto NewSrcReg =
4800 MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
4801 BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), NewSrcReg)
4802 .addReg(SrcReg);
4803 SrcReg = NewSrcReg;
4804 } else
4805 SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
4806 Opc = AArch64::STR_PXI;
4807 StackID = TargetStackID::ScalableVector;
4808 }
4809 break;
4810 case 4:
4811 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4812 Opc = AArch64::STRWui;
4813 if (SrcReg.isVirtual())
4814 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4815 else
4816 assert(SrcReg != AArch64::WSP);
4817 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4818 Opc = AArch64::STRSui;
4819 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4820 Opc = AArch64::STR_PPXI;
4821 StackID = TargetStackID::ScalableVector;
4822 }
4823 break;
4824 case 8:
4825 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4826 Opc = AArch64::STRXui;
4827 if (SrcReg.isVirtual())
4828 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4829 else
4830 assert(SrcReg != AArch64::SP);
4831 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4832 Opc = AArch64::STRDui;
4833 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4834 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4835 get(AArch64::STPWi), SrcReg, isKill,
4836 AArch64::sube32, AArch64::subo32, FI, MMO);
4837 return;
4838 }
4839 break;
4840 case 16:
4841 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4842 Opc = AArch64::STRQui;
4843 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4844 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4845 Opc = AArch64::ST1Twov1d;
4846 Offset = false;
4847 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4848 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4849 get(AArch64::STPXi), SrcReg, isKill,
4850 AArch64::sube64, AArch64::subo64, FI, MMO);
4851 return;
4852 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4853 assert(Subtarget.hasSVEorSME() &&
4854 "Unexpected register store without SVE store instructions");
4855 Opc = AArch64::STR_ZXI;
4856 StackID = TargetStackID::ScalableVector;
4857 }
4858 break;
4859 case 24:
4860 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4861 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4862 Opc = AArch64::ST1Threev1d;
4863 Offset = false;
4864 }
4865 break;
4866 case 32:
4867 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4868 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4869 Opc = AArch64::ST1Fourv1d;
4870 Offset = false;
4871 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4872 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4873 Opc = AArch64::ST1Twov2d;
4874 Offset = false;
4875 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4876 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4877 assert(Subtarget.hasSVEorSME() &&
4878 "Unexpected register store without SVE store instructions");
4879 Opc = AArch64::STR_ZZXI;
4880 StackID = TargetStackID::ScalableVector;
4881 }
4882 break;
4883 case 48:
4884 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4885 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4886 Opc = AArch64::ST1Threev2d;
4887 Offset = false;
4888 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4889 assert(Subtarget.hasSVEorSME() &&
4890 "Unexpected register store without SVE store instructions");
4891 Opc = AArch64::STR_ZZZXI;
4892 StackID = TargetStackID::ScalableVector;
4893 }
4894 break;
4895 case 64:
4896 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4897 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4898 Opc = AArch64::ST1Fourv2d;
4899 Offset = false;
4900 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4901 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4902 assert(Subtarget.hasSVEorSME() &&
4903 "Unexpected register store without SVE store instructions");
4904 Opc = AArch64::STR_ZZZZXI;
4905 StackID = TargetStackID::ScalableVector;
4906 }
4907 break;
4908 }
4909 assert(Opc && "Unknown register class");
4910 MFI.setStackID(FI, StackID);
4911
4912 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4913 .addReg(SrcReg, getKillRegState(isKill))
4914 .addFrameIndex(FI);
4915
4916 if (Offset)
4917 MI.addImm(0);
4918 if (PNRReg.isValid())
4919 MI.addDef(PNRReg, RegState::Implicit);
4920 MI.addMemOperand(MMO);
4921 }
4922
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4923 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4924 MachineBasicBlock &MBB,
4925 MachineBasicBlock::iterator InsertBefore,
4926 const MCInstrDesc &MCID,
4927 Register DestReg, unsigned SubIdx0,
4928 unsigned SubIdx1, int FI,
4929 MachineMemOperand *MMO) {
4930 Register DestReg0 = DestReg;
4931 Register DestReg1 = DestReg;
4932 bool IsUndef = true;
4933 if (DestReg.isPhysical()) {
4934 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
4935 SubIdx0 = 0;
4936 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
4937 SubIdx1 = 0;
4938 IsUndef = false;
4939 }
4940 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4941 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
4942 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
4943 .addFrameIndex(FI)
4944 .addImm(0)
4945 .addMemOperand(MMO);
4946 }
4947
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const4948 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
4949 MachineBasicBlock::iterator MBBI,
4950 Register DestReg, int FI,
4951 const TargetRegisterClass *RC,
4952 const TargetRegisterInfo *TRI,
4953 Register VReg) const {
4954 MachineFunction &MF = *MBB.getParent();
4955 MachineFrameInfo &MFI = MF.getFrameInfo();
4956 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4957 MachineMemOperand *MMO =
4958 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
4959 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4960
4961 unsigned Opc = 0;
4962 bool Offset = true;
4963 unsigned StackID = TargetStackID::Default;
4964 Register PNRReg = MCRegister::NoRegister;
4965 switch (TRI->getSpillSize(*RC)) {
4966 case 1:
4967 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4968 Opc = AArch64::LDRBui;
4969 break;
4970 case 2:
4971 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4972 Opc = AArch64::LDRHui;
4973 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
4974 assert(Subtarget.hasSVEorSME() &&
4975 "Unexpected register load without SVE load instructions");
4976 Opc = AArch64::LDR_PXI;
4977 StackID = TargetStackID::ScalableVector;
4978 } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
4979 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4980 "Unexpected register load without SVE2p1 or SME2");
4981 PNRReg = DestReg;
4982 if (DestReg.isVirtual())
4983 DestReg = MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
4984 else
4985 DestReg = (DestReg - AArch64::PN0) + AArch64::P0;
4986 Opc = AArch64::LDR_PXI;
4987 StackID = TargetStackID::ScalableVector;
4988 }
4989 break;
4990 case 4:
4991 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4992 Opc = AArch64::LDRWui;
4993 if (DestReg.isVirtual())
4994 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
4995 else
4996 assert(DestReg != AArch64::WSP);
4997 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4998 Opc = AArch64::LDRSui;
4999 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5000 Opc = AArch64::LDR_PPXI;
5001 StackID = TargetStackID::ScalableVector;
5002 }
5003 break;
5004 case 8:
5005 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5006 Opc = AArch64::LDRXui;
5007 if (DestReg.isVirtual())
5008 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5009 else
5010 assert(DestReg != AArch64::SP);
5011 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5012 Opc = AArch64::LDRDui;
5013 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5014 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5015 get(AArch64::LDPWi), DestReg, AArch64::sube32,
5016 AArch64::subo32, FI, MMO);
5017 return;
5018 }
5019 break;
5020 case 16:
5021 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5022 Opc = AArch64::LDRQui;
5023 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5024 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5025 Opc = AArch64::LD1Twov1d;
5026 Offset = false;
5027 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5028 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5029 get(AArch64::LDPXi), DestReg, AArch64::sube64,
5030 AArch64::subo64, FI, MMO);
5031 return;
5032 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5033 assert(Subtarget.hasSVEorSME() &&
5034 "Unexpected register load without SVE load instructions");
5035 Opc = AArch64::LDR_ZXI;
5036 StackID = TargetStackID::ScalableVector;
5037 }
5038 break;
5039 case 24:
5040 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5041 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5042 Opc = AArch64::LD1Threev1d;
5043 Offset = false;
5044 }
5045 break;
5046 case 32:
5047 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5048 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5049 Opc = AArch64::LD1Fourv1d;
5050 Offset = false;
5051 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5052 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5053 Opc = AArch64::LD1Twov2d;
5054 Offset = false;
5055 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5056 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5057 assert(Subtarget.hasSVEorSME() &&
5058 "Unexpected register load without SVE load instructions");
5059 Opc = AArch64::LDR_ZZXI;
5060 StackID = TargetStackID::ScalableVector;
5061 }
5062 break;
5063 case 48:
5064 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5065 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5066 Opc = AArch64::LD1Threev2d;
5067 Offset = false;
5068 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5069 assert(Subtarget.hasSVEorSME() &&
5070 "Unexpected register load without SVE load instructions");
5071 Opc = AArch64::LDR_ZZZXI;
5072 StackID = TargetStackID::ScalableVector;
5073 }
5074 break;
5075 case 64:
5076 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5077 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5078 Opc = AArch64::LD1Fourv2d;
5079 Offset = false;
5080 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5081 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5082 assert(Subtarget.hasSVEorSME() &&
5083 "Unexpected register load without SVE load instructions");
5084 Opc = AArch64::LDR_ZZZZXI;
5085 StackID = TargetStackID::ScalableVector;
5086 }
5087 break;
5088 }
5089
5090 assert(Opc && "Unknown register class");
5091 MFI.setStackID(FI, StackID);
5092
5093 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5094 .addReg(DestReg, getDefRegState(true))
5095 .addFrameIndex(FI);
5096 if (Offset)
5097 MI.addImm(0);
5098 if (PNRReg.isValid() && !PNRReg.isVirtual())
5099 MI.addDef(PNRReg, RegState::Implicit);
5100 MI.addMemOperand(MMO);
5101
5102 if (PNRReg.isValid() && PNRReg.isVirtual())
5103 BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg)
5104 .addReg(DestReg);
5105 }
5106
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)5107 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5108 const MachineInstr &UseMI,
5109 const TargetRegisterInfo *TRI) {
5110 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5111 UseMI.getIterator()),
5112 [TRI](const MachineInstr &I) {
5113 return I.modifiesRegister(AArch64::NZCV, TRI) ||
5114 I.readsRegister(AArch64::NZCV, TRI);
5115 });
5116 }
5117
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)5118 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5119 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5120 // The smallest scalable element supported by scaled SVE addressing
5121 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5122 // byte offset must always be a multiple of 2.
5123 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5124
5125 // VGSized offsets are divided by '2', because the VG register is the
5126 // the number of 64bit granules as opposed to 128bit vector chunks,
5127 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5128 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5129 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5130 ByteSized = Offset.getFixed();
5131 VGSized = Offset.getScalable() / 2;
5132 }
5133
5134 /// Returns the offset in parts to which this frame offset can be
5135 /// decomposed for the purpose of describing a frame offset.
5136 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)5137 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5138 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5139 int64_t &NumDataVectors) {
5140 // The smallest scalable element supported by scaled SVE addressing
5141 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5142 // byte offset must always be a multiple of 2.
5143 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5144
5145 NumBytes = Offset.getFixed();
5146 NumDataVectors = 0;
5147 NumPredicateVectors = Offset.getScalable() / 2;
5148 // This method is used to get the offsets to adjust the frame offset.
5149 // If the function requires ADDPL to be used and needs more than two ADDPL
5150 // instructions, part of the offset is folded into NumDataVectors so that it
5151 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5152 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5153 NumPredicateVectors > 62) {
5154 NumDataVectors = NumPredicateVectors / 8;
5155 NumPredicateVectors -= NumDataVectors * 8;
5156 }
5157 }
5158
5159 // Convenience function to create a DWARF expression for
5160 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)5161 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5162 int NumVGScaledBytes, unsigned VG,
5163 llvm::raw_string_ostream &Comment) {
5164 uint8_t buffer[16];
5165
5166 if (NumBytes) {
5167 Expr.push_back(dwarf::DW_OP_consts);
5168 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5169 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5170 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5171 }
5172
5173 if (NumVGScaledBytes) {
5174 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5175 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5176
5177 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5178 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5179 Expr.push_back(0);
5180
5181 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5182 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5183
5184 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5185 << std::abs(NumVGScaledBytes) << " * VG";
5186 }
5187 }
5188
5189 // Creates an MCCFIInstruction:
5190 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpression(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & Offset)5191 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5192 unsigned Reg,
5193 const StackOffset &Offset) {
5194 int64_t NumBytes, NumVGScaledBytes;
5195 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5196 NumVGScaledBytes);
5197 std::string CommentBuffer;
5198 llvm::raw_string_ostream Comment(CommentBuffer);
5199
5200 if (Reg == AArch64::SP)
5201 Comment << "sp";
5202 else if (Reg == AArch64::FP)
5203 Comment << "fp";
5204 else
5205 Comment << printReg(Reg, &TRI);
5206
5207 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5208 SmallString<64> Expr;
5209 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5210 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5211 Expr.push_back(0);
5212 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5213 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5214
5215 // Wrap this into DW_CFA_def_cfa.
5216 SmallString<64> DefCfaExpr;
5217 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5218 uint8_t buffer[16];
5219 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5220 DefCfaExpr.append(Expr.str());
5221 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5222 Comment.str());
5223 }
5224
createDefCFA(const TargetRegisterInfo & TRI,unsigned FrameReg,unsigned Reg,const StackOffset & Offset,bool LastAdjustmentWasScalable)5225 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5226 unsigned FrameReg, unsigned Reg,
5227 const StackOffset &Offset,
5228 bool LastAdjustmentWasScalable) {
5229 if (Offset.getScalable())
5230 return createDefCFAExpression(TRI, Reg, Offset);
5231
5232 if (FrameReg == Reg && !LastAdjustmentWasScalable)
5233 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5234
5235 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5236 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5237 }
5238
createCFAOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA)5239 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5240 unsigned Reg,
5241 const StackOffset &OffsetFromDefCFA) {
5242 int64_t NumBytes, NumVGScaledBytes;
5243 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5244 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5245
5246 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5247
5248 // Non-scalable offsets can use DW_CFA_offset directly.
5249 if (!NumVGScaledBytes)
5250 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5251
5252 std::string CommentBuffer;
5253 llvm::raw_string_ostream Comment(CommentBuffer);
5254 Comment << printReg(Reg, &TRI) << " @ cfa";
5255
5256 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5257 SmallString<64> OffsetExpr;
5258 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5259 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5260
5261 // Wrap this into DW_CFA_expression
5262 SmallString<64> CfaExpr;
5263 CfaExpr.push_back(dwarf::DW_CFA_expression);
5264 uint8_t buffer[16];
5265 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5266 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5267 CfaExpr.append(OffsetExpr.str());
5268
5269 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5270 Comment.str());
5271 }
5272
5273 // Helper function to emit a frame offset adjustment from a given
5274 // pointer (SrcReg), stored into DestReg. This function is explicit
5275 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5276 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5277 MachineBasicBlock::iterator MBBI,
5278 const DebugLoc &DL, unsigned DestReg,
5279 unsigned SrcReg, int64_t Offset, unsigned Opc,
5280 const TargetInstrInfo *TII,
5281 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5282 bool *HasWinCFI, bool EmitCFAOffset,
5283 StackOffset CFAOffset, unsigned FrameReg) {
5284 int Sign = 1;
5285 unsigned MaxEncoding, ShiftSize;
5286 switch (Opc) {
5287 case AArch64::ADDXri:
5288 case AArch64::ADDSXri:
5289 case AArch64::SUBXri:
5290 case AArch64::SUBSXri:
5291 MaxEncoding = 0xfff;
5292 ShiftSize = 12;
5293 break;
5294 case AArch64::ADDVL_XXI:
5295 case AArch64::ADDPL_XXI:
5296 case AArch64::ADDSVL_XXI:
5297 case AArch64::ADDSPL_XXI:
5298 MaxEncoding = 31;
5299 ShiftSize = 0;
5300 if (Offset < 0) {
5301 MaxEncoding = 32;
5302 Sign = -1;
5303 Offset = -Offset;
5304 }
5305 break;
5306 default:
5307 llvm_unreachable("Unsupported opcode");
5308 }
5309
5310 // `Offset` can be in bytes or in "scalable bytes".
5311 int VScale = 1;
5312 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5313 VScale = 16;
5314 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5315 VScale = 2;
5316
5317 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5318 // scratch register. If DestReg is a virtual register, use it as the
5319 // scratch register; otherwise, create a new virtual register (to be
5320 // replaced by the scavenger at the end of PEI). That case can be optimized
5321 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5322 // register can be loaded with offset%8 and the add/sub can use an extending
5323 // instruction with LSL#3.
5324 // Currently the function handles any offsets but generates a poor sequence
5325 // of code.
5326 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5327
5328 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5329 Register TmpReg = DestReg;
5330 if (TmpReg == AArch64::XZR)
5331 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5332 &AArch64::GPR64RegClass);
5333 do {
5334 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5335 unsigned LocalShiftSize = 0;
5336 if (ThisVal > MaxEncoding) {
5337 ThisVal = ThisVal >> ShiftSize;
5338 LocalShiftSize = ShiftSize;
5339 }
5340 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5341 "Encoding cannot handle value that big");
5342
5343 Offset -= ThisVal << LocalShiftSize;
5344 if (Offset == 0)
5345 TmpReg = DestReg;
5346 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5347 .addReg(SrcReg)
5348 .addImm(Sign * (int)ThisVal);
5349 if (ShiftSize)
5350 MBI = MBI.addImm(
5351 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5352 MBI = MBI.setMIFlag(Flag);
5353
5354 auto Change =
5355 VScale == 1
5356 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5357 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5358 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5359 CFAOffset += Change;
5360 else
5361 CFAOffset -= Change;
5362 if (EmitCFAOffset && DestReg == TmpReg) {
5363 MachineFunction &MF = *MBB.getParent();
5364 const TargetSubtargetInfo &STI = MF.getSubtarget();
5365 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5366
5367 unsigned CFIIndex = MF.addFrameInst(
5368 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5369 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5370 .addCFIIndex(CFIIndex)
5371 .setMIFlags(Flag);
5372 }
5373
5374 if (NeedsWinCFI) {
5375 assert(Sign == 1 && "SEH directives should always have a positive sign");
5376 int Imm = (int)(ThisVal << LocalShiftSize);
5377 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5378 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5379 if (HasWinCFI)
5380 *HasWinCFI = true;
5381 if (Imm == 0)
5382 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5383 else
5384 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5385 .addImm(Imm)
5386 .setMIFlag(Flag);
5387 assert(Offset == 0 && "Expected remaining offset to be zero to "
5388 "emit a single SEH directive");
5389 } else if (DestReg == AArch64::SP) {
5390 if (HasWinCFI)
5391 *HasWinCFI = true;
5392 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5393 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5394 .addImm(Imm)
5395 .setMIFlag(Flag);
5396 }
5397 }
5398
5399 SrcReg = TmpReg;
5400 } while (Offset);
5401 }
5402
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5403 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5404 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5405 unsigned DestReg, unsigned SrcReg,
5406 StackOffset Offset, const TargetInstrInfo *TII,
5407 MachineInstr::MIFlag Flag, bool SetNZCV,
5408 bool NeedsWinCFI, bool *HasWinCFI,
5409 bool EmitCFAOffset, StackOffset CFAOffset,
5410 unsigned FrameReg) {
5411 // If a function is marked as arm_locally_streaming, then the runtime value of
5412 // vscale in the prologue/epilogue is different the runtime value of vscale
5413 // in the function's body. To avoid having to consider multiple vscales,
5414 // we can use `addsvl` to allocate any scalable stack-slots, which under
5415 // most circumstances will be only locals, not callee-save slots.
5416 const Function &F = MBB.getParent()->getFunction();
5417 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5418
5419 int64_t Bytes, NumPredicateVectors, NumDataVectors;
5420 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5421 Offset, Bytes, NumPredicateVectors, NumDataVectors);
5422
5423 // First emit non-scalable frame offsets, or a simple 'mov'.
5424 if (Bytes || (!Offset && SrcReg != DestReg)) {
5425 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5426 "SP increment/decrement not 8-byte aligned");
5427 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5428 if (Bytes < 0) {
5429 Bytes = -Bytes;
5430 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5431 }
5432 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5433 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5434 FrameReg);
5435 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5436 ? StackOffset::getFixed(-Bytes)
5437 : StackOffset::getFixed(Bytes);
5438 SrcReg = DestReg;
5439 FrameReg = DestReg;
5440 }
5441
5442 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5443 "SetNZCV not supported with SVE vectors");
5444 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5445 "WinCFI not supported with SVE vectors");
5446
5447 if (NumDataVectors) {
5448 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5449 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5450 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5451 CFAOffset, FrameReg);
5452 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5453 SrcReg = DestReg;
5454 }
5455
5456 if (NumPredicateVectors) {
5457 assert(DestReg != AArch64::SP && "Unaligned access to SP");
5458 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5459 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5460 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5461 CFAOffset, FrameReg);
5462 }
5463 }
5464
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const5465 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5466 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5467 MachineBasicBlock::iterator InsertPt, int FrameIndex,
5468 LiveIntervals *LIS, VirtRegMap *VRM) const {
5469 // This is a bit of a hack. Consider this instruction:
5470 //
5471 // %0 = COPY %sp; GPR64all:%0
5472 //
5473 // We explicitly chose GPR64all for the virtual register so such a copy might
5474 // be eliminated by RegisterCoalescer. However, that may not be possible, and
5475 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5476 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5477 //
5478 // To prevent that, we are going to constrain the %0 register class here.
5479 if (MI.isFullCopy()) {
5480 Register DstReg = MI.getOperand(0).getReg();
5481 Register SrcReg = MI.getOperand(1).getReg();
5482 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5483 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5484 return nullptr;
5485 }
5486 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5487 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5488 return nullptr;
5489 }
5490 // Nothing can folded with copy from/to NZCV.
5491 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5492 return nullptr;
5493 }
5494
5495 // Handle the case where a copy is being spilled or filled but the source
5496 // and destination register class don't match. For example:
5497 //
5498 // %0 = COPY %xzr; GPR64common:%0
5499 //
5500 // In this case we can still safely fold away the COPY and generate the
5501 // following spill code:
5502 //
5503 // STRXui %xzr, %stack.0
5504 //
5505 // This also eliminates spilled cross register class COPYs (e.g. between x and
5506 // d regs) of the same size. For example:
5507 //
5508 // %0 = COPY %1; GPR64:%0, FPR64:%1
5509 //
5510 // will be filled as
5511 //
5512 // LDRDui %0, fi<#0>
5513 //
5514 // instead of
5515 //
5516 // LDRXui %Temp, fi<#0>
5517 // %0 = FMOV %Temp
5518 //
5519 if (MI.isCopy() && Ops.size() == 1 &&
5520 // Make sure we're only folding the explicit COPY defs/uses.
5521 (Ops[0] == 0 || Ops[0] == 1)) {
5522 bool IsSpill = Ops[0] == 0;
5523 bool IsFill = !IsSpill;
5524 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5525 const MachineRegisterInfo &MRI = MF.getRegInfo();
5526 MachineBasicBlock &MBB = *MI.getParent();
5527 const MachineOperand &DstMO = MI.getOperand(0);
5528 const MachineOperand &SrcMO = MI.getOperand(1);
5529 Register DstReg = DstMO.getReg();
5530 Register SrcReg = SrcMO.getReg();
5531 // This is slightly expensive to compute for physical regs since
5532 // getMinimalPhysRegClass is slow.
5533 auto getRegClass = [&](unsigned Reg) {
5534 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5535 : TRI.getMinimalPhysRegClass(Reg);
5536 };
5537
5538 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5539 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5540 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5541 "Mismatched register size in non subreg COPY");
5542 if (IsSpill)
5543 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
5544 getRegClass(SrcReg), &TRI, Register());
5545 else
5546 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
5547 getRegClass(DstReg), &TRI, Register());
5548 return &*--InsertPt;
5549 }
5550
5551 // Handle cases like spilling def of:
5552 //
5553 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5554 //
5555 // where the physical register source can be widened and stored to the full
5556 // virtual reg destination stack slot, in this case producing:
5557 //
5558 // STRXui %xzr, %stack.0
5559 //
5560 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5561 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5562 assert(SrcMO.getSubReg() == 0 &&
5563 "Unexpected subreg on physical register");
5564 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5565 FrameIndex, &AArch64::GPR64RegClass, &TRI,
5566 Register());
5567 return &*--InsertPt;
5568 }
5569
5570 // Handle cases like filling use of:
5571 //
5572 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5573 //
5574 // where we can load the full virtual reg source stack slot, into the subreg
5575 // destination, in this case producing:
5576 //
5577 // LDRWui %0:sub_32<def,read-undef>, %stack.0
5578 //
5579 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5580 const TargetRegisterClass *FillRC;
5581 switch (DstMO.getSubReg()) {
5582 default:
5583 FillRC = nullptr;
5584 break;
5585 case AArch64::sub_32:
5586 FillRC = &AArch64::GPR32RegClass;
5587 break;
5588 case AArch64::ssub:
5589 FillRC = &AArch64::FPR32RegClass;
5590 break;
5591 case AArch64::dsub:
5592 FillRC = &AArch64::FPR64RegClass;
5593 break;
5594 }
5595
5596 if (FillRC) {
5597 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5598 TRI.getRegSizeInBits(*FillRC) &&
5599 "Mismatched regclass size on folded subreg COPY");
5600 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
5601 Register());
5602 MachineInstr &LoadMI = *--InsertPt;
5603 MachineOperand &LoadDst = LoadMI.getOperand(0);
5604 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5605 LoadDst.setSubReg(DstMO.getSubReg());
5606 LoadDst.setIsUndef();
5607 return &LoadMI;
5608 }
5609 }
5610 }
5611
5612 // Cannot fold.
5613 return nullptr;
5614 }
5615
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)5616 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5617 StackOffset &SOffset,
5618 bool *OutUseUnscaledOp,
5619 unsigned *OutUnscaledOp,
5620 int64_t *EmittableOffset) {
5621 // Set output values in case of early exit.
5622 if (EmittableOffset)
5623 *EmittableOffset = 0;
5624 if (OutUseUnscaledOp)
5625 *OutUseUnscaledOp = false;
5626 if (OutUnscaledOp)
5627 *OutUnscaledOp = 0;
5628
5629 // Exit early for structured vector spills/fills as they can't take an
5630 // immediate offset.
5631 switch (MI.getOpcode()) {
5632 default:
5633 break;
5634 case AArch64::LD1Rv1d:
5635 case AArch64::LD1Rv2s:
5636 case AArch64::LD1Rv2d:
5637 case AArch64::LD1Rv4h:
5638 case AArch64::LD1Rv4s:
5639 case AArch64::LD1Rv8b:
5640 case AArch64::LD1Rv8h:
5641 case AArch64::LD1Rv16b:
5642 case AArch64::LD1Twov2d:
5643 case AArch64::LD1Threev2d:
5644 case AArch64::LD1Fourv2d:
5645 case AArch64::LD1Twov1d:
5646 case AArch64::LD1Threev1d:
5647 case AArch64::LD1Fourv1d:
5648 case AArch64::ST1Twov2d:
5649 case AArch64::ST1Threev2d:
5650 case AArch64::ST1Fourv2d:
5651 case AArch64::ST1Twov1d:
5652 case AArch64::ST1Threev1d:
5653 case AArch64::ST1Fourv1d:
5654 case AArch64::ST1i8:
5655 case AArch64::ST1i16:
5656 case AArch64::ST1i32:
5657 case AArch64::ST1i64:
5658 case AArch64::IRG:
5659 case AArch64::IRGstack:
5660 case AArch64::STGloop:
5661 case AArch64::STZGloop:
5662 return AArch64FrameOffsetCannotUpdate;
5663 }
5664
5665 // Get the min/max offset and the scale.
5666 TypeSize ScaleValue(0U, false), Width(0U, false);
5667 int64_t MinOff, MaxOff;
5668 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
5669 MaxOff))
5670 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5671
5672 // Construct the complete offset.
5673 bool IsMulVL = ScaleValue.isScalable();
5674 unsigned Scale = ScaleValue.getKnownMinValue();
5675 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5676
5677 const MachineOperand &ImmOpnd =
5678 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
5679 Offset += ImmOpnd.getImm() * Scale;
5680
5681 // If the offset doesn't match the scale, we rewrite the instruction to
5682 // use the unscaled instruction instead. Likewise, if we have a negative
5683 // offset and there is an unscaled op to use.
5684 std::optional<unsigned> UnscaledOp =
5685 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
5686 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5687 if (useUnscaledOp &&
5688 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
5689 MaxOff))
5690 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5691
5692 Scale = ScaleValue.getKnownMinValue();
5693 assert(IsMulVL == ScaleValue.isScalable() &&
5694 "Unscaled opcode has different value for scalable");
5695
5696 int64_t Remainder = Offset % Scale;
5697 assert(!(Remainder && useUnscaledOp) &&
5698 "Cannot have remainder when using unscaled op");
5699
5700 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5701 int64_t NewOffset = Offset / Scale;
5702 if (MinOff <= NewOffset && NewOffset <= MaxOff)
5703 Offset = Remainder;
5704 else {
5705 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5706 Offset = Offset - NewOffset * Scale;
5707 }
5708
5709 if (EmittableOffset)
5710 *EmittableOffset = NewOffset;
5711 if (OutUseUnscaledOp)
5712 *OutUseUnscaledOp = useUnscaledOp;
5713 if (OutUnscaledOp && UnscaledOp)
5714 *OutUnscaledOp = *UnscaledOp;
5715
5716 if (IsMulVL)
5717 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
5718 else
5719 SOffset = StackOffset::get(Offset, SOffset.getScalable());
5720 return AArch64FrameOffsetCanUpdate |
5721 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5722 }
5723
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)5724 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5725 unsigned FrameReg, StackOffset &Offset,
5726 const AArch64InstrInfo *TII) {
5727 unsigned Opcode = MI.getOpcode();
5728 unsigned ImmIdx = FrameRegIdx + 1;
5729
5730 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5731 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
5732 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5733 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5734 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5735 MI.eraseFromParent();
5736 Offset = StackOffset();
5737 return true;
5738 }
5739
5740 int64_t NewOffset;
5741 unsigned UnscaledOp;
5742 bool UseUnscaledOp;
5743 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
5744 &UnscaledOp, &NewOffset);
5745 if (Status & AArch64FrameOffsetCanUpdate) {
5746 if (Status & AArch64FrameOffsetIsLegal)
5747 // Replace the FrameIndex with FrameReg.
5748 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
5749 if (UseUnscaledOp)
5750 MI.setDesc(TII->get(UnscaledOp));
5751
5752 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
5753 return !Offset;
5754 }
5755
5756 return false;
5757 }
5758
insertNoop(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI) const5759 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5760 MachineBasicBlock::iterator MI) const {
5761 DebugLoc DL;
5762 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5763 }
5764
getNop() const5765 MCInst AArch64InstrInfo::getNop() const {
5766 return MCInstBuilder(AArch64::HINT).addImm(0);
5767 }
5768
5769 // AArch64 supports MachineCombiner.
useMachineCombiner() const5770 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5771
5772 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)5773 static bool isCombineInstrSettingFlag(unsigned Opc) {
5774 switch (Opc) {
5775 case AArch64::ADDSWrr:
5776 case AArch64::ADDSWri:
5777 case AArch64::ADDSXrr:
5778 case AArch64::ADDSXri:
5779 case AArch64::SUBSWrr:
5780 case AArch64::SUBSXrr:
5781 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5782 case AArch64::SUBSWri:
5783 case AArch64::SUBSXri:
5784 return true;
5785 default:
5786 break;
5787 }
5788 return false;
5789 }
5790
5791 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)5792 static bool isCombineInstrCandidate32(unsigned Opc) {
5793 switch (Opc) {
5794 case AArch64::ADDWrr:
5795 case AArch64::ADDWri:
5796 case AArch64::SUBWrr:
5797 case AArch64::ADDSWrr:
5798 case AArch64::ADDSWri:
5799 case AArch64::SUBSWrr:
5800 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5801 case AArch64::SUBWri:
5802 case AArch64::SUBSWri:
5803 return true;
5804 default:
5805 break;
5806 }
5807 return false;
5808 }
5809
5810 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)5811 static bool isCombineInstrCandidate64(unsigned Opc) {
5812 switch (Opc) {
5813 case AArch64::ADDXrr:
5814 case AArch64::ADDXri:
5815 case AArch64::SUBXrr:
5816 case AArch64::ADDSXrr:
5817 case AArch64::ADDSXri:
5818 case AArch64::SUBSXrr:
5819 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5820 case AArch64::SUBXri:
5821 case AArch64::SUBSXri:
5822 case AArch64::ADDv8i8:
5823 case AArch64::ADDv16i8:
5824 case AArch64::ADDv4i16:
5825 case AArch64::ADDv8i16:
5826 case AArch64::ADDv2i32:
5827 case AArch64::ADDv4i32:
5828 case AArch64::SUBv8i8:
5829 case AArch64::SUBv16i8:
5830 case AArch64::SUBv4i16:
5831 case AArch64::SUBv8i16:
5832 case AArch64::SUBv2i32:
5833 case AArch64::SUBv4i32:
5834 return true;
5835 default:
5836 break;
5837 }
5838 return false;
5839 }
5840
5841 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)5842 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5843 switch (Inst.getOpcode()) {
5844 default:
5845 break;
5846 case AArch64::FADDHrr:
5847 case AArch64::FADDSrr:
5848 case AArch64::FADDDrr:
5849 case AArch64::FADDv4f16:
5850 case AArch64::FADDv8f16:
5851 case AArch64::FADDv2f32:
5852 case AArch64::FADDv2f64:
5853 case AArch64::FADDv4f32:
5854 case AArch64::FSUBHrr:
5855 case AArch64::FSUBSrr:
5856 case AArch64::FSUBDrr:
5857 case AArch64::FSUBv4f16:
5858 case AArch64::FSUBv8f16:
5859 case AArch64::FSUBv2f32:
5860 case AArch64::FSUBv2f64:
5861 case AArch64::FSUBv4f32:
5862 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5863 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5864 // the target options or if FADD/FSUB has the contract fast-math flag.
5865 return Options.UnsafeFPMath ||
5866 Options.AllowFPOpFusion == FPOpFusion::Fast ||
5867 Inst.getFlag(MachineInstr::FmContract);
5868 return true;
5869 }
5870 return false;
5871 }
5872
5873 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)5874 static bool isCombineInstrCandidate(unsigned Opc) {
5875 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5876 }
5877
5878 //
5879 // Utility routine that checks if \param MO is defined by an
5880 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)5881 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5882 unsigned CombineOpc, unsigned ZeroReg = 0,
5883 bool CheckZeroReg = false) {
5884 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5885 MachineInstr *MI = nullptr;
5886
5887 if (MO.isReg() && MO.getReg().isVirtual())
5888 MI = MRI.getUniqueVRegDef(MO.getReg());
5889 // And it needs to be in the trace (otherwise, it won't have a depth).
5890 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5891 return false;
5892 // Must only used by the user we combine with.
5893 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
5894 return false;
5895
5896 if (CheckZeroReg) {
5897 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5898 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5899 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5900 // The third input reg must be zero.
5901 if (MI->getOperand(3).getReg() != ZeroReg)
5902 return false;
5903 }
5904
5905 if (isCombineInstrSettingFlag(CombineOpc) &&
5906 MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
5907 return false;
5908
5909 return true;
5910 }
5911
5912 //
5913 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)5914 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5915 unsigned MulOpc, unsigned ZeroReg) {
5916 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
5917 }
5918
5919 //
5920 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)5921 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5922 unsigned MulOpc) {
5923 return canCombine(MBB, MO, MulOpc);
5924 }
5925
5926 // TODO: There are many more machine instruction opcodes to match:
5927 // 1. Other data types (integer, vectors)
5928 // 2. Other math / logic operations (xor, or)
5929 // 3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst,bool Invert) const5930 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5931 bool Invert) const {
5932 if (Invert)
5933 return false;
5934 switch (Inst.getOpcode()) {
5935 // == Floating-point types ==
5936 // -- Floating-point instructions --
5937 case AArch64::FADDHrr:
5938 case AArch64::FADDSrr:
5939 case AArch64::FADDDrr:
5940 case AArch64::FMULHrr:
5941 case AArch64::FMULSrr:
5942 case AArch64::FMULDrr:
5943 case AArch64::FMULX16:
5944 case AArch64::FMULX32:
5945 case AArch64::FMULX64:
5946 // -- Advanced SIMD instructions --
5947 case AArch64::FADDv4f16:
5948 case AArch64::FADDv8f16:
5949 case AArch64::FADDv2f32:
5950 case AArch64::FADDv4f32:
5951 case AArch64::FADDv2f64:
5952 case AArch64::FMULv4f16:
5953 case AArch64::FMULv8f16:
5954 case AArch64::FMULv2f32:
5955 case AArch64::FMULv4f32:
5956 case AArch64::FMULv2f64:
5957 case AArch64::FMULXv4f16:
5958 case AArch64::FMULXv8f16:
5959 case AArch64::FMULXv2f32:
5960 case AArch64::FMULXv4f32:
5961 case AArch64::FMULXv2f64:
5962 // -- SVE instructions --
5963 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
5964 // in the SVE instruction set (though there are predicated ones).
5965 case AArch64::FADD_ZZZ_H:
5966 case AArch64::FADD_ZZZ_S:
5967 case AArch64::FADD_ZZZ_D:
5968 case AArch64::FMUL_ZZZ_H:
5969 case AArch64::FMUL_ZZZ_S:
5970 case AArch64::FMUL_ZZZ_D:
5971 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
5972 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
5973 Inst.getFlag(MachineInstr::MIFlag::FmNsz));
5974
5975 // == Integer types ==
5976 // -- Base instructions --
5977 // Opcodes MULWrr and MULXrr don't exist because
5978 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
5979 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
5980 // The machine-combiner does not support three-source-operands machine
5981 // instruction. So we cannot reassociate MULs.
5982 case AArch64::ADDWrr:
5983 case AArch64::ADDXrr:
5984 case AArch64::ANDWrr:
5985 case AArch64::ANDXrr:
5986 case AArch64::ORRWrr:
5987 case AArch64::ORRXrr:
5988 case AArch64::EORWrr:
5989 case AArch64::EORXrr:
5990 case AArch64::EONWrr:
5991 case AArch64::EONXrr:
5992 // -- Advanced SIMD instructions --
5993 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
5994 // in the Advanced SIMD instruction set.
5995 case AArch64::ADDv8i8:
5996 case AArch64::ADDv16i8:
5997 case AArch64::ADDv4i16:
5998 case AArch64::ADDv8i16:
5999 case AArch64::ADDv2i32:
6000 case AArch64::ADDv4i32:
6001 case AArch64::ADDv1i64:
6002 case AArch64::ADDv2i64:
6003 case AArch64::MULv8i8:
6004 case AArch64::MULv16i8:
6005 case AArch64::MULv4i16:
6006 case AArch64::MULv8i16:
6007 case AArch64::MULv2i32:
6008 case AArch64::MULv4i32:
6009 case AArch64::ANDv8i8:
6010 case AArch64::ANDv16i8:
6011 case AArch64::ORRv8i8:
6012 case AArch64::ORRv16i8:
6013 case AArch64::EORv8i8:
6014 case AArch64::EORv16i8:
6015 // -- SVE instructions --
6016 case AArch64::ADD_ZZZ_B:
6017 case AArch64::ADD_ZZZ_H:
6018 case AArch64::ADD_ZZZ_S:
6019 case AArch64::ADD_ZZZ_D:
6020 case AArch64::MUL_ZZZ_B:
6021 case AArch64::MUL_ZZZ_H:
6022 case AArch64::MUL_ZZZ_S:
6023 case AArch64::MUL_ZZZ_D:
6024 case AArch64::AND_ZZZ:
6025 case AArch64::ORR_ZZZ:
6026 case AArch64::EOR_ZZZ:
6027 return true;
6028
6029 default:
6030 return false;
6031 }
6032 }
6033
6034 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6035 static bool getMaddPatterns(MachineInstr &Root,
6036 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6037 unsigned Opc = Root.getOpcode();
6038 MachineBasicBlock &MBB = *Root.getParent();
6039 bool Found = false;
6040
6041 if (!isCombineInstrCandidate(Opc))
6042 return false;
6043 if (isCombineInstrSettingFlag(Opc)) {
6044 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
6045 // When NZCV is live bail out.
6046 if (Cmp_NZCV == -1)
6047 return false;
6048 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6049 // When opcode can't change bail out.
6050 // CHECKME: do we miss any cases for opcode conversion?
6051 if (NewOpc == Opc)
6052 return false;
6053 Opc = NewOpc;
6054 }
6055
6056 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6057 MachineCombinerPattern Pattern) {
6058 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6059 Patterns.push_back(Pattern);
6060 Found = true;
6061 }
6062 };
6063
6064 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
6065 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6066 Patterns.push_back(Pattern);
6067 Found = true;
6068 }
6069 };
6070
6071 typedef MachineCombinerPattern MCP;
6072
6073 switch (Opc) {
6074 default:
6075 break;
6076 case AArch64::ADDWrr:
6077 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6078 "ADDWrr does not have register operands");
6079 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6080 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6081 break;
6082 case AArch64::ADDXrr:
6083 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6084 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6085 break;
6086 case AArch64::SUBWrr:
6087 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6088 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6089 break;
6090 case AArch64::SUBXrr:
6091 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6092 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6093 break;
6094 case AArch64::ADDWri:
6095 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6096 break;
6097 case AArch64::ADDXri:
6098 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6099 break;
6100 case AArch64::SUBWri:
6101 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6102 break;
6103 case AArch64::SUBXri:
6104 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6105 break;
6106 case AArch64::ADDv8i8:
6107 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6108 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6109 break;
6110 case AArch64::ADDv16i8:
6111 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6112 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6113 break;
6114 case AArch64::ADDv4i16:
6115 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6116 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6117 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6118 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6119 break;
6120 case AArch64::ADDv8i16:
6121 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6122 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6123 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6124 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6125 break;
6126 case AArch64::ADDv2i32:
6127 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6128 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6129 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6130 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6131 break;
6132 case AArch64::ADDv4i32:
6133 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6134 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6135 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6136 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6137 break;
6138 case AArch64::SUBv8i8:
6139 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6140 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6141 break;
6142 case AArch64::SUBv16i8:
6143 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6144 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6145 break;
6146 case AArch64::SUBv4i16:
6147 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6148 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6149 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6150 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6151 break;
6152 case AArch64::SUBv8i16:
6153 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6154 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6155 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6156 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6157 break;
6158 case AArch64::SUBv2i32:
6159 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6160 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6161 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6162 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6163 break;
6164 case AArch64::SUBv4i32:
6165 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6166 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6167 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6168 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6169 break;
6170 }
6171 return Found;
6172 }
6173 /// Floating-Point Support
6174
6175 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6176 static bool getFMAPatterns(MachineInstr &Root,
6177 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6178
6179 if (!isCombineInstrCandidateFP(Root))
6180 return false;
6181
6182 MachineBasicBlock &MBB = *Root.getParent();
6183 bool Found = false;
6184
6185 auto Match = [&](int Opcode, int Operand,
6186 MachineCombinerPattern Pattern) -> bool {
6187 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6188 Patterns.push_back(Pattern);
6189 return true;
6190 }
6191 return false;
6192 };
6193
6194 typedef MachineCombinerPattern MCP;
6195
6196 switch (Root.getOpcode()) {
6197 default:
6198 assert(false && "Unsupported FP instruction in combiner\n");
6199 break;
6200 case AArch64::FADDHrr:
6201 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6202 "FADDHrr does not have register operands");
6203
6204 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6205 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6206 break;
6207 case AArch64::FADDSrr:
6208 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6209 "FADDSrr does not have register operands");
6210
6211 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6212 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6213
6214 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6215 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6216 break;
6217 case AArch64::FADDDrr:
6218 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6219 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6220
6221 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6222 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6223 break;
6224 case AArch64::FADDv4f16:
6225 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6226 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6227
6228 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6229 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6230 break;
6231 case AArch64::FADDv8f16:
6232 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6233 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6234
6235 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6236 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6237 break;
6238 case AArch64::FADDv2f32:
6239 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6240 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6241
6242 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6243 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6244 break;
6245 case AArch64::FADDv2f64:
6246 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6247 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6248
6249 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6250 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6251 break;
6252 case AArch64::FADDv4f32:
6253 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6254 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6255
6256 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6257 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6258 break;
6259 case AArch64::FSUBHrr:
6260 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6261 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6262 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6263 break;
6264 case AArch64::FSUBSrr:
6265 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6266
6267 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6268 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6269
6270 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6271 break;
6272 case AArch64::FSUBDrr:
6273 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6274
6275 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6276 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6277
6278 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6279 break;
6280 case AArch64::FSUBv4f16:
6281 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6282 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6283
6284 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6285 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6286 break;
6287 case AArch64::FSUBv8f16:
6288 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6289 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6290
6291 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6292 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6293 break;
6294 case AArch64::FSUBv2f32:
6295 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6296 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6297
6298 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6299 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6300 break;
6301 case AArch64::FSUBv2f64:
6302 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6303 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6304
6305 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6306 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6307 break;
6308 case AArch64::FSUBv4f32:
6309 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6310 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6311
6312 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6313 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6314 break;
6315 }
6316 return Found;
6317 }
6318
getFMULPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6319 static bool getFMULPatterns(MachineInstr &Root,
6320 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6321 MachineBasicBlock &MBB = *Root.getParent();
6322 bool Found = false;
6323
6324 auto Match = [&](unsigned Opcode, int Operand,
6325 MachineCombinerPattern Pattern) -> bool {
6326 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6327 MachineOperand &MO = Root.getOperand(Operand);
6328 MachineInstr *MI = nullptr;
6329 if (MO.isReg() && MO.getReg().isVirtual())
6330 MI = MRI.getUniqueVRegDef(MO.getReg());
6331 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6332 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6333 MI->getOperand(1).getReg().isVirtual())
6334 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6335 if (MI && MI->getOpcode() == Opcode) {
6336 Patterns.push_back(Pattern);
6337 return true;
6338 }
6339 return false;
6340 };
6341
6342 typedef MachineCombinerPattern MCP;
6343
6344 switch (Root.getOpcode()) {
6345 default:
6346 return false;
6347 case AArch64::FMULv2f32:
6348 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6349 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6350 break;
6351 case AArch64::FMULv2f64:
6352 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6353 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6354 break;
6355 case AArch64::FMULv4f16:
6356 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6357 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6358 break;
6359 case AArch64::FMULv4f32:
6360 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6361 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6362 break;
6363 case AArch64::FMULv8f16:
6364 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6365 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6366 break;
6367 }
6368
6369 return Found;
6370 }
6371
getFNEGPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6372 static bool getFNEGPatterns(MachineInstr &Root,
6373 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6374 unsigned Opc = Root.getOpcode();
6375 MachineBasicBlock &MBB = *Root.getParent();
6376 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6377
6378 auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool {
6379 MachineOperand &MO = Root.getOperand(1);
6380 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6381 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6382 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6383 Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6384 Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6385 MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6386 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6387 Patterns.push_back(Pattern);
6388 return true;
6389 }
6390 return false;
6391 };
6392
6393 switch (Opc) {
6394 default:
6395 break;
6396 case AArch64::FNEGDr:
6397 return Match(AArch64::FMADDDrrr, MachineCombinerPattern::FNMADD);
6398 case AArch64::FNEGSr:
6399 return Match(AArch64::FMADDSrrr, MachineCombinerPattern::FNMADD);
6400 }
6401
6402 return false;
6403 }
6404
6405 /// Return true when a code sequence can improve throughput. It
6406 /// should be called only for instructions in loops.
6407 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const6408 bool AArch64InstrInfo::isThroughputPattern(
6409 MachineCombinerPattern Pattern) const {
6410 switch (Pattern) {
6411 default:
6412 break;
6413 case MachineCombinerPattern::FMULADDH_OP1:
6414 case MachineCombinerPattern::FMULADDH_OP2:
6415 case MachineCombinerPattern::FMULSUBH_OP1:
6416 case MachineCombinerPattern::FMULSUBH_OP2:
6417 case MachineCombinerPattern::FMULADDS_OP1:
6418 case MachineCombinerPattern::FMULADDS_OP2:
6419 case MachineCombinerPattern::FMULSUBS_OP1:
6420 case MachineCombinerPattern::FMULSUBS_OP2:
6421 case MachineCombinerPattern::FMULADDD_OP1:
6422 case MachineCombinerPattern::FMULADDD_OP2:
6423 case MachineCombinerPattern::FMULSUBD_OP1:
6424 case MachineCombinerPattern::FMULSUBD_OP2:
6425 case MachineCombinerPattern::FNMULSUBH_OP1:
6426 case MachineCombinerPattern::FNMULSUBS_OP1:
6427 case MachineCombinerPattern::FNMULSUBD_OP1:
6428 case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6429 case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6430 case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6431 case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6432 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6433 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6434 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6435 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6436 case MachineCombinerPattern::FMLAv4f16_OP2:
6437 case MachineCombinerPattern::FMLAv4f16_OP1:
6438 case MachineCombinerPattern::FMLAv8f16_OP1:
6439 case MachineCombinerPattern::FMLAv8f16_OP2:
6440 case MachineCombinerPattern::FMLAv2f32_OP2:
6441 case MachineCombinerPattern::FMLAv2f32_OP1:
6442 case MachineCombinerPattern::FMLAv2f64_OP1:
6443 case MachineCombinerPattern::FMLAv2f64_OP2:
6444 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6445 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6446 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6447 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6448 case MachineCombinerPattern::FMLAv4f32_OP1:
6449 case MachineCombinerPattern::FMLAv4f32_OP2:
6450 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6451 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6452 case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6453 case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6454 case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6455 case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6456 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6457 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6458 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6459 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6460 case MachineCombinerPattern::FMLSv4f16_OP1:
6461 case MachineCombinerPattern::FMLSv4f16_OP2:
6462 case MachineCombinerPattern::FMLSv8f16_OP1:
6463 case MachineCombinerPattern::FMLSv8f16_OP2:
6464 case MachineCombinerPattern::FMLSv2f32_OP2:
6465 case MachineCombinerPattern::FMLSv2f64_OP2:
6466 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6467 case MachineCombinerPattern::FMLSv4f32_OP2:
6468 case MachineCombinerPattern::FMULv2i32_indexed_OP1:
6469 case MachineCombinerPattern::FMULv2i32_indexed_OP2:
6470 case MachineCombinerPattern::FMULv2i64_indexed_OP1:
6471 case MachineCombinerPattern::FMULv2i64_indexed_OP2:
6472 case MachineCombinerPattern::FMULv4i16_indexed_OP1:
6473 case MachineCombinerPattern::FMULv4i16_indexed_OP2:
6474 case MachineCombinerPattern::FMULv4i32_indexed_OP1:
6475 case MachineCombinerPattern::FMULv4i32_indexed_OP2:
6476 case MachineCombinerPattern::FMULv8i16_indexed_OP1:
6477 case MachineCombinerPattern::FMULv8i16_indexed_OP2:
6478 case MachineCombinerPattern::MULADDv8i8_OP1:
6479 case MachineCombinerPattern::MULADDv8i8_OP2:
6480 case MachineCombinerPattern::MULADDv16i8_OP1:
6481 case MachineCombinerPattern::MULADDv16i8_OP2:
6482 case MachineCombinerPattern::MULADDv4i16_OP1:
6483 case MachineCombinerPattern::MULADDv4i16_OP2:
6484 case MachineCombinerPattern::MULADDv8i16_OP1:
6485 case MachineCombinerPattern::MULADDv8i16_OP2:
6486 case MachineCombinerPattern::MULADDv2i32_OP1:
6487 case MachineCombinerPattern::MULADDv2i32_OP2:
6488 case MachineCombinerPattern::MULADDv4i32_OP1:
6489 case MachineCombinerPattern::MULADDv4i32_OP2:
6490 case MachineCombinerPattern::MULSUBv8i8_OP1:
6491 case MachineCombinerPattern::MULSUBv8i8_OP2:
6492 case MachineCombinerPattern::MULSUBv16i8_OP1:
6493 case MachineCombinerPattern::MULSUBv16i8_OP2:
6494 case MachineCombinerPattern::MULSUBv4i16_OP1:
6495 case MachineCombinerPattern::MULSUBv4i16_OP2:
6496 case MachineCombinerPattern::MULSUBv8i16_OP1:
6497 case MachineCombinerPattern::MULSUBv8i16_OP2:
6498 case MachineCombinerPattern::MULSUBv2i32_OP1:
6499 case MachineCombinerPattern::MULSUBv2i32_OP2:
6500 case MachineCombinerPattern::MULSUBv4i32_OP1:
6501 case MachineCombinerPattern::MULSUBv4i32_OP2:
6502 case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6503 case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6504 case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6505 case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6506 case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6507 case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6508 case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6509 case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6510 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6511 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6512 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6513 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6514 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6515 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6516 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6517 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6518 return true;
6519 } // end switch (Pattern)
6520 return false;
6521 }
6522
6523 /// Find other MI combine patterns.
getMiscPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6524 static bool getMiscPatterns(MachineInstr &Root,
6525 SmallVectorImpl<MachineCombinerPattern> &Patterns)
6526 {
6527 // A - (B + C) ==> (A - B) - C or (A - C) - B
6528 unsigned Opc = Root.getOpcode();
6529 MachineBasicBlock &MBB = *Root.getParent();
6530
6531 switch (Opc) {
6532 case AArch64::SUBWrr:
6533 case AArch64::SUBSWrr:
6534 case AArch64::SUBXrr:
6535 case AArch64::SUBSXrr:
6536 // Found candidate root.
6537 break;
6538 default:
6539 return false;
6540 }
6541
6542 if (isCombineInstrSettingFlag(Opc) &&
6543 Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
6544 return false;
6545
6546 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6547 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6548 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6549 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6550 Patterns.push_back(MachineCombinerPattern::SUBADD_OP1);
6551 Patterns.push_back(MachineCombinerPattern::SUBADD_OP2);
6552 return true;
6553 }
6554
6555 return false;
6556 }
6557
6558 /// Return true when there is potentially a faster code sequence for an
6559 /// instruction chain ending in \p Root. All potential patterns are listed in
6560 /// the \p Pattern vector. Pattern should be sorted in priority order since the
6561 /// pattern evaluator stops checking as soon as it finds a faster sequence.
6562
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns,bool DoRegPressureReduce) const6563 bool AArch64InstrInfo::getMachineCombinerPatterns(
6564 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
6565 bool DoRegPressureReduce) const {
6566 // Integer patterns
6567 if (getMaddPatterns(Root, Patterns))
6568 return true;
6569 // Floating point patterns
6570 if (getFMULPatterns(Root, Patterns))
6571 return true;
6572 if (getFMAPatterns(Root, Patterns))
6573 return true;
6574 if (getFNEGPatterns(Root, Patterns))
6575 return true;
6576
6577 // Other patterns
6578 if (getMiscPatterns(Root, Patterns))
6579 return true;
6580
6581 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6582 DoRegPressureReduce);
6583 }
6584
6585 enum class FMAInstKind { Default, Indexed, Accumulator };
6586 /// genFusedMultiply - Generate fused multiply instructions.
6587 /// This function supports both integer and floating point instructions.
6588 /// A typical example:
6589 /// F|MUL I=A,B,0
6590 /// F|ADD R,I,C
6591 /// ==> F|MADD R,A,B,C
6592 /// \param MF Containing MachineFunction
6593 /// \param MRI Register information
6594 /// \param TII Target information
6595 /// \param Root is the F|ADD instruction
6596 /// \param [out] InsInstrs is a vector of machine instructions and will
6597 /// contain the generated madd instruction
6598 /// \param IdxMulOpd is index of operand in Root that is the result of
6599 /// the F|MUL. In the example above IdxMulOpd is 1.
6600 /// \param MaddOpc the opcode fo the f|madd instruction
6601 /// \param RC Register class of operands
6602 /// \param kind of fma instruction (addressing mode) to be generated
6603 /// \param ReplacedAddend is the result register from the instruction
6604 /// replacing the non-combined operand, if any.
6605 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)6606 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6607 const TargetInstrInfo *TII, MachineInstr &Root,
6608 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6609 unsigned MaddOpc, const TargetRegisterClass *RC,
6610 FMAInstKind kind = FMAInstKind::Default,
6611 const Register *ReplacedAddend = nullptr) {
6612 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6613
6614 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6615 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6616 Register ResultReg = Root.getOperand(0).getReg();
6617 Register SrcReg0 = MUL->getOperand(1).getReg();
6618 bool Src0IsKill = MUL->getOperand(1).isKill();
6619 Register SrcReg1 = MUL->getOperand(2).getReg();
6620 bool Src1IsKill = MUL->getOperand(2).isKill();
6621
6622 Register SrcReg2;
6623 bool Src2IsKill;
6624 if (ReplacedAddend) {
6625 // If we just generated a new addend, we must be it's only use.
6626 SrcReg2 = *ReplacedAddend;
6627 Src2IsKill = true;
6628 } else {
6629 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
6630 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
6631 }
6632
6633 if (ResultReg.isVirtual())
6634 MRI.constrainRegClass(ResultReg, RC);
6635 if (SrcReg0.isVirtual())
6636 MRI.constrainRegClass(SrcReg0, RC);
6637 if (SrcReg1.isVirtual())
6638 MRI.constrainRegClass(SrcReg1, RC);
6639 if (SrcReg2.isVirtual())
6640 MRI.constrainRegClass(SrcReg2, RC);
6641
6642 MachineInstrBuilder MIB;
6643 if (kind == FMAInstKind::Default)
6644 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6645 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6646 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6647 .addReg(SrcReg2, getKillRegState(Src2IsKill));
6648 else if (kind == FMAInstKind::Indexed)
6649 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6650 .addReg(SrcReg2, getKillRegState(Src2IsKill))
6651 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6652 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6653 .addImm(MUL->getOperand(3).getImm());
6654 else if (kind == FMAInstKind::Accumulator)
6655 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6656 .addReg(SrcReg2, getKillRegState(Src2IsKill))
6657 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6658 .addReg(SrcReg1, getKillRegState(Src1IsKill));
6659 else
6660 assert(false && "Invalid FMA instruction kind \n");
6661 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6662 InsInstrs.push_back(MIB);
6663 return MUL;
6664 }
6665
6666 static MachineInstr *
genFNegatedMAD(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs)6667 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6668 const TargetInstrInfo *TII, MachineInstr &Root,
6669 SmallVectorImpl<MachineInstr *> &InsInstrs) {
6670 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
6671
6672 unsigned Opc = 0;
6673 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
6674 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6675 Opc = AArch64::FNMADDSrrr;
6676 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6677 Opc = AArch64::FNMADDDrrr;
6678 else
6679 return nullptr;
6680
6681 Register ResultReg = Root.getOperand(0).getReg();
6682 Register SrcReg0 = MAD->getOperand(1).getReg();
6683 Register SrcReg1 = MAD->getOperand(2).getReg();
6684 Register SrcReg2 = MAD->getOperand(3).getReg();
6685 bool Src0IsKill = MAD->getOperand(1).isKill();
6686 bool Src1IsKill = MAD->getOperand(2).isKill();
6687 bool Src2IsKill = MAD->getOperand(3).isKill();
6688 if (ResultReg.isVirtual())
6689 MRI.constrainRegClass(ResultReg, RC);
6690 if (SrcReg0.isVirtual())
6691 MRI.constrainRegClass(SrcReg0, RC);
6692 if (SrcReg1.isVirtual())
6693 MRI.constrainRegClass(SrcReg1, RC);
6694 if (SrcReg2.isVirtual())
6695 MRI.constrainRegClass(SrcReg2, RC);
6696
6697 MachineInstrBuilder MIB =
6698 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
6699 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6700 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6701 .addReg(SrcReg2, getKillRegState(Src2IsKill));
6702 InsInstrs.push_back(MIB);
6703
6704 return MAD;
6705 }
6706
6707 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6708 static MachineInstr *
genIndexedMultiply(MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxDupOp,unsigned MulOpc,const TargetRegisterClass * RC,MachineRegisterInfo & MRI)6709 genIndexedMultiply(MachineInstr &Root,
6710 SmallVectorImpl<MachineInstr *> &InsInstrs,
6711 unsigned IdxDupOp, unsigned MulOpc,
6712 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6713 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6714 "Invalid index of FMUL operand");
6715
6716 MachineFunction &MF = *Root.getMF();
6717 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6718
6719 MachineInstr *Dup =
6720 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
6721
6722 if (Dup->getOpcode() == TargetOpcode::COPY)
6723 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
6724
6725 Register DupSrcReg = Dup->getOperand(1).getReg();
6726 MRI.clearKillFlags(DupSrcReg);
6727 MRI.constrainRegClass(DupSrcReg, RC);
6728
6729 unsigned DupSrcLane = Dup->getOperand(2).getImm();
6730
6731 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6732 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
6733
6734 Register ResultReg = Root.getOperand(0).getReg();
6735
6736 MachineInstrBuilder MIB;
6737 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
6738 .add(MulOp)
6739 .addReg(DupSrcReg)
6740 .addImm(DupSrcLane);
6741
6742 InsInstrs.push_back(MIB);
6743 return &Root;
6744 }
6745
6746 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6747 /// instructions.
6748 ///
6749 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6750 static MachineInstr *genFusedMultiplyAcc(
6751 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6752 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6753 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6754 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6755 FMAInstKind::Accumulator);
6756 }
6757
6758 /// genNeg - Helper to generate an intermediate negation of the second operand
6759 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)6760 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6761 const TargetInstrInfo *TII, MachineInstr &Root,
6762 SmallVectorImpl<MachineInstr *> &InsInstrs,
6763 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6764 unsigned MnegOpc, const TargetRegisterClass *RC) {
6765 Register NewVR = MRI.createVirtualRegister(RC);
6766 MachineInstrBuilder MIB =
6767 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
6768 .add(Root.getOperand(2));
6769 InsInstrs.push_back(MIB);
6770
6771 assert(InstrIdxForVirtReg.empty());
6772 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6773
6774 return NewVR;
6775 }
6776
6777 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6778 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6779 static MachineInstr *genFusedMultiplyAccNeg(
6780 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6781 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6782 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6783 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6784 assert(IdxMulOpd == 1);
6785
6786 Register NewVR =
6787 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6788 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6789 FMAInstKind::Accumulator, &NewVR);
6790 }
6791
6792 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6793 /// instructions.
6794 ///
6795 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6796 static MachineInstr *genFusedMultiplyIdx(
6797 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6798 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6799 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6800 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6801 FMAInstKind::Indexed);
6802 }
6803
6804 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6805 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6806 static MachineInstr *genFusedMultiplyIdxNeg(
6807 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6808 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6809 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6810 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6811 assert(IdxMulOpd == 1);
6812
6813 Register NewVR =
6814 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6815
6816 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6817 FMAInstKind::Indexed, &NewVR);
6818 }
6819
6820 /// genMaddR - Generate madd instruction and combine mul and add using
6821 /// an extra virtual register
6822 /// Example - an ADD intermediate needs to be stored in a register:
6823 /// MUL I=A,B,0
6824 /// ADD R,I,Imm
6825 /// ==> ORR V, ZR, Imm
6826 /// ==> MADD R,A,B,V
6827 /// \param MF Containing MachineFunction
6828 /// \param MRI Register information
6829 /// \param TII Target information
6830 /// \param Root is the ADD instruction
6831 /// \param [out] InsInstrs is a vector of machine instructions and will
6832 /// contain the generated madd instruction
6833 /// \param IdxMulOpd is index of operand in Root that is the result of
6834 /// the MUL. In the example above IdxMulOpd is 1.
6835 /// \param MaddOpc the opcode fo the madd instruction
6836 /// \param VR is a virtual register that holds the value of an ADD operand
6837 /// (V in the example above).
6838 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)6839 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6840 const TargetInstrInfo *TII, MachineInstr &Root,
6841 SmallVectorImpl<MachineInstr *> &InsInstrs,
6842 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6843 const TargetRegisterClass *RC) {
6844 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6845
6846 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6847 Register ResultReg = Root.getOperand(0).getReg();
6848 Register SrcReg0 = MUL->getOperand(1).getReg();
6849 bool Src0IsKill = MUL->getOperand(1).isKill();
6850 Register SrcReg1 = MUL->getOperand(2).getReg();
6851 bool Src1IsKill = MUL->getOperand(2).isKill();
6852
6853 if (ResultReg.isVirtual())
6854 MRI.constrainRegClass(ResultReg, RC);
6855 if (SrcReg0.isVirtual())
6856 MRI.constrainRegClass(SrcReg0, RC);
6857 if (SrcReg1.isVirtual())
6858 MRI.constrainRegClass(SrcReg1, RC);
6859 if (Register::isVirtualRegister(VR))
6860 MRI.constrainRegClass(VR, RC);
6861
6862 MachineInstrBuilder MIB =
6863 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6864 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6865 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6866 .addReg(VR);
6867 // Insert the MADD
6868 InsInstrs.push_back(MIB);
6869 return MUL;
6870 }
6871
6872 /// Do the following transformation
6873 /// A - (B + C) ==> (A - B) - C
6874 /// A - (B + C) ==> (A - C) - B
6875 static void
genSubAdd2SubSub(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,unsigned IdxOpd1,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg)6876 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6877 const TargetInstrInfo *TII, MachineInstr &Root,
6878 SmallVectorImpl<MachineInstr *> &InsInstrs,
6879 SmallVectorImpl<MachineInstr *> &DelInstrs,
6880 unsigned IdxOpd1,
6881 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6882 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6883 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6884 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
6885
6886 Register ResultReg = Root.getOperand(0).getReg();
6887 Register RegA = Root.getOperand(1).getReg();
6888 bool RegAIsKill = Root.getOperand(1).isKill();
6889 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
6890 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
6891 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
6892 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
6893 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
6894
6895 unsigned Opcode = Root.getOpcode();
6896 if (Opcode == AArch64::SUBSWrr)
6897 Opcode = AArch64::SUBWrr;
6898 else if (Opcode == AArch64::SUBSXrr)
6899 Opcode = AArch64::SUBXrr;
6900 else
6901 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6902 "Unexpected instruction opcode.");
6903
6904 MachineInstrBuilder MIB1 =
6905 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
6906 .addReg(RegA, getKillRegState(RegAIsKill))
6907 .addReg(RegB, getKillRegState(RegBIsKill));
6908 MachineInstrBuilder MIB2 =
6909 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
6910 .addReg(NewVR, getKillRegState(true))
6911 .addReg(RegC, getKillRegState(RegCIsKill));
6912
6913 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6914 InsInstrs.push_back(MIB1);
6915 InsInstrs.push_back(MIB2);
6916 DelInstrs.push_back(AddMI);
6917 }
6918
6919 /// When getMachineCombinerPatterns() finds potential patterns,
6920 /// this function generates the instructions that could replace the
6921 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const6922 void AArch64InstrInfo::genAlternativeCodeSequence(
6923 MachineInstr &Root, MachineCombinerPattern Pattern,
6924 SmallVectorImpl<MachineInstr *> &InsInstrs,
6925 SmallVectorImpl<MachineInstr *> &DelInstrs,
6926 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6927 MachineBasicBlock &MBB = *Root.getParent();
6928 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6929 MachineFunction &MF = *MBB.getParent();
6930 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6931
6932 MachineInstr *MUL = nullptr;
6933 const TargetRegisterClass *RC;
6934 unsigned Opc;
6935 switch (Pattern) {
6936 default:
6937 // Reassociate instructions.
6938 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6939 DelInstrs, InstrIdxForVirtReg);
6940 return;
6941 case MachineCombinerPattern::SUBADD_OP1:
6942 // A - (B + C)
6943 // ==> (A - B) - C
6944 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
6945 InstrIdxForVirtReg);
6946 break;
6947 case MachineCombinerPattern::SUBADD_OP2:
6948 // A - (B + C)
6949 // ==> (A - C) - B
6950 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
6951 InstrIdxForVirtReg);
6952 break;
6953 case MachineCombinerPattern::MULADDW_OP1:
6954 case MachineCombinerPattern::MULADDX_OP1:
6955 // MUL I=A,B,0
6956 // ADD R,I,C
6957 // ==> MADD R,A,B,C
6958 // --- Create(MADD);
6959 if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
6960 Opc = AArch64::MADDWrrr;
6961 RC = &AArch64::GPR32RegClass;
6962 } else {
6963 Opc = AArch64::MADDXrrr;
6964 RC = &AArch64::GPR64RegClass;
6965 }
6966 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
6967 break;
6968 case MachineCombinerPattern::MULADDW_OP2:
6969 case MachineCombinerPattern::MULADDX_OP2:
6970 // MUL I=A,B,0
6971 // ADD R,C,I
6972 // ==> MADD R,A,B,C
6973 // --- Create(MADD);
6974 if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
6975 Opc = AArch64::MADDWrrr;
6976 RC = &AArch64::GPR32RegClass;
6977 } else {
6978 Opc = AArch64::MADDXrrr;
6979 RC = &AArch64::GPR64RegClass;
6980 }
6981 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
6982 break;
6983 case MachineCombinerPattern::MULADDWI_OP1:
6984 case MachineCombinerPattern::MULADDXI_OP1: {
6985 // MUL I=A,B,0
6986 // ADD R,I,Imm
6987 // ==> MOV V, Imm
6988 // ==> MADD R,A,B,V
6989 // --- Create(MADD);
6990 const TargetRegisterClass *OrrRC;
6991 unsigned BitSize, OrrOpc, ZeroReg;
6992 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
6993 OrrOpc = AArch64::ORRWri;
6994 OrrRC = &AArch64::GPR32spRegClass;
6995 BitSize = 32;
6996 ZeroReg = AArch64::WZR;
6997 Opc = AArch64::MADDWrrr;
6998 RC = &AArch64::GPR32RegClass;
6999 } else {
7000 OrrOpc = AArch64::ORRXri;
7001 OrrRC = &AArch64::GPR64spRegClass;
7002 BitSize = 64;
7003 ZeroReg = AArch64::XZR;
7004 Opc = AArch64::MADDXrrr;
7005 RC = &AArch64::GPR64RegClass;
7006 }
7007 Register NewVR = MRI.createVirtualRegister(OrrRC);
7008 uint64_t Imm = Root.getOperand(2).getImm();
7009
7010 if (Root.getOperand(3).isImm()) {
7011 unsigned Val = Root.getOperand(3).getImm();
7012 Imm = Imm << Val;
7013 }
7014 uint64_t UImm = SignExtend64(Imm, BitSize);
7015 // The immediate can be composed via a single instruction.
7016 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7017 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7018 if (Insn.size() != 1)
7019 return;
7020 auto MovI = Insn.begin();
7021 MachineInstrBuilder MIB1;
7022 // MOV is an alias for one of three instructions: movz, movn, and orr.
7023 if (MovI->Opcode == OrrOpc)
7024 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7025 .addReg(ZeroReg)
7026 .addImm(MovI->Op2);
7027 else {
7028 if (BitSize == 32)
7029 assert((MovI->Opcode == AArch64::MOVNWi ||
7030 MovI->Opcode == AArch64::MOVZWi) &&
7031 "Expected opcode");
7032 else
7033 assert((MovI->Opcode == AArch64::MOVNXi ||
7034 MovI->Opcode == AArch64::MOVZXi) &&
7035 "Expected opcode");
7036 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7037 .addImm(MovI->Op1)
7038 .addImm(MovI->Op2);
7039 }
7040 InsInstrs.push_back(MIB1);
7041 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7042 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7043 break;
7044 }
7045 case MachineCombinerPattern::MULSUBW_OP1:
7046 case MachineCombinerPattern::MULSUBX_OP1: {
7047 // MUL I=A,B,0
7048 // SUB R,I, C
7049 // ==> SUB V, 0, C
7050 // ==> MADD R,A,B,V // = -C + A*B
7051 // --- Create(MADD);
7052 const TargetRegisterClass *SubRC;
7053 unsigned SubOpc, ZeroReg;
7054 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
7055 SubOpc = AArch64::SUBWrr;
7056 SubRC = &AArch64::GPR32spRegClass;
7057 ZeroReg = AArch64::WZR;
7058 Opc = AArch64::MADDWrrr;
7059 RC = &AArch64::GPR32RegClass;
7060 } else {
7061 SubOpc = AArch64::SUBXrr;
7062 SubRC = &AArch64::GPR64spRegClass;
7063 ZeroReg = AArch64::XZR;
7064 Opc = AArch64::MADDXrrr;
7065 RC = &AArch64::GPR64RegClass;
7066 }
7067 Register NewVR = MRI.createVirtualRegister(SubRC);
7068 // SUB NewVR, 0, C
7069 MachineInstrBuilder MIB1 =
7070 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7071 .addReg(ZeroReg)
7072 .add(Root.getOperand(2));
7073 InsInstrs.push_back(MIB1);
7074 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7075 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7076 break;
7077 }
7078 case MachineCombinerPattern::MULSUBW_OP2:
7079 case MachineCombinerPattern::MULSUBX_OP2:
7080 // MUL I=A,B,0
7081 // SUB R,C,I
7082 // ==> MSUB R,A,B,C (computes C - A*B)
7083 // --- Create(MSUB);
7084 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
7085 Opc = AArch64::MSUBWrrr;
7086 RC = &AArch64::GPR32RegClass;
7087 } else {
7088 Opc = AArch64::MSUBXrrr;
7089 RC = &AArch64::GPR64RegClass;
7090 }
7091 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7092 break;
7093 case MachineCombinerPattern::MULSUBWI_OP1:
7094 case MachineCombinerPattern::MULSUBXI_OP1: {
7095 // MUL I=A,B,0
7096 // SUB R,I, Imm
7097 // ==> MOV V, -Imm
7098 // ==> MADD R,A,B,V // = -Imm + A*B
7099 // --- Create(MADD);
7100 const TargetRegisterClass *OrrRC;
7101 unsigned BitSize, OrrOpc, ZeroReg;
7102 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
7103 OrrOpc = AArch64::ORRWri;
7104 OrrRC = &AArch64::GPR32spRegClass;
7105 BitSize = 32;
7106 ZeroReg = AArch64::WZR;
7107 Opc = AArch64::MADDWrrr;
7108 RC = &AArch64::GPR32RegClass;
7109 } else {
7110 OrrOpc = AArch64::ORRXri;
7111 OrrRC = &AArch64::GPR64spRegClass;
7112 BitSize = 64;
7113 ZeroReg = AArch64::XZR;
7114 Opc = AArch64::MADDXrrr;
7115 RC = &AArch64::GPR64RegClass;
7116 }
7117 Register NewVR = MRI.createVirtualRegister(OrrRC);
7118 uint64_t Imm = Root.getOperand(2).getImm();
7119 if (Root.getOperand(3).isImm()) {
7120 unsigned Val = Root.getOperand(3).getImm();
7121 Imm = Imm << Val;
7122 }
7123 uint64_t UImm = SignExtend64(-Imm, BitSize);
7124 // The immediate can be composed via a single instruction.
7125 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7126 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7127 if (Insn.size() != 1)
7128 return;
7129 auto MovI = Insn.begin();
7130 MachineInstrBuilder MIB1;
7131 // MOV is an alias for one of three instructions: movz, movn, and orr.
7132 if (MovI->Opcode == OrrOpc)
7133 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7134 .addReg(ZeroReg)
7135 .addImm(MovI->Op2);
7136 else {
7137 if (BitSize == 32)
7138 assert((MovI->Opcode == AArch64::MOVNWi ||
7139 MovI->Opcode == AArch64::MOVZWi) &&
7140 "Expected opcode");
7141 else
7142 assert((MovI->Opcode == AArch64::MOVNXi ||
7143 MovI->Opcode == AArch64::MOVZXi) &&
7144 "Expected opcode");
7145 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7146 .addImm(MovI->Op1)
7147 .addImm(MovI->Op2);
7148 }
7149 InsInstrs.push_back(MIB1);
7150 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7151 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7152 break;
7153 }
7154
7155 case MachineCombinerPattern::MULADDv8i8_OP1:
7156 Opc = AArch64::MLAv8i8;
7157 RC = &AArch64::FPR64RegClass;
7158 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7159 break;
7160 case MachineCombinerPattern::MULADDv8i8_OP2:
7161 Opc = AArch64::MLAv8i8;
7162 RC = &AArch64::FPR64RegClass;
7163 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7164 break;
7165 case MachineCombinerPattern::MULADDv16i8_OP1:
7166 Opc = AArch64::MLAv16i8;
7167 RC = &AArch64::FPR128RegClass;
7168 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7169 break;
7170 case MachineCombinerPattern::MULADDv16i8_OP2:
7171 Opc = AArch64::MLAv16i8;
7172 RC = &AArch64::FPR128RegClass;
7173 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7174 break;
7175 case MachineCombinerPattern::MULADDv4i16_OP1:
7176 Opc = AArch64::MLAv4i16;
7177 RC = &AArch64::FPR64RegClass;
7178 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7179 break;
7180 case MachineCombinerPattern::MULADDv4i16_OP2:
7181 Opc = AArch64::MLAv4i16;
7182 RC = &AArch64::FPR64RegClass;
7183 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7184 break;
7185 case MachineCombinerPattern::MULADDv8i16_OP1:
7186 Opc = AArch64::MLAv8i16;
7187 RC = &AArch64::FPR128RegClass;
7188 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7189 break;
7190 case MachineCombinerPattern::MULADDv8i16_OP2:
7191 Opc = AArch64::MLAv8i16;
7192 RC = &AArch64::FPR128RegClass;
7193 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7194 break;
7195 case MachineCombinerPattern::MULADDv2i32_OP1:
7196 Opc = AArch64::MLAv2i32;
7197 RC = &AArch64::FPR64RegClass;
7198 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7199 break;
7200 case MachineCombinerPattern::MULADDv2i32_OP2:
7201 Opc = AArch64::MLAv2i32;
7202 RC = &AArch64::FPR64RegClass;
7203 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7204 break;
7205 case MachineCombinerPattern::MULADDv4i32_OP1:
7206 Opc = AArch64::MLAv4i32;
7207 RC = &AArch64::FPR128RegClass;
7208 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7209 break;
7210 case MachineCombinerPattern::MULADDv4i32_OP2:
7211 Opc = AArch64::MLAv4i32;
7212 RC = &AArch64::FPR128RegClass;
7213 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7214 break;
7215
7216 case MachineCombinerPattern::MULSUBv8i8_OP1:
7217 Opc = AArch64::MLAv8i8;
7218 RC = &AArch64::FPR64RegClass;
7219 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7220 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7221 RC);
7222 break;
7223 case MachineCombinerPattern::MULSUBv8i8_OP2:
7224 Opc = AArch64::MLSv8i8;
7225 RC = &AArch64::FPR64RegClass;
7226 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7227 break;
7228 case MachineCombinerPattern::MULSUBv16i8_OP1:
7229 Opc = AArch64::MLAv16i8;
7230 RC = &AArch64::FPR128RegClass;
7231 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7232 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7233 RC);
7234 break;
7235 case MachineCombinerPattern::MULSUBv16i8_OP2:
7236 Opc = AArch64::MLSv16i8;
7237 RC = &AArch64::FPR128RegClass;
7238 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7239 break;
7240 case MachineCombinerPattern::MULSUBv4i16_OP1:
7241 Opc = AArch64::MLAv4i16;
7242 RC = &AArch64::FPR64RegClass;
7243 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7244 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7245 RC);
7246 break;
7247 case MachineCombinerPattern::MULSUBv4i16_OP2:
7248 Opc = AArch64::MLSv4i16;
7249 RC = &AArch64::FPR64RegClass;
7250 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7251 break;
7252 case MachineCombinerPattern::MULSUBv8i16_OP1:
7253 Opc = AArch64::MLAv8i16;
7254 RC = &AArch64::FPR128RegClass;
7255 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7256 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7257 RC);
7258 break;
7259 case MachineCombinerPattern::MULSUBv8i16_OP2:
7260 Opc = AArch64::MLSv8i16;
7261 RC = &AArch64::FPR128RegClass;
7262 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7263 break;
7264 case MachineCombinerPattern::MULSUBv2i32_OP1:
7265 Opc = AArch64::MLAv2i32;
7266 RC = &AArch64::FPR64RegClass;
7267 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7268 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7269 RC);
7270 break;
7271 case MachineCombinerPattern::MULSUBv2i32_OP2:
7272 Opc = AArch64::MLSv2i32;
7273 RC = &AArch64::FPR64RegClass;
7274 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7275 break;
7276 case MachineCombinerPattern::MULSUBv4i32_OP1:
7277 Opc = AArch64::MLAv4i32;
7278 RC = &AArch64::FPR128RegClass;
7279 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7280 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7281 RC);
7282 break;
7283 case MachineCombinerPattern::MULSUBv4i32_OP2:
7284 Opc = AArch64::MLSv4i32;
7285 RC = &AArch64::FPR128RegClass;
7286 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7287 break;
7288
7289 case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7290 Opc = AArch64::MLAv4i16_indexed;
7291 RC = &AArch64::FPR64RegClass;
7292 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7293 break;
7294 case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7295 Opc = AArch64::MLAv4i16_indexed;
7296 RC = &AArch64::FPR64RegClass;
7297 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7298 break;
7299 case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7300 Opc = AArch64::MLAv8i16_indexed;
7301 RC = &AArch64::FPR128RegClass;
7302 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7303 break;
7304 case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7305 Opc = AArch64::MLAv8i16_indexed;
7306 RC = &AArch64::FPR128RegClass;
7307 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7308 break;
7309 case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7310 Opc = AArch64::MLAv2i32_indexed;
7311 RC = &AArch64::FPR64RegClass;
7312 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7313 break;
7314 case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7315 Opc = AArch64::MLAv2i32_indexed;
7316 RC = &AArch64::FPR64RegClass;
7317 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7318 break;
7319 case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7320 Opc = AArch64::MLAv4i32_indexed;
7321 RC = &AArch64::FPR128RegClass;
7322 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7323 break;
7324 case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7325 Opc = AArch64::MLAv4i32_indexed;
7326 RC = &AArch64::FPR128RegClass;
7327 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7328 break;
7329
7330 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7331 Opc = AArch64::MLAv4i16_indexed;
7332 RC = &AArch64::FPR64RegClass;
7333 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7334 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7335 RC);
7336 break;
7337 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7338 Opc = AArch64::MLSv4i16_indexed;
7339 RC = &AArch64::FPR64RegClass;
7340 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7341 break;
7342 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7343 Opc = AArch64::MLAv8i16_indexed;
7344 RC = &AArch64::FPR128RegClass;
7345 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7346 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7347 RC);
7348 break;
7349 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7350 Opc = AArch64::MLSv8i16_indexed;
7351 RC = &AArch64::FPR128RegClass;
7352 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7353 break;
7354 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7355 Opc = AArch64::MLAv2i32_indexed;
7356 RC = &AArch64::FPR64RegClass;
7357 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7358 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7359 RC);
7360 break;
7361 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7362 Opc = AArch64::MLSv2i32_indexed;
7363 RC = &AArch64::FPR64RegClass;
7364 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7365 break;
7366 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7367 Opc = AArch64::MLAv4i32_indexed;
7368 RC = &AArch64::FPR128RegClass;
7369 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7370 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7371 RC);
7372 break;
7373 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7374 Opc = AArch64::MLSv4i32_indexed;
7375 RC = &AArch64::FPR128RegClass;
7376 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7377 break;
7378
7379 // Floating Point Support
7380 case MachineCombinerPattern::FMULADDH_OP1:
7381 Opc = AArch64::FMADDHrrr;
7382 RC = &AArch64::FPR16RegClass;
7383 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7384 break;
7385 case MachineCombinerPattern::FMULADDS_OP1:
7386 Opc = AArch64::FMADDSrrr;
7387 RC = &AArch64::FPR32RegClass;
7388 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7389 break;
7390 case MachineCombinerPattern::FMULADDD_OP1:
7391 Opc = AArch64::FMADDDrrr;
7392 RC = &AArch64::FPR64RegClass;
7393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7394 break;
7395
7396 case MachineCombinerPattern::FMULADDH_OP2:
7397 Opc = AArch64::FMADDHrrr;
7398 RC = &AArch64::FPR16RegClass;
7399 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7400 break;
7401 case MachineCombinerPattern::FMULADDS_OP2:
7402 Opc = AArch64::FMADDSrrr;
7403 RC = &AArch64::FPR32RegClass;
7404 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7405 break;
7406 case MachineCombinerPattern::FMULADDD_OP2:
7407 Opc = AArch64::FMADDDrrr;
7408 RC = &AArch64::FPR64RegClass;
7409 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7410 break;
7411
7412 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7413 Opc = AArch64::FMLAv1i32_indexed;
7414 RC = &AArch64::FPR32RegClass;
7415 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7416 FMAInstKind::Indexed);
7417 break;
7418 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7419 Opc = AArch64::FMLAv1i32_indexed;
7420 RC = &AArch64::FPR32RegClass;
7421 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7422 FMAInstKind::Indexed);
7423 break;
7424
7425 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7426 Opc = AArch64::FMLAv1i64_indexed;
7427 RC = &AArch64::FPR64RegClass;
7428 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7429 FMAInstKind::Indexed);
7430 break;
7431 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7432 Opc = AArch64::FMLAv1i64_indexed;
7433 RC = &AArch64::FPR64RegClass;
7434 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7435 FMAInstKind::Indexed);
7436 break;
7437
7438 case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7439 RC = &AArch64::FPR64RegClass;
7440 Opc = AArch64::FMLAv4i16_indexed;
7441 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7442 FMAInstKind::Indexed);
7443 break;
7444 case MachineCombinerPattern::FMLAv4f16_OP1:
7445 RC = &AArch64::FPR64RegClass;
7446 Opc = AArch64::FMLAv4f16;
7447 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7448 FMAInstKind::Accumulator);
7449 break;
7450 case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7451 RC = &AArch64::FPR64RegClass;
7452 Opc = AArch64::FMLAv4i16_indexed;
7453 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7454 FMAInstKind::Indexed);
7455 break;
7456 case MachineCombinerPattern::FMLAv4f16_OP2:
7457 RC = &AArch64::FPR64RegClass;
7458 Opc = AArch64::FMLAv4f16;
7459 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7460 FMAInstKind::Accumulator);
7461 break;
7462
7463 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7464 case MachineCombinerPattern::FMLAv2f32_OP1:
7465 RC = &AArch64::FPR64RegClass;
7466 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7467 Opc = AArch64::FMLAv2i32_indexed;
7468 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7469 FMAInstKind::Indexed);
7470 } else {
7471 Opc = AArch64::FMLAv2f32;
7472 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7473 FMAInstKind::Accumulator);
7474 }
7475 break;
7476 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7477 case MachineCombinerPattern::FMLAv2f32_OP2:
7478 RC = &AArch64::FPR64RegClass;
7479 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7480 Opc = AArch64::FMLAv2i32_indexed;
7481 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7482 FMAInstKind::Indexed);
7483 } else {
7484 Opc = AArch64::FMLAv2f32;
7485 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7486 FMAInstKind::Accumulator);
7487 }
7488 break;
7489
7490 case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7491 RC = &AArch64::FPR128RegClass;
7492 Opc = AArch64::FMLAv8i16_indexed;
7493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7494 FMAInstKind::Indexed);
7495 break;
7496 case MachineCombinerPattern::FMLAv8f16_OP1:
7497 RC = &AArch64::FPR128RegClass;
7498 Opc = AArch64::FMLAv8f16;
7499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7500 FMAInstKind::Accumulator);
7501 break;
7502 case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7503 RC = &AArch64::FPR128RegClass;
7504 Opc = AArch64::FMLAv8i16_indexed;
7505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7506 FMAInstKind::Indexed);
7507 break;
7508 case MachineCombinerPattern::FMLAv8f16_OP2:
7509 RC = &AArch64::FPR128RegClass;
7510 Opc = AArch64::FMLAv8f16;
7511 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7512 FMAInstKind::Accumulator);
7513 break;
7514
7515 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7516 case MachineCombinerPattern::FMLAv2f64_OP1:
7517 RC = &AArch64::FPR128RegClass;
7518 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7519 Opc = AArch64::FMLAv2i64_indexed;
7520 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7521 FMAInstKind::Indexed);
7522 } else {
7523 Opc = AArch64::FMLAv2f64;
7524 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7525 FMAInstKind::Accumulator);
7526 }
7527 break;
7528 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7529 case MachineCombinerPattern::FMLAv2f64_OP2:
7530 RC = &AArch64::FPR128RegClass;
7531 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7532 Opc = AArch64::FMLAv2i64_indexed;
7533 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7534 FMAInstKind::Indexed);
7535 } else {
7536 Opc = AArch64::FMLAv2f64;
7537 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7538 FMAInstKind::Accumulator);
7539 }
7540 break;
7541
7542 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7543 case MachineCombinerPattern::FMLAv4f32_OP1:
7544 RC = &AArch64::FPR128RegClass;
7545 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7546 Opc = AArch64::FMLAv4i32_indexed;
7547 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7548 FMAInstKind::Indexed);
7549 } else {
7550 Opc = AArch64::FMLAv4f32;
7551 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7552 FMAInstKind::Accumulator);
7553 }
7554 break;
7555
7556 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7557 case MachineCombinerPattern::FMLAv4f32_OP2:
7558 RC = &AArch64::FPR128RegClass;
7559 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7560 Opc = AArch64::FMLAv4i32_indexed;
7561 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7562 FMAInstKind::Indexed);
7563 } else {
7564 Opc = AArch64::FMLAv4f32;
7565 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7566 FMAInstKind::Accumulator);
7567 }
7568 break;
7569
7570 case MachineCombinerPattern::FMULSUBH_OP1:
7571 Opc = AArch64::FNMSUBHrrr;
7572 RC = &AArch64::FPR16RegClass;
7573 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7574 break;
7575 case MachineCombinerPattern::FMULSUBS_OP1:
7576 Opc = AArch64::FNMSUBSrrr;
7577 RC = &AArch64::FPR32RegClass;
7578 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7579 break;
7580 case MachineCombinerPattern::FMULSUBD_OP1:
7581 Opc = AArch64::FNMSUBDrrr;
7582 RC = &AArch64::FPR64RegClass;
7583 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7584 break;
7585
7586 case MachineCombinerPattern::FNMULSUBH_OP1:
7587 Opc = AArch64::FNMADDHrrr;
7588 RC = &AArch64::FPR16RegClass;
7589 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7590 break;
7591 case MachineCombinerPattern::FNMULSUBS_OP1:
7592 Opc = AArch64::FNMADDSrrr;
7593 RC = &AArch64::FPR32RegClass;
7594 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7595 break;
7596 case MachineCombinerPattern::FNMULSUBD_OP1:
7597 Opc = AArch64::FNMADDDrrr;
7598 RC = &AArch64::FPR64RegClass;
7599 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7600 break;
7601
7602 case MachineCombinerPattern::FMULSUBH_OP2:
7603 Opc = AArch64::FMSUBHrrr;
7604 RC = &AArch64::FPR16RegClass;
7605 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7606 break;
7607 case MachineCombinerPattern::FMULSUBS_OP2:
7608 Opc = AArch64::FMSUBSrrr;
7609 RC = &AArch64::FPR32RegClass;
7610 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7611 break;
7612 case MachineCombinerPattern::FMULSUBD_OP2:
7613 Opc = AArch64::FMSUBDrrr;
7614 RC = &AArch64::FPR64RegClass;
7615 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7616 break;
7617
7618 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7619 Opc = AArch64::FMLSv1i32_indexed;
7620 RC = &AArch64::FPR32RegClass;
7621 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7622 FMAInstKind::Indexed);
7623 break;
7624
7625 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7626 Opc = AArch64::FMLSv1i64_indexed;
7627 RC = &AArch64::FPR64RegClass;
7628 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7629 FMAInstKind::Indexed);
7630 break;
7631
7632 case MachineCombinerPattern::FMLSv4f16_OP1:
7633 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7634 RC = &AArch64::FPR64RegClass;
7635 Register NewVR = MRI.createVirtualRegister(RC);
7636 MachineInstrBuilder MIB1 =
7637 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7638 .add(Root.getOperand(2));
7639 InsInstrs.push_back(MIB1);
7640 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7641 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
7642 Opc = AArch64::FMLAv4f16;
7643 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7644 FMAInstKind::Accumulator, &NewVR);
7645 } else {
7646 Opc = AArch64::FMLAv4i16_indexed;
7647 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7648 FMAInstKind::Indexed, &NewVR);
7649 }
7650 break;
7651 }
7652 case MachineCombinerPattern::FMLSv4f16_OP2:
7653 RC = &AArch64::FPR64RegClass;
7654 Opc = AArch64::FMLSv4f16;
7655 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7656 FMAInstKind::Accumulator);
7657 break;
7658 case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7659 RC = &AArch64::FPR64RegClass;
7660 Opc = AArch64::FMLSv4i16_indexed;
7661 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7662 FMAInstKind::Indexed);
7663 break;
7664
7665 case MachineCombinerPattern::FMLSv2f32_OP2:
7666 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7667 RC = &AArch64::FPR64RegClass;
7668 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7669 Opc = AArch64::FMLSv2i32_indexed;
7670 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7671 FMAInstKind::Indexed);
7672 } else {
7673 Opc = AArch64::FMLSv2f32;
7674 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7675 FMAInstKind::Accumulator);
7676 }
7677 break;
7678
7679 case MachineCombinerPattern::FMLSv8f16_OP1:
7680 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7681 RC = &AArch64::FPR128RegClass;
7682 Register NewVR = MRI.createVirtualRegister(RC);
7683 MachineInstrBuilder MIB1 =
7684 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7685 .add(Root.getOperand(2));
7686 InsInstrs.push_back(MIB1);
7687 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7688 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
7689 Opc = AArch64::FMLAv8f16;
7690 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7691 FMAInstKind::Accumulator, &NewVR);
7692 } else {
7693 Opc = AArch64::FMLAv8i16_indexed;
7694 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7695 FMAInstKind::Indexed, &NewVR);
7696 }
7697 break;
7698 }
7699 case MachineCombinerPattern::FMLSv8f16_OP2:
7700 RC = &AArch64::FPR128RegClass;
7701 Opc = AArch64::FMLSv8f16;
7702 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7703 FMAInstKind::Accumulator);
7704 break;
7705 case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7706 RC = &AArch64::FPR128RegClass;
7707 Opc = AArch64::FMLSv8i16_indexed;
7708 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7709 FMAInstKind::Indexed);
7710 break;
7711
7712 case MachineCombinerPattern::FMLSv2f64_OP2:
7713 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7714 RC = &AArch64::FPR128RegClass;
7715 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7716 Opc = AArch64::FMLSv2i64_indexed;
7717 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7718 FMAInstKind::Indexed);
7719 } else {
7720 Opc = AArch64::FMLSv2f64;
7721 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7722 FMAInstKind::Accumulator);
7723 }
7724 break;
7725
7726 case MachineCombinerPattern::FMLSv4f32_OP2:
7727 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7728 RC = &AArch64::FPR128RegClass;
7729 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7730 Opc = AArch64::FMLSv4i32_indexed;
7731 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7732 FMAInstKind::Indexed);
7733 } else {
7734 Opc = AArch64::FMLSv4f32;
7735 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7736 FMAInstKind::Accumulator);
7737 }
7738 break;
7739 case MachineCombinerPattern::FMLSv2f32_OP1:
7740 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7741 RC = &AArch64::FPR64RegClass;
7742 Register NewVR = MRI.createVirtualRegister(RC);
7743 MachineInstrBuilder MIB1 =
7744 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7745 .add(Root.getOperand(2));
7746 InsInstrs.push_back(MIB1);
7747 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7748 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7749 Opc = AArch64::FMLAv2i32_indexed;
7750 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7751 FMAInstKind::Indexed, &NewVR);
7752 } else {
7753 Opc = AArch64::FMLAv2f32;
7754 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7755 FMAInstKind::Accumulator, &NewVR);
7756 }
7757 break;
7758 }
7759 case MachineCombinerPattern::FMLSv4f32_OP1:
7760 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7761 RC = &AArch64::FPR128RegClass;
7762 Register NewVR = MRI.createVirtualRegister(RC);
7763 MachineInstrBuilder MIB1 =
7764 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7765 .add(Root.getOperand(2));
7766 InsInstrs.push_back(MIB1);
7767 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7768 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7769 Opc = AArch64::FMLAv4i32_indexed;
7770 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7771 FMAInstKind::Indexed, &NewVR);
7772 } else {
7773 Opc = AArch64::FMLAv4f32;
7774 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7775 FMAInstKind::Accumulator, &NewVR);
7776 }
7777 break;
7778 }
7779 case MachineCombinerPattern::FMLSv2f64_OP1:
7780 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7781 RC = &AArch64::FPR128RegClass;
7782 Register NewVR = MRI.createVirtualRegister(RC);
7783 MachineInstrBuilder MIB1 =
7784 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7785 .add(Root.getOperand(2));
7786 InsInstrs.push_back(MIB1);
7787 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7788 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7789 Opc = AArch64::FMLAv2i64_indexed;
7790 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7791 FMAInstKind::Indexed, &NewVR);
7792 } else {
7793 Opc = AArch64::FMLAv2f64;
7794 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7795 FMAInstKind::Accumulator, &NewVR);
7796 }
7797 break;
7798 }
7799 case MachineCombinerPattern::FMULv2i32_indexed_OP1:
7800 case MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7801 unsigned IdxDupOp =
7802 (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2;
7803 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7804 &AArch64::FPR128RegClass, MRI);
7805 break;
7806 }
7807 case MachineCombinerPattern::FMULv2i64_indexed_OP1:
7808 case MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7809 unsigned IdxDupOp =
7810 (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2;
7811 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7812 &AArch64::FPR128RegClass, MRI);
7813 break;
7814 }
7815 case MachineCombinerPattern::FMULv4i16_indexed_OP1:
7816 case MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7817 unsigned IdxDupOp =
7818 (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2;
7819 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7820 &AArch64::FPR128_loRegClass, MRI);
7821 break;
7822 }
7823 case MachineCombinerPattern::FMULv4i32_indexed_OP1:
7824 case MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7825 unsigned IdxDupOp =
7826 (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2;
7827 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7828 &AArch64::FPR128RegClass, MRI);
7829 break;
7830 }
7831 case MachineCombinerPattern::FMULv8i16_indexed_OP1:
7832 case MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7833 unsigned IdxDupOp =
7834 (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2;
7835 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7836 &AArch64::FPR128_loRegClass, MRI);
7837 break;
7838 }
7839 case MachineCombinerPattern::FNMADD: {
7840 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7841 break;
7842 }
7843
7844 } // end switch (Pattern)
7845 // Record MUL and ADD/SUB for deletion
7846 if (MUL)
7847 DelInstrs.push_back(MUL);
7848 DelInstrs.push_back(&Root);
7849
7850 // Set the flags on the inserted instructions to be the merged flags of the
7851 // instructions that we have combined.
7852 uint32_t Flags = Root.getFlags();
7853 if (MUL)
7854 Flags = Root.mergeFlagsWith(*MUL);
7855 for (auto *MI : InsInstrs)
7856 MI->setFlags(Flags);
7857 }
7858
7859 /// Replace csincr-branch sequence by simple conditional branch
7860 ///
7861 /// Examples:
7862 /// 1. \code
7863 /// csinc w9, wzr, wzr, <condition code>
7864 /// tbnz w9, #0, 0x44
7865 /// \endcode
7866 /// to
7867 /// \code
7868 /// b.<inverted condition code>
7869 /// \endcode
7870 ///
7871 /// 2. \code
7872 /// csinc w9, wzr, wzr, <condition code>
7873 /// tbz w9, #0, 0x44
7874 /// \endcode
7875 /// to
7876 /// \code
7877 /// b.<condition code>
7878 /// \endcode
7879 ///
7880 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7881 /// compare's constant operand is power of 2.
7882 ///
7883 /// Examples:
7884 /// \code
7885 /// and w8, w8, #0x400
7886 /// cbnz w8, L1
7887 /// \endcode
7888 /// to
7889 /// \code
7890 /// tbnz w8, #10, L1
7891 /// \endcode
7892 ///
7893 /// \param MI Conditional Branch
7894 /// \return True when the simple conditional branch is generated
7895 ///
optimizeCondBranch(MachineInstr & MI) const7896 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7897 bool IsNegativeBranch = false;
7898 bool IsTestAndBranch = false;
7899 unsigned TargetBBInMI = 0;
7900 switch (MI.getOpcode()) {
7901 default:
7902 llvm_unreachable("Unknown branch instruction?");
7903 case AArch64::Bcc:
7904 return false;
7905 case AArch64::CBZW:
7906 case AArch64::CBZX:
7907 TargetBBInMI = 1;
7908 break;
7909 case AArch64::CBNZW:
7910 case AArch64::CBNZX:
7911 TargetBBInMI = 1;
7912 IsNegativeBranch = true;
7913 break;
7914 case AArch64::TBZW:
7915 case AArch64::TBZX:
7916 TargetBBInMI = 2;
7917 IsTestAndBranch = true;
7918 break;
7919 case AArch64::TBNZW:
7920 case AArch64::TBNZX:
7921 TargetBBInMI = 2;
7922 IsNegativeBranch = true;
7923 IsTestAndBranch = true;
7924 break;
7925 }
7926 // So we increment a zero register and test for bits other
7927 // than bit 0? Conservatively bail out in case the verifier
7928 // missed this case.
7929 if (IsTestAndBranch && MI.getOperand(1).getImm())
7930 return false;
7931
7932 // Find Definition.
7933 assert(MI.getParent() && "Incomplete machine instruciton\n");
7934 MachineBasicBlock *MBB = MI.getParent();
7935 MachineFunction *MF = MBB->getParent();
7936 MachineRegisterInfo *MRI = &MF->getRegInfo();
7937 Register VReg = MI.getOperand(0).getReg();
7938 if (!VReg.isVirtual())
7939 return false;
7940
7941 MachineInstr *DefMI = MRI->getVRegDef(VReg);
7942
7943 // Look through COPY instructions to find definition.
7944 while (DefMI->isCopy()) {
7945 Register CopyVReg = DefMI->getOperand(1).getReg();
7946 if (!MRI->hasOneNonDBGUse(CopyVReg))
7947 return false;
7948 if (!MRI->hasOneDef(CopyVReg))
7949 return false;
7950 DefMI = MRI->getVRegDef(CopyVReg);
7951 }
7952
7953 switch (DefMI->getOpcode()) {
7954 default:
7955 return false;
7956 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
7957 case AArch64::ANDWri:
7958 case AArch64::ANDXri: {
7959 if (IsTestAndBranch)
7960 return false;
7961 if (DefMI->getParent() != MBB)
7962 return false;
7963 if (!MRI->hasOneNonDBGUse(VReg))
7964 return false;
7965
7966 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
7967 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
7968 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
7969 if (!isPowerOf2_64(Mask))
7970 return false;
7971
7972 MachineOperand &MO = DefMI->getOperand(1);
7973 Register NewReg = MO.getReg();
7974 if (!NewReg.isVirtual())
7975 return false;
7976
7977 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
7978
7979 MachineBasicBlock &RefToMBB = *MBB;
7980 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
7981 DebugLoc DL = MI.getDebugLoc();
7982 unsigned Imm = Log2_64(Mask);
7983 unsigned Opc = (Imm < 32)
7984 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
7985 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
7986 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
7987 .addReg(NewReg)
7988 .addImm(Imm)
7989 .addMBB(TBB);
7990 // Register lives on to the CBZ now.
7991 MO.setIsKill(false);
7992
7993 // For immediate smaller than 32, we need to use the 32-bit
7994 // variant (W) in all cases. Indeed the 64-bit variant does not
7995 // allow to encode them.
7996 // Therefore, if the input register is 64-bit, we need to take the
7997 // 32-bit sub-part.
7998 if (!Is32Bit && Imm < 32)
7999 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8000 MI.eraseFromParent();
8001 return true;
8002 }
8003 // Look for CSINC
8004 case AArch64::CSINCWr:
8005 case AArch64::CSINCXr: {
8006 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8007 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8008 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8009 DefMI->getOperand(2).getReg() == AArch64::XZR))
8010 return false;
8011
8012 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
8013 return false;
8014
8015 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8016 // Convert only when the condition code is not modified between
8017 // the CSINC and the branch. The CC may be used by other
8018 // instructions in between.
8019 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8020 return false;
8021 MachineBasicBlock &RefToMBB = *MBB;
8022 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8023 DebugLoc DL = MI.getDebugLoc();
8024 if (IsNegativeBranch)
8025 CC = AArch64CC::getInvertedCondCode(CC);
8026 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8027 MI.eraseFromParent();
8028 return true;
8029 }
8030 }
8031 }
8032
8033 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const8034 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8035 const unsigned Mask = AArch64II::MO_FRAGMENT;
8036 return std::make_pair(TF & Mask, TF & ~Mask);
8037 }
8038
8039 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const8040 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8041 using namespace AArch64II;
8042
8043 static const std::pair<unsigned, const char *> TargetFlags[] = {
8044 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8045 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
8046 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
8047 {MO_HI12, "aarch64-hi12"}};
8048 return ArrayRef(TargetFlags);
8049 }
8050
8051 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const8052 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8053 using namespace AArch64II;
8054
8055 static const std::pair<unsigned, const char *> TargetFlags[] = {
8056 {MO_COFFSTUB, "aarch64-coffstub"},
8057 {MO_GOT, "aarch64-got"},
8058 {MO_NC, "aarch64-nc"},
8059 {MO_S, "aarch64-s"},
8060 {MO_TLS, "aarch64-tls"},
8061 {MO_DLLIMPORT, "aarch64-dllimport"},
8062 {MO_PREL, "aarch64-prel"},
8063 {MO_TAGGED, "aarch64-tagged"},
8064 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8065 };
8066 return ArrayRef(TargetFlags);
8067 }
8068
8069 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const8070 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8071 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8072 {{MOSuppressPair, "aarch64-suppress-pair"},
8073 {MOStridedAccess, "aarch64-strided-access"}};
8074 return ArrayRef(TargetFlags);
8075 }
8076
8077 /// Constants defining how certain sequences should be outlined.
8078 /// This encompasses how an outlined function should be called, and what kind of
8079 /// frame should be emitted for that outlined function.
8080 ///
8081 /// \p MachineOutlinerDefault implies that the function should be called with
8082 /// a save and restore of LR to the stack.
8083 ///
8084 /// That is,
8085 ///
8086 /// I1 Save LR OUTLINED_FUNCTION:
8087 /// I2 --> BL OUTLINED_FUNCTION I1
8088 /// I3 Restore LR I2
8089 /// I3
8090 /// RET
8091 ///
8092 /// * Call construction overhead: 3 (save + BL + restore)
8093 /// * Frame construction overhead: 1 (ret)
8094 /// * Requires stack fixups? Yes
8095 ///
8096 /// \p MachineOutlinerTailCall implies that the function is being created from
8097 /// a sequence of instructions ending in a return.
8098 ///
8099 /// That is,
8100 ///
8101 /// I1 OUTLINED_FUNCTION:
8102 /// I2 --> B OUTLINED_FUNCTION I1
8103 /// RET I2
8104 /// RET
8105 ///
8106 /// * Call construction overhead: 1 (B)
8107 /// * Frame construction overhead: 0 (Return included in sequence)
8108 /// * Requires stack fixups? No
8109 ///
8110 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8111 /// a BL instruction, but doesn't require LR to be saved and restored. This
8112 /// happens when LR is known to be dead.
8113 ///
8114 /// That is,
8115 ///
8116 /// I1 OUTLINED_FUNCTION:
8117 /// I2 --> BL OUTLINED_FUNCTION I1
8118 /// I3 I2
8119 /// I3
8120 /// RET
8121 ///
8122 /// * Call construction overhead: 1 (BL)
8123 /// * Frame construction overhead: 1 (RET)
8124 /// * Requires stack fixups? No
8125 ///
8126 /// \p MachineOutlinerThunk implies that the function is being created from
8127 /// a sequence of instructions ending in a call. The outlined function is
8128 /// called with a BL instruction, and the outlined function tail-calls the
8129 /// original call destination.
8130 ///
8131 /// That is,
8132 ///
8133 /// I1 OUTLINED_FUNCTION:
8134 /// I2 --> BL OUTLINED_FUNCTION I1
8135 /// BL f I2
8136 /// B f
8137 /// * Call construction overhead: 1 (BL)
8138 /// * Frame construction overhead: 0
8139 /// * Requires stack fixups? No
8140 ///
8141 /// \p MachineOutlinerRegSave implies that the function should be called with a
8142 /// save and restore of LR to an available register. This allows us to avoid
8143 /// stack fixups. Note that this outlining variant is compatible with the
8144 /// NoLRSave case.
8145 ///
8146 /// That is,
8147 ///
8148 /// I1 Save LR OUTLINED_FUNCTION:
8149 /// I2 --> BL OUTLINED_FUNCTION I1
8150 /// I3 Restore LR I2
8151 /// I3
8152 /// RET
8153 ///
8154 /// * Call construction overhead: 3 (save + BL + restore)
8155 /// * Frame construction overhead: 1 (ret)
8156 /// * Requires stack fixups? No
8157 enum MachineOutlinerClass {
8158 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
8159 MachineOutlinerTailCall, /// Only emit a branch.
8160 MachineOutlinerNoLRSave, /// Emit a call and return.
8161 MachineOutlinerThunk, /// Emit a call and tail-call.
8162 MachineOutlinerRegSave /// Same as default, but save to a register.
8163 };
8164
8165 enum MachineOutlinerMBBFlags {
8166 LRUnavailableSomewhere = 0x2,
8167 HasCalls = 0x4,
8168 UnsafeRegsDead = 0x8
8169 };
8170
8171 Register
findRegisterToSaveLRTo(outliner::Candidate & C) const8172 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8173 MachineFunction *MF = C.getMF();
8174 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8175 const AArch64RegisterInfo *ARI =
8176 static_cast<const AArch64RegisterInfo *>(&TRI);
8177 // Check if there is an available register across the sequence that we can
8178 // use.
8179 for (unsigned Reg : AArch64::GPR64RegClass) {
8180 if (!ARI->isReservedReg(*MF, Reg) &&
8181 Reg != AArch64::LR && // LR is not reserved, but don't use it.
8182 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8183 Reg != AArch64::X17 && // Ditto for X17.
8184 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8185 C.isAvailableInsideSeq(Reg, TRI))
8186 return Reg;
8187 }
8188 return Register();
8189 }
8190
8191 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8192 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8193 const outliner::Candidate &b) {
8194 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8195 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8196
8197 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8198 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8199 }
8200
8201 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8202 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8203 const outliner::Candidate &b) {
8204 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8205 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8206
8207 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8208 }
8209
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8210 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8211 const outliner::Candidate &b) {
8212 const AArch64Subtarget &SubtargetA =
8213 a.getMF()->getSubtarget<AArch64Subtarget>();
8214 const AArch64Subtarget &SubtargetB =
8215 b.getMF()->getSubtarget<AArch64Subtarget>();
8216 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8217 }
8218
8219 std::optional<outliner::OutlinedFunction>
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const8220 AArch64InstrInfo::getOutliningCandidateInfo(
8221 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8222 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8223
8224 unsigned SequenceSize = 0;
8225 for (auto &MI : FirstCand)
8226 SequenceSize += getInstSizeInBytes(MI);
8227
8228 unsigned NumBytesToCreateFrame = 0;
8229
8230 // We only allow outlining for functions having exactly matching return
8231 // address signing attributes, i.e., all share the same value for the
8232 // attribute "sign-return-address" and all share the same type of key they
8233 // are signed with.
8234 // Additionally we require all functions to simultaniously either support
8235 // v8.3a features or not. Otherwise an outlined function could get signed
8236 // using dedicated v8.3 instructions and a call from a function that doesn't
8237 // support v8.3 instructions would therefore be invalid.
8238 if (std::adjacent_find(
8239 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8240 [](const outliner::Candidate &a, const outliner::Candidate &b) {
8241 // Return true if a and b are non-equal w.r.t. return address
8242 // signing or support of v8.3a features
8243 if (outliningCandidatesSigningScopeConsensus(a, b) &&
8244 outliningCandidatesSigningKeyConsensus(a, b) &&
8245 outliningCandidatesV8_3OpsConsensus(a, b)) {
8246 return false;
8247 }
8248 return true;
8249 }) != RepeatedSequenceLocs.end()) {
8250 return std::nullopt;
8251 }
8252
8253 // Since at this point all candidates agree on their return address signing
8254 // picking just one is fine. If the candidate functions potentially sign their
8255 // return addresses, the outlined function should do the same. Note that in
8256 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8257 // not certainly true that the outlined function will have to sign its return
8258 // address but this decision is made later, when the decision to outline
8259 // has already been made.
8260 // The same holds for the number of additional instructions we need: On
8261 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8262 // necessary. However, at this point we don't know if the outlined function
8263 // will have a RET instruction so we assume the worst.
8264 const TargetRegisterInfo &TRI = getRegisterInfo();
8265 // Performing a tail call may require extra checks when PAuth is enabled.
8266 // If PAuth is disabled, set it to zero for uniformity.
8267 unsigned NumBytesToCheckLRInTCEpilogue = 0;
8268 if (FirstCand.getMF()
8269 ->getInfo<AArch64FunctionInfo>()
8270 ->shouldSignReturnAddress(true)) {
8271 // One PAC and one AUT instructions
8272 NumBytesToCreateFrame += 8;
8273
8274 // PAuth is enabled - set extra tail call cost, if any.
8275 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod();
8276 NumBytesToCheckLRInTCEpilogue =
8277 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8278 // Checking the authenticated LR value may significantly impact
8279 // SequenceSize, so account for it for more precise results.
8280 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8281 SequenceSize += NumBytesToCheckLRInTCEpilogue;
8282
8283 // We have to check if sp modifying instructions would get outlined.
8284 // If so we only allow outlining if sp is unchanged overall, so matching
8285 // sub and add instructions are okay to outline, all other sp modifications
8286 // are not
8287 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8288 int SPValue = 0;
8289 for (auto &MI : C) {
8290 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8291 switch (MI.getOpcode()) {
8292 case AArch64::ADDXri:
8293 case AArch64::ADDWri:
8294 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8295 assert(MI.getOperand(2).isImm() &&
8296 "Expected operand to be immediate");
8297 assert(MI.getOperand(1).isReg() &&
8298 "Expected operand to be a register");
8299 // Check if the add just increments sp. If so, we search for
8300 // matching sub instructions that decrement sp. If not, the
8301 // modification is illegal
8302 if (MI.getOperand(1).getReg() == AArch64::SP)
8303 SPValue += MI.getOperand(2).getImm();
8304 else
8305 return true;
8306 break;
8307 case AArch64::SUBXri:
8308 case AArch64::SUBWri:
8309 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8310 assert(MI.getOperand(2).isImm() &&
8311 "Expected operand to be immediate");
8312 assert(MI.getOperand(1).isReg() &&
8313 "Expected operand to be a register");
8314 // Check if the sub just decrements sp. If so, we search for
8315 // matching add instructions that increment sp. If not, the
8316 // modification is illegal
8317 if (MI.getOperand(1).getReg() == AArch64::SP)
8318 SPValue -= MI.getOperand(2).getImm();
8319 else
8320 return true;
8321 break;
8322 default:
8323 return true;
8324 }
8325 }
8326 }
8327 if (SPValue)
8328 return true;
8329 return false;
8330 };
8331 // Remove candidates with illegal stack modifying instructions
8332 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8333
8334 // If the sequence doesn't have enough candidates left, then we're done.
8335 if (RepeatedSequenceLocs.size() < 2)
8336 return std::nullopt;
8337 }
8338
8339 // Properties about candidate MBBs that hold for all of them.
8340 unsigned FlagsSetInAll = 0xF;
8341
8342 // Compute liveness information for each candidate, and set FlagsSetInAll.
8343 for (outliner::Candidate &C : RepeatedSequenceLocs)
8344 FlagsSetInAll &= C.Flags;
8345
8346 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8347
8348 // Helper lambda which sets call information for every candidate.
8349 auto SetCandidateCallInfo =
8350 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8351 for (outliner::Candidate &C : RepeatedSequenceLocs)
8352 C.setCallInfo(CallID, NumBytesForCall);
8353 };
8354
8355 unsigned FrameID = MachineOutlinerDefault;
8356 NumBytesToCreateFrame += 4;
8357
8358 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8359 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8360 });
8361
8362 // We check to see if CFI Instructions are present, and if they are
8363 // we find the number of CFI Instructions in the candidates.
8364 unsigned CFICount = 0;
8365 for (auto &I : RepeatedSequenceLocs[0]) {
8366 if (I.isCFIInstruction())
8367 CFICount++;
8368 }
8369
8370 // We compare the number of found CFI Instructions to the number of CFI
8371 // instructions in the parent function for each candidate. We must check this
8372 // since if we outline one of the CFI instructions in a function, we have to
8373 // outline them all for correctness. If we do not, the address offsets will be
8374 // incorrect between the two sections of the program.
8375 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8376 std::vector<MCCFIInstruction> CFIInstructions =
8377 C.getMF()->getFrameInstructions();
8378
8379 if (CFICount > 0 && CFICount != CFIInstructions.size())
8380 return std::nullopt;
8381 }
8382
8383 // Returns true if an instructions is safe to fix up, false otherwise.
8384 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8385 if (MI.isCall())
8386 return true;
8387
8388 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8389 !MI.readsRegister(AArch64::SP, &TRI))
8390 return true;
8391
8392 // Any modification of SP will break our code to save/restore LR.
8393 // FIXME: We could handle some instructions which add a constant
8394 // offset to SP, with a bit more work.
8395 if (MI.modifiesRegister(AArch64::SP, &TRI))
8396 return false;
8397
8398 // At this point, we have a stack instruction that we might need to
8399 // fix up. We'll handle it if it's a load or store.
8400 if (MI.mayLoadOrStore()) {
8401 const MachineOperand *Base; // Filled with the base operand of MI.
8402 int64_t Offset; // Filled with the offset of MI.
8403 bool OffsetIsScalable;
8404
8405 // Does it allow us to offset the base operand and is the base the
8406 // register SP?
8407 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8408 !Base->isReg() || Base->getReg() != AArch64::SP)
8409 return false;
8410
8411 // Fixe-up code below assumes bytes.
8412 if (OffsetIsScalable)
8413 return false;
8414
8415 // Find the minimum/maximum offset for this instruction and check
8416 // if fixing it up would be in range.
8417 int64_t MinOffset,
8418 MaxOffset; // Unscaled offsets for the instruction.
8419 // The scale to multiply the offsets by.
8420 TypeSize Scale(0U, false), DummyWidth(0U, false);
8421 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8422
8423 Offset += 16; // Update the offset to what it would be if we outlined.
8424 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8425 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8426 return false;
8427
8428 // It's in range, so we can outline it.
8429 return true;
8430 }
8431
8432 // FIXME: Add handling for instructions like "add x0, sp, #8".
8433
8434 // We can't fix it up, so don't outline it.
8435 return false;
8436 };
8437
8438 // True if it's possible to fix up each stack instruction in this sequence.
8439 // Important for frames/call variants that modify the stack.
8440 bool AllStackInstrsSafe = llvm::all_of(FirstCand, IsSafeToFixup);
8441
8442 // If the last instruction in any candidate is a terminator, then we should
8443 // tail call all of the candidates.
8444 if (RepeatedSequenceLocs[0].back().isTerminator()) {
8445 FrameID = MachineOutlinerTailCall;
8446 NumBytesToCreateFrame = 0;
8447 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8448 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8449 }
8450
8451 else if (LastInstrOpcode == AArch64::BL ||
8452 ((LastInstrOpcode == AArch64::BLR ||
8453 LastInstrOpcode == AArch64::BLRNoIP) &&
8454 !HasBTI)) {
8455 // FIXME: Do we need to check if the code after this uses the value of LR?
8456 FrameID = MachineOutlinerThunk;
8457 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8458 SetCandidateCallInfo(MachineOutlinerThunk, 4);
8459 }
8460
8461 else {
8462 // We need to decide how to emit calls + frames. We can always emit the same
8463 // frame if we don't need to save to the stack. If we have to save to the
8464 // stack, then we need a different frame.
8465 unsigned NumBytesNoStackCalls = 0;
8466 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8467
8468 // Check if we have to save LR.
8469 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8470 bool LRAvailable =
8471 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8472 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8473 : true;
8474 // If we have a noreturn caller, then we're going to be conservative and
8475 // say that we have to save LR. If we don't have a ret at the end of the
8476 // block, then we can't reason about liveness accurately.
8477 //
8478 // FIXME: We can probably do better than always disabling this in
8479 // noreturn functions by fixing up the liveness info.
8480 bool IsNoReturn =
8481 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8482
8483 // Is LR available? If so, we don't need a save.
8484 if (LRAvailable && !IsNoReturn) {
8485 NumBytesNoStackCalls += 4;
8486 C.setCallInfo(MachineOutlinerNoLRSave, 4);
8487 CandidatesWithoutStackFixups.push_back(C);
8488 }
8489
8490 // Is an unused register available? If so, we won't modify the stack, so
8491 // we can outline with the same frame type as those that don't save LR.
8492 else if (findRegisterToSaveLRTo(C)) {
8493 NumBytesNoStackCalls += 12;
8494 C.setCallInfo(MachineOutlinerRegSave, 12);
8495 CandidatesWithoutStackFixups.push_back(C);
8496 }
8497
8498 // Is SP used in the sequence at all? If not, we don't have to modify
8499 // the stack, so we are guaranteed to get the same frame.
8500 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8501 NumBytesNoStackCalls += 12;
8502 C.setCallInfo(MachineOutlinerDefault, 12);
8503 CandidatesWithoutStackFixups.push_back(C);
8504 }
8505
8506 // If we outline this, we need to modify the stack. Pretend we don't
8507 // outline this by saving all of its bytes.
8508 else {
8509 NumBytesNoStackCalls += SequenceSize;
8510 }
8511 }
8512
8513 // If there are no places where we have to save LR, then note that we
8514 // don't have to update the stack. Otherwise, give every candidate the
8515 // default call type, as long as it's safe to do so.
8516 if (!AllStackInstrsSafe ||
8517 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8518 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8519 FrameID = MachineOutlinerNoLRSave;
8520 } else {
8521 SetCandidateCallInfo(MachineOutlinerDefault, 12);
8522
8523 // Bugzilla ID: 46767
8524 // TODO: Check if fixing up the stack more than once is safe so we can
8525 // outline these.
8526 //
8527 // An outline resulting in a caller that requires stack fixups at the
8528 // callsite to a callee that also requires stack fixups can happen when
8529 // there are no available registers at the candidate callsite for a
8530 // candidate that itself also has calls.
8531 //
8532 // In other words if function_containing_sequence in the following pseudo
8533 // assembly requires that we save LR at the point of the call, but there
8534 // are no available registers: in this case we save using SP and as a
8535 // result the SP offsets requires stack fixups by multiples of 16.
8536 //
8537 // function_containing_sequence:
8538 // ...
8539 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8540 // call OUTLINED_FUNCTION_N
8541 // restore LR from SP
8542 // ...
8543 //
8544 // OUTLINED_FUNCTION_N:
8545 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8546 // ...
8547 // bl foo
8548 // restore LR from SP
8549 // ret
8550 //
8551 // Because the code to handle more than one stack fixup does not
8552 // currently have the proper checks for legality, these cases will assert
8553 // in the AArch64 MachineOutliner. This is because the code to do this
8554 // needs more hardening, testing, better checks that generated code is
8555 // legal, etc and because it is only verified to handle a single pass of
8556 // stack fixup.
8557 //
8558 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8559 // these cases until they are known to be handled. Bugzilla 46767 is
8560 // referenced in comments at the assert site.
8561 //
8562 // To avoid asserting (or generating non-legal code on noassert builds)
8563 // we remove all candidates which would need more than one stack fixup by
8564 // pruning the cases where the candidate has calls while also having no
8565 // available LR and having no available general purpose registers to copy
8566 // LR to (ie one extra stack save/restore).
8567 //
8568 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8569 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
8570 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8571 return (llvm::any_of(C, IsCall)) &&
8572 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8573 !findRegisterToSaveLRTo(C));
8574 });
8575 }
8576 }
8577
8578 // If we dropped all of the candidates, bail out here.
8579 if (RepeatedSequenceLocs.size() < 2) {
8580 RepeatedSequenceLocs.clear();
8581 return std::nullopt;
8582 }
8583 }
8584
8585 // Does every candidate's MBB contain a call? If so, then we might have a call
8586 // in the range.
8587 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8588 // Check if the range contains a call. These require a save + restore of the
8589 // link register.
8590 bool ModStackToSaveLR = false;
8591 if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
8592 [](const MachineInstr &MI) { return MI.isCall(); }))
8593 ModStackToSaveLR = true;
8594
8595 // Handle the last instruction separately. If this is a tail call, then the
8596 // last instruction is a call. We don't want to save + restore in this case.
8597 // However, it could be possible that the last instruction is a call without
8598 // it being valid to tail call this sequence. We should consider this as
8599 // well.
8600 else if (FrameID != MachineOutlinerThunk &&
8601 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8602 ModStackToSaveLR = true;
8603
8604 if (ModStackToSaveLR) {
8605 // We can't fix up the stack. Bail out.
8606 if (!AllStackInstrsSafe) {
8607 RepeatedSequenceLocs.clear();
8608 return std::nullopt;
8609 }
8610
8611 // Save + restore LR.
8612 NumBytesToCreateFrame += 8;
8613 }
8614 }
8615
8616 // If we have CFI instructions, we can only outline if the outlined section
8617 // can be a tail call
8618 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8619 return std::nullopt;
8620
8621 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8622 NumBytesToCreateFrame, FrameID);
8623 }
8624
mergeOutliningCandidateAttributes(Function & F,std::vector<outliner::Candidate> & Candidates) const8625 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8626 Function &F, std::vector<outliner::Candidate> &Candidates) const {
8627 // If a bunch of candidates reach this point they must agree on their return
8628 // address signing. It is therefore enough to just consider the signing
8629 // behaviour of one of them
8630 const auto &CFn = Candidates.front().getMF()->getFunction();
8631
8632 // Since all candidates belong to the same module, just copy the
8633 // function-level attributes of an arbitrary function.
8634 if (CFn.hasFnAttribute("sign-return-address"))
8635 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
8636 if (CFn.hasFnAttribute("sign-return-address-key"))
8637 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
8638
8639 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8640 }
8641
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const8642 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8643 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8644 const Function &F = MF.getFunction();
8645
8646 // Can F be deduplicated by the linker? If it can, don't outline from it.
8647 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8648 return false;
8649
8650 // Don't outline from functions with section markings; the program could
8651 // expect that all the code is in the named section.
8652 // FIXME: Allow outlining from multiple functions with the same section
8653 // marking.
8654 if (F.hasSection())
8655 return false;
8656
8657 // Outlining from functions with redzones is unsafe since the outliner may
8658 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8659 // outline from it.
8660 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8661 if (!AFI || AFI->hasRedZone().value_or(true))
8662 return false;
8663
8664 // FIXME: Teach the outliner to generate/handle Windows unwind info.
8665 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8666 return false;
8667
8668 // It's safe to outline from MF.
8669 return true;
8670 }
8671
8672 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
getOutlinableRanges(MachineBasicBlock & MBB,unsigned & Flags) const8673 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8674 unsigned &Flags) const {
8675 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8676 "Must track liveness!");
8677 SmallVector<
8678 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8679 Ranges;
8680 // According to the AArch64 Procedure Call Standard, the following are
8681 // undefined on entry/exit from a function call:
8682 //
8683 // * Registers x16, x17, (and thus w16, w17)
8684 // * Condition codes (and thus the NZCV register)
8685 //
8686 // If any of these registers are used inside or live across an outlined
8687 // function, then they may be modified later, either by the compiler or
8688 // some other tool (like the linker).
8689 //
8690 // To avoid outlining in these situations, partition each block into ranges
8691 // where these registers are dead. We will only outline from those ranges.
8692 LiveRegUnits LRU(getRegisterInfo());
8693 auto AreAllUnsafeRegsDead = [&LRU]() {
8694 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8695 LRU.available(AArch64::NZCV);
8696 };
8697
8698 // We need to know if LR is live across an outlining boundary later on in
8699 // order to decide how we'll create the outlined call, frame, etc.
8700 //
8701 // It's pretty expensive to check this for *every candidate* within a block.
8702 // That's some potentially n^2 behaviour, since in the worst case, we'd need
8703 // to compute liveness from the end of the block for O(n) candidates within
8704 // the block.
8705 //
8706 // So, to improve the average case, let's keep track of liveness from the end
8707 // of the block to the beginning of *every outlinable range*. If we know that
8708 // LR is available in every range we could outline from, then we know that
8709 // we don't need to check liveness for any candidate within that range.
8710 bool LRAvailableEverywhere = true;
8711 // Compute liveness bottom-up.
8712 LRU.addLiveOuts(MBB);
8713 // Update flags that require info about the entire MBB.
8714 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8715 if (MI.isCall() && !MI.isTerminator())
8716 Flags |= MachineOutlinerMBBFlags::HasCalls;
8717 };
8718 // Range: [RangeBegin, RangeEnd)
8719 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8720 unsigned RangeLen;
8721 auto CreateNewRangeStartingAt =
8722 [&RangeBegin, &RangeEnd,
8723 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8724 RangeBegin = NewBegin;
8725 RangeEnd = std::next(RangeBegin);
8726 RangeLen = 0;
8727 };
8728 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8729 // At least one unsafe register is not dead. We do not want to outline at
8730 // this point. If it is long enough to outline from, save the range
8731 // [RangeBegin, RangeEnd).
8732 if (RangeLen > 1)
8733 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
8734 };
8735 // Find the first point where all unsafe registers are dead.
8736 // FIND: <safe instr> <-- end of first potential range
8737 // SKIP: <unsafe def>
8738 // SKIP: ... everything between ...
8739 // SKIP: <unsafe use>
8740 auto FirstPossibleEndPt = MBB.instr_rbegin();
8741 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8742 LRU.stepBackward(*FirstPossibleEndPt);
8743 // Update flags that impact how we outline across the entire block,
8744 // regardless of safety.
8745 UpdateWholeMBBFlags(*FirstPossibleEndPt);
8746 if (AreAllUnsafeRegsDead())
8747 break;
8748 }
8749 // If we exhausted the entire block, we have no safe ranges to outline.
8750 if (FirstPossibleEndPt == MBB.instr_rend())
8751 return Ranges;
8752 // Current range.
8753 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8754 // StartPt points to the first place where all unsafe registers
8755 // are dead (if there is any such point). Begin partitioning the MBB into
8756 // ranges.
8757 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
8758 LRU.stepBackward(MI);
8759 UpdateWholeMBBFlags(MI);
8760 if (!AreAllUnsafeRegsDead()) {
8761 SaveRangeIfNonEmpty();
8762 CreateNewRangeStartingAt(MI.getIterator());
8763 continue;
8764 }
8765 LRAvailableEverywhere &= LRU.available(AArch64::LR);
8766 RangeBegin = MI.getIterator();
8767 ++RangeLen;
8768 }
8769 // Above loop misses the last (or only) range. If we are still safe, then
8770 // let's save the range.
8771 if (AreAllUnsafeRegsDead())
8772 SaveRangeIfNonEmpty();
8773 if (Ranges.empty())
8774 return Ranges;
8775 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8776 // the order.
8777 std::reverse(Ranges.begin(), Ranges.end());
8778 // If there is at least one outlinable range where LR is unavailable
8779 // somewhere, remember that.
8780 if (!LRAvailableEverywhere)
8781 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8782 return Ranges;
8783 }
8784
8785 outliner::InstrType
getOutliningTypeImpl(MachineBasicBlock::iterator & MIT,unsigned Flags) const8786 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8787 unsigned Flags) const {
8788 MachineInstr &MI = *MIT;
8789 MachineBasicBlock *MBB = MI.getParent();
8790 MachineFunction *MF = MBB->getParent();
8791 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8792
8793 // Don't outline anything used for return address signing. The outlined
8794 // function will get signed later if needed
8795 switch (MI.getOpcode()) {
8796 case AArch64::PACM:
8797 case AArch64::PACIASP:
8798 case AArch64::PACIBSP:
8799 case AArch64::PACIASPPC:
8800 case AArch64::PACIBSPPC:
8801 case AArch64::AUTIASP:
8802 case AArch64::AUTIBSP:
8803 case AArch64::AUTIASPPCi:
8804 case AArch64::AUTIASPPCr:
8805 case AArch64::AUTIBSPPCi:
8806 case AArch64::AUTIBSPPCr:
8807 case AArch64::RETAA:
8808 case AArch64::RETAB:
8809 case AArch64::RETAASPPCi:
8810 case AArch64::RETAASPPCr:
8811 case AArch64::RETABSPPCi:
8812 case AArch64::RETABSPPCr:
8813 case AArch64::EMITBKEY:
8814 case AArch64::PAUTH_PROLOGUE:
8815 case AArch64::PAUTH_EPILOGUE:
8816 return outliner::InstrType::Illegal;
8817 }
8818
8819 // Don't outline LOHs.
8820 if (FuncInfo->getLOHRelated().count(&MI))
8821 return outliner::InstrType::Illegal;
8822
8823 // We can only outline these if we will tail call the outlined function, or
8824 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8825 // in a tail call.
8826 //
8827 // FIXME: If the proper fixups for the offset are implemented, this should be
8828 // possible.
8829 if (MI.isCFIInstruction())
8830 return outliner::InstrType::Legal;
8831
8832 // Is this a terminator for a basic block?
8833 if (MI.isTerminator())
8834 // TargetInstrInfo::getOutliningType has already filtered out anything
8835 // that would break this, so we can allow it here.
8836 return outliner::InstrType::Legal;
8837
8838 // Make sure none of the operands are un-outlinable.
8839 for (const MachineOperand &MOP : MI.operands()) {
8840 // A check preventing CFI indices was here before, but only CFI
8841 // instructions should have those.
8842 assert(!MOP.isCFIIndex());
8843
8844 // If it uses LR or W30 explicitly, then don't touch it.
8845 if (MOP.isReg() && !MOP.isImplicit() &&
8846 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8847 return outliner::InstrType::Illegal;
8848 }
8849
8850 // Special cases for instructions that can always be outlined, but will fail
8851 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8852 // be outlined because they don't require a *specific* value to be in LR.
8853 if (MI.getOpcode() == AArch64::ADRP)
8854 return outliner::InstrType::Legal;
8855
8856 // If MI is a call we might be able to outline it. We don't want to outline
8857 // any calls that rely on the position of items on the stack. When we outline
8858 // something containing a call, we have to emit a save and restore of LR in
8859 // the outlined function. Currently, this always happens by saving LR to the
8860 // stack. Thus, if we outline, say, half the parameters for a function call
8861 // plus the call, then we'll break the callee's expectations for the layout
8862 // of the stack.
8863 //
8864 // FIXME: Allow calls to functions which construct a stack frame, as long
8865 // as they don't access arguments on the stack.
8866 // FIXME: Figure out some way to analyze functions defined in other modules.
8867 // We should be able to compute the memory usage based on the IR calling
8868 // convention, even if we can't see the definition.
8869 if (MI.isCall()) {
8870 // Get the function associated with the call. Look at each operand and find
8871 // the one that represents the callee and get its name.
8872 const Function *Callee = nullptr;
8873 for (const MachineOperand &MOP : MI.operands()) {
8874 if (MOP.isGlobal()) {
8875 Callee = dyn_cast<Function>(MOP.getGlobal());
8876 break;
8877 }
8878 }
8879
8880 // Never outline calls to mcount. There isn't any rule that would require
8881 // this, but the Linux kernel's "ftrace" feature depends on it.
8882 if (Callee && Callee->getName() == "\01_mcount")
8883 return outliner::InstrType::Illegal;
8884
8885 // If we don't know anything about the callee, assume it depends on the
8886 // stack layout of the caller. In that case, it's only legal to outline
8887 // as a tail-call. Explicitly list the call instructions we know about so we
8888 // don't get unexpected results with call pseudo-instructions.
8889 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8890 if (MI.getOpcode() == AArch64::BLR ||
8891 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8892 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8893
8894 if (!Callee)
8895 return UnknownCallOutlineType;
8896
8897 // We have a function we have information about. Check it if it's something
8898 // can safely outline.
8899 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
8900
8901 // We don't know what's going on with the callee at all. Don't touch it.
8902 if (!CalleeMF)
8903 return UnknownCallOutlineType;
8904
8905 // Check if we know anything about the callee saves on the function. If we
8906 // don't, then don't touch it, since that implies that we haven't
8907 // computed anything about its stack frame yet.
8908 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8909 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8910 MFI.getNumObjects() > 0)
8911 return UnknownCallOutlineType;
8912
8913 // At this point, we can say that CalleeMF ought to not pass anything on the
8914 // stack. Therefore, we can outline it.
8915 return outliner::InstrType::Legal;
8916 }
8917
8918 // Don't touch the link register or W30.
8919 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8920 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
8921 return outliner::InstrType::Illegal;
8922
8923 // Don't outline BTI instructions, because that will prevent the outlining
8924 // site from being indirectly callable.
8925 if (hasBTISemantics(MI))
8926 return outliner::InstrType::Illegal;
8927
8928 return outliner::InstrType::Legal;
8929 }
8930
fixupPostOutline(MachineBasicBlock & MBB) const8931 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
8932 for (MachineInstr &MI : MBB) {
8933 const MachineOperand *Base;
8934 TypeSize Width(0, false);
8935 int64_t Offset;
8936 bool OffsetIsScalable;
8937
8938 // Is this a load or store with an immediate offset with SP as the base?
8939 if (!MI.mayLoadOrStore() ||
8940 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
8941 &RI) ||
8942 (Base->isReg() && Base->getReg() != AArch64::SP))
8943 continue;
8944
8945 // It is, so we have to fix it up.
8946 TypeSize Scale(0U, false);
8947 int64_t Dummy1, Dummy2;
8948
8949 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
8950 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
8951 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
8952 assert(Scale != 0 && "Unexpected opcode!");
8953 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
8954
8955 // We've pushed the return address to the stack, so add 16 to the offset.
8956 // This is safe, since we already checked if it would overflow when we
8957 // checked if this instruction was legal to outline.
8958 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
8959 StackOffsetOperand.setImm(NewImm);
8960 }
8961 }
8962
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,const AArch64InstrInfo * TII,bool ShouldSignReturnAddr)8963 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
8964 const AArch64InstrInfo *TII,
8965 bool ShouldSignReturnAddr) {
8966 if (!ShouldSignReturnAddr)
8967 return;
8968
8969 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
8970 .setMIFlag(MachineInstr::FrameSetup);
8971 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
8972 TII->get(AArch64::PAUTH_EPILOGUE))
8973 .setMIFlag(MachineInstr::FrameDestroy);
8974 }
8975
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const8976 void AArch64InstrInfo::buildOutlinedFrame(
8977 MachineBasicBlock &MBB, MachineFunction &MF,
8978 const outliner::OutlinedFunction &OF) const {
8979
8980 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
8981
8982 if (OF.FrameConstructionID == MachineOutlinerTailCall)
8983 FI->setOutliningStyle("Tail Call");
8984 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
8985 // For thunk outlining, rewrite the last instruction from a call to a
8986 // tail-call.
8987 MachineInstr *Call = &*--MBB.instr_end();
8988 unsigned TailOpcode;
8989 if (Call->getOpcode() == AArch64::BL) {
8990 TailOpcode = AArch64::TCRETURNdi;
8991 } else {
8992 assert(Call->getOpcode() == AArch64::BLR ||
8993 Call->getOpcode() == AArch64::BLRNoIP);
8994 TailOpcode = AArch64::TCRETURNriALL;
8995 }
8996 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
8997 .add(Call->getOperand(0))
8998 .addImm(0);
8999 MBB.insert(MBB.end(), TC);
9000 Call->eraseFromParent();
9001
9002 FI->setOutliningStyle("Thunk");
9003 }
9004
9005 bool IsLeafFunction = true;
9006
9007 // Is there a call in the outlined range?
9008 auto IsNonTailCall = [](const MachineInstr &MI) {
9009 return MI.isCall() && !MI.isReturn();
9010 };
9011
9012 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9013 // Fix up the instructions in the range, since we're going to modify the
9014 // stack.
9015
9016 // Bugzilla ID: 46767
9017 // TODO: Check if fixing up twice is safe so we can outline these.
9018 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9019 "Can only fix up stack references once");
9020 fixupPostOutline(MBB);
9021
9022 IsLeafFunction = false;
9023
9024 // LR has to be a live in so that we can save it.
9025 if (!MBB.isLiveIn(AArch64::LR))
9026 MBB.addLiveIn(AArch64::LR);
9027
9028 MachineBasicBlock::iterator It = MBB.begin();
9029 MachineBasicBlock::iterator Et = MBB.end();
9030
9031 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9032 OF.FrameConstructionID == MachineOutlinerThunk)
9033 Et = std::prev(MBB.end());
9034
9035 // Insert a save before the outlined region
9036 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9037 .addReg(AArch64::SP, RegState::Define)
9038 .addReg(AArch64::LR)
9039 .addReg(AArch64::SP)
9040 .addImm(-16);
9041 It = MBB.insert(It, STRXpre);
9042
9043 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9044 const TargetSubtargetInfo &STI = MF.getSubtarget();
9045 const MCRegisterInfo *MRI = STI.getRegisterInfo();
9046 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9047
9048 // Add a CFI saying the stack was moved 16 B down.
9049 int64_t StackPosEntry =
9050 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9051 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9052 .addCFIIndex(StackPosEntry)
9053 .setMIFlags(MachineInstr::FrameSetup);
9054
9055 // Add a CFI saying that the LR that we want to find is now 16 B higher
9056 // than before.
9057 int64_t LRPosEntry = MF.addFrameInst(
9058 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9059 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9060 .addCFIIndex(LRPosEntry)
9061 .setMIFlags(MachineInstr::FrameSetup);
9062 }
9063
9064 // Insert a restore before the terminator for the function.
9065 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9066 .addReg(AArch64::SP, RegState::Define)
9067 .addReg(AArch64::LR, RegState::Define)
9068 .addReg(AArch64::SP)
9069 .addImm(16);
9070 Et = MBB.insert(Et, LDRXpost);
9071 }
9072
9073 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9074
9075 // If this is a tail call outlined function, then there's already a return.
9076 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9077 OF.FrameConstructionID == MachineOutlinerThunk) {
9078 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9079 return;
9080 }
9081
9082 // It's not a tail call, so we have to insert the return ourselves.
9083
9084 // LR has to be a live in so that we can return to it.
9085 if (!MBB.isLiveIn(AArch64::LR))
9086 MBB.addLiveIn(AArch64::LR);
9087
9088 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9089 .addReg(AArch64::LR);
9090 MBB.insert(MBB.end(), ret);
9091
9092 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9093
9094 FI->setOutliningStyle("Function");
9095
9096 // Did we have to modify the stack by saving the link register?
9097 if (OF.FrameConstructionID != MachineOutlinerDefault)
9098 return;
9099
9100 // We modified the stack.
9101 // Walk over the basic block and fix up all the stack accesses.
9102 fixupPostOutline(MBB);
9103 }
9104
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,outliner::Candidate & C) const9105 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9106 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9107 MachineFunction &MF, outliner::Candidate &C) const {
9108
9109 // Are we tail calling?
9110 if (C.CallConstructionID == MachineOutlinerTailCall) {
9111 // If yes, then we can just branch to the label.
9112 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9113 .addGlobalAddress(M.getNamedValue(MF.getName()))
9114 .addImm(0));
9115 return It;
9116 }
9117
9118 // Are we saving the link register?
9119 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9120 C.CallConstructionID == MachineOutlinerThunk) {
9121 // No, so just insert the call.
9122 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9123 .addGlobalAddress(M.getNamedValue(MF.getName())));
9124 return It;
9125 }
9126
9127 // We want to return the spot where we inserted the call.
9128 MachineBasicBlock::iterator CallPt;
9129
9130 // Instructions for saving and restoring LR around the call instruction we're
9131 // going to insert.
9132 MachineInstr *Save;
9133 MachineInstr *Restore;
9134 // Can we save to a register?
9135 if (C.CallConstructionID == MachineOutlinerRegSave) {
9136 // FIXME: This logic should be sunk into a target-specific interface so that
9137 // we don't have to recompute the register.
9138 Register Reg = findRegisterToSaveLRTo(C);
9139 assert(Reg && "No callee-saved register available?");
9140
9141 // LR has to be a live in so that we can save it.
9142 if (!MBB.isLiveIn(AArch64::LR))
9143 MBB.addLiveIn(AArch64::LR);
9144
9145 // Save and restore LR from Reg.
9146 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9147 .addReg(AArch64::XZR)
9148 .addReg(AArch64::LR)
9149 .addImm(0);
9150 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9151 .addReg(AArch64::XZR)
9152 .addReg(Reg)
9153 .addImm(0);
9154 } else {
9155 // We have the default case. Save and restore from SP.
9156 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9157 .addReg(AArch64::SP, RegState::Define)
9158 .addReg(AArch64::LR)
9159 .addReg(AArch64::SP)
9160 .addImm(-16);
9161 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9162 .addReg(AArch64::SP, RegState::Define)
9163 .addReg(AArch64::LR, RegState::Define)
9164 .addReg(AArch64::SP)
9165 .addImm(16);
9166 }
9167
9168 It = MBB.insert(It, Save);
9169 It++;
9170
9171 // Insert the call.
9172 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9173 .addGlobalAddress(M.getNamedValue(MF.getName())));
9174 CallPt = It;
9175 It++;
9176
9177 It = MBB.insert(It, Restore);
9178 return CallPt;
9179 }
9180
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const9181 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9182 MachineFunction &MF) const {
9183 return MF.getFunction().hasMinSize();
9184 }
9185
buildClearRegister(Register Reg,MachineBasicBlock & MBB,MachineBasicBlock::iterator Iter,DebugLoc & DL,bool AllowSideEffects) const9186 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9187 MachineBasicBlock::iterator Iter,
9188 DebugLoc &DL,
9189 bool AllowSideEffects) const {
9190 const MachineFunction &MF = *MBB.getParent();
9191 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9192 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9193
9194 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9195 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9196 } else if (STI.hasSVE()) {
9197 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9198 .addImm(0)
9199 .addImm(0);
9200 } else {
9201 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9202 .addImm(0);
9203 }
9204 }
9205
9206 std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const9207 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9208
9209 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9210 // and zero immediate operands used as an alias for mov instruction.
9211 if (MI.getOpcode() == AArch64::ORRWrs &&
9212 MI.getOperand(1).getReg() == AArch64::WZR &&
9213 MI.getOperand(3).getImm() == 0x0 &&
9214 // Check that the w->w move is not a zero-extending w->x mov.
9215 (!MI.getOperand(0).getReg().isVirtual() ||
9216 MI.getOperand(0).getSubReg() == 0) &&
9217 (!MI.getOperand(0).getReg().isPhysical() ||
9218 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9219 AArch64::X0) == -1))
9220 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9221
9222 if (MI.getOpcode() == AArch64::ORRXrs &&
9223 MI.getOperand(1).getReg() == AArch64::XZR &&
9224 MI.getOperand(3).getImm() == 0x0)
9225 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9226
9227 return std::nullopt;
9228 }
9229
9230 std::optional<DestSourcePair>
isCopyLikeInstrImpl(const MachineInstr & MI) const9231 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9232 if (MI.getOpcode() == AArch64::ORRWrs &&
9233 MI.getOperand(1).getReg() == AArch64::WZR &&
9234 MI.getOperand(3).getImm() == 0x0)
9235 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9236 return std::nullopt;
9237 }
9238
9239 std::optional<RegImmPair>
isAddImmediate(const MachineInstr & MI,Register Reg) const9240 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9241 int Sign = 1;
9242 int64_t Offset = 0;
9243
9244 // TODO: Handle cases where Reg is a super- or sub-register of the
9245 // destination register.
9246 const MachineOperand &Op0 = MI.getOperand(0);
9247 if (!Op0.isReg() || Reg != Op0.getReg())
9248 return std::nullopt;
9249
9250 switch (MI.getOpcode()) {
9251 default:
9252 return std::nullopt;
9253 case AArch64::SUBWri:
9254 case AArch64::SUBXri:
9255 case AArch64::SUBSWri:
9256 case AArch64::SUBSXri:
9257 Sign *= -1;
9258 [[fallthrough]];
9259 case AArch64::ADDSWri:
9260 case AArch64::ADDSXri:
9261 case AArch64::ADDWri:
9262 case AArch64::ADDXri: {
9263 // TODO: Third operand can be global address (usually some string).
9264 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9265 !MI.getOperand(2).isImm())
9266 return std::nullopt;
9267 int Shift = MI.getOperand(3).getImm();
9268 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9269 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9270 }
9271 }
9272 return RegImmPair{MI.getOperand(1).getReg(), Offset};
9273 }
9274
9275 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9276 /// the destination register then, if possible, describe the value in terms of
9277 /// the source register.
9278 static std::optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)9279 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9280 const TargetInstrInfo *TII,
9281 const TargetRegisterInfo *TRI) {
9282 auto DestSrc = TII->isCopyLikeInstr(MI);
9283 if (!DestSrc)
9284 return std::nullopt;
9285
9286 Register DestReg = DestSrc->Destination->getReg();
9287 Register SrcReg = DestSrc->Source->getReg();
9288
9289 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9290
9291 // If the described register is the destination, just return the source.
9292 if (DestReg == DescribedReg)
9293 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9294
9295 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9296 if (MI.getOpcode() == AArch64::ORRWrs &&
9297 TRI->isSuperRegister(DestReg, DescribedReg))
9298 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9299
9300 // We may need to describe the lower part of a ORRXrs move.
9301 if (MI.getOpcode() == AArch64::ORRXrs &&
9302 TRI->isSubRegister(DestReg, DescribedReg)) {
9303 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9304 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9305 }
9306
9307 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9308 "Unhandled ORR[XW]rs copy case");
9309
9310 return std::nullopt;
9311 }
9312
isFunctionSafeToSplit(const MachineFunction & MF) const9313 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9314 // Functions cannot be split to different sections on AArch64 if they have
9315 // a red zone. This is because relaxing a cross-section branch may require
9316 // incrementing the stack pointer to spill a register, which would overwrite
9317 // the red zone.
9318 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9319 return false;
9320
9321 return TargetInstrInfo::isFunctionSafeToSplit(MF);
9322 }
9323
isMBBSafeToSplitToCold(const MachineBasicBlock & MBB) const9324 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9325 const MachineBasicBlock &MBB) const {
9326 // Asm Goto blocks can contain conditional branches to goto labels, which can
9327 // get moved out of range of the branch instruction.
9328 auto isAsmGoto = [](const MachineInstr &MI) {
9329 return MI.getOpcode() == AArch64::INLINEASM_BR;
9330 };
9331 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9332 return false;
9333
9334 // Because jump tables are label-relative instead of table-relative, they all
9335 // must be in the same section or relocation fixup handling will fail.
9336
9337 // Check if MBB is a jump table target
9338 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9339 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9340 return llvm::is_contained(JTE.MBBs, &MBB);
9341 };
9342 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9343 return false;
9344
9345 // Check if MBB contains a jump table lookup
9346 for (const MachineInstr &MI : MBB) {
9347 switch (MI.getOpcode()) {
9348 case TargetOpcode::G_BRJT:
9349 case AArch64::JumpTableDest32:
9350 case AArch64::JumpTableDest16:
9351 case AArch64::JumpTableDest8:
9352 return false;
9353 default:
9354 continue;
9355 }
9356 }
9357
9358 // MBB isn't a special case, so it's safe to be split to the cold section.
9359 return true;
9360 }
9361
9362 std::optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const9363 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9364 Register Reg) const {
9365 const MachineFunction *MF = MI.getMF();
9366 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9367 switch (MI.getOpcode()) {
9368 case AArch64::MOVZWi:
9369 case AArch64::MOVZXi: {
9370 // MOVZWi may be used for producing zero-extended 32-bit immediates in
9371 // 64-bit parameters, so we need to consider super-registers.
9372 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9373 return std::nullopt;
9374
9375 if (!MI.getOperand(1).isImm())
9376 return std::nullopt;
9377 int64_t Immediate = MI.getOperand(1).getImm();
9378 int Shift = MI.getOperand(2).getImm();
9379 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9380 nullptr);
9381 }
9382 case AArch64::ORRWrs:
9383 case AArch64::ORRXrs:
9384 return describeORRLoadedValue(MI, Reg, this, TRI);
9385 }
9386
9387 return TargetInstrInfo::describeLoadedValue(MI, Reg);
9388 }
9389
isExtendLikelyToBeFolded(MachineInstr & ExtMI,MachineRegisterInfo & MRI) const9390 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9391 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9392 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9393 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9394 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9395
9396 // Anyexts are nops.
9397 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9398 return true;
9399
9400 Register DefReg = ExtMI.getOperand(0).getReg();
9401 if (!MRI.hasOneNonDBGUse(DefReg))
9402 return false;
9403
9404 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9405 // addressing mode.
9406 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9407 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9408 }
9409
getElementSizeForOpcode(unsigned Opc) const9410 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9411 return get(Opc).TSFlags & AArch64::ElementSizeMask;
9412 }
9413
isPTestLikeOpcode(unsigned Opc) const9414 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9415 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9416 }
9417
isWhileOpcode(unsigned Opc) const9418 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9419 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9420 }
9421
9422 unsigned int
getTailDuplicateSize(CodeGenOptLevel OptLevel) const9423 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9424 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9425 }
9426
isLegalAddressingMode(unsigned NumBytes,int64_t Offset,unsigned Scale) const9427 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9428 unsigned Scale) const {
9429 if (Offset && Scale)
9430 return false;
9431
9432 // Check Reg + Imm
9433 if (!Scale) {
9434 // 9-bit signed offset
9435 if (isInt<9>(Offset))
9436 return true;
9437
9438 // 12-bit unsigned offset
9439 unsigned Shift = Log2_64(NumBytes);
9440 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9441 // Must be a multiple of NumBytes (NumBytes is a power of 2)
9442 (Offset >> Shift) << Shift == Offset)
9443 return true;
9444 return false;
9445 }
9446
9447 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9448 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9449 }
9450
getBLRCallOpcode(const MachineFunction & MF)9451 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9452 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9453 return AArch64::BLRNoIP;
9454 else
9455 return AArch64::BLR;
9456 }
9457
isReallyTriviallyReMaterializable(const MachineInstr & MI) const9458 bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
9459 const MachineInstr &MI) const {
9460 const MachineFunction &MF = *MI.getMF();
9461 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
9462
9463 // If the function contains changes to streaming mode, then there
9464 // is a danger that rematerialised instructions end up between
9465 // instruction sequences (e.g. call sequences, or prolog/epilogue)
9466 // where the streaming-SVE mode is temporarily changed.
9467 if (AFI.hasStreamingModeChanges()) {
9468 // Avoid rematerializing rematerializable instructions that use/define
9469 // scalable values, such as 'pfalse' or 'ptrue', which result in different
9470 // results when the runtime vector length is different.
9471 const MachineRegisterInfo &MRI = MF.getRegInfo();
9472 const MachineFrameInfo &MFI = MF.getFrameInfo();
9473 if (any_of(MI.operands(), [&MRI, &MFI](const MachineOperand &MO) {
9474 if (MO.isFI() &&
9475 MFI.getStackID(MO.getIndex()) == TargetStackID::ScalableVector)
9476 return true;
9477 if (!MO.isReg())
9478 return false;
9479
9480 if (MO.getReg().isVirtual()) {
9481 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
9482 return AArch64::ZPRRegClass.hasSubClassEq(RC) ||
9483 AArch64::PPRRegClass.hasSubClassEq(RC);
9484 }
9485 return AArch64::ZPRRegClass.contains(MO.getReg()) ||
9486 AArch64::PPRRegClass.contains(MO.getReg());
9487 }))
9488 return false;
9489
9490 // Avoid rematerializing instructions that return a value that is
9491 // different depending on vector length, even when it is not returned
9492 // in a scalable vector/predicate register.
9493 switch (MI.getOpcode()) {
9494 default:
9495 break;
9496 case AArch64::RDVLI_XI:
9497 case AArch64::ADDVL_XXI:
9498 case AArch64::ADDPL_XXI:
9499 case AArch64::CNTB_XPiI:
9500 case AArch64::CNTH_XPiI:
9501 case AArch64::CNTW_XPiI:
9502 case AArch64::CNTD_XPiI:
9503 return false;
9504 }
9505 }
9506
9507 return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
9508 }
9509
9510 MachineBasicBlock::iterator
probedStackAlloc(MachineBasicBlock::iterator MBBI,Register TargetReg,bool FrameSetup) const9511 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9512 Register TargetReg, bool FrameSetup) const {
9513 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9514
9515 MachineBasicBlock &MBB = *MBBI->getParent();
9516 MachineFunction &MF = *MBB.getParent();
9517 const AArch64InstrInfo *TII =
9518 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9519 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9520 DebugLoc DL = MBB.findDebugLoc(MBBI);
9521
9522 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
9523 MachineBasicBlock *LoopTestMBB =
9524 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9525 MF.insert(MBBInsertPoint, LoopTestMBB);
9526 MachineBasicBlock *LoopBodyMBB =
9527 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9528 MF.insert(MBBInsertPoint, LoopBodyMBB);
9529 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9530 MF.insert(MBBInsertPoint, ExitMBB);
9531 MachineInstr::MIFlag Flags =
9532 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9533
9534 // LoopTest:
9535 // SUB SP, SP, #ProbeSize
9536 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9537 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9538
9539 // CMP SP, TargetReg
9540 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9541 AArch64::XZR)
9542 .addReg(AArch64::SP)
9543 .addReg(TargetReg)
9544 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9545 .setMIFlags(Flags);
9546
9547 // B.<Cond> LoopExit
9548 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9549 .addImm(AArch64CC::LE)
9550 .addMBB(ExitMBB)
9551 .setMIFlags(Flags);
9552
9553 // STR XZR, [SP]
9554 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9555 .addReg(AArch64::XZR)
9556 .addReg(AArch64::SP)
9557 .addImm(0)
9558 .setMIFlags(Flags);
9559
9560 // B loop
9561 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9562 .addMBB(LoopTestMBB)
9563 .setMIFlags(Flags);
9564
9565 // LoopExit:
9566 // MOV SP, TargetReg
9567 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9568 .addReg(TargetReg)
9569 .addImm(0)
9570 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9571 .setMIFlags(Flags);
9572
9573 // LDR XZR, [SP]
9574 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9575 .addReg(AArch64::XZR, RegState::Define)
9576 .addReg(AArch64::SP)
9577 .addImm(0)
9578 .setMIFlags(Flags);
9579
9580 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
9581 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
9582
9583 LoopTestMBB->addSuccessor(ExitMBB);
9584 LoopTestMBB->addSuccessor(LoopBodyMBB);
9585 LoopBodyMBB->addSuccessor(LoopTestMBB);
9586 MBB.addSuccessor(LoopTestMBB);
9587
9588 // Update liveins.
9589 if (MF.getRegInfo().reservedRegsFrozen()) {
9590 bool anyChange = false;
9591 do {
9592 anyChange = recomputeLiveIns(*ExitMBB) ||
9593 recomputeLiveIns(*LoopBodyMBB) ||
9594 recomputeLiveIns(*LoopTestMBB);
9595 } while (anyChange);
9596 ;
9597 }
9598
9599 return ExitMBB->begin();
9600 }
9601
9602 #define GET_INSTRINFO_HELPERS
9603 #define GET_INSTRMAP_INFO
9604 #include "AArch64GenInstrInfo.inc"
9605