1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64FrameLowering.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PointerAuth.h"
18 #include "AArch64Subtarget.h"
19 #include "MCTargetDesc/AArch64AddressingModes.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/ArrayRef.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/LivePhysRegs.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineCombinerPattern.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineInstr.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineMemOperand.h"
32 #include "llvm/CodeGen/MachineModuleInfo.h"
33 #include "llvm/CodeGen/MachineOperand.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/RegisterScavenging.h"
36 #include "llvm/CodeGen/StackMaps.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/CodeGen/TargetSubtargetInfo.h"
39 #include "llvm/IR/DebugInfoMetadata.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/IR/GlobalValue.h"
42 #include "llvm/MC/MCAsmInfo.h"
43 #include "llvm/MC/MCInst.h"
44 #include "llvm/MC/MCInstBuilder.h"
45 #include "llvm/MC/MCInstrDesc.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CodeGen.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/LEB128.h"
51 #include "llvm/Support/MathExtras.h"
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Target/TargetOptions.h"
54 #include <cassert>
55 #include <cstdint>
56 #include <iterator>
57 #include <utility>
58 
59 using namespace llvm;
60 
61 #define GET_INSTRINFO_CTOR_DTOR
62 #include "AArch64GenInstrInfo.inc"
63 
64 static cl::opt<unsigned> TBZDisplacementBits(
65     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
66     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
67 
68 static cl::opt<unsigned> CBZDisplacementBits(
69     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
70     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
71 
72 static cl::opt<unsigned>
73     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
74                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
75 
76 static cl::opt<unsigned>
77     BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
78                       cl::desc("Restrict range of B instructions (DEBUG)"));
79 
AArch64InstrInfo(const AArch64Subtarget & STI)80 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
81     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
82                           AArch64::CATCHRET),
83       RI(STI.getTargetTriple()), Subtarget(STI) {}
84 
85 /// GetInstSize - Return the number of bytes of code the specified
86 /// instruction may be.  This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const87 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
88   const MachineBasicBlock &MBB = *MI.getParent();
89   const MachineFunction *MF = MBB.getParent();
90   const Function &F = MF->getFunction();
91   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
92 
93   {
94     auto Op = MI.getOpcode();
95     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
96       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
97   }
98 
99   // Meta-instructions emit no code.
100   if (MI.isMetaInstruction())
101     return 0;
102 
103   // FIXME: We currently only handle pseudoinstructions that don't get expanded
104   //        before the assembly printer.
105   unsigned NumBytes = 0;
106   const MCInstrDesc &Desc = MI.getDesc();
107 
108   // Size should be preferably set in
109   // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
110   // Specific cases handle instructions of variable sizes
111   switch (Desc.getOpcode()) {
112   default:
113     if (Desc.getSize())
114       return Desc.getSize();
115 
116     // Anything not explicitly designated otherwise (i.e. pseudo-instructions
117     // with fixed constant size but not specified in .td file) is a normal
118     // 4-byte insn.
119     NumBytes = 4;
120     break;
121   case TargetOpcode::STACKMAP:
122     // The upper bound for a stackmap intrinsic is the full length of its shadow
123     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
124     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
125     break;
126   case TargetOpcode::PATCHPOINT:
127     // The size of the patchpoint intrinsic is the number of bytes requested
128     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
129     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
130     break;
131   case TargetOpcode::STATEPOINT:
132     NumBytes = StatepointOpers(&MI).getNumPatchBytes();
133     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
134     // No patch bytes means a normal call inst is emitted
135     if (NumBytes == 0)
136       NumBytes = 4;
137     break;
138   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
139     // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
140     // instructions are expanded to the specified number of NOPs. Otherwise,
141     // they are expanded to 36-byte XRay sleds.
142     NumBytes =
143         F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
144     break;
145   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
146   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
147     // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
148     NumBytes = 36;
149     break;
150   case TargetOpcode::PATCHABLE_EVENT_CALL:
151     // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
152     NumBytes = 24;
153     break;
154 
155   case AArch64::SPACE:
156     NumBytes = MI.getOperand(1).getImm();
157     break;
158   case TargetOpcode::BUNDLE:
159     NumBytes = getInstBundleLength(MI);
160     break;
161   }
162 
163   return NumBytes;
164 }
165 
getInstBundleLength(const MachineInstr & MI) const166 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
167   unsigned Size = 0;
168   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
169   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
170   while (++I != E && I->isInsideBundle()) {
171     assert(!I->isBundle() && "No nested bundle!");
172     Size += getInstSizeInBytes(*I);
173   }
174   return Size;
175 }
176 
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)177 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
178                             SmallVectorImpl<MachineOperand> &Cond) {
179   // Block ends with fall-through condbranch.
180   switch (LastInst->getOpcode()) {
181   default:
182     llvm_unreachable("Unknown branch instruction?");
183   case AArch64::Bcc:
184     Target = LastInst->getOperand(1).getMBB();
185     Cond.push_back(LastInst->getOperand(0));
186     break;
187   case AArch64::CBZW:
188   case AArch64::CBZX:
189   case AArch64::CBNZW:
190   case AArch64::CBNZX:
191     Target = LastInst->getOperand(1).getMBB();
192     Cond.push_back(MachineOperand::CreateImm(-1));
193     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
194     Cond.push_back(LastInst->getOperand(0));
195     break;
196   case AArch64::TBZW:
197   case AArch64::TBZX:
198   case AArch64::TBNZW:
199   case AArch64::TBNZX:
200     Target = LastInst->getOperand(2).getMBB();
201     Cond.push_back(MachineOperand::CreateImm(-1));
202     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
203     Cond.push_back(LastInst->getOperand(0));
204     Cond.push_back(LastInst->getOperand(1));
205   }
206 }
207 
getBranchDisplacementBits(unsigned Opc)208 static unsigned getBranchDisplacementBits(unsigned Opc) {
209   switch (Opc) {
210   default:
211     llvm_unreachable("unexpected opcode!");
212   case AArch64::B:
213     return BDisplacementBits;
214   case AArch64::TBNZW:
215   case AArch64::TBZW:
216   case AArch64::TBNZX:
217   case AArch64::TBZX:
218     return TBZDisplacementBits;
219   case AArch64::CBNZW:
220   case AArch64::CBZW:
221   case AArch64::CBNZX:
222   case AArch64::CBZX:
223     return CBZDisplacementBits;
224   case AArch64::Bcc:
225     return BCCDisplacementBits;
226   }
227 }
228 
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const229 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
230                                              int64_t BrOffset) const {
231   unsigned Bits = getBranchDisplacementBits(BranchOp);
232   assert(Bits >= 3 && "max branch displacement must be enough to jump"
233                       "over conditional branch expansion");
234   return isIntN(Bits, BrOffset / 4);
235 }
236 
237 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const238 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
239   switch (MI.getOpcode()) {
240   default:
241     llvm_unreachable("unexpected opcode!");
242   case AArch64::B:
243     return MI.getOperand(0).getMBB();
244   case AArch64::TBZW:
245   case AArch64::TBNZW:
246   case AArch64::TBZX:
247   case AArch64::TBNZX:
248     return MI.getOperand(2).getMBB();
249   case AArch64::CBZW:
250   case AArch64::CBNZW:
251   case AArch64::CBZX:
252   case AArch64::CBNZX:
253   case AArch64::Bcc:
254     return MI.getOperand(1).getMBB();
255   }
256 }
257 
insertIndirectBranch(MachineBasicBlock & MBB,MachineBasicBlock & NewDestBB,MachineBasicBlock & RestoreBB,const DebugLoc & DL,int64_t BrOffset,RegScavenger * RS) const258 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
259                                             MachineBasicBlock &NewDestBB,
260                                             MachineBasicBlock &RestoreBB,
261                                             const DebugLoc &DL,
262                                             int64_t BrOffset,
263                                             RegScavenger *RS) const {
264   assert(RS && "RegScavenger required for long branching");
265   assert(MBB.empty() &&
266          "new block should be inserted for expanding unconditional branch");
267   assert(MBB.pred_size() == 1);
268   assert(RestoreBB.empty() &&
269          "restore block should be inserted for restoring clobbered registers");
270 
271   auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
272     // Offsets outside of the signed 33-bit range are not supported for ADRP +
273     // ADD.
274     if (!isInt<33>(BrOffset))
275       report_fatal_error(
276           "Branch offsets outside of the signed 33-bit range not supported");
277 
278     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
279         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
280     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
281         .addReg(Reg)
282         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
283         .addImm(0);
284     BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
285   };
286 
287   RS->enterBasicBlockEnd(MBB);
288   // If X16 is unused, we can rely on the linker to insert a range extension
289   // thunk if NewDestBB is out of range of a single B instruction.
290   constexpr Register Reg = AArch64::X16;
291   if (!RS->isRegUsed(Reg)) {
292     insertUnconditionalBranch(MBB, &NewDestBB, DL);
293     RS->setRegUsed(Reg);
294     return;
295   }
296 
297   // If there's a free register and it's worth inflating the code size,
298   // manually insert the indirect branch.
299   Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
300   if (Scavenged != AArch64::NoRegister &&
301       MBB.getSectionID() == MBBSectionID::ColdSectionID) {
302     buildIndirectBranch(Scavenged, NewDestBB);
303     RS->setRegUsed(Scavenged);
304     return;
305   }
306 
307   // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
308   // with red zones.
309   AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
310   if (!AFI || AFI->hasRedZone().value_or(true))
311     report_fatal_error(
312         "Unable to insert indirect branch inside function that has red zone");
313 
314   // Otherwise, spill X16 and defer range extension to the linker.
315   BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
316       .addReg(AArch64::SP, RegState::Define)
317       .addReg(Reg)
318       .addReg(AArch64::SP)
319       .addImm(-16);
320 
321   BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
322 
323   BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
324       .addReg(AArch64::SP, RegState::Define)
325       .addReg(Reg, RegState::Define)
326       .addReg(AArch64::SP)
327       .addImm(16);
328 }
329 
330 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const331 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
332                                      MachineBasicBlock *&TBB,
333                                      MachineBasicBlock *&FBB,
334                                      SmallVectorImpl<MachineOperand> &Cond,
335                                      bool AllowModify) const {
336   // If the block has no terminators, it just falls into the block after it.
337   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
338   if (I == MBB.end())
339     return false;
340 
341   // Skip over SpeculationBarrierEndBB terminators
342   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
343       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
344     --I;
345   }
346 
347   if (!isUnpredicatedTerminator(*I))
348     return false;
349 
350   // Get the last instruction in the block.
351   MachineInstr *LastInst = &*I;
352 
353   // If there is only one terminator instruction, process it.
354   unsigned LastOpc = LastInst->getOpcode();
355   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
356     if (isUncondBranchOpcode(LastOpc)) {
357       TBB = LastInst->getOperand(0).getMBB();
358       return false;
359     }
360     if (isCondBranchOpcode(LastOpc)) {
361       // Block ends with fall-through condbranch.
362       parseCondBranch(LastInst, TBB, Cond);
363       return false;
364     }
365     return true; // Can't handle indirect branch.
366   }
367 
368   // Get the instruction before it if it is a terminator.
369   MachineInstr *SecondLastInst = &*I;
370   unsigned SecondLastOpc = SecondLastInst->getOpcode();
371 
372   // If AllowModify is true and the block ends with two or more unconditional
373   // branches, delete all but the first unconditional branch.
374   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
375     while (isUncondBranchOpcode(SecondLastOpc)) {
376       LastInst->eraseFromParent();
377       LastInst = SecondLastInst;
378       LastOpc = LastInst->getOpcode();
379       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
380         // Return now the only terminator is an unconditional branch.
381         TBB = LastInst->getOperand(0).getMBB();
382         return false;
383       }
384       SecondLastInst = &*I;
385       SecondLastOpc = SecondLastInst->getOpcode();
386     }
387   }
388 
389   // If we're allowed to modify and the block ends in a unconditional branch
390   // which could simply fallthrough, remove the branch.  (Note: This case only
391   // matters when we can't understand the whole sequence, otherwise it's also
392   // handled by BranchFolding.cpp.)
393   if (AllowModify && isUncondBranchOpcode(LastOpc) &&
394       MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
395     LastInst->eraseFromParent();
396     LastInst = SecondLastInst;
397     LastOpc = LastInst->getOpcode();
398     if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
399       assert(!isUncondBranchOpcode(LastOpc) &&
400              "unreachable unconditional branches removed above");
401 
402       if (isCondBranchOpcode(LastOpc)) {
403         // Block ends with fall-through condbranch.
404         parseCondBranch(LastInst, TBB, Cond);
405         return false;
406       }
407       return true; // Can't handle indirect branch.
408     }
409     SecondLastInst = &*I;
410     SecondLastOpc = SecondLastInst->getOpcode();
411   }
412 
413   // If there are three terminators, we don't know what sort of block this is.
414   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
415     return true;
416 
417   // If the block ends with a B and a Bcc, handle it.
418   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
419     parseCondBranch(SecondLastInst, TBB, Cond);
420     FBB = LastInst->getOperand(0).getMBB();
421     return false;
422   }
423 
424   // If the block ends with two unconditional branches, handle it.  The second
425   // one is not executed, so remove it.
426   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
427     TBB = SecondLastInst->getOperand(0).getMBB();
428     I = LastInst;
429     if (AllowModify)
430       I->eraseFromParent();
431     return false;
432   }
433 
434   // ...likewise if it ends with an indirect branch followed by an unconditional
435   // branch.
436   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
437     I = LastInst;
438     if (AllowModify)
439       I->eraseFromParent();
440     return true;
441   }
442 
443   // Otherwise, can't handle this.
444   return true;
445 }
446 
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const447 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
448                                               MachineBranchPredicate &MBP,
449                                               bool AllowModify) const {
450   // For the moment, handle only a block which ends with a cb(n)zx followed by
451   // a fallthrough.  Why this?  Because it is a common form.
452   // TODO: Should we handle b.cc?
453 
454   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
455   if (I == MBB.end())
456     return true;
457 
458   // Skip over SpeculationBarrierEndBB terminators
459   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
460       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
461     --I;
462   }
463 
464   if (!isUnpredicatedTerminator(*I))
465     return true;
466 
467   // Get the last instruction in the block.
468   MachineInstr *LastInst = &*I;
469   unsigned LastOpc = LastInst->getOpcode();
470   if (!isCondBranchOpcode(LastOpc))
471     return true;
472 
473   switch (LastOpc) {
474   default:
475     return true;
476   case AArch64::CBZW:
477   case AArch64::CBZX:
478   case AArch64::CBNZW:
479   case AArch64::CBNZX:
480     break;
481   };
482 
483   MBP.TrueDest = LastInst->getOperand(1).getMBB();
484   assert(MBP.TrueDest && "expected!");
485   MBP.FalseDest = MBB.getNextNode();
486 
487   MBP.ConditionDef = nullptr;
488   MBP.SingleUseCondition = false;
489 
490   MBP.LHS = LastInst->getOperand(0);
491   MBP.RHS = MachineOperand::CreateImm(0);
492   MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
493                                             : MachineBranchPredicate::PRED_EQ;
494   return false;
495 }
496 
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const497 bool AArch64InstrInfo::reverseBranchCondition(
498     SmallVectorImpl<MachineOperand> &Cond) const {
499   if (Cond[0].getImm() != -1) {
500     // Regular Bcc
501     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
502     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
503   } else {
504     // Folded compare-and-branch
505     switch (Cond[1].getImm()) {
506     default:
507       llvm_unreachable("Unknown conditional branch!");
508     case AArch64::CBZW:
509       Cond[1].setImm(AArch64::CBNZW);
510       break;
511     case AArch64::CBNZW:
512       Cond[1].setImm(AArch64::CBZW);
513       break;
514     case AArch64::CBZX:
515       Cond[1].setImm(AArch64::CBNZX);
516       break;
517     case AArch64::CBNZX:
518       Cond[1].setImm(AArch64::CBZX);
519       break;
520     case AArch64::TBZW:
521       Cond[1].setImm(AArch64::TBNZW);
522       break;
523     case AArch64::TBNZW:
524       Cond[1].setImm(AArch64::TBZW);
525       break;
526     case AArch64::TBZX:
527       Cond[1].setImm(AArch64::TBNZX);
528       break;
529     case AArch64::TBNZX:
530       Cond[1].setImm(AArch64::TBZX);
531       break;
532     }
533   }
534 
535   return false;
536 }
537 
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const538 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
539                                         int *BytesRemoved) const {
540   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
541   if (I == MBB.end())
542     return 0;
543 
544   if (!isUncondBranchOpcode(I->getOpcode()) &&
545       !isCondBranchOpcode(I->getOpcode()))
546     return 0;
547 
548   // Remove the branch.
549   I->eraseFromParent();
550 
551   I = MBB.end();
552 
553   if (I == MBB.begin()) {
554     if (BytesRemoved)
555       *BytesRemoved = 4;
556     return 1;
557   }
558   --I;
559   if (!isCondBranchOpcode(I->getOpcode())) {
560     if (BytesRemoved)
561       *BytesRemoved = 4;
562     return 1;
563   }
564 
565   // Remove the branch.
566   I->eraseFromParent();
567   if (BytesRemoved)
568     *BytesRemoved = 8;
569 
570   return 2;
571 }
572 
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const573 void AArch64InstrInfo::instantiateCondBranch(
574     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
575     ArrayRef<MachineOperand> Cond) const {
576   if (Cond[0].getImm() != -1) {
577     // Regular Bcc
578     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
579   } else {
580     // Folded compare-and-branch
581     // Note that we use addOperand instead of addReg to keep the flags.
582     const MachineInstrBuilder MIB =
583         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
584     if (Cond.size() > 3)
585       MIB.addImm(Cond[3].getImm());
586     MIB.addMBB(TBB);
587   }
588 }
589 
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const590 unsigned AArch64InstrInfo::insertBranch(
591     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
592     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
593   // Shouldn't be a fall through.
594   assert(TBB && "insertBranch must not be told to insert a fallthrough");
595 
596   if (!FBB) {
597     if (Cond.empty()) // Unconditional branch?
598       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
599     else
600       instantiateCondBranch(MBB, DL, TBB, Cond);
601 
602     if (BytesAdded)
603       *BytesAdded = 4;
604 
605     return 1;
606   }
607 
608   // Two-way conditional branch.
609   instantiateCondBranch(MBB, DL, TBB, Cond);
610   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
611 
612   if (BytesAdded)
613     *BytesAdded = 8;
614 
615   return 2;
616 }
617 
618 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)619 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
620   while (Register::isVirtualRegister(VReg)) {
621     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
622     if (!DefMI->isFullCopy())
623       return VReg;
624     VReg = DefMI->getOperand(1).getReg();
625   }
626   return VReg;
627 }
628 
629 // Determine if VReg is defined by an instruction that can be folded into a
630 // csel instruction. If so, return the folded opcode, and the replacement
631 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)632 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
633                                 unsigned *NewVReg = nullptr) {
634   VReg = removeCopies(MRI, VReg);
635   if (!Register::isVirtualRegister(VReg))
636     return 0;
637 
638   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
639   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
640   unsigned Opc = 0;
641   unsigned SrcOpNum = 0;
642   switch (DefMI->getOpcode()) {
643   case AArch64::ADDSXri:
644   case AArch64::ADDSWri:
645     // if NZCV is used, do not fold.
646     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
647       return 0;
648     // fall-through to ADDXri and ADDWri.
649     [[fallthrough]];
650   case AArch64::ADDXri:
651   case AArch64::ADDWri:
652     // add x, 1 -> csinc.
653     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
654         DefMI->getOperand(3).getImm() != 0)
655       return 0;
656     SrcOpNum = 1;
657     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
658     break;
659 
660   case AArch64::ORNXrr:
661   case AArch64::ORNWrr: {
662     // not x -> csinv, represented as orn dst, xzr, src.
663     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
664     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
665       return 0;
666     SrcOpNum = 2;
667     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
668     break;
669   }
670 
671   case AArch64::SUBSXrr:
672   case AArch64::SUBSWrr:
673     // if NZCV is used, do not fold.
674     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
675       return 0;
676     // fall-through to SUBXrr and SUBWrr.
677     [[fallthrough]];
678   case AArch64::SUBXrr:
679   case AArch64::SUBWrr: {
680     // neg x -> csneg, represented as sub dst, xzr, src.
681     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
682     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
683       return 0;
684     SrcOpNum = 2;
685     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
686     break;
687   }
688   default:
689     return 0;
690   }
691   assert(Opc && SrcOpNum && "Missing parameters");
692 
693   if (NewVReg)
694     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
695   return Opc;
696 }
697 
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const698 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
699                                        ArrayRef<MachineOperand> Cond,
700                                        Register DstReg, Register TrueReg,
701                                        Register FalseReg, int &CondCycles,
702                                        int &TrueCycles,
703                                        int &FalseCycles) const {
704   // Check register classes.
705   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
706   const TargetRegisterClass *RC =
707       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
708   if (!RC)
709     return false;
710 
711   // Also need to check the dest regclass, in case we're trying to optimize
712   // something like:
713   // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
714   if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
715     return false;
716 
717   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
718   unsigned ExtraCondLat = Cond.size() != 1;
719 
720   // GPRs are handled by csel.
721   // FIXME: Fold in x+1, -x, and ~x when applicable.
722   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
723       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
724     // Single-cycle csel, csinc, csinv, and csneg.
725     CondCycles = 1 + ExtraCondLat;
726     TrueCycles = FalseCycles = 1;
727     if (canFoldIntoCSel(MRI, TrueReg))
728       TrueCycles = 0;
729     else if (canFoldIntoCSel(MRI, FalseReg))
730       FalseCycles = 0;
731     return true;
732   }
733 
734   // Scalar floating point is handled by fcsel.
735   // FIXME: Form fabs, fmin, and fmax when applicable.
736   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
737       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
738     CondCycles = 5 + ExtraCondLat;
739     TrueCycles = FalseCycles = 2;
740     return true;
741   }
742 
743   // Can't do vectors.
744   return false;
745 }
746 
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const747 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
748                                     MachineBasicBlock::iterator I,
749                                     const DebugLoc &DL, Register DstReg,
750                                     ArrayRef<MachineOperand> Cond,
751                                     Register TrueReg, Register FalseReg) const {
752   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
753 
754   // Parse the condition code, see parseCondBranch() above.
755   AArch64CC::CondCode CC;
756   switch (Cond.size()) {
757   default:
758     llvm_unreachable("Unknown condition opcode in Cond");
759   case 1: // b.cc
760     CC = AArch64CC::CondCode(Cond[0].getImm());
761     break;
762   case 3: { // cbz/cbnz
763     // We must insert a compare against 0.
764     bool Is64Bit;
765     switch (Cond[1].getImm()) {
766     default:
767       llvm_unreachable("Unknown branch opcode in Cond");
768     case AArch64::CBZW:
769       Is64Bit = false;
770       CC = AArch64CC::EQ;
771       break;
772     case AArch64::CBZX:
773       Is64Bit = true;
774       CC = AArch64CC::EQ;
775       break;
776     case AArch64::CBNZW:
777       Is64Bit = false;
778       CC = AArch64CC::NE;
779       break;
780     case AArch64::CBNZX:
781       Is64Bit = true;
782       CC = AArch64CC::NE;
783       break;
784     }
785     Register SrcReg = Cond[2].getReg();
786     if (Is64Bit) {
787       // cmp reg, #0 is actually subs xzr, reg, #0.
788       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
789       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
790           .addReg(SrcReg)
791           .addImm(0)
792           .addImm(0);
793     } else {
794       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
795       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
796           .addReg(SrcReg)
797           .addImm(0)
798           .addImm(0);
799     }
800     break;
801   }
802   case 4: { // tbz/tbnz
803     // We must insert a tst instruction.
804     switch (Cond[1].getImm()) {
805     default:
806       llvm_unreachable("Unknown branch opcode in Cond");
807     case AArch64::TBZW:
808     case AArch64::TBZX:
809       CC = AArch64CC::EQ;
810       break;
811     case AArch64::TBNZW:
812     case AArch64::TBNZX:
813       CC = AArch64CC::NE;
814       break;
815     }
816     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
817     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
818       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
819           .addReg(Cond[2].getReg())
820           .addImm(
821               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
822     else
823       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
824           .addReg(Cond[2].getReg())
825           .addImm(
826               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
827     break;
828   }
829   }
830 
831   unsigned Opc = 0;
832   const TargetRegisterClass *RC = nullptr;
833   bool TryFold = false;
834   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
835     RC = &AArch64::GPR64RegClass;
836     Opc = AArch64::CSELXr;
837     TryFold = true;
838   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
839     RC = &AArch64::GPR32RegClass;
840     Opc = AArch64::CSELWr;
841     TryFold = true;
842   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
843     RC = &AArch64::FPR64RegClass;
844     Opc = AArch64::FCSELDrrr;
845   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
846     RC = &AArch64::FPR32RegClass;
847     Opc = AArch64::FCSELSrrr;
848   }
849   assert(RC && "Unsupported regclass");
850 
851   // Try folding simple instructions into the csel.
852   if (TryFold) {
853     unsigned NewVReg = 0;
854     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
855     if (FoldedOpc) {
856       // The folded opcodes csinc, csinc and csneg apply the operation to
857       // FalseReg, so we need to invert the condition.
858       CC = AArch64CC::getInvertedCondCode(CC);
859       TrueReg = FalseReg;
860     } else
861       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
862 
863     // Fold the operation. Leave any dead instructions for DCE to clean up.
864     if (FoldedOpc) {
865       FalseReg = NewVReg;
866       Opc = FoldedOpc;
867       // The extends the live range of NewVReg.
868       MRI.clearKillFlags(NewVReg);
869     }
870   }
871 
872   // Pull all virtual register into the appropriate class.
873   MRI.constrainRegClass(TrueReg, RC);
874   MRI.constrainRegClass(FalseReg, RC);
875 
876   // Insert the csel.
877   BuildMI(MBB, I, DL, get(Opc), DstReg)
878       .addReg(TrueReg)
879       .addReg(FalseReg)
880       .addImm(CC);
881 }
882 
883 // Return true if Imm can be loaded into a register by a "cheap" sequence of
884 // instructions. For now, "cheap" means at most two instructions.
isCheapImmediate(const MachineInstr & MI,unsigned BitSize)885 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
886   if (BitSize == 32)
887     return true;
888 
889   assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
890   uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
891   SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
892   AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
893 
894   return Is.size() <= 2;
895 }
896 
897 // FIXME: this implementation should be micro-architecture dependent, so a
898 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const899 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
900   if (Subtarget.hasExynosCheapAsMoveHandling()) {
901     if (isExynosCheapAsMove(MI))
902       return true;
903     return MI.isAsCheapAsAMove();
904   }
905 
906   switch (MI.getOpcode()) {
907   default:
908     return MI.isAsCheapAsAMove();
909 
910   case AArch64::ADDWrs:
911   case AArch64::ADDXrs:
912   case AArch64::SUBWrs:
913   case AArch64::SUBXrs:
914     return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
915 
916   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
917   // ORRXri, it is as cheap as MOV.
918   // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
919   case AArch64::MOVi32imm:
920     return isCheapImmediate(MI, 32);
921   case AArch64::MOVi64imm:
922     return isCheapImmediate(MI, 64);
923   }
924 }
925 
isFalkorShiftExtFast(const MachineInstr & MI)926 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
927   switch (MI.getOpcode()) {
928   default:
929     return false;
930 
931   case AArch64::ADDWrs:
932   case AArch64::ADDXrs:
933   case AArch64::ADDSWrs:
934   case AArch64::ADDSXrs: {
935     unsigned Imm = MI.getOperand(3).getImm();
936     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
937     if (ShiftVal == 0)
938       return true;
939     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
940   }
941 
942   case AArch64::ADDWrx:
943   case AArch64::ADDXrx:
944   case AArch64::ADDXrx64:
945   case AArch64::ADDSWrx:
946   case AArch64::ADDSXrx:
947   case AArch64::ADDSXrx64: {
948     unsigned Imm = MI.getOperand(3).getImm();
949     switch (AArch64_AM::getArithExtendType(Imm)) {
950     default:
951       return false;
952     case AArch64_AM::UXTB:
953     case AArch64_AM::UXTH:
954     case AArch64_AM::UXTW:
955     case AArch64_AM::UXTX:
956       return AArch64_AM::getArithShiftValue(Imm) <= 4;
957     }
958   }
959 
960   case AArch64::SUBWrs:
961   case AArch64::SUBSWrs: {
962     unsigned Imm = MI.getOperand(3).getImm();
963     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
964     return ShiftVal == 0 ||
965            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
966   }
967 
968   case AArch64::SUBXrs:
969   case AArch64::SUBSXrs: {
970     unsigned Imm = MI.getOperand(3).getImm();
971     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
972     return ShiftVal == 0 ||
973            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
974   }
975 
976   case AArch64::SUBWrx:
977   case AArch64::SUBXrx:
978   case AArch64::SUBXrx64:
979   case AArch64::SUBSWrx:
980   case AArch64::SUBSXrx:
981   case AArch64::SUBSXrx64: {
982     unsigned Imm = MI.getOperand(3).getImm();
983     switch (AArch64_AM::getArithExtendType(Imm)) {
984     default:
985       return false;
986     case AArch64_AM::UXTB:
987     case AArch64_AM::UXTH:
988     case AArch64_AM::UXTW:
989     case AArch64_AM::UXTX:
990       return AArch64_AM::getArithShiftValue(Imm) == 0;
991     }
992   }
993 
994   case AArch64::LDRBBroW:
995   case AArch64::LDRBBroX:
996   case AArch64::LDRBroW:
997   case AArch64::LDRBroX:
998   case AArch64::LDRDroW:
999   case AArch64::LDRDroX:
1000   case AArch64::LDRHHroW:
1001   case AArch64::LDRHHroX:
1002   case AArch64::LDRHroW:
1003   case AArch64::LDRHroX:
1004   case AArch64::LDRQroW:
1005   case AArch64::LDRQroX:
1006   case AArch64::LDRSBWroW:
1007   case AArch64::LDRSBWroX:
1008   case AArch64::LDRSBXroW:
1009   case AArch64::LDRSBXroX:
1010   case AArch64::LDRSHWroW:
1011   case AArch64::LDRSHWroX:
1012   case AArch64::LDRSHXroW:
1013   case AArch64::LDRSHXroX:
1014   case AArch64::LDRSWroW:
1015   case AArch64::LDRSWroX:
1016   case AArch64::LDRSroW:
1017   case AArch64::LDRSroX:
1018   case AArch64::LDRWroW:
1019   case AArch64::LDRWroX:
1020   case AArch64::LDRXroW:
1021   case AArch64::LDRXroX:
1022   case AArch64::PRFMroW:
1023   case AArch64::PRFMroX:
1024   case AArch64::STRBBroW:
1025   case AArch64::STRBBroX:
1026   case AArch64::STRBroW:
1027   case AArch64::STRBroX:
1028   case AArch64::STRDroW:
1029   case AArch64::STRDroX:
1030   case AArch64::STRHHroW:
1031   case AArch64::STRHHroX:
1032   case AArch64::STRHroW:
1033   case AArch64::STRHroX:
1034   case AArch64::STRQroW:
1035   case AArch64::STRQroX:
1036   case AArch64::STRSroW:
1037   case AArch64::STRSroX:
1038   case AArch64::STRWroW:
1039   case AArch64::STRWroX:
1040   case AArch64::STRXroW:
1041   case AArch64::STRXroX: {
1042     unsigned IsSigned = MI.getOperand(3).getImm();
1043     return !IsSigned;
1044   }
1045   }
1046 }
1047 
isSEHInstruction(const MachineInstr & MI)1048 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1049   unsigned Opc = MI.getOpcode();
1050   switch (Opc) {
1051     default:
1052       return false;
1053     case AArch64::SEH_StackAlloc:
1054     case AArch64::SEH_SaveFPLR:
1055     case AArch64::SEH_SaveFPLR_X:
1056     case AArch64::SEH_SaveReg:
1057     case AArch64::SEH_SaveReg_X:
1058     case AArch64::SEH_SaveRegP:
1059     case AArch64::SEH_SaveRegP_X:
1060     case AArch64::SEH_SaveFReg:
1061     case AArch64::SEH_SaveFReg_X:
1062     case AArch64::SEH_SaveFRegP:
1063     case AArch64::SEH_SaveFRegP_X:
1064     case AArch64::SEH_SetFP:
1065     case AArch64::SEH_AddFP:
1066     case AArch64::SEH_Nop:
1067     case AArch64::SEH_PrologEnd:
1068     case AArch64::SEH_EpilogStart:
1069     case AArch64::SEH_EpilogEnd:
1070     case AArch64::SEH_PACSignLR:
1071     case AArch64::SEH_SaveAnyRegQP:
1072     case AArch64::SEH_SaveAnyRegQPX:
1073       return true;
1074   }
1075 }
1076 
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1077 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1078                                              Register &SrcReg, Register &DstReg,
1079                                              unsigned &SubIdx) const {
1080   switch (MI.getOpcode()) {
1081   default:
1082     return false;
1083   case AArch64::SBFMXri: // aka sxtw
1084   case AArch64::UBFMXri: // aka uxtw
1085     // Check for the 32 -> 64 bit extension case, these instructions can do
1086     // much more.
1087     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1088       return false;
1089     // This is a signed or unsigned 32 -> 64 bit extension.
1090     SrcReg = MI.getOperand(1).getReg();
1091     DstReg = MI.getOperand(0).getReg();
1092     SubIdx = AArch64::sub_32;
1093     return true;
1094   }
1095 }
1096 
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1097 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1098     const MachineInstr &MIa, const MachineInstr &MIb) const {
1099   const TargetRegisterInfo *TRI = &getRegisterInfo();
1100   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1101   int64_t OffsetA = 0, OffsetB = 0;
1102   TypeSize WidthA(0, false), WidthB(0, false);
1103   bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1104 
1105   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1106   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1107 
1108   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1109       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1110     return false;
1111 
1112   // Retrieve the base, offset from the base and width. Width
1113   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
1114   // base are identical, and the offset of a lower memory access +
1115   // the width doesn't overlap the offset of a higher memory access,
1116   // then the memory accesses are different.
1117   // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1118   // are assumed to have the same scale (vscale).
1119   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1120                                    WidthA, TRI) &&
1121       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1122                                    WidthB, TRI)) {
1123     if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1124         OffsetAIsScalable == OffsetBIsScalable) {
1125       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1126       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1127       TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1128       if (LowWidth.isScalable() == OffsetAIsScalable &&
1129           LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1130         return true;
1131     }
1132   }
1133   return false;
1134 }
1135 
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1136 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1137                                             const MachineBasicBlock *MBB,
1138                                             const MachineFunction &MF) const {
1139   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1140     return true;
1141 
1142   // Do not move an instruction that can be recognized as a branch target.
1143   if (hasBTISemantics(MI))
1144     return true;
1145 
1146   switch (MI.getOpcode()) {
1147   case AArch64::HINT:
1148     // CSDB hints are scheduling barriers.
1149     if (MI.getOperand(0).getImm() == 0x14)
1150       return true;
1151     break;
1152   case AArch64::DSB:
1153   case AArch64::ISB:
1154     // DSB and ISB also are scheduling barriers.
1155     return true;
1156   case AArch64::MSRpstatesvcrImm1:
1157     // SMSTART and SMSTOP are also scheduling barriers.
1158     return true;
1159   default:;
1160   }
1161   if (isSEHInstruction(MI))
1162     return true;
1163   auto Next = std::next(MI.getIterator());
1164   return Next != MBB->end() && Next->isCFIInstruction();
1165 }
1166 
1167 /// analyzeCompare - For a comparison instruction, return the source registers
1168 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1169 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int64_t & CmpMask,int64_t & CmpValue) const1170 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1171                                       Register &SrcReg2, int64_t &CmpMask,
1172                                       int64_t &CmpValue) const {
1173   // The first operand can be a frame index where we'd normally expect a
1174   // register.
1175   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1176   if (!MI.getOperand(1).isReg())
1177     return false;
1178 
1179   switch (MI.getOpcode()) {
1180   default:
1181     break;
1182   case AArch64::PTEST_PP:
1183   case AArch64::PTEST_PP_ANY:
1184     SrcReg = MI.getOperand(0).getReg();
1185     SrcReg2 = MI.getOperand(1).getReg();
1186     // Not sure about the mask and value for now...
1187     CmpMask = ~0;
1188     CmpValue = 0;
1189     return true;
1190   case AArch64::SUBSWrr:
1191   case AArch64::SUBSWrs:
1192   case AArch64::SUBSWrx:
1193   case AArch64::SUBSXrr:
1194   case AArch64::SUBSXrs:
1195   case AArch64::SUBSXrx:
1196   case AArch64::ADDSWrr:
1197   case AArch64::ADDSWrs:
1198   case AArch64::ADDSWrx:
1199   case AArch64::ADDSXrr:
1200   case AArch64::ADDSXrs:
1201   case AArch64::ADDSXrx:
1202     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1203     SrcReg = MI.getOperand(1).getReg();
1204     SrcReg2 = MI.getOperand(2).getReg();
1205     CmpMask = ~0;
1206     CmpValue = 0;
1207     return true;
1208   case AArch64::SUBSWri:
1209   case AArch64::ADDSWri:
1210   case AArch64::SUBSXri:
1211   case AArch64::ADDSXri:
1212     SrcReg = MI.getOperand(1).getReg();
1213     SrcReg2 = 0;
1214     CmpMask = ~0;
1215     CmpValue = MI.getOperand(2).getImm();
1216     return true;
1217   case AArch64::ANDSWri:
1218   case AArch64::ANDSXri:
1219     // ANDS does not use the same encoding scheme as the others xxxS
1220     // instructions.
1221     SrcReg = MI.getOperand(1).getReg();
1222     SrcReg2 = 0;
1223     CmpMask = ~0;
1224     CmpValue = AArch64_AM::decodeLogicalImmediate(
1225                    MI.getOperand(2).getImm(),
1226                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1227     return true;
1228   }
1229 
1230   return false;
1231 }
1232 
UpdateOperandRegClass(MachineInstr & Instr)1233 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1234   MachineBasicBlock *MBB = Instr.getParent();
1235   assert(MBB && "Can't get MachineBasicBlock here");
1236   MachineFunction *MF = MBB->getParent();
1237   assert(MF && "Can't get MachineFunction here");
1238   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1239   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1240   MachineRegisterInfo *MRI = &MF->getRegInfo();
1241 
1242   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1243        ++OpIdx) {
1244     MachineOperand &MO = Instr.getOperand(OpIdx);
1245     const TargetRegisterClass *OpRegCstraints =
1246         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1247 
1248     // If there's no constraint, there's nothing to do.
1249     if (!OpRegCstraints)
1250       continue;
1251     // If the operand is a frame index, there's nothing to do here.
1252     // A frame index operand will resolve correctly during PEI.
1253     if (MO.isFI())
1254       continue;
1255 
1256     assert(MO.isReg() &&
1257            "Operand has register constraints without being a register!");
1258 
1259     Register Reg = MO.getReg();
1260     if (Reg.isPhysical()) {
1261       if (!OpRegCstraints->contains(Reg))
1262         return false;
1263     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1264                !MRI->constrainRegClass(Reg, OpRegCstraints))
1265       return false;
1266   }
1267 
1268   return true;
1269 }
1270 
1271 /// Return the opcode that does not set flags when possible - otherwise
1272 /// return the original opcode. The caller is responsible to do the actual
1273 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1274 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1275   // Don't convert all compare instructions, because for some the zero register
1276   // encoding becomes the sp register.
1277   bool MIDefinesZeroReg = false;
1278   if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1279     MIDefinesZeroReg = true;
1280 
1281   switch (MI.getOpcode()) {
1282   default:
1283     return MI.getOpcode();
1284   case AArch64::ADDSWrr:
1285     return AArch64::ADDWrr;
1286   case AArch64::ADDSWri:
1287     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1288   case AArch64::ADDSWrs:
1289     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1290   case AArch64::ADDSWrx:
1291     return AArch64::ADDWrx;
1292   case AArch64::ADDSXrr:
1293     return AArch64::ADDXrr;
1294   case AArch64::ADDSXri:
1295     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1296   case AArch64::ADDSXrs:
1297     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1298   case AArch64::ADDSXrx:
1299     return AArch64::ADDXrx;
1300   case AArch64::SUBSWrr:
1301     return AArch64::SUBWrr;
1302   case AArch64::SUBSWri:
1303     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1304   case AArch64::SUBSWrs:
1305     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1306   case AArch64::SUBSWrx:
1307     return AArch64::SUBWrx;
1308   case AArch64::SUBSXrr:
1309     return AArch64::SUBXrr;
1310   case AArch64::SUBSXri:
1311     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1312   case AArch64::SUBSXrs:
1313     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1314   case AArch64::SUBSXrx:
1315     return AArch64::SUBXrx;
1316   }
1317 }
1318 
1319 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1320 
1321 /// True when condition flags are accessed (either by writing or reading)
1322 /// on the instruction trace starting at From and ending at To.
1323 ///
1324 /// Note: If From and To are from different blocks it's assumed CC are accessed
1325 ///       on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1326 static bool areCFlagsAccessedBetweenInstrs(
1327     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1328     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1329   // Early exit if To is at the beginning of the BB.
1330   if (To == To->getParent()->begin())
1331     return true;
1332 
1333   // Check whether the instructions are in the same basic block
1334   // If not, assume the condition flags might get modified somewhere.
1335   if (To->getParent() != From->getParent())
1336     return true;
1337 
1338   // From must be above To.
1339   assert(std::any_of(
1340       ++To.getReverse(), To->getParent()->rend(),
1341       [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1342 
1343   // We iterate backward starting at \p To until we hit \p From.
1344   for (const MachineInstr &Instr :
1345        instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1346     if (((AccessToCheck & AK_Write) &&
1347          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1348         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1349       return true;
1350   }
1351   return false;
1352 }
1353 
1354 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1355 /// operation which could set the flags in an identical manner
optimizePTestInstr(MachineInstr * PTest,unsigned MaskReg,unsigned PredReg,const MachineRegisterInfo * MRI) const1356 bool AArch64InstrInfo::optimizePTestInstr(
1357     MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1358     const MachineRegisterInfo *MRI) const {
1359   auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1360   auto *Pred = MRI->getUniqueVRegDef(PredReg);
1361   auto NewOp = Pred->getOpcode();
1362   bool OpChanged = false;
1363 
1364   unsigned MaskOpcode = Mask->getOpcode();
1365   unsigned PredOpcode = Pred->getOpcode();
1366   bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1367   bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1368 
1369   if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) &&
1370       getElementSizeForOpcode(MaskOpcode) ==
1371           getElementSizeForOpcode(PredOpcode) &&
1372       Mask->getOperand(1).getImm() == 31) {
1373     // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1374     // redundant since WHILE performs an implicit PTEST with an all active
1375     // mask. Must be an all active predicate of matching element size.
1376 
1377     // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1378     // PTEST_LIKE instruction uses the same all active mask and the element
1379     // size matches. If the PTEST has a condition of any then it is always
1380     // redundant.
1381     if (PredIsPTestLike) {
1382       auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1383       if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY)
1384         return false;
1385     }
1386 
1387     // Fallthough to simply remove the PTEST.
1388   } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) &&
1389              PTest->getOpcode() == AArch64::PTEST_PP_ANY) {
1390     // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1391     // instruction that sets the flags as PTEST would. This is only valid when
1392     // the condition is any.
1393 
1394     // Fallthough to simply remove the PTEST.
1395   } else if (PredIsPTestLike) {
1396     // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1397     // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1398     // on 8-bit predicates like the PTEST.  Otherwise, for instructions like
1399     // compare that also support 16/32/64-bit predicates, the implicit PTEST
1400     // performed by the compare could consider fewer lanes for these element
1401     // sizes.
1402     //
1403     // For example, consider
1404     //
1405     //   ptrue p0.b                    ; P0=1111-1111-1111-1111
1406     //   index z0.s, #0, #1            ; Z0=<0,1,2,3>
1407     //   index z1.s, #1, #1            ; Z1=<1,2,3,4>
1408     //   cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
1409     //                                 ;       ^ last active
1410     //   ptest p0, p1.b                ; P1=0001-0001-0001-0001
1411     //                                 ;     ^ last active
1412     //
1413     // where the compare generates a canonical all active 32-bit predicate
1414     // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1415     // active flag, whereas the PTEST instruction with the same mask doesn't.
1416     // For PTEST_ANY this doesn't apply as the flags in this case would be
1417     // identical regardless of element size.
1418     auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1419     uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1420     if ((Mask != PTestLikeMask) ||
1421         (PredElementSize != AArch64::ElementSizeB &&
1422          PTest->getOpcode() != AArch64::PTEST_PP_ANY))
1423       return false;
1424 
1425     // Fallthough to simply remove the PTEST.
1426   } else {
1427     // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1428     // opcode so the PTEST becomes redundant.
1429     switch (PredOpcode) {
1430     case AArch64::AND_PPzPP:
1431     case AArch64::BIC_PPzPP:
1432     case AArch64::EOR_PPzPP:
1433     case AArch64::NAND_PPzPP:
1434     case AArch64::NOR_PPzPP:
1435     case AArch64::ORN_PPzPP:
1436     case AArch64::ORR_PPzPP:
1437     case AArch64::BRKA_PPzP:
1438     case AArch64::BRKPA_PPzPP:
1439     case AArch64::BRKB_PPzP:
1440     case AArch64::BRKPB_PPzPP:
1441     case AArch64::RDFFR_PPz: {
1442       // Check to see if our mask is the same. If not the resulting flag bits
1443       // may be different and we can't remove the ptest.
1444       auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1445       if (Mask != PredMask)
1446         return false;
1447       break;
1448     }
1449     case AArch64::BRKN_PPzP: {
1450       // BRKN uses an all active implicit mask to set flags unlike the other
1451       // flag-setting instructions.
1452       // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1453       if ((MaskOpcode != AArch64::PTRUE_B) ||
1454           (Mask->getOperand(1).getImm() != 31))
1455         return false;
1456       break;
1457     }
1458     case AArch64::PTRUE_B:
1459       // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1460       break;
1461     default:
1462       // Bail out if we don't recognize the input
1463       return false;
1464     }
1465 
1466     NewOp = convertToFlagSettingOpc(PredOpcode);
1467     OpChanged = true;
1468   }
1469 
1470   const TargetRegisterInfo *TRI = &getRegisterInfo();
1471 
1472   // If another instruction between Pred and PTest accesses flags, don't remove
1473   // the ptest or update the earlier instruction to modify them.
1474   if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1475     return false;
1476 
1477   // If we pass all the checks, it's safe to remove the PTEST and use the flags
1478   // as they are prior to PTEST. Sometimes this requires the tested PTEST
1479   // operand to be replaced with an equivalent instruction that also sets the
1480   // flags.
1481   Pred->setDesc(get(NewOp));
1482   PTest->eraseFromParent();
1483   if (OpChanged) {
1484     bool succeeded = UpdateOperandRegClass(*Pred);
1485     (void)succeeded;
1486     assert(succeeded && "Operands have incompatible register classes!");
1487     Pred->addRegisterDefined(AArch64::NZCV, TRI);
1488   }
1489 
1490   // Ensure that the flags def is live.
1491   if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1492     unsigned i = 0, e = Pred->getNumOperands();
1493     for (; i != e; ++i) {
1494       MachineOperand &MO = Pred->getOperand(i);
1495       if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1496         MO.setIsDead(false);
1497         break;
1498       }
1499     }
1500   }
1501   return true;
1502 }
1503 
1504 /// Try to optimize a compare instruction. A compare instruction is an
1505 /// instruction which produces AArch64::NZCV. It can be truly compare
1506 /// instruction
1507 /// when there are no uses of its destination register.
1508 ///
1509 /// The following steps are tried in order:
1510 /// 1. Convert CmpInstr into an unconditional version.
1511 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1512 ///    condition code or an instruction which can be converted into such an
1513 ///    instruction.
1514 ///    Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int64_t CmpMask,int64_t CmpValue,const MachineRegisterInfo * MRI) const1515 bool AArch64InstrInfo::optimizeCompareInstr(
1516     MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1517     int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1518   assert(CmpInstr.getParent());
1519   assert(MRI);
1520 
1521   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1522   int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1523   if (DeadNZCVIdx != -1) {
1524     if (CmpInstr.definesRegister(AArch64::WZR) ||
1525         CmpInstr.definesRegister(AArch64::XZR)) {
1526       CmpInstr.eraseFromParent();
1527       return true;
1528     }
1529     unsigned Opc = CmpInstr.getOpcode();
1530     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1531     if (NewOpc == Opc)
1532       return false;
1533     const MCInstrDesc &MCID = get(NewOpc);
1534     CmpInstr.setDesc(MCID);
1535     CmpInstr.removeOperand(DeadNZCVIdx);
1536     bool succeeded = UpdateOperandRegClass(CmpInstr);
1537     (void)succeeded;
1538     assert(succeeded && "Some operands reg class are incompatible!");
1539     return true;
1540   }
1541 
1542   if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1543       CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1544     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1545 
1546   if (SrcReg2 != 0)
1547     return false;
1548 
1549   // CmpInstr is a Compare instruction if destination register is not used.
1550   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1551     return false;
1552 
1553   if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1554     return true;
1555   return (CmpValue == 0 || CmpValue == 1) &&
1556          removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1557 }
1558 
1559 /// Get opcode of S version of Instr.
1560 /// If Instr is S version its opcode is returned.
1561 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1562 /// or we are not interested in it.
sForm(MachineInstr & Instr)1563 static unsigned sForm(MachineInstr &Instr) {
1564   switch (Instr.getOpcode()) {
1565   default:
1566     return AArch64::INSTRUCTION_LIST_END;
1567 
1568   case AArch64::ADDSWrr:
1569   case AArch64::ADDSWri:
1570   case AArch64::ADDSXrr:
1571   case AArch64::ADDSXri:
1572   case AArch64::SUBSWrr:
1573   case AArch64::SUBSWri:
1574   case AArch64::SUBSXrr:
1575   case AArch64::SUBSXri:
1576     return Instr.getOpcode();
1577 
1578   case AArch64::ADDWrr:
1579     return AArch64::ADDSWrr;
1580   case AArch64::ADDWri:
1581     return AArch64::ADDSWri;
1582   case AArch64::ADDXrr:
1583     return AArch64::ADDSXrr;
1584   case AArch64::ADDXri:
1585     return AArch64::ADDSXri;
1586   case AArch64::ADCWr:
1587     return AArch64::ADCSWr;
1588   case AArch64::ADCXr:
1589     return AArch64::ADCSXr;
1590   case AArch64::SUBWrr:
1591     return AArch64::SUBSWrr;
1592   case AArch64::SUBWri:
1593     return AArch64::SUBSWri;
1594   case AArch64::SUBXrr:
1595     return AArch64::SUBSXrr;
1596   case AArch64::SUBXri:
1597     return AArch64::SUBSXri;
1598   case AArch64::SBCWr:
1599     return AArch64::SBCSWr;
1600   case AArch64::SBCXr:
1601     return AArch64::SBCSXr;
1602   case AArch64::ANDWri:
1603     return AArch64::ANDSWri;
1604   case AArch64::ANDXri:
1605     return AArch64::ANDSXri;
1606   }
1607 }
1608 
1609 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(const MachineBasicBlock * MBB)1610 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1611   for (auto *BB : MBB->successors())
1612     if (BB->isLiveIn(AArch64::NZCV))
1613       return true;
1614   return false;
1615 }
1616 
1617 /// \returns The condition code operand index for \p Instr if it is a branch
1618 /// or select and -1 otherwise.
1619 static int
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr & Instr)1620 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1621   switch (Instr.getOpcode()) {
1622   default:
1623     return -1;
1624 
1625   case AArch64::Bcc: {
1626     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1627     assert(Idx >= 2);
1628     return Idx - 2;
1629   }
1630 
1631   case AArch64::CSINVWr:
1632   case AArch64::CSINVXr:
1633   case AArch64::CSINCWr:
1634   case AArch64::CSINCXr:
1635   case AArch64::CSELWr:
1636   case AArch64::CSELXr:
1637   case AArch64::CSNEGWr:
1638   case AArch64::CSNEGXr:
1639   case AArch64::FCSELSrrr:
1640   case AArch64::FCSELDrrr: {
1641     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1642     assert(Idx >= 1);
1643     return Idx - 1;
1644   }
1645   }
1646 }
1647 
1648 /// Find a condition code used by the instruction.
1649 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1650 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1651 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1652   int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1653   return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1654                           Instr.getOperand(CCIdx).getImm())
1655                     : AArch64CC::Invalid;
1656 }
1657 
getUsedNZCV(AArch64CC::CondCode CC)1658 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1659   assert(CC != AArch64CC::Invalid);
1660   UsedNZCV UsedFlags;
1661   switch (CC) {
1662   default:
1663     break;
1664 
1665   case AArch64CC::EQ: // Z set
1666   case AArch64CC::NE: // Z clear
1667     UsedFlags.Z = true;
1668     break;
1669 
1670   case AArch64CC::HI: // Z clear and C set
1671   case AArch64CC::LS: // Z set   or  C clear
1672     UsedFlags.Z = true;
1673     [[fallthrough]];
1674   case AArch64CC::HS: // C set
1675   case AArch64CC::LO: // C clear
1676     UsedFlags.C = true;
1677     break;
1678 
1679   case AArch64CC::MI: // N set
1680   case AArch64CC::PL: // N clear
1681     UsedFlags.N = true;
1682     break;
1683 
1684   case AArch64CC::VS: // V set
1685   case AArch64CC::VC: // V clear
1686     UsedFlags.V = true;
1687     break;
1688 
1689   case AArch64CC::GT: // Z clear, N and V the same
1690   case AArch64CC::LE: // Z set,   N and V differ
1691     UsedFlags.Z = true;
1692     [[fallthrough]];
1693   case AArch64CC::GE: // N and V the same
1694   case AArch64CC::LT: // N and V differ
1695     UsedFlags.N = true;
1696     UsedFlags.V = true;
1697     break;
1698   }
1699   return UsedFlags;
1700 }
1701 
1702 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1703 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1704 /// \returns std::nullopt otherwise.
1705 ///
1706 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1707 std::optional<UsedNZCV>
examineCFlagsUse(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > * CCUseInstrs)1708 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1709                        const TargetRegisterInfo &TRI,
1710                        SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1711   MachineBasicBlock *CmpParent = CmpInstr.getParent();
1712   if (MI.getParent() != CmpParent)
1713     return std::nullopt;
1714 
1715   if (areCFlagsAliveInSuccessors(CmpParent))
1716     return std::nullopt;
1717 
1718   UsedNZCV NZCVUsedAfterCmp;
1719   for (MachineInstr &Instr : instructionsWithoutDebug(
1720            std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1721     if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1722       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1723       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1724         return std::nullopt;
1725       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1726       if (CCUseInstrs)
1727         CCUseInstrs->push_back(&Instr);
1728     }
1729     if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1730       break;
1731   }
1732   return NZCVUsedAfterCmp;
1733 }
1734 
isADDSRegImm(unsigned Opcode)1735 static bool isADDSRegImm(unsigned Opcode) {
1736   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1737 }
1738 
isSUBSRegImm(unsigned Opcode)1739 static bool isSUBSRegImm(unsigned Opcode) {
1740   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1741 }
1742 
1743 /// Check if CmpInstr can be substituted by MI.
1744 ///
1745 /// CmpInstr can be substituted:
1746 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1747 /// - and, MI and CmpInstr are from the same MachineBB
1748 /// - and, condition flags are not alive in successors of the CmpInstr parent
1749 /// - and, if MI opcode is the S form there must be no defs of flags between
1750 ///        MI and CmpInstr
1751 ///        or if MI opcode is not the S form there must be neither defs of flags
1752 ///        nor uses of flags between MI and CmpInstr.
1753 /// - and, if C/V flags are not used after CmpInstr
1754 ///        or if N flag is used but MI produces poison value if signed overflow
1755 ///        occurs.
canInstrSubstituteCmpInstr(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI)1756 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1757                                        const TargetRegisterInfo &TRI) {
1758   // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1759   // that may or may not set flags.
1760   assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1761 
1762   const unsigned CmpOpcode = CmpInstr.getOpcode();
1763   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1764     return false;
1765 
1766   assert((CmpInstr.getOperand(2).isImm() &&
1767           CmpInstr.getOperand(2).getImm() == 0) &&
1768          "Caller guarantees that CmpInstr compares with constant 0");
1769 
1770   std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1771   if (!NZVCUsed || NZVCUsed->C)
1772     return false;
1773 
1774   // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1775   // '%vreg = add ...' or '%vreg = sub ...'.
1776   // Condition flag V is used to indicate signed overflow.
1777   // 1) MI and CmpInstr set N and V to the same value.
1778   // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1779   //    signed overflow occurs, so CmpInstr could still be simplified away.
1780   if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1781     return false;
1782 
1783   AccessKind AccessToCheck = AK_Write;
1784   if (sForm(MI) != MI.getOpcode())
1785     AccessToCheck = AK_All;
1786   return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1787 }
1788 
1789 /// Substitute an instruction comparing to zero with another instruction
1790 /// which produces needed condition flags.
1791 ///
1792 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo & MRI) const1793 bool AArch64InstrInfo::substituteCmpToZero(
1794     MachineInstr &CmpInstr, unsigned SrcReg,
1795     const MachineRegisterInfo &MRI) const {
1796   // Get the unique definition of SrcReg.
1797   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1798   if (!MI)
1799     return false;
1800 
1801   const TargetRegisterInfo &TRI = getRegisterInfo();
1802 
1803   unsigned NewOpc = sForm(*MI);
1804   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1805     return false;
1806 
1807   if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1808     return false;
1809 
1810   // Update the instruction to set NZCV.
1811   MI->setDesc(get(NewOpc));
1812   CmpInstr.eraseFromParent();
1813   bool succeeded = UpdateOperandRegClass(*MI);
1814   (void)succeeded;
1815   assert(succeeded && "Some operands reg class are incompatible!");
1816   MI->addRegisterDefined(AArch64::NZCV, &TRI);
1817   return true;
1818 }
1819 
1820 /// \returns True if \p CmpInstr can be removed.
1821 ///
1822 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1823 /// codes used in \p CCUseInstrs must be inverted.
canCmpInstrBeRemoved(MachineInstr & MI,MachineInstr & CmpInstr,int CmpValue,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > & CCUseInstrs,bool & IsInvertCC)1824 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1825                                  int CmpValue, const TargetRegisterInfo &TRI,
1826                                  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1827                                  bool &IsInvertCC) {
1828   assert((CmpValue == 0 || CmpValue == 1) &&
1829          "Only comparisons to 0 or 1 considered for removal!");
1830 
1831   // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1832   unsigned MIOpc = MI.getOpcode();
1833   if (MIOpc == AArch64::CSINCWr) {
1834     if (MI.getOperand(1).getReg() != AArch64::WZR ||
1835         MI.getOperand(2).getReg() != AArch64::WZR)
1836       return false;
1837   } else if (MIOpc == AArch64::CSINCXr) {
1838     if (MI.getOperand(1).getReg() != AArch64::XZR ||
1839         MI.getOperand(2).getReg() != AArch64::XZR)
1840       return false;
1841   } else {
1842     return false;
1843   }
1844   AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1845   if (MICC == AArch64CC::Invalid)
1846     return false;
1847 
1848   // NZCV needs to be defined
1849   if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
1850     return false;
1851 
1852   // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1853   const unsigned CmpOpcode = CmpInstr.getOpcode();
1854   bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1855   if (CmpValue && !IsSubsRegImm)
1856     return false;
1857   if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1858     return false;
1859 
1860   // MI conditions allowed: eq, ne, mi, pl
1861   UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1862   if (MIUsedNZCV.C || MIUsedNZCV.V)
1863     return false;
1864 
1865   std::optional<UsedNZCV> NZCVUsedAfterCmp =
1866       examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1867   // Condition flags are not used in CmpInstr basic block successors and only
1868   // Z or N flags allowed to be used after CmpInstr within its basic block
1869   if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1870     return false;
1871   // Z or N flag used after CmpInstr must correspond to the flag used in MI
1872   if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1873       (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1874     return false;
1875   // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1876   if (MIUsedNZCV.N && !CmpValue)
1877     return false;
1878 
1879   // There must be no defs of flags between MI and CmpInstr
1880   if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1881     return false;
1882 
1883   // Condition code is inverted in the following cases:
1884   // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1885   // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1886   IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1887                (!CmpValue && MICC == AArch64CC::NE);
1888   return true;
1889 }
1890 
1891 /// Remove comparison in csinc-cmp sequence
1892 ///
1893 /// Examples:
1894 /// 1. \code
1895 ///   csinc w9, wzr, wzr, ne
1896 ///   cmp   w9, #0
1897 ///   b.eq
1898 ///    \endcode
1899 /// to
1900 ///    \code
1901 ///   csinc w9, wzr, wzr, ne
1902 ///   b.ne
1903 ///    \endcode
1904 ///
1905 /// 2. \code
1906 ///   csinc x2, xzr, xzr, mi
1907 ///   cmp   x2, #1
1908 ///   b.pl
1909 ///    \endcode
1910 /// to
1911 ///    \code
1912 ///   csinc x2, xzr, xzr, mi
1913 ///   b.pl
1914 ///    \endcode
1915 ///
1916 /// \param  CmpInstr comparison instruction
1917 /// \return True when comparison removed
removeCmpToZeroOrOne(MachineInstr & CmpInstr,unsigned SrcReg,int CmpValue,const MachineRegisterInfo & MRI) const1918 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1919     MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1920     const MachineRegisterInfo &MRI) const {
1921   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1922   if (!MI)
1923     return false;
1924   const TargetRegisterInfo &TRI = getRegisterInfo();
1925   SmallVector<MachineInstr *, 4> CCUseInstrs;
1926   bool IsInvertCC = false;
1927   if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1928                             IsInvertCC))
1929     return false;
1930   // Make transformation
1931   CmpInstr.eraseFromParent();
1932   if (IsInvertCC) {
1933     // Invert condition codes in CmpInstr CC users
1934     for (MachineInstr *CCUseInstr : CCUseInstrs) {
1935       int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1936       assert(Idx >= 0 && "Unexpected instruction using CC.");
1937       MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1938       AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1939           static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1940       CCOperand.setImm(CCUse);
1941     }
1942   }
1943   return true;
1944 }
1945 
expandPostRAPseudo(MachineInstr & MI) const1946 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1947   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1948       MI.getOpcode() != AArch64::CATCHRET)
1949     return false;
1950 
1951   MachineBasicBlock &MBB = *MI.getParent();
1952   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1953   auto TRI = Subtarget.getRegisterInfo();
1954   DebugLoc DL = MI.getDebugLoc();
1955 
1956   if (MI.getOpcode() == AArch64::CATCHRET) {
1957     // Skip to the first instruction before the epilog.
1958     const TargetInstrInfo *TII =
1959       MBB.getParent()->getSubtarget().getInstrInfo();
1960     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1961     auto MBBI = MachineBasicBlock::iterator(MI);
1962     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1963     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1964            FirstEpilogSEH != MBB.begin())
1965       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1966     if (FirstEpilogSEH != MBB.begin())
1967       FirstEpilogSEH = std::next(FirstEpilogSEH);
1968     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1969         .addReg(AArch64::X0, RegState::Define)
1970         .addMBB(TargetMBB);
1971     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1972         .addReg(AArch64::X0, RegState::Define)
1973         .addReg(AArch64::X0)
1974         .addMBB(TargetMBB)
1975         .addImm(0);
1976     return true;
1977   }
1978 
1979   Register Reg = MI.getOperand(0).getReg();
1980   Module &M = *MBB.getParent()->getFunction().getParent();
1981   if (M.getStackProtectorGuard() == "sysreg") {
1982     const AArch64SysReg::SysReg *SrcReg =
1983         AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
1984     if (!SrcReg)
1985       report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
1986 
1987     // mrs xN, sysreg
1988     BuildMI(MBB, MI, DL, get(AArch64::MRS))
1989         .addDef(Reg, RegState::Renamable)
1990         .addImm(SrcReg->Encoding);
1991     int Offset = M.getStackProtectorGuardOffset();
1992     if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
1993       // ldr xN, [xN, #offset]
1994       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1995           .addDef(Reg)
1996           .addUse(Reg, RegState::Kill)
1997           .addImm(Offset / 8);
1998     } else if (Offset >= -256 && Offset <= 255) {
1999       // ldur xN, [xN, #offset]
2000       BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2001           .addDef(Reg)
2002           .addUse(Reg, RegState::Kill)
2003           .addImm(Offset);
2004     } else if (Offset >= -4095 && Offset <= 4095) {
2005       if (Offset > 0) {
2006         // add xN, xN, #offset
2007         BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2008             .addDef(Reg)
2009             .addUse(Reg, RegState::Kill)
2010             .addImm(Offset)
2011             .addImm(0);
2012       } else {
2013         // sub xN, xN, #offset
2014         BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2015             .addDef(Reg)
2016             .addUse(Reg, RegState::Kill)
2017             .addImm(-Offset)
2018             .addImm(0);
2019       }
2020       // ldr xN, [xN]
2021       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2022           .addDef(Reg)
2023           .addUse(Reg, RegState::Kill)
2024           .addImm(0);
2025     } else {
2026       // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2027       // than 23760.
2028       // It might be nice to use AArch64::MOVi32imm here, which would get
2029       // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2030       // contains the MRS result. findScratchNonCalleeSaveRegister() in
2031       // AArch64FrameLowering might help us find such a scratch register
2032       // though. If we failed to find a scratch register, we could emit a
2033       // stream of add instructions to build up the immediate. Or, we could try
2034       // to insert a AArch64::MOVi32imm before register allocation so that we
2035       // didn't need to scavenge for a scratch register.
2036       report_fatal_error("Unable to encode Stack Protector Guard Offset");
2037     }
2038     MBB.erase(MI);
2039     return true;
2040   }
2041 
2042   const GlobalValue *GV =
2043       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2044   const TargetMachine &TM = MBB.getParent()->getTarget();
2045   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2046   const unsigned char MO_NC = AArch64II::MO_NC;
2047 
2048   if ((OpFlags & AArch64II::MO_GOT) != 0) {
2049     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2050         .addGlobalAddress(GV, 0, OpFlags);
2051     if (Subtarget.isTargetILP32()) {
2052       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2053       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2054           .addDef(Reg32, RegState::Dead)
2055           .addUse(Reg, RegState::Kill)
2056           .addImm(0)
2057           .addMemOperand(*MI.memoperands_begin())
2058           .addDef(Reg, RegState::Implicit);
2059     } else {
2060       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2061           .addReg(Reg, RegState::Kill)
2062           .addImm(0)
2063           .addMemOperand(*MI.memoperands_begin());
2064     }
2065   } else if (TM.getCodeModel() == CodeModel::Large) {
2066     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2067     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2068         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2069         .addImm(0);
2070     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2071         .addReg(Reg, RegState::Kill)
2072         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2073         .addImm(16);
2074     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2075         .addReg(Reg, RegState::Kill)
2076         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2077         .addImm(32);
2078     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2079         .addReg(Reg, RegState::Kill)
2080         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2081         .addImm(48);
2082     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2083         .addReg(Reg, RegState::Kill)
2084         .addImm(0)
2085         .addMemOperand(*MI.memoperands_begin());
2086   } else if (TM.getCodeModel() == CodeModel::Tiny) {
2087     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2088         .addGlobalAddress(GV, 0, OpFlags);
2089   } else {
2090     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2091         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2092     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2093     if (Subtarget.isTargetILP32()) {
2094       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2095       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2096           .addDef(Reg32, RegState::Dead)
2097           .addUse(Reg, RegState::Kill)
2098           .addGlobalAddress(GV, 0, LoFlags)
2099           .addMemOperand(*MI.memoperands_begin())
2100           .addDef(Reg, RegState::Implicit);
2101     } else {
2102       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2103           .addReg(Reg, RegState::Kill)
2104           .addGlobalAddress(GV, 0, LoFlags)
2105           .addMemOperand(*MI.memoperands_begin());
2106     }
2107   }
2108 
2109   MBB.erase(MI);
2110 
2111   return true;
2112 }
2113 
2114 // Return true if this instruction simply sets its single destination register
2115 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)2116 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2117   switch (MI.getOpcode()) {
2118   default:
2119     break;
2120   case AArch64::MOVZWi:
2121   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2122     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2123       assert(MI.getDesc().getNumOperands() == 3 &&
2124              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2125       return true;
2126     }
2127     break;
2128   case AArch64::ANDWri: // and Rd, Rzr, #imm
2129     return MI.getOperand(1).getReg() == AArch64::WZR;
2130   case AArch64::ANDXri:
2131     return MI.getOperand(1).getReg() == AArch64::XZR;
2132   case TargetOpcode::COPY:
2133     return MI.getOperand(1).getReg() == AArch64::WZR;
2134   }
2135   return false;
2136 }
2137 
2138 // Return true if this instruction simply renames a general register without
2139 // modifying bits.
isGPRCopy(const MachineInstr & MI)2140 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2141   switch (MI.getOpcode()) {
2142   default:
2143     break;
2144   case TargetOpcode::COPY: {
2145     // GPR32 copies will by lowered to ORRXrs
2146     Register DstReg = MI.getOperand(0).getReg();
2147     return (AArch64::GPR32RegClass.contains(DstReg) ||
2148             AArch64::GPR64RegClass.contains(DstReg));
2149   }
2150   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2151     if (MI.getOperand(1).getReg() == AArch64::XZR) {
2152       assert(MI.getDesc().getNumOperands() == 4 &&
2153              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2154       return true;
2155     }
2156     break;
2157   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2158     if (MI.getOperand(2).getImm() == 0) {
2159       assert(MI.getDesc().getNumOperands() == 4 &&
2160              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2161       return true;
2162     }
2163     break;
2164   }
2165   return false;
2166 }
2167 
2168 // Return true if this instruction simply renames a general register without
2169 // modifying bits.
isFPRCopy(const MachineInstr & MI)2170 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2171   switch (MI.getOpcode()) {
2172   default:
2173     break;
2174   case TargetOpcode::COPY: {
2175     Register DstReg = MI.getOperand(0).getReg();
2176     return AArch64::FPR128RegClass.contains(DstReg);
2177   }
2178   case AArch64::ORRv16i8:
2179     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2180       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2181              "invalid ORRv16i8 operands");
2182       return true;
2183     }
2184     break;
2185   }
2186   return false;
2187 }
2188 
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const2189 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2190                                                int &FrameIndex) const {
2191   switch (MI.getOpcode()) {
2192   default:
2193     break;
2194   case AArch64::LDRWui:
2195   case AArch64::LDRXui:
2196   case AArch64::LDRBui:
2197   case AArch64::LDRHui:
2198   case AArch64::LDRSui:
2199   case AArch64::LDRDui:
2200   case AArch64::LDRQui:
2201   case AArch64::LDR_PXI:
2202     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2203         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2204       FrameIndex = MI.getOperand(1).getIndex();
2205       return MI.getOperand(0).getReg();
2206     }
2207     break;
2208   }
2209 
2210   return 0;
2211 }
2212 
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const2213 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2214                                               int &FrameIndex) const {
2215   switch (MI.getOpcode()) {
2216   default:
2217     break;
2218   case AArch64::STRWui:
2219   case AArch64::STRXui:
2220   case AArch64::STRBui:
2221   case AArch64::STRHui:
2222   case AArch64::STRSui:
2223   case AArch64::STRDui:
2224   case AArch64::STRQui:
2225   case AArch64::STR_PXI:
2226     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2227         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2228       FrameIndex = MI.getOperand(1).getIndex();
2229       return MI.getOperand(0).getReg();
2230     }
2231     break;
2232   }
2233   return 0;
2234 }
2235 
2236 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)2237 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2238   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2239     return MMO->getFlags() & MOSuppressPair;
2240   });
2241 }
2242 
2243 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)2244 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2245   if (MI.memoperands_empty())
2246     return;
2247   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2248 }
2249 
2250 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)2251 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2252   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2253     return MMO->getFlags() & MOStridedAccess;
2254   });
2255 }
2256 
hasUnscaledLdStOffset(unsigned Opc)2257 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2258   switch (Opc) {
2259   default:
2260     return false;
2261   case AArch64::STURSi:
2262   case AArch64::STRSpre:
2263   case AArch64::STURDi:
2264   case AArch64::STRDpre:
2265   case AArch64::STURQi:
2266   case AArch64::STRQpre:
2267   case AArch64::STURBBi:
2268   case AArch64::STURHHi:
2269   case AArch64::STURWi:
2270   case AArch64::STRWpre:
2271   case AArch64::STURXi:
2272   case AArch64::STRXpre:
2273   case AArch64::LDURSi:
2274   case AArch64::LDRSpre:
2275   case AArch64::LDURDi:
2276   case AArch64::LDRDpre:
2277   case AArch64::LDURQi:
2278   case AArch64::LDRQpre:
2279   case AArch64::LDURWi:
2280   case AArch64::LDRWpre:
2281   case AArch64::LDURXi:
2282   case AArch64::LDRXpre:
2283   case AArch64::LDRSWpre:
2284   case AArch64::LDURSWi:
2285   case AArch64::LDURHHi:
2286   case AArch64::LDURBBi:
2287   case AArch64::LDURSBWi:
2288   case AArch64::LDURSHWi:
2289     return true;
2290   }
2291 }
2292 
getUnscaledLdSt(unsigned Opc)2293 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2294   switch (Opc) {
2295   default: return {};
2296   case AArch64::PRFMui: return AArch64::PRFUMi;
2297   case AArch64::LDRXui: return AArch64::LDURXi;
2298   case AArch64::LDRWui: return AArch64::LDURWi;
2299   case AArch64::LDRBui: return AArch64::LDURBi;
2300   case AArch64::LDRHui: return AArch64::LDURHi;
2301   case AArch64::LDRSui: return AArch64::LDURSi;
2302   case AArch64::LDRDui: return AArch64::LDURDi;
2303   case AArch64::LDRQui: return AArch64::LDURQi;
2304   case AArch64::LDRBBui: return AArch64::LDURBBi;
2305   case AArch64::LDRHHui: return AArch64::LDURHHi;
2306   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2307   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2308   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2309   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2310   case AArch64::LDRSWui: return AArch64::LDURSWi;
2311   case AArch64::STRXui: return AArch64::STURXi;
2312   case AArch64::STRWui: return AArch64::STURWi;
2313   case AArch64::STRBui: return AArch64::STURBi;
2314   case AArch64::STRHui: return AArch64::STURHi;
2315   case AArch64::STRSui: return AArch64::STURSi;
2316   case AArch64::STRDui: return AArch64::STURDi;
2317   case AArch64::STRQui: return AArch64::STURQi;
2318   case AArch64::STRBBui: return AArch64::STURBBi;
2319   case AArch64::STRHHui: return AArch64::STURHHi;
2320   }
2321 }
2322 
getLoadStoreImmIdx(unsigned Opc)2323 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2324   switch (Opc) {
2325   default:
2326     return 2;
2327   case AArch64::LDPXi:
2328   case AArch64::LDPDi:
2329   case AArch64::STPXi:
2330   case AArch64::STPDi:
2331   case AArch64::LDNPXi:
2332   case AArch64::LDNPDi:
2333   case AArch64::STNPXi:
2334   case AArch64::STNPDi:
2335   case AArch64::LDPQi:
2336   case AArch64::STPQi:
2337   case AArch64::LDNPQi:
2338   case AArch64::STNPQi:
2339   case AArch64::LDPWi:
2340   case AArch64::LDPSi:
2341   case AArch64::STPWi:
2342   case AArch64::STPSi:
2343   case AArch64::LDNPWi:
2344   case AArch64::LDNPSi:
2345   case AArch64::STNPWi:
2346   case AArch64::STNPSi:
2347   case AArch64::LDG:
2348   case AArch64::STGPi:
2349 
2350   case AArch64::LD1B_IMM:
2351   case AArch64::LD1B_H_IMM:
2352   case AArch64::LD1B_S_IMM:
2353   case AArch64::LD1B_D_IMM:
2354   case AArch64::LD1SB_H_IMM:
2355   case AArch64::LD1SB_S_IMM:
2356   case AArch64::LD1SB_D_IMM:
2357   case AArch64::LD1H_IMM:
2358   case AArch64::LD1H_S_IMM:
2359   case AArch64::LD1H_D_IMM:
2360   case AArch64::LD1SH_S_IMM:
2361   case AArch64::LD1SH_D_IMM:
2362   case AArch64::LD1W_IMM:
2363   case AArch64::LD1W_D_IMM:
2364   case AArch64::LD1SW_D_IMM:
2365   case AArch64::LD1D_IMM:
2366 
2367   case AArch64::LD2B_IMM:
2368   case AArch64::LD2H_IMM:
2369   case AArch64::LD2W_IMM:
2370   case AArch64::LD2D_IMM:
2371   case AArch64::LD3B_IMM:
2372   case AArch64::LD3H_IMM:
2373   case AArch64::LD3W_IMM:
2374   case AArch64::LD3D_IMM:
2375   case AArch64::LD4B_IMM:
2376   case AArch64::LD4H_IMM:
2377   case AArch64::LD4W_IMM:
2378   case AArch64::LD4D_IMM:
2379 
2380   case AArch64::ST1B_IMM:
2381   case AArch64::ST1B_H_IMM:
2382   case AArch64::ST1B_S_IMM:
2383   case AArch64::ST1B_D_IMM:
2384   case AArch64::ST1H_IMM:
2385   case AArch64::ST1H_S_IMM:
2386   case AArch64::ST1H_D_IMM:
2387   case AArch64::ST1W_IMM:
2388   case AArch64::ST1W_D_IMM:
2389   case AArch64::ST1D_IMM:
2390 
2391   case AArch64::ST2B_IMM:
2392   case AArch64::ST2H_IMM:
2393   case AArch64::ST2W_IMM:
2394   case AArch64::ST2D_IMM:
2395   case AArch64::ST3B_IMM:
2396   case AArch64::ST3H_IMM:
2397   case AArch64::ST3W_IMM:
2398   case AArch64::ST3D_IMM:
2399   case AArch64::ST4B_IMM:
2400   case AArch64::ST4H_IMM:
2401   case AArch64::ST4W_IMM:
2402   case AArch64::ST4D_IMM:
2403 
2404   case AArch64::LD1RB_IMM:
2405   case AArch64::LD1RB_H_IMM:
2406   case AArch64::LD1RB_S_IMM:
2407   case AArch64::LD1RB_D_IMM:
2408   case AArch64::LD1RSB_H_IMM:
2409   case AArch64::LD1RSB_S_IMM:
2410   case AArch64::LD1RSB_D_IMM:
2411   case AArch64::LD1RH_IMM:
2412   case AArch64::LD1RH_S_IMM:
2413   case AArch64::LD1RH_D_IMM:
2414   case AArch64::LD1RSH_S_IMM:
2415   case AArch64::LD1RSH_D_IMM:
2416   case AArch64::LD1RW_IMM:
2417   case AArch64::LD1RW_D_IMM:
2418   case AArch64::LD1RSW_IMM:
2419   case AArch64::LD1RD_IMM:
2420 
2421   case AArch64::LDNT1B_ZRI:
2422   case AArch64::LDNT1H_ZRI:
2423   case AArch64::LDNT1W_ZRI:
2424   case AArch64::LDNT1D_ZRI:
2425   case AArch64::STNT1B_ZRI:
2426   case AArch64::STNT1H_ZRI:
2427   case AArch64::STNT1W_ZRI:
2428   case AArch64::STNT1D_ZRI:
2429 
2430   case AArch64::LDNF1B_IMM:
2431   case AArch64::LDNF1B_H_IMM:
2432   case AArch64::LDNF1B_S_IMM:
2433   case AArch64::LDNF1B_D_IMM:
2434   case AArch64::LDNF1SB_H_IMM:
2435   case AArch64::LDNF1SB_S_IMM:
2436   case AArch64::LDNF1SB_D_IMM:
2437   case AArch64::LDNF1H_IMM:
2438   case AArch64::LDNF1H_S_IMM:
2439   case AArch64::LDNF1H_D_IMM:
2440   case AArch64::LDNF1SH_S_IMM:
2441   case AArch64::LDNF1SH_D_IMM:
2442   case AArch64::LDNF1W_IMM:
2443   case AArch64::LDNF1W_D_IMM:
2444   case AArch64::LDNF1SW_D_IMM:
2445   case AArch64::LDNF1D_IMM:
2446     return 3;
2447   case AArch64::ADDG:
2448   case AArch64::STGi:
2449   case AArch64::LDR_PXI:
2450   case AArch64::STR_PXI:
2451     return 2;
2452   }
2453 }
2454 
isPairableLdStInst(const MachineInstr & MI)2455 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2456   switch (MI.getOpcode()) {
2457   default:
2458     return false;
2459   // Scaled instructions.
2460   case AArch64::STRSui:
2461   case AArch64::STRDui:
2462   case AArch64::STRQui:
2463   case AArch64::STRXui:
2464   case AArch64::STRWui:
2465   case AArch64::LDRSui:
2466   case AArch64::LDRDui:
2467   case AArch64::LDRQui:
2468   case AArch64::LDRXui:
2469   case AArch64::LDRWui:
2470   case AArch64::LDRSWui:
2471   // Unscaled instructions.
2472   case AArch64::STURSi:
2473   case AArch64::STRSpre:
2474   case AArch64::STURDi:
2475   case AArch64::STRDpre:
2476   case AArch64::STURQi:
2477   case AArch64::STRQpre:
2478   case AArch64::STURWi:
2479   case AArch64::STRWpre:
2480   case AArch64::STURXi:
2481   case AArch64::STRXpre:
2482   case AArch64::LDURSi:
2483   case AArch64::LDRSpre:
2484   case AArch64::LDURDi:
2485   case AArch64::LDRDpre:
2486   case AArch64::LDURQi:
2487   case AArch64::LDRQpre:
2488   case AArch64::LDURWi:
2489   case AArch64::LDRWpre:
2490   case AArch64::LDURXi:
2491   case AArch64::LDRXpre:
2492   case AArch64::LDURSWi:
2493   case AArch64::LDRSWpre:
2494     return true;
2495   }
2496 }
2497 
isTailCallReturnInst(const MachineInstr & MI)2498 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2499   switch (MI.getOpcode()) {
2500   default:
2501     assert((!MI.isCall() || !MI.isReturn()) &&
2502            "Unexpected instruction - was a new tail call opcode introduced?");
2503     return false;
2504   case AArch64::TCRETURNdi:
2505   case AArch64::TCRETURNri:
2506   case AArch64::TCRETURNriBTI:
2507   case AArch64::TCRETURNriALL:
2508     return true;
2509   }
2510 }
2511 
convertToFlagSettingOpc(unsigned Opc)2512 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2513   switch (Opc) {
2514   default:
2515     llvm_unreachable("Opcode has no flag setting equivalent!");
2516   // 32-bit cases:
2517   case AArch64::ADDWri:
2518     return AArch64::ADDSWri;
2519   case AArch64::ADDWrr:
2520     return AArch64::ADDSWrr;
2521   case AArch64::ADDWrs:
2522     return AArch64::ADDSWrs;
2523   case AArch64::ADDWrx:
2524     return AArch64::ADDSWrx;
2525   case AArch64::ANDWri:
2526     return AArch64::ANDSWri;
2527   case AArch64::ANDWrr:
2528     return AArch64::ANDSWrr;
2529   case AArch64::ANDWrs:
2530     return AArch64::ANDSWrs;
2531   case AArch64::BICWrr:
2532     return AArch64::BICSWrr;
2533   case AArch64::BICWrs:
2534     return AArch64::BICSWrs;
2535   case AArch64::SUBWri:
2536     return AArch64::SUBSWri;
2537   case AArch64::SUBWrr:
2538     return AArch64::SUBSWrr;
2539   case AArch64::SUBWrs:
2540     return AArch64::SUBSWrs;
2541   case AArch64::SUBWrx:
2542     return AArch64::SUBSWrx;
2543   // 64-bit cases:
2544   case AArch64::ADDXri:
2545     return AArch64::ADDSXri;
2546   case AArch64::ADDXrr:
2547     return AArch64::ADDSXrr;
2548   case AArch64::ADDXrs:
2549     return AArch64::ADDSXrs;
2550   case AArch64::ADDXrx:
2551     return AArch64::ADDSXrx;
2552   case AArch64::ANDXri:
2553     return AArch64::ANDSXri;
2554   case AArch64::ANDXrr:
2555     return AArch64::ANDSXrr;
2556   case AArch64::ANDXrs:
2557     return AArch64::ANDSXrs;
2558   case AArch64::BICXrr:
2559     return AArch64::BICSXrr;
2560   case AArch64::BICXrs:
2561     return AArch64::BICSXrs;
2562   case AArch64::SUBXri:
2563     return AArch64::SUBSXri;
2564   case AArch64::SUBXrr:
2565     return AArch64::SUBSXrr;
2566   case AArch64::SUBXrs:
2567     return AArch64::SUBSXrs;
2568   case AArch64::SUBXrx:
2569     return AArch64::SUBSXrx;
2570   // SVE instructions:
2571   case AArch64::AND_PPzPP:
2572     return AArch64::ANDS_PPzPP;
2573   case AArch64::BIC_PPzPP:
2574     return AArch64::BICS_PPzPP;
2575   case AArch64::EOR_PPzPP:
2576     return AArch64::EORS_PPzPP;
2577   case AArch64::NAND_PPzPP:
2578     return AArch64::NANDS_PPzPP;
2579   case AArch64::NOR_PPzPP:
2580     return AArch64::NORS_PPzPP;
2581   case AArch64::ORN_PPzPP:
2582     return AArch64::ORNS_PPzPP;
2583   case AArch64::ORR_PPzPP:
2584     return AArch64::ORRS_PPzPP;
2585   case AArch64::BRKA_PPzP:
2586     return AArch64::BRKAS_PPzP;
2587   case AArch64::BRKPA_PPzPP:
2588     return AArch64::BRKPAS_PPzPP;
2589   case AArch64::BRKB_PPzP:
2590     return AArch64::BRKBS_PPzP;
2591   case AArch64::BRKPB_PPzPP:
2592     return AArch64::BRKPBS_PPzPP;
2593   case AArch64::BRKN_PPzP:
2594     return AArch64::BRKNS_PPzP;
2595   case AArch64::RDFFR_PPz:
2596     return AArch64::RDFFRS_PPz;
2597   case AArch64::PTRUE_B:
2598     return AArch64::PTRUES_B;
2599   }
2600 }
2601 
2602 // Is this a candidate for ld/st merging or pairing?  For example, we don't
2603 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2604 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2605 
2606   bool IsPreLdSt = isPreLdSt(MI);
2607 
2608   // If this is a volatile load/store, don't mess with it.
2609   if (MI.hasOrderedMemoryRef())
2610     return false;
2611 
2612   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2613   // For Pre-inc LD/ST, the operand is shifted by one.
2614   assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2615           MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2616          "Expected a reg or frame index operand.");
2617 
2618   // For Pre-indexed addressing quadword instructions, the third operand is the
2619   // immediate value.
2620   bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2621 
2622   if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2623     return false;
2624 
2625   // Can't merge/pair if the instruction modifies the base register.
2626   // e.g., ldr x0, [x0]
2627   // This case will never occur with an FI base.
2628   // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2629   // STR<S,D,Q,W,X>pre, it can be merged.
2630   // For example:
2631   //   ldr q0, [x11, #32]!
2632   //   ldr q1, [x11, #16]
2633   //   to
2634   //   ldp q0, q1, [x11, #32]!
2635   if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2636     Register BaseReg = MI.getOperand(1).getReg();
2637     const TargetRegisterInfo *TRI = &getRegisterInfo();
2638     if (MI.modifiesRegister(BaseReg, TRI))
2639       return false;
2640   }
2641 
2642   // Check if this load/store has a hint to avoid pair formation.
2643   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2644   if (isLdStPairSuppressed(MI))
2645     return false;
2646 
2647   // Do not pair any callee-save store/reload instructions in the
2648   // prologue/epilogue if the CFI information encoded the operations as separate
2649   // instructions, as that will cause the size of the actual prologue to mismatch
2650   // with the prologue size recorded in the Windows CFI.
2651   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2652   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2653                      MI.getMF()->getFunction().needsUnwindTableEntry();
2654   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2655                       MI.getFlag(MachineInstr::FrameDestroy)))
2656     return false;
2657 
2658   // On some CPUs quad load/store pairs are slower than two single load/stores.
2659   if (Subtarget.isPaired128Slow()) {
2660     switch (MI.getOpcode()) {
2661     default:
2662       break;
2663     case AArch64::LDURQi:
2664     case AArch64::STURQi:
2665     case AArch64::LDRQui:
2666     case AArch64::STRQui:
2667       return false;
2668     }
2669   }
2670 
2671   return true;
2672 }
2673 
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2674 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2675     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2676     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2677     const TargetRegisterInfo *TRI) const {
2678   if (!LdSt.mayLoadOrStore())
2679     return false;
2680 
2681   const MachineOperand *BaseOp;
2682   TypeSize WidthN(0, false);
2683   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2684                                     WidthN, TRI))
2685     return false;
2686   // The maximum vscale is 16 under AArch64, return the maximal extent for the
2687   // vector.
2688   Width = WidthN.isScalable()
2689               ? WidthN.getKnownMinValue() * AArch64::SVEMaxBitsPerVector /
2690                     AArch64::SVEBitsPerBlock
2691               : WidthN.getKnownMinValue();
2692   BaseOps.push_back(BaseOp);
2693   return true;
2694 }
2695 
2696 std::optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2697 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2698                                           const TargetRegisterInfo *TRI) const {
2699   const MachineOperand *Base; // Filled with the base operand of MI.
2700   int64_t Offset;             // Filled with the offset of MI.
2701   bool OffsetIsScalable;
2702   if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2703     return std::nullopt;
2704 
2705   if (!Base->isReg())
2706     return std::nullopt;
2707   ExtAddrMode AM;
2708   AM.BaseReg = Base->getReg();
2709   AM.Displacement = Offset;
2710   AM.ScaledReg = 0;
2711   AM.Scale = 0;
2712   return AM;
2713 }
2714 
canFoldIntoAddrMode(const MachineInstr & MemI,Register Reg,const MachineInstr & AddrI,ExtAddrMode & AM) const2715 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2716                                            Register Reg,
2717                                            const MachineInstr &AddrI,
2718                                            ExtAddrMode &AM) const {
2719   // Filter out instructions into which we cannot fold.
2720   unsigned NumBytes;
2721   int64_t OffsetScale = 1;
2722   switch (MemI.getOpcode()) {
2723   default:
2724     return false;
2725 
2726   case AArch64::LDURQi:
2727   case AArch64::STURQi:
2728     NumBytes = 16;
2729     break;
2730 
2731   case AArch64::LDURDi:
2732   case AArch64::STURDi:
2733   case AArch64::LDURXi:
2734   case AArch64::STURXi:
2735     NumBytes = 8;
2736     break;
2737 
2738   case AArch64::LDURWi:
2739   case AArch64::LDURSWi:
2740   case AArch64::STURWi:
2741     NumBytes = 4;
2742     break;
2743 
2744   case AArch64::LDURHi:
2745   case AArch64::STURHi:
2746   case AArch64::LDURHHi:
2747   case AArch64::STURHHi:
2748   case AArch64::LDURSHXi:
2749   case AArch64::LDURSHWi:
2750     NumBytes = 2;
2751     break;
2752 
2753   case AArch64::LDRBroX:
2754   case AArch64::LDRBBroX:
2755   case AArch64::LDRSBXroX:
2756   case AArch64::LDRSBWroX:
2757   case AArch64::STRBroX:
2758   case AArch64::STRBBroX:
2759   case AArch64::LDURBi:
2760   case AArch64::LDURBBi:
2761   case AArch64::LDURSBXi:
2762   case AArch64::LDURSBWi:
2763   case AArch64::STURBi:
2764   case AArch64::STURBBi:
2765   case AArch64::LDRBui:
2766   case AArch64::LDRBBui:
2767   case AArch64::LDRSBXui:
2768   case AArch64::LDRSBWui:
2769   case AArch64::STRBui:
2770   case AArch64::STRBBui:
2771     NumBytes = 1;
2772     break;
2773 
2774   case AArch64::LDRQroX:
2775   case AArch64::STRQroX:
2776   case AArch64::LDRQui:
2777   case AArch64::STRQui:
2778     NumBytes = 16;
2779     OffsetScale = 16;
2780     break;
2781 
2782   case AArch64::LDRDroX:
2783   case AArch64::STRDroX:
2784   case AArch64::LDRXroX:
2785   case AArch64::STRXroX:
2786   case AArch64::LDRDui:
2787   case AArch64::STRDui:
2788   case AArch64::LDRXui:
2789   case AArch64::STRXui:
2790     NumBytes = 8;
2791     OffsetScale = 8;
2792     break;
2793 
2794   case AArch64::LDRWroX:
2795   case AArch64::LDRSWroX:
2796   case AArch64::STRWroX:
2797   case AArch64::LDRWui:
2798   case AArch64::LDRSWui:
2799   case AArch64::STRWui:
2800     NumBytes = 4;
2801     OffsetScale = 4;
2802     break;
2803 
2804   case AArch64::LDRHroX:
2805   case AArch64::STRHroX:
2806   case AArch64::LDRHHroX:
2807   case AArch64::STRHHroX:
2808   case AArch64::LDRSHXroX:
2809   case AArch64::LDRSHWroX:
2810   case AArch64::LDRHui:
2811   case AArch64::STRHui:
2812   case AArch64::LDRHHui:
2813   case AArch64::STRHHui:
2814   case AArch64::LDRSHXui:
2815   case AArch64::LDRSHWui:
2816     NumBytes = 2;
2817     OffsetScale = 2;
2818     break;
2819   }
2820 
2821   // Check the fold operand is not the loaded/stored value.
2822   const MachineOperand &BaseRegOp = MemI.getOperand(0);
2823   if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2824     return false;
2825 
2826   // Handle memory instructions with a [Reg, Reg] addressing mode.
2827   if (MemI.getOperand(2).isReg()) {
2828     // Bail if the addressing mode already includes extension of the offset
2829     // register.
2830     if (MemI.getOperand(3).getImm())
2831       return false;
2832 
2833     // Check if we actually have a scaled offset.
2834     if (MemI.getOperand(4).getImm() == 0)
2835       OffsetScale = 1;
2836 
2837     // If the address instructions is folded into the base register, then the
2838     // addressing mode must not have a scale. Then we can swap the base and the
2839     // scaled registers.
2840     if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
2841       return false;
2842 
2843     switch (AddrI.getOpcode()) {
2844     default:
2845       return false;
2846 
2847     case AArch64::SBFMXri:
2848       // sxtw Xa, Wm
2849       // ldr Xd, [Xn, Xa, lsl #N]
2850       // ->
2851       // ldr Xd, [Xn, Wm, sxtw #N]
2852       if (AddrI.getOperand(2).getImm() != 0 ||
2853           AddrI.getOperand(3).getImm() != 31)
2854         return false;
2855 
2856       AM.BaseReg = MemI.getOperand(1).getReg();
2857       if (AM.BaseReg == Reg)
2858         AM.BaseReg = MemI.getOperand(2).getReg();
2859       AM.ScaledReg = AddrI.getOperand(1).getReg();
2860       AM.Scale = OffsetScale;
2861       AM.Displacement = 0;
2862       AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2863       return true;
2864 
2865     case TargetOpcode::SUBREG_TO_REG: {
2866       // mov Wa, Wm
2867       // ldr Xd, [Xn, Xa, lsl #N]
2868       // ->
2869       // ldr Xd, [Xn, Wm, uxtw #N]
2870 
2871       // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2872       if (AddrI.getOperand(1).getImm() != 0 ||
2873           AddrI.getOperand(3).getImm() != AArch64::sub_32)
2874         return false;
2875 
2876       const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2877       Register OffsetReg = AddrI.getOperand(2).getReg();
2878       if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
2879         return false;
2880 
2881       const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
2882       if (DefMI.getOpcode() != AArch64::ORRWrs ||
2883           DefMI.getOperand(1).getReg() != AArch64::WZR ||
2884           DefMI.getOperand(3).getImm() != 0)
2885         return false;
2886 
2887       AM.BaseReg = MemI.getOperand(1).getReg();
2888       if (AM.BaseReg == Reg)
2889         AM.BaseReg = MemI.getOperand(2).getReg();
2890       AM.ScaledReg = DefMI.getOperand(2).getReg();
2891       AM.Scale = OffsetScale;
2892       AM.Displacement = 0;
2893       AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2894       return true;
2895     }
2896     }
2897   }
2898 
2899   // Handle memory instructions with a [Reg, #Imm] addressing mode.
2900 
2901   // Check we are not breaking a potential conversion to an LDP.
2902   auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2903                                  int64_t NewOffset) -> bool {
2904     int64_t MinOffset, MaxOffset;
2905     switch (NumBytes) {
2906     default:
2907       return true;
2908     case 4:
2909       MinOffset = -256;
2910       MaxOffset = 252;
2911       break;
2912     case 8:
2913       MinOffset = -512;
2914       MaxOffset = 504;
2915       break;
2916     case 16:
2917       MinOffset = -1024;
2918       MaxOffset = 1008;
2919       break;
2920     }
2921     return OldOffset < MinOffset || OldOffset > MaxOffset ||
2922            (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2923   };
2924   auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2925     int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
2926     int64_t NewOffset = OldOffset + Disp;
2927     if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
2928       return false;
2929     // If the old offset would fit into an LDP, but the new offset wouldn't,
2930     // bail out.
2931     if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2932       return false;
2933     AM.BaseReg = AddrI.getOperand(1).getReg();
2934     AM.ScaledReg = 0;
2935     AM.Scale = 0;
2936     AM.Displacement = NewOffset;
2937     AM.Form = ExtAddrMode::Formula::Basic;
2938     return true;
2939   };
2940 
2941   auto canFoldAddRegIntoAddrMode =
2942       [&](int64_t Scale,
2943           ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2944     if (MemI.getOperand(2).getImm() != 0)
2945       return false;
2946     if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2947       return false;
2948     AM.BaseReg = AddrI.getOperand(1).getReg();
2949     AM.ScaledReg = AddrI.getOperand(2).getReg();
2950     AM.Scale = Scale;
2951     AM.Displacement = 0;
2952     AM.Form = Form;
2953     return true;
2954   };
2955 
2956   auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2957     unsigned Opcode = MemI.getOpcode();
2958     return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2959            Subtarget.isSTRQroSlow();
2960   };
2961 
2962   int64_t Disp = 0;
2963   const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2964   switch (AddrI.getOpcode()) {
2965   default:
2966     return false;
2967 
2968   case AArch64::ADDXri:
2969     // add Xa, Xn, #N
2970     // ldr Xd, [Xa, #M]
2971     // ->
2972     // ldr Xd, [Xn, #N'+M]
2973     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2974     return canFoldAddSubImmIntoAddrMode(Disp);
2975 
2976   case AArch64::SUBXri:
2977     // sub Xa, Xn, #N
2978     // ldr Xd, [Xa, #M]
2979     // ->
2980     // ldr Xd, [Xn, #N'+M]
2981     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2982     return canFoldAddSubImmIntoAddrMode(-Disp);
2983 
2984   case AArch64::ADDXrs: {
2985     // add Xa, Xn, Xm, lsl #N
2986     // ldr Xd, [Xa]
2987     // ->
2988     // ldr Xd, [Xn, Xm, lsl #N]
2989 
2990     // Don't fold the add if the result would be slower, unless optimising for
2991     // size.
2992     unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
2993     if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
2994       return false;
2995     Shift = AArch64_AM::getShiftValue(Shift);
2996     if (!OptSize) {
2997       if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
2998         return false;
2999       if (avoidSlowSTRQ(MemI))
3000         return false;
3001     }
3002     return canFoldAddRegIntoAddrMode(1ULL << Shift);
3003   }
3004 
3005   case AArch64::ADDXrr:
3006     // add Xa, Xn, Xm
3007     // ldr Xd, [Xa]
3008     // ->
3009     // ldr Xd, [Xn, Xm, lsl #0]
3010 
3011     // Don't fold the add if the result would be slower, unless optimising for
3012     // size.
3013     if (!OptSize && avoidSlowSTRQ(MemI))
3014       return false;
3015     return canFoldAddRegIntoAddrMode(1);
3016 
3017   case AArch64::ADDXrx:
3018     // add Xa, Xn, Wm, {s,u}xtw #N
3019     // ldr Xd, [Xa]
3020     // ->
3021     // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3022 
3023     // Don't fold the add if the result would be slower, unless optimising for
3024     // size.
3025     if (!OptSize && avoidSlowSTRQ(MemI))
3026       return false;
3027 
3028     // Can fold only sign-/zero-extend of a word.
3029     unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3030     AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3031     if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3032       return false;
3033 
3034     return canFoldAddRegIntoAddrMode(
3035         1ULL << AArch64_AM::getArithShiftValue(Imm),
3036         (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3037                                      : ExtAddrMode::Formula::ZExtScaledReg);
3038   }
3039 }
3040 
3041 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3042 // return the opcode of an instruction performing the same operation, but using
3043 // the [Reg, Reg] addressing mode.
regOffsetOpcode(unsigned Opcode)3044 static unsigned regOffsetOpcode(unsigned Opcode) {
3045   switch (Opcode) {
3046   default:
3047     llvm_unreachable("Address folding not implemented for instruction");
3048 
3049   case AArch64::LDURQi:
3050   case AArch64::LDRQui:
3051     return AArch64::LDRQroX;
3052   case AArch64::STURQi:
3053   case AArch64::STRQui:
3054     return AArch64::STRQroX;
3055   case AArch64::LDURDi:
3056   case AArch64::LDRDui:
3057     return AArch64::LDRDroX;
3058   case AArch64::STURDi:
3059   case AArch64::STRDui:
3060     return AArch64::STRDroX;
3061   case AArch64::LDURXi:
3062   case AArch64::LDRXui:
3063     return AArch64::LDRXroX;
3064   case AArch64::STURXi:
3065   case AArch64::STRXui:
3066     return AArch64::STRXroX;
3067   case AArch64::LDURWi:
3068   case AArch64::LDRWui:
3069     return AArch64::LDRWroX;
3070   case AArch64::LDURSWi:
3071   case AArch64::LDRSWui:
3072     return AArch64::LDRSWroX;
3073   case AArch64::STURWi:
3074   case AArch64::STRWui:
3075     return AArch64::STRWroX;
3076   case AArch64::LDURHi:
3077   case AArch64::LDRHui:
3078     return AArch64::LDRHroX;
3079   case AArch64::STURHi:
3080   case AArch64::STRHui:
3081     return AArch64::STRHroX;
3082   case AArch64::LDURHHi:
3083   case AArch64::LDRHHui:
3084     return AArch64::LDRHHroX;
3085   case AArch64::STURHHi:
3086   case AArch64::STRHHui:
3087     return AArch64::STRHHroX;
3088   case AArch64::LDURSHXi:
3089   case AArch64::LDRSHXui:
3090     return AArch64::LDRSHXroX;
3091   case AArch64::LDURSHWi:
3092   case AArch64::LDRSHWui:
3093     return AArch64::LDRSHWroX;
3094   case AArch64::LDURBi:
3095   case AArch64::LDRBui:
3096     return AArch64::LDRBroX;
3097   case AArch64::LDURBBi:
3098   case AArch64::LDRBBui:
3099     return AArch64::LDRBBroX;
3100   case AArch64::LDURSBXi:
3101   case AArch64::LDRSBXui:
3102     return AArch64::LDRSBXroX;
3103   case AArch64::LDURSBWi:
3104   case AArch64::LDRSBWui:
3105     return AArch64::LDRSBWroX;
3106   case AArch64::STURBi:
3107   case AArch64::STRBui:
3108     return AArch64::STRBroX;
3109   case AArch64::STURBBi:
3110   case AArch64::STRBBui:
3111     return AArch64::STRBBroX;
3112   }
3113 }
3114 
3115 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3116 // the opcode of an instruction performing the same operation, but using the
3117 // [Reg, #Imm] addressing mode with scaled offset.
scaledOffsetOpcode(unsigned Opcode,unsigned & Scale)3118 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3119   switch (Opcode) {
3120   default:
3121     llvm_unreachable("Address folding not implemented for instruction");
3122 
3123   case AArch64::LDURQi:
3124     Scale = 16;
3125     return AArch64::LDRQui;
3126   case AArch64::STURQi:
3127     Scale = 16;
3128     return AArch64::STRQui;
3129   case AArch64::LDURDi:
3130     Scale = 8;
3131     return AArch64::LDRDui;
3132   case AArch64::STURDi:
3133     Scale = 8;
3134     return AArch64::STRDui;
3135   case AArch64::LDURXi:
3136     Scale = 8;
3137     return AArch64::LDRXui;
3138   case AArch64::STURXi:
3139     Scale = 8;
3140     return AArch64::STRXui;
3141   case AArch64::LDURWi:
3142     Scale = 4;
3143     return AArch64::LDRWui;
3144   case AArch64::LDURSWi:
3145     Scale = 4;
3146     return AArch64::LDRSWui;
3147   case AArch64::STURWi:
3148     Scale = 4;
3149     return AArch64::STRWui;
3150   case AArch64::LDURHi:
3151     Scale = 2;
3152     return AArch64::LDRHui;
3153   case AArch64::STURHi:
3154     Scale = 2;
3155     return AArch64::STRHui;
3156   case AArch64::LDURHHi:
3157     Scale = 2;
3158     return AArch64::LDRHHui;
3159   case AArch64::STURHHi:
3160     Scale = 2;
3161     return AArch64::STRHHui;
3162   case AArch64::LDURSHXi:
3163     Scale = 2;
3164     return AArch64::LDRSHXui;
3165   case AArch64::LDURSHWi:
3166     Scale = 2;
3167     return AArch64::LDRSHWui;
3168   case AArch64::LDURBi:
3169     Scale = 1;
3170     return AArch64::LDRBui;
3171   case AArch64::LDURBBi:
3172     Scale = 1;
3173     return AArch64::LDRBBui;
3174   case AArch64::LDURSBXi:
3175     Scale = 1;
3176     return AArch64::LDRSBXui;
3177   case AArch64::LDURSBWi:
3178     Scale = 1;
3179     return AArch64::LDRSBWui;
3180   case AArch64::STURBi:
3181     Scale = 1;
3182     return AArch64::STRBui;
3183   case AArch64::STURBBi:
3184     Scale = 1;
3185     return AArch64::STRBBui;
3186   case AArch64::LDRQui:
3187   case AArch64::STRQui:
3188     Scale = 16;
3189     return Opcode;
3190   case AArch64::LDRDui:
3191   case AArch64::STRDui:
3192   case AArch64::LDRXui:
3193   case AArch64::STRXui:
3194     Scale = 8;
3195     return Opcode;
3196   case AArch64::LDRWui:
3197   case AArch64::LDRSWui:
3198   case AArch64::STRWui:
3199     Scale = 4;
3200     return Opcode;
3201   case AArch64::LDRHui:
3202   case AArch64::STRHui:
3203   case AArch64::LDRHHui:
3204   case AArch64::STRHHui:
3205   case AArch64::LDRSHXui:
3206   case AArch64::LDRSHWui:
3207     Scale = 2;
3208     return Opcode;
3209   case AArch64::LDRBui:
3210   case AArch64::LDRBBui:
3211   case AArch64::LDRSBXui:
3212   case AArch64::LDRSBWui:
3213   case AArch64::STRBui:
3214   case AArch64::STRBBui:
3215     Scale = 1;
3216     return Opcode;
3217   }
3218 }
3219 
3220 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3221 // the opcode of an instruction performing the same operation, but using the
3222 // [Reg, #Imm] addressing mode with unscaled offset.
unscaledOffsetOpcode(unsigned Opcode)3223 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3224   switch (Opcode) {
3225   default:
3226     llvm_unreachable("Address folding not implemented for instruction");
3227 
3228   case AArch64::LDURQi:
3229   case AArch64::STURQi:
3230   case AArch64::LDURDi:
3231   case AArch64::STURDi:
3232   case AArch64::LDURXi:
3233   case AArch64::STURXi:
3234   case AArch64::LDURWi:
3235   case AArch64::LDURSWi:
3236   case AArch64::STURWi:
3237   case AArch64::LDURHi:
3238   case AArch64::STURHi:
3239   case AArch64::LDURHHi:
3240   case AArch64::STURHHi:
3241   case AArch64::LDURSHXi:
3242   case AArch64::LDURSHWi:
3243   case AArch64::LDURBi:
3244   case AArch64::STURBi:
3245   case AArch64::LDURBBi:
3246   case AArch64::STURBBi:
3247   case AArch64::LDURSBWi:
3248   case AArch64::LDURSBXi:
3249     return Opcode;
3250   case AArch64::LDRQui:
3251     return AArch64::LDURQi;
3252   case AArch64::STRQui:
3253     return AArch64::STURQi;
3254   case AArch64::LDRDui:
3255     return AArch64::LDURDi;
3256   case AArch64::STRDui:
3257     return AArch64::STURDi;
3258   case AArch64::LDRXui:
3259     return AArch64::LDURXi;
3260   case AArch64::STRXui:
3261     return AArch64::STURXi;
3262   case AArch64::LDRWui:
3263     return AArch64::LDURWi;
3264   case AArch64::LDRSWui:
3265     return AArch64::LDURSWi;
3266   case AArch64::STRWui:
3267     return AArch64::STURWi;
3268   case AArch64::LDRHui:
3269     return AArch64::LDURHi;
3270   case AArch64::STRHui:
3271     return AArch64::STURHi;
3272   case AArch64::LDRHHui:
3273     return AArch64::LDURHHi;
3274   case AArch64::STRHHui:
3275     return AArch64::STURHHi;
3276   case AArch64::LDRSHXui:
3277     return AArch64::LDURSHXi;
3278   case AArch64::LDRSHWui:
3279     return AArch64::LDURSHWi;
3280   case AArch64::LDRBBui:
3281     return AArch64::LDURBBi;
3282   case AArch64::LDRBui:
3283     return AArch64::LDURBi;
3284   case AArch64::STRBBui:
3285     return AArch64::STURBBi;
3286   case AArch64::STRBui:
3287     return AArch64::STURBi;
3288   case AArch64::LDRSBWui:
3289     return AArch64::LDURSBWi;
3290   case AArch64::LDRSBXui:
3291     return AArch64::LDURSBXi;
3292   }
3293 }
3294 
3295 // Given the opcode of a memory load/store instruction, return the opcode of an
3296 // instruction performing the same operation, but using
3297 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3298 // offset register.
offsetExtendOpcode(unsigned Opcode)3299 static unsigned offsetExtendOpcode(unsigned Opcode) {
3300   switch (Opcode) {
3301   default:
3302     llvm_unreachable("Address folding not implemented for instruction");
3303 
3304   case AArch64::LDRQroX:
3305   case AArch64::LDURQi:
3306   case AArch64::LDRQui:
3307     return AArch64::LDRQroW;
3308   case AArch64::STRQroX:
3309   case AArch64::STURQi:
3310   case AArch64::STRQui:
3311     return AArch64::STRQroW;
3312   case AArch64::LDRDroX:
3313   case AArch64::LDURDi:
3314   case AArch64::LDRDui:
3315     return AArch64::LDRDroW;
3316   case AArch64::STRDroX:
3317   case AArch64::STURDi:
3318   case AArch64::STRDui:
3319     return AArch64::STRDroW;
3320   case AArch64::LDRXroX:
3321   case AArch64::LDURXi:
3322   case AArch64::LDRXui:
3323     return AArch64::LDRXroW;
3324   case AArch64::STRXroX:
3325   case AArch64::STURXi:
3326   case AArch64::STRXui:
3327     return AArch64::STRXroW;
3328   case AArch64::LDRWroX:
3329   case AArch64::LDURWi:
3330   case AArch64::LDRWui:
3331     return AArch64::LDRWroW;
3332   case AArch64::LDRSWroX:
3333   case AArch64::LDURSWi:
3334   case AArch64::LDRSWui:
3335     return AArch64::LDRSWroW;
3336   case AArch64::STRWroX:
3337   case AArch64::STURWi:
3338   case AArch64::STRWui:
3339     return AArch64::STRWroW;
3340   case AArch64::LDRHroX:
3341   case AArch64::LDURHi:
3342   case AArch64::LDRHui:
3343     return AArch64::LDRHroW;
3344   case AArch64::STRHroX:
3345   case AArch64::STURHi:
3346   case AArch64::STRHui:
3347     return AArch64::STRHroW;
3348   case AArch64::LDRHHroX:
3349   case AArch64::LDURHHi:
3350   case AArch64::LDRHHui:
3351     return AArch64::LDRHHroW;
3352   case AArch64::STRHHroX:
3353   case AArch64::STURHHi:
3354   case AArch64::STRHHui:
3355     return AArch64::STRHHroW;
3356   case AArch64::LDRSHXroX:
3357   case AArch64::LDURSHXi:
3358   case AArch64::LDRSHXui:
3359     return AArch64::LDRSHXroW;
3360   case AArch64::LDRSHWroX:
3361   case AArch64::LDURSHWi:
3362   case AArch64::LDRSHWui:
3363     return AArch64::LDRSHWroW;
3364   case AArch64::LDRBroX:
3365   case AArch64::LDURBi:
3366   case AArch64::LDRBui:
3367     return AArch64::LDRBroW;
3368   case AArch64::LDRBBroX:
3369   case AArch64::LDURBBi:
3370   case AArch64::LDRBBui:
3371     return AArch64::LDRBBroW;
3372   case AArch64::LDRSBXroX:
3373   case AArch64::LDURSBXi:
3374   case AArch64::LDRSBXui:
3375     return AArch64::LDRSBXroW;
3376   case AArch64::LDRSBWroX:
3377   case AArch64::LDURSBWi:
3378   case AArch64::LDRSBWui:
3379     return AArch64::LDRSBWroW;
3380   case AArch64::STRBroX:
3381   case AArch64::STURBi:
3382   case AArch64::STRBui:
3383     return AArch64::STRBroW;
3384   case AArch64::STRBBroX:
3385   case AArch64::STURBBi:
3386   case AArch64::STRBBui:
3387     return AArch64::STRBBroW;
3388   }
3389 }
3390 
emitLdStWithAddr(MachineInstr & MemI,const ExtAddrMode & AM) const3391 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3392                                                  const ExtAddrMode &AM) const {
3393 
3394   const DebugLoc &DL = MemI.getDebugLoc();
3395   MachineBasicBlock &MBB = *MemI.getParent();
3396   MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3397 
3398   if (AM.Form == ExtAddrMode::Formula::Basic) {
3399     if (AM.ScaledReg) {
3400       // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3401       unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3402       MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3403       auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3404                    .addReg(MemI.getOperand(0).getReg(),
3405                            MemI.mayLoad() ? RegState::Define : 0)
3406                    .addReg(AM.BaseReg)
3407                    .addReg(AM.ScaledReg)
3408                    .addImm(0)
3409                    .addImm(AM.Scale > 1)
3410                    .setMemRefs(MemI.memoperands())
3411                    .setMIFlags(MemI.getFlags());
3412       return B.getInstr();
3413     }
3414 
3415     assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3416            "Addressing mode not supported for folding");
3417 
3418     // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3419     unsigned Scale = 1;
3420     unsigned Opcode = MemI.getOpcode();
3421     if (isInt<9>(AM.Displacement))
3422       Opcode = unscaledOffsetOpcode(Opcode);
3423     else
3424       Opcode = scaledOffsetOpcode(Opcode, Scale);
3425 
3426     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3427                  .addReg(MemI.getOperand(0).getReg(),
3428                          MemI.mayLoad() ? RegState::Define : 0)
3429                  .addReg(AM.BaseReg)
3430                  .addImm(AM.Displacement / Scale)
3431                  .setMemRefs(MemI.memoperands())
3432                  .setMIFlags(MemI.getFlags());
3433     return B.getInstr();
3434   }
3435 
3436   if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3437       AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3438     // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3439     assert(AM.ScaledReg && !AM.Displacement &&
3440            "Address offset can be a register or an immediate, but not both");
3441     unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3442     MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3443     // Make sure the offset register is in the correct register class.
3444     Register OffsetReg = AM.ScaledReg;
3445     const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3446     if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3447       OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3448       BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3449           .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3450     }
3451     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3452                  .addReg(MemI.getOperand(0).getReg(),
3453                          MemI.mayLoad() ? RegState::Define : 0)
3454                  .addReg(AM.BaseReg)
3455                  .addReg(OffsetReg)
3456                  .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3457                  .addImm(AM.Scale != 1)
3458                  .setMemRefs(MemI.memoperands())
3459                  .setMIFlags(MemI.getFlags());
3460 
3461     return B.getInstr();
3462   }
3463 
3464   llvm_unreachable(
3465       "Function must not be called with an addressing mode it can't handle");
3466 }
3467 
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,TypeSize & Width,const TargetRegisterInfo * TRI) const3468 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3469     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3470     bool &OffsetIsScalable, TypeSize &Width,
3471     const TargetRegisterInfo *TRI) const {
3472   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3473   // Handle only loads/stores with base register followed by immediate offset.
3474   if (LdSt.getNumExplicitOperands() == 3) {
3475     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3476     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3477         !LdSt.getOperand(2).isImm())
3478       return false;
3479   } else if (LdSt.getNumExplicitOperands() == 4) {
3480     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3481     if (!LdSt.getOperand(1).isReg() ||
3482         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3483         !LdSt.getOperand(3).isImm())
3484       return false;
3485   } else
3486     return false;
3487 
3488   // Get the scaling factor for the instruction and set the width for the
3489   // instruction.
3490   TypeSize Scale(0U, false);
3491   int64_t Dummy1, Dummy2;
3492 
3493   // If this returns false, then it's an instruction we don't want to handle.
3494   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3495     return false;
3496 
3497   // Compute the offset. Offset is calculated as the immediate operand
3498   // multiplied by the scaling factor. Unscaled instructions have scaling factor
3499   // set to 1.
3500   if (LdSt.getNumExplicitOperands() == 3) {
3501     BaseOp = &LdSt.getOperand(1);
3502     Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3503   } else {
3504     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3505     BaseOp = &LdSt.getOperand(2);
3506     Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3507   }
3508   OffsetIsScalable = Scale.isScalable();
3509 
3510   if (!BaseOp->isReg() && !BaseOp->isFI())
3511     return false;
3512 
3513   return true;
3514 }
3515 
3516 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const3517 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3518   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3519   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3520   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3521   return OfsOp;
3522 }
3523 
getMemOpInfo(unsigned Opcode,TypeSize & Scale,TypeSize & Width,int64_t & MinOffset,int64_t & MaxOffset)3524 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3525                                     TypeSize &Width, int64_t &MinOffset,
3526                                     int64_t &MaxOffset) {
3527   switch (Opcode) {
3528   // Not a memory operation or something we want to handle.
3529   default:
3530     Scale = TypeSize::getFixed(0);
3531     Width = TypeSize::getFixed(0);
3532     MinOffset = MaxOffset = 0;
3533     return false;
3534   case AArch64::STRWpost:
3535   case AArch64::LDRWpost:
3536     Width = TypeSize::getFixed(32);
3537     Scale = TypeSize::getFixed(4);
3538     MinOffset = -256;
3539     MaxOffset = 255;
3540     break;
3541   case AArch64::LDURQi:
3542   case AArch64::STURQi:
3543     Width = TypeSize::getFixed(16);
3544     Scale = TypeSize::getFixed(1);
3545     MinOffset = -256;
3546     MaxOffset = 255;
3547     break;
3548   case AArch64::PRFUMi:
3549   case AArch64::LDURXi:
3550   case AArch64::LDURDi:
3551   case AArch64::LDAPURXi:
3552   case AArch64::STURXi:
3553   case AArch64::STURDi:
3554   case AArch64::STLURXi:
3555     Width = TypeSize::getFixed(8);
3556     Scale = TypeSize::getFixed(1);
3557     MinOffset = -256;
3558     MaxOffset = 255;
3559     break;
3560   case AArch64::LDURWi:
3561   case AArch64::LDURSi:
3562   case AArch64::LDURSWi:
3563   case AArch64::LDAPURi:
3564   case AArch64::LDAPURSWi:
3565   case AArch64::STURWi:
3566   case AArch64::STURSi:
3567   case AArch64::STLURWi:
3568     Width = TypeSize::getFixed(4);
3569     Scale = TypeSize::getFixed(1);
3570     MinOffset = -256;
3571     MaxOffset = 255;
3572     break;
3573   case AArch64::LDURHi:
3574   case AArch64::LDURHHi:
3575   case AArch64::LDURSHXi:
3576   case AArch64::LDURSHWi:
3577   case AArch64::LDAPURHi:
3578   case AArch64::LDAPURSHWi:
3579   case AArch64::LDAPURSHXi:
3580   case AArch64::STURHi:
3581   case AArch64::STURHHi:
3582   case AArch64::STLURHi:
3583     Width = TypeSize::getFixed(2);
3584     Scale = TypeSize::getFixed(1);
3585     MinOffset = -256;
3586     MaxOffset = 255;
3587     break;
3588   case AArch64::LDURBi:
3589   case AArch64::LDURBBi:
3590   case AArch64::LDURSBXi:
3591   case AArch64::LDURSBWi:
3592   case AArch64::LDAPURBi:
3593   case AArch64::LDAPURSBWi:
3594   case AArch64::LDAPURSBXi:
3595   case AArch64::STURBi:
3596   case AArch64::STURBBi:
3597   case AArch64::STLURBi:
3598     Width = TypeSize::getFixed(1);
3599     Scale = TypeSize::getFixed(1);
3600     MinOffset = -256;
3601     MaxOffset = 255;
3602     break;
3603   case AArch64::LDPQi:
3604   case AArch64::LDNPQi:
3605   case AArch64::STPQi:
3606   case AArch64::STNPQi:
3607     Scale = TypeSize::getFixed(16);
3608     Width = TypeSize::getFixed(32);
3609     MinOffset = -64;
3610     MaxOffset = 63;
3611     break;
3612   case AArch64::LDRQui:
3613   case AArch64::STRQui:
3614     Scale = TypeSize::getFixed(16);
3615     Width = TypeSize::getFixed(16);
3616     MinOffset = 0;
3617     MaxOffset = 4095;
3618     break;
3619   case AArch64::LDPXi:
3620   case AArch64::LDPDi:
3621   case AArch64::LDNPXi:
3622   case AArch64::LDNPDi:
3623   case AArch64::STPXi:
3624   case AArch64::STPDi:
3625   case AArch64::STNPXi:
3626   case AArch64::STNPDi:
3627     Scale = TypeSize::getFixed(8);
3628     Width = TypeSize::getFixed(16);
3629     MinOffset = -64;
3630     MaxOffset = 63;
3631     break;
3632   case AArch64::PRFMui:
3633   case AArch64::LDRXui:
3634   case AArch64::LDRDui:
3635   case AArch64::STRXui:
3636   case AArch64::STRDui:
3637     Scale = TypeSize::getFixed(8);
3638     Width = TypeSize::getFixed(8);
3639     MinOffset = 0;
3640     MaxOffset = 4095;
3641     break;
3642   case AArch64::StoreSwiftAsyncContext:
3643     // Store is an STRXui, but there might be an ADDXri in the expansion too.
3644     Scale = TypeSize::getFixed(1);
3645     Width = TypeSize::getFixed(8);
3646     MinOffset = 0;
3647     MaxOffset = 4095;
3648     break;
3649   case AArch64::LDPWi:
3650   case AArch64::LDPSi:
3651   case AArch64::LDNPWi:
3652   case AArch64::LDNPSi:
3653   case AArch64::STPWi:
3654   case AArch64::STPSi:
3655   case AArch64::STNPWi:
3656   case AArch64::STNPSi:
3657     Scale = TypeSize::getFixed(4);
3658     Width = TypeSize::getFixed(8);
3659     MinOffset = -64;
3660     MaxOffset = 63;
3661     break;
3662   case AArch64::LDRWui:
3663   case AArch64::LDRSui:
3664   case AArch64::LDRSWui:
3665   case AArch64::STRWui:
3666   case AArch64::STRSui:
3667     Scale = TypeSize::getFixed(4);
3668     Width = TypeSize::getFixed(4);
3669     MinOffset = 0;
3670     MaxOffset = 4095;
3671     break;
3672   case AArch64::LDRHui:
3673   case AArch64::LDRHHui:
3674   case AArch64::LDRSHWui:
3675   case AArch64::LDRSHXui:
3676   case AArch64::STRHui:
3677   case AArch64::STRHHui:
3678     Scale = TypeSize::getFixed(2);
3679     Width = TypeSize::getFixed(2);
3680     MinOffset = 0;
3681     MaxOffset = 4095;
3682     break;
3683   case AArch64::LDRBui:
3684   case AArch64::LDRBBui:
3685   case AArch64::LDRSBWui:
3686   case AArch64::LDRSBXui:
3687   case AArch64::STRBui:
3688   case AArch64::STRBBui:
3689     Scale = TypeSize::getFixed(1);
3690     Width = TypeSize::getFixed(1);
3691     MinOffset = 0;
3692     MaxOffset = 4095;
3693     break;
3694   case AArch64::STPXpre:
3695   case AArch64::LDPXpost:
3696   case AArch64::STPDpre:
3697   case AArch64::LDPDpost:
3698     Scale = TypeSize::getFixed(8);
3699     Width = TypeSize::getFixed(8);
3700     MinOffset = -512;
3701     MaxOffset = 504;
3702     break;
3703   case AArch64::STPQpre:
3704   case AArch64::LDPQpost:
3705     Scale = TypeSize::getFixed(16);
3706     Width = TypeSize::getFixed(16);
3707     MinOffset = -1024;
3708     MaxOffset = 1008;
3709     break;
3710   case AArch64::STRXpre:
3711   case AArch64::STRDpre:
3712   case AArch64::LDRXpost:
3713   case AArch64::LDRDpost:
3714     Scale = TypeSize::getFixed(1);
3715     Width = TypeSize::getFixed(8);
3716     MinOffset = -256;
3717     MaxOffset = 255;
3718     break;
3719   case AArch64::STRQpre:
3720   case AArch64::LDRQpost:
3721     Scale = TypeSize::getFixed(1);
3722     Width = TypeSize::getFixed(16);
3723     MinOffset = -256;
3724     MaxOffset = 255;
3725     break;
3726   case AArch64::ADDG:
3727     Scale = TypeSize::getFixed(16);
3728     Width = TypeSize::getFixed(0);
3729     MinOffset = 0;
3730     MaxOffset = 63;
3731     break;
3732   case AArch64::TAGPstack:
3733     Scale = TypeSize::getFixed(16);
3734     Width = TypeSize::getFixed(0);
3735     // TAGP with a negative offset turns into SUBP, which has a maximum offset
3736     // of 63 (not 64!).
3737     MinOffset = -63;
3738     MaxOffset = 63;
3739     break;
3740   case AArch64::LDG:
3741   case AArch64::STGi:
3742   case AArch64::STZGi:
3743     Scale = TypeSize::getFixed(16);
3744     Width = TypeSize::getFixed(16);
3745     MinOffset = -256;
3746     MaxOffset = 255;
3747     break;
3748   case AArch64::STR_ZZZZXI:
3749   case AArch64::LDR_ZZZZXI:
3750     Scale = TypeSize::getScalable(16);
3751     Width = TypeSize::getScalable(16 * 4);
3752     MinOffset = -256;
3753     MaxOffset = 252;
3754     break;
3755   case AArch64::STR_ZZZXI:
3756   case AArch64::LDR_ZZZXI:
3757     Scale = TypeSize::getScalable(16);
3758     Width = TypeSize::getScalable(16 * 3);
3759     MinOffset = -256;
3760     MaxOffset = 253;
3761     break;
3762   case AArch64::STR_ZZXI:
3763   case AArch64::LDR_ZZXI:
3764     Scale = TypeSize::getScalable(16);
3765     Width = TypeSize::getScalable(16 * 2);
3766     MinOffset = -256;
3767     MaxOffset = 254;
3768     break;
3769   case AArch64::LDR_PXI:
3770   case AArch64::STR_PXI:
3771     Scale = TypeSize::getScalable(2);
3772     Width = TypeSize::getScalable(2);
3773     MinOffset = -256;
3774     MaxOffset = 255;
3775     break;
3776   case AArch64::LDR_PPXI:
3777   case AArch64::STR_PPXI:
3778     Scale = TypeSize::getScalable(2);
3779     Width = TypeSize::getScalable(2 * 2);
3780     MinOffset = -256;
3781     MaxOffset = 254;
3782     break;
3783   case AArch64::LDR_ZXI:
3784   case AArch64::STR_ZXI:
3785     Scale = TypeSize::getScalable(16);
3786     Width = TypeSize::getScalable(16);
3787     MinOffset = -256;
3788     MaxOffset = 255;
3789     break;
3790   case AArch64::LD1B_IMM:
3791   case AArch64::LD1H_IMM:
3792   case AArch64::LD1W_IMM:
3793   case AArch64::LD1D_IMM:
3794   case AArch64::LDNT1B_ZRI:
3795   case AArch64::LDNT1H_ZRI:
3796   case AArch64::LDNT1W_ZRI:
3797   case AArch64::LDNT1D_ZRI:
3798   case AArch64::ST1B_IMM:
3799   case AArch64::ST1H_IMM:
3800   case AArch64::ST1W_IMM:
3801   case AArch64::ST1D_IMM:
3802   case AArch64::STNT1B_ZRI:
3803   case AArch64::STNT1H_ZRI:
3804   case AArch64::STNT1W_ZRI:
3805   case AArch64::STNT1D_ZRI:
3806   case AArch64::LDNF1B_IMM:
3807   case AArch64::LDNF1H_IMM:
3808   case AArch64::LDNF1W_IMM:
3809   case AArch64::LDNF1D_IMM:
3810     // A full vectors worth of data
3811     // Width = mbytes * elements
3812     Scale = TypeSize::getScalable(16);
3813     Width = TypeSize::getScalable(16);
3814     MinOffset = -8;
3815     MaxOffset = 7;
3816     break;
3817   case AArch64::LD2B_IMM:
3818   case AArch64::LD2H_IMM:
3819   case AArch64::LD2W_IMM:
3820   case AArch64::LD2D_IMM:
3821   case AArch64::ST2B_IMM:
3822   case AArch64::ST2H_IMM:
3823   case AArch64::ST2W_IMM:
3824   case AArch64::ST2D_IMM:
3825     Scale = TypeSize::getScalable(32);
3826     Width = TypeSize::getScalable(16 * 2);
3827     MinOffset = -8;
3828     MaxOffset = 7;
3829     break;
3830   case AArch64::LD3B_IMM:
3831   case AArch64::LD3H_IMM:
3832   case AArch64::LD3W_IMM:
3833   case AArch64::LD3D_IMM:
3834   case AArch64::ST3B_IMM:
3835   case AArch64::ST3H_IMM:
3836   case AArch64::ST3W_IMM:
3837   case AArch64::ST3D_IMM:
3838     Scale = TypeSize::getScalable(48);
3839     Width = TypeSize::getScalable(16 * 3);
3840     MinOffset = -8;
3841     MaxOffset = 7;
3842     break;
3843   case AArch64::LD4B_IMM:
3844   case AArch64::LD4H_IMM:
3845   case AArch64::LD4W_IMM:
3846   case AArch64::LD4D_IMM:
3847   case AArch64::ST4B_IMM:
3848   case AArch64::ST4H_IMM:
3849   case AArch64::ST4W_IMM:
3850   case AArch64::ST4D_IMM:
3851     Scale = TypeSize::getScalable(64);
3852     Width = TypeSize::getScalable(16 * 4);
3853     MinOffset = -8;
3854     MaxOffset = 7;
3855     break;
3856   case AArch64::LD1B_H_IMM:
3857   case AArch64::LD1SB_H_IMM:
3858   case AArch64::LD1H_S_IMM:
3859   case AArch64::LD1SH_S_IMM:
3860   case AArch64::LD1W_D_IMM:
3861   case AArch64::LD1SW_D_IMM:
3862   case AArch64::ST1B_H_IMM:
3863   case AArch64::ST1H_S_IMM:
3864   case AArch64::ST1W_D_IMM:
3865   case AArch64::LDNF1B_H_IMM:
3866   case AArch64::LDNF1SB_H_IMM:
3867   case AArch64::LDNF1H_S_IMM:
3868   case AArch64::LDNF1SH_S_IMM:
3869   case AArch64::LDNF1W_D_IMM:
3870   case AArch64::LDNF1SW_D_IMM:
3871     // A half vector worth of data
3872     // Width = mbytes * elements
3873     Scale = TypeSize::getScalable(8);
3874     Width = TypeSize::getScalable(8);
3875     MinOffset = -8;
3876     MaxOffset = 7;
3877     break;
3878   case AArch64::LD1B_S_IMM:
3879   case AArch64::LD1SB_S_IMM:
3880   case AArch64::LD1H_D_IMM:
3881   case AArch64::LD1SH_D_IMM:
3882   case AArch64::ST1B_S_IMM:
3883   case AArch64::ST1H_D_IMM:
3884   case AArch64::LDNF1B_S_IMM:
3885   case AArch64::LDNF1SB_S_IMM:
3886   case AArch64::LDNF1H_D_IMM:
3887   case AArch64::LDNF1SH_D_IMM:
3888     // A quarter vector worth of data
3889     // Width = mbytes * elements
3890     Scale = TypeSize::getScalable(4);
3891     Width = TypeSize::getScalable(4);
3892     MinOffset = -8;
3893     MaxOffset = 7;
3894     break;
3895   case AArch64::LD1B_D_IMM:
3896   case AArch64::LD1SB_D_IMM:
3897   case AArch64::ST1B_D_IMM:
3898   case AArch64::LDNF1B_D_IMM:
3899   case AArch64::LDNF1SB_D_IMM:
3900     // A eighth vector worth of data
3901     // Width = mbytes * elements
3902     Scale = TypeSize::getScalable(2);
3903     Width = TypeSize::getScalable(2);
3904     MinOffset = -8;
3905     MaxOffset = 7;
3906     break;
3907   case AArch64::ST2Gi:
3908   case AArch64::STZ2Gi:
3909     Scale = TypeSize::getFixed(16);
3910     Width = TypeSize::getFixed(32);
3911     MinOffset = -256;
3912     MaxOffset = 255;
3913     break;
3914   case AArch64::STGPi:
3915     Scale = TypeSize::getFixed(16);
3916     Width = TypeSize::getFixed(16);
3917     MinOffset = -64;
3918     MaxOffset = 63;
3919     break;
3920   case AArch64::LD1RB_IMM:
3921   case AArch64::LD1RB_H_IMM:
3922   case AArch64::LD1RB_S_IMM:
3923   case AArch64::LD1RB_D_IMM:
3924   case AArch64::LD1RSB_H_IMM:
3925   case AArch64::LD1RSB_S_IMM:
3926   case AArch64::LD1RSB_D_IMM:
3927     Scale = TypeSize::getFixed(1);
3928     Width = TypeSize::getFixed(1);
3929     MinOffset = 0;
3930     MaxOffset = 63;
3931     break;
3932   case AArch64::LD1RH_IMM:
3933   case AArch64::LD1RH_S_IMM:
3934   case AArch64::LD1RH_D_IMM:
3935   case AArch64::LD1RSH_S_IMM:
3936   case AArch64::LD1RSH_D_IMM:
3937     Scale = TypeSize::getFixed(2);
3938     Width = TypeSize::getFixed(2);
3939     MinOffset = 0;
3940     MaxOffset = 63;
3941     break;
3942   case AArch64::LD1RW_IMM:
3943   case AArch64::LD1RW_D_IMM:
3944   case AArch64::LD1RSW_IMM:
3945     Scale = TypeSize::getFixed(4);
3946     Width = TypeSize::getFixed(4);
3947     MinOffset = 0;
3948     MaxOffset = 63;
3949     break;
3950   case AArch64::LD1RD_IMM:
3951     Scale = TypeSize::getFixed(8);
3952     Width = TypeSize::getFixed(8);
3953     MinOffset = 0;
3954     MaxOffset = 63;
3955     break;
3956   }
3957 
3958   return true;
3959 }
3960 
3961 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)3962 int AArch64InstrInfo::getMemScale(unsigned Opc) {
3963   switch (Opc) {
3964   default:
3965     llvm_unreachable("Opcode has unknown scale!");
3966   case AArch64::LDRBBui:
3967   case AArch64::LDURBBi:
3968   case AArch64::LDRSBWui:
3969   case AArch64::LDURSBWi:
3970   case AArch64::STRBBui:
3971   case AArch64::STURBBi:
3972     return 1;
3973   case AArch64::LDRHHui:
3974   case AArch64::LDURHHi:
3975   case AArch64::LDRSHWui:
3976   case AArch64::LDURSHWi:
3977   case AArch64::STRHHui:
3978   case AArch64::STURHHi:
3979     return 2;
3980   case AArch64::LDRSui:
3981   case AArch64::LDURSi:
3982   case AArch64::LDRSpre:
3983   case AArch64::LDRSWui:
3984   case AArch64::LDURSWi:
3985   case AArch64::LDRSWpre:
3986   case AArch64::LDRWpre:
3987   case AArch64::LDRWui:
3988   case AArch64::LDURWi:
3989   case AArch64::STRSui:
3990   case AArch64::STURSi:
3991   case AArch64::STRSpre:
3992   case AArch64::STRWui:
3993   case AArch64::STURWi:
3994   case AArch64::STRWpre:
3995   case AArch64::LDPSi:
3996   case AArch64::LDPSWi:
3997   case AArch64::LDPWi:
3998   case AArch64::STPSi:
3999   case AArch64::STPWi:
4000     return 4;
4001   case AArch64::LDRDui:
4002   case AArch64::LDURDi:
4003   case AArch64::LDRDpre:
4004   case AArch64::LDRXui:
4005   case AArch64::LDURXi:
4006   case AArch64::LDRXpre:
4007   case AArch64::STRDui:
4008   case AArch64::STURDi:
4009   case AArch64::STRDpre:
4010   case AArch64::STRXui:
4011   case AArch64::STURXi:
4012   case AArch64::STRXpre:
4013   case AArch64::LDPDi:
4014   case AArch64::LDPXi:
4015   case AArch64::STPDi:
4016   case AArch64::STPXi:
4017     return 8;
4018   case AArch64::LDRQui:
4019   case AArch64::LDURQi:
4020   case AArch64::STRQui:
4021   case AArch64::STURQi:
4022   case AArch64::STRQpre:
4023   case AArch64::LDPQi:
4024   case AArch64::LDRQpre:
4025   case AArch64::STPQi:
4026   case AArch64::STGi:
4027   case AArch64::STZGi:
4028   case AArch64::ST2Gi:
4029   case AArch64::STZ2Gi:
4030   case AArch64::STGPi:
4031     return 16;
4032   }
4033 }
4034 
isPreLd(const MachineInstr & MI)4035 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4036   switch (MI.getOpcode()) {
4037   default:
4038     return false;
4039   case AArch64::LDRWpre:
4040   case AArch64::LDRXpre:
4041   case AArch64::LDRSWpre:
4042   case AArch64::LDRSpre:
4043   case AArch64::LDRDpre:
4044   case AArch64::LDRQpre:
4045     return true;
4046   }
4047 }
4048 
isPreSt(const MachineInstr & MI)4049 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4050   switch (MI.getOpcode()) {
4051   default:
4052     return false;
4053   case AArch64::STRWpre:
4054   case AArch64::STRXpre:
4055   case AArch64::STRSpre:
4056   case AArch64::STRDpre:
4057   case AArch64::STRQpre:
4058     return true;
4059   }
4060 }
4061 
isPreLdSt(const MachineInstr & MI)4062 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4063   return isPreLd(MI) || isPreSt(MI);
4064 }
4065 
isPairedLdSt(const MachineInstr & MI)4066 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4067   switch (MI.getOpcode()) {
4068   default:
4069     return false;
4070   case AArch64::LDPSi:
4071   case AArch64::LDPSWi:
4072   case AArch64::LDPDi:
4073   case AArch64::LDPQi:
4074   case AArch64::LDPWi:
4075   case AArch64::LDPXi:
4076   case AArch64::STPSi:
4077   case AArch64::STPDi:
4078   case AArch64::STPQi:
4079   case AArch64::STPWi:
4080   case AArch64::STPXi:
4081   case AArch64::STGPi:
4082     return true;
4083   }
4084 }
4085 
getLdStBaseOp(const MachineInstr & MI)4086 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4087   unsigned Idx =
4088       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4089                                                                             : 1;
4090   return MI.getOperand(Idx);
4091 }
4092 
4093 const MachineOperand &
getLdStOffsetOp(const MachineInstr & MI)4094 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4095   unsigned Idx =
4096       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4097                                                                             : 2;
4098   return MI.getOperand(Idx);
4099 }
4100 
getRegClass(const MachineInstr & MI,Register Reg)4101 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4102                                               Register Reg) {
4103   if (MI.getParent() == nullptr)
4104     return nullptr;
4105   const MachineFunction *MF = MI.getParent()->getParent();
4106   return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4107 }
4108 
isHForm(const MachineInstr & MI)4109 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4110   auto IsHFPR = [&](const MachineOperand &Op) {
4111     if (!Op.isReg())
4112       return false;
4113     auto Reg = Op.getReg();
4114     if (Reg.isPhysical())
4115       return AArch64::FPR16RegClass.contains(Reg);
4116     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4117     return TRC == &AArch64::FPR16RegClass ||
4118            TRC == &AArch64::FPR16_loRegClass;
4119   };
4120   return llvm::any_of(MI.operands(), IsHFPR);
4121 }
4122 
isQForm(const MachineInstr & MI)4123 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4124   auto IsQFPR = [&](const MachineOperand &Op) {
4125     if (!Op.isReg())
4126       return false;
4127     auto Reg = Op.getReg();
4128     if (Reg.isPhysical())
4129       return AArch64::FPR128RegClass.contains(Reg);
4130     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4131     return TRC == &AArch64::FPR128RegClass ||
4132            TRC == &AArch64::FPR128_loRegClass;
4133   };
4134   return llvm::any_of(MI.operands(), IsQFPR);
4135 }
4136 
hasBTISemantics(const MachineInstr & MI)4137 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4138   switch (MI.getOpcode()) {
4139   case AArch64::BRK:
4140   case AArch64::HLT:
4141   case AArch64::PACIASP:
4142   case AArch64::PACIBSP:
4143     // Implicit BTI behavior.
4144     return true;
4145   case AArch64::PAUTH_PROLOGUE:
4146     // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4147     return true;
4148   case AArch64::HINT: {
4149     unsigned Imm = MI.getOperand(0).getImm();
4150     // Explicit BTI instruction.
4151     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4152       return true;
4153     // PACI(A|B)SP instructions.
4154     if (Imm == 25 || Imm == 27)
4155       return true;
4156     return false;
4157   }
4158   default:
4159     return false;
4160   }
4161 }
4162 
isFpOrNEON(const MachineInstr & MI)4163 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4164   auto IsFPR = [&](const MachineOperand &Op) {
4165     if (!Op.isReg())
4166       return false;
4167     auto Reg = Op.getReg();
4168     if (Reg.isPhysical())
4169       return AArch64::FPR128RegClass.contains(Reg) ||
4170              AArch64::FPR64RegClass.contains(Reg) ||
4171              AArch64::FPR32RegClass.contains(Reg) ||
4172              AArch64::FPR16RegClass.contains(Reg) ||
4173              AArch64::FPR8RegClass.contains(Reg);
4174 
4175     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4176     return TRC == &AArch64::FPR128RegClass ||
4177            TRC == &AArch64::FPR128_loRegClass ||
4178            TRC == &AArch64::FPR64RegClass ||
4179            TRC == &AArch64::FPR64_loRegClass ||
4180            TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4181            TRC == &AArch64::FPR8RegClass;
4182   };
4183   return llvm::any_of(MI.operands(), IsFPR);
4184 }
4185 
4186 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
4187 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)4188 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4189   int Scale = AArch64InstrInfo::getMemScale(Opc);
4190 
4191   // If the byte-offset isn't a multiple of the stride, we can't scale this
4192   // offset.
4193   if (Offset % Scale != 0)
4194     return false;
4195 
4196   // Convert the byte-offset used by unscaled into an "element" offset used
4197   // by the scaled pair load/store instructions.
4198   Offset /= Scale;
4199   return true;
4200 }
4201 
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)4202 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4203   if (FirstOpc == SecondOpc)
4204     return true;
4205   // We can also pair sign-ext and zero-ext instructions.
4206   switch (FirstOpc) {
4207   default:
4208     return false;
4209   case AArch64::LDRQui:
4210   case AArch64::LDURQi:
4211     return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4212   case AArch64::LDRWui:
4213   case AArch64::LDURWi:
4214     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4215   case AArch64::LDRSWui:
4216   case AArch64::LDURSWi:
4217     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4218   }
4219   // These instructions can't be paired based on their opcodes.
4220   return false;
4221 }
4222 
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)4223 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4224                             int64_t Offset1, unsigned Opcode1, int FI2,
4225                             int64_t Offset2, unsigned Opcode2) {
4226   // Accesses through fixed stack object frame indices may access a different
4227   // fixed stack slot. Check that the object offsets + offsets match.
4228   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4229     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4230     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4231     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4232     // Convert to scaled object offsets.
4233     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4234     if (ObjectOffset1 % Scale1 != 0)
4235       return false;
4236     ObjectOffset1 /= Scale1;
4237     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4238     if (ObjectOffset2 % Scale2 != 0)
4239       return false;
4240     ObjectOffset2 /= Scale2;
4241     ObjectOffset1 += Offset1;
4242     ObjectOffset2 += Offset2;
4243     return ObjectOffset1 + 1 == ObjectOffset2;
4244   }
4245 
4246   return FI1 == FI2;
4247 }
4248 
4249 /// Detect opportunities for ldp/stp formation.
4250 ///
4251 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,int64_t OpOffset1,bool OffsetIsScalable1,ArrayRef<const MachineOperand * > BaseOps2,int64_t OpOffset2,bool OffsetIsScalable2,unsigned ClusterSize,unsigned NumBytes) const4252 bool AArch64InstrInfo::shouldClusterMemOps(
4253     ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4254     bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4255     int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4256     unsigned NumBytes) const {
4257   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4258   const MachineOperand &BaseOp1 = *BaseOps1.front();
4259   const MachineOperand &BaseOp2 = *BaseOps2.front();
4260   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4261   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4262   if (BaseOp1.getType() != BaseOp2.getType())
4263     return false;
4264 
4265   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4266          "Only base registers and frame indices are supported.");
4267 
4268   // Check for both base regs and base FI.
4269   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4270     return false;
4271 
4272   // Only cluster up to a single pair.
4273   if (ClusterSize > 2)
4274     return false;
4275 
4276   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4277     return false;
4278 
4279   // Can we pair these instructions based on their opcodes?
4280   unsigned FirstOpc = FirstLdSt.getOpcode();
4281   unsigned SecondOpc = SecondLdSt.getOpcode();
4282   if (!canPairLdStOpc(FirstOpc, SecondOpc))
4283     return false;
4284 
4285   // Can't merge volatiles or load/stores that have a hint to avoid pair
4286   // formation, for example.
4287   if (!isCandidateToMergeOrPair(FirstLdSt) ||
4288       !isCandidateToMergeOrPair(SecondLdSt))
4289     return false;
4290 
4291   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4292   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4293   if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4294     return false;
4295 
4296   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4297   if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4298     return false;
4299 
4300   // Pairwise instructions have a 7-bit signed offset field.
4301   if (Offset1 > 63 || Offset1 < -64)
4302     return false;
4303 
4304   // The caller should already have ordered First/SecondLdSt by offset.
4305   // Note: except for non-equal frame index bases
4306   if (BaseOp1.isFI()) {
4307     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4308            "Caller should have ordered offsets.");
4309 
4310     const MachineFrameInfo &MFI =
4311         FirstLdSt.getParent()->getParent()->getFrameInfo();
4312     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4313                            BaseOp2.getIndex(), Offset2, SecondOpc);
4314   }
4315 
4316   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4317 
4318   return Offset1 + 1 == Offset2;
4319 }
4320 
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)4321 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4322                                             unsigned Reg, unsigned SubIdx,
4323                                             unsigned State,
4324                                             const TargetRegisterInfo *TRI) {
4325   if (!SubIdx)
4326     return MIB.addReg(Reg, State);
4327 
4328   if (Register::isPhysicalRegister(Reg))
4329     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4330   return MIB.addReg(Reg, State, SubIdx);
4331 }
4332 
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)4333 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4334                                         unsigned NumRegs) {
4335   // We really want the positive remainder mod 32 here, that happens to be
4336   // easily obtainable with a mask.
4337   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4338 }
4339 
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const4340 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4341                                         MachineBasicBlock::iterator I,
4342                                         const DebugLoc &DL, MCRegister DestReg,
4343                                         MCRegister SrcReg, bool KillSrc,
4344                                         unsigned Opcode,
4345                                         ArrayRef<unsigned> Indices) const {
4346   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4347   const TargetRegisterInfo *TRI = &getRegisterInfo();
4348   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4349   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4350   unsigned NumRegs = Indices.size();
4351 
4352   int SubReg = 0, End = NumRegs, Incr = 1;
4353   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4354     SubReg = NumRegs - 1;
4355     End = -1;
4356     Incr = -1;
4357   }
4358 
4359   for (; SubReg != End; SubReg += Incr) {
4360     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4361     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4362     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4363     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4364   }
4365 }
4366 
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const4367 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4368                                        MachineBasicBlock::iterator I,
4369                                        DebugLoc DL, unsigned DestReg,
4370                                        unsigned SrcReg, bool KillSrc,
4371                                        unsigned Opcode, unsigned ZeroReg,
4372                                        llvm::ArrayRef<unsigned> Indices) const {
4373   const TargetRegisterInfo *TRI = &getRegisterInfo();
4374   unsigned NumRegs = Indices.size();
4375 
4376 #ifndef NDEBUG
4377   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4378   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4379   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4380          "GPR reg sequences should not be able to overlap");
4381 #endif
4382 
4383   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4384     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4385     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4386     MIB.addReg(ZeroReg);
4387     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4388     MIB.addImm(0);
4389   }
4390 }
4391 
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const4392 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4393                                    MachineBasicBlock::iterator I,
4394                                    const DebugLoc &DL, MCRegister DestReg,
4395                                    MCRegister SrcReg, bool KillSrc) const {
4396   if (AArch64::GPR32spRegClass.contains(DestReg) &&
4397       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4398     const TargetRegisterInfo *TRI = &getRegisterInfo();
4399 
4400     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4401       // If either operand is WSP, expand to ADD #0.
4402       if (Subtarget.hasZeroCycleRegMove()) {
4403         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4404         MCRegister DestRegX = TRI->getMatchingSuperReg(
4405             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4406         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4407             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4408         // This instruction is reading and writing X registers.  This may upset
4409         // the register scavenger and machine verifier, so we need to indicate
4410         // that we are reading an undefined value from SrcRegX, but a proper
4411         // value from SrcReg.
4412         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4413             .addReg(SrcRegX, RegState::Undef)
4414             .addImm(0)
4415             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4416             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4417       } else {
4418         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4419             .addReg(SrcReg, getKillRegState(KillSrc))
4420             .addImm(0)
4421             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4422       }
4423     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4424       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4425           .addImm(0)
4426           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4427     } else {
4428       if (Subtarget.hasZeroCycleRegMove()) {
4429         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4430         MCRegister DestRegX = TRI->getMatchingSuperReg(
4431             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4432         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4433             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4434         // This instruction is reading and writing X registers.  This may upset
4435         // the register scavenger and machine verifier, so we need to indicate
4436         // that we are reading an undefined value from SrcRegX, but a proper
4437         // value from SrcReg.
4438         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4439             .addReg(AArch64::XZR)
4440             .addReg(SrcRegX, RegState::Undef)
4441             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4442       } else {
4443         // Otherwise, expand to ORR WZR.
4444         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4445             .addReg(AArch64::WZR)
4446             .addReg(SrcReg, getKillRegState(KillSrc));
4447       }
4448     }
4449     return;
4450   }
4451 
4452   // Copy a Predicate register by ORRing with itself.
4453   if (AArch64::PPRRegClass.contains(DestReg) &&
4454       AArch64::PPRRegClass.contains(SrcReg)) {
4455     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4456     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4457       .addReg(SrcReg) // Pg
4458       .addReg(SrcReg)
4459       .addReg(SrcReg, getKillRegState(KillSrc));
4460     return;
4461   }
4462 
4463   // Copy a predicate-as-counter register by ORRing with itself as if it
4464   // were a regular predicate (mask) register.
4465   bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4466   bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4467   if (DestIsPNR || SrcIsPNR) {
4468     assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4469            "Unexpected predicate-as-counter register.");
4470     auto ToPPR = [](MCRegister R) -> MCRegister {
4471       return (R - AArch64::PN0) + AArch64::P0;
4472     };
4473     MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4474     MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4475 
4476     if (PPRSrcReg != PPRDestReg) {
4477       auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4478                        .addReg(PPRSrcReg) // Pg
4479                        .addReg(PPRSrcReg)
4480                        .addReg(PPRSrcReg, getKillRegState(KillSrc));
4481       if (DestIsPNR)
4482         NewMI.addDef(DestReg, RegState::Implicit);
4483     }
4484     return;
4485   }
4486 
4487   // Copy a Z register by ORRing with itself.
4488   if (AArch64::ZPRRegClass.contains(DestReg) &&
4489       AArch64::ZPRRegClass.contains(SrcReg)) {
4490     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4491     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4492       .addReg(SrcReg)
4493       .addReg(SrcReg, getKillRegState(KillSrc));
4494     return;
4495   }
4496 
4497   // Copy a Z register pair by copying the individual sub-registers.
4498   if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4499        AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4500       (AArch64::ZPR2RegClass.contains(SrcReg) ||
4501        AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4502     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4503     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4504     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4505                      Indices);
4506     return;
4507   }
4508 
4509   // Copy a Z register triple by copying the individual sub-registers.
4510   if (AArch64::ZPR3RegClass.contains(DestReg) &&
4511       AArch64::ZPR3RegClass.contains(SrcReg)) {
4512     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4513     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4514                                        AArch64::zsub2};
4515     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4516                      Indices);
4517     return;
4518   }
4519 
4520   // Copy a Z register quad by copying the individual sub-registers.
4521   if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4522        AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4523       (AArch64::ZPR4RegClass.contains(SrcReg) ||
4524        AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4525     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4526     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4527                                        AArch64::zsub2, AArch64::zsub3};
4528     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4529                      Indices);
4530     return;
4531   }
4532 
4533   if (AArch64::GPR64spRegClass.contains(DestReg) &&
4534       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4535     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4536       // If either operand is SP, expand to ADD #0.
4537       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4538           .addReg(SrcReg, getKillRegState(KillSrc))
4539           .addImm(0)
4540           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4541     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4542       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4543           .addImm(0)
4544           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4545     } else {
4546       // Otherwise, expand to ORR XZR.
4547       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4548           .addReg(AArch64::XZR)
4549           .addReg(SrcReg, getKillRegState(KillSrc));
4550     }
4551     return;
4552   }
4553 
4554   // Copy a DDDD register quad by copying the individual sub-registers.
4555   if (AArch64::DDDDRegClass.contains(DestReg) &&
4556       AArch64::DDDDRegClass.contains(SrcReg)) {
4557     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4558                                        AArch64::dsub2, AArch64::dsub3};
4559     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4560                      Indices);
4561     return;
4562   }
4563 
4564   // Copy a DDD register triple by copying the individual sub-registers.
4565   if (AArch64::DDDRegClass.contains(DestReg) &&
4566       AArch64::DDDRegClass.contains(SrcReg)) {
4567     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4568                                        AArch64::dsub2};
4569     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4570                      Indices);
4571     return;
4572   }
4573 
4574   // Copy a DD register pair by copying the individual sub-registers.
4575   if (AArch64::DDRegClass.contains(DestReg) &&
4576       AArch64::DDRegClass.contains(SrcReg)) {
4577     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4578     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4579                      Indices);
4580     return;
4581   }
4582 
4583   // Copy a QQQQ register quad by copying the individual sub-registers.
4584   if (AArch64::QQQQRegClass.contains(DestReg) &&
4585       AArch64::QQQQRegClass.contains(SrcReg)) {
4586     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4587                                        AArch64::qsub2, AArch64::qsub3};
4588     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4589                      Indices);
4590     return;
4591   }
4592 
4593   // Copy a QQQ register triple by copying the individual sub-registers.
4594   if (AArch64::QQQRegClass.contains(DestReg) &&
4595       AArch64::QQQRegClass.contains(SrcReg)) {
4596     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4597                                        AArch64::qsub2};
4598     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4599                      Indices);
4600     return;
4601   }
4602 
4603   // Copy a QQ register pair by copying the individual sub-registers.
4604   if (AArch64::QQRegClass.contains(DestReg) &&
4605       AArch64::QQRegClass.contains(SrcReg)) {
4606     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4607     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4608                      Indices);
4609     return;
4610   }
4611 
4612   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4613       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4614     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4615     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4616                     AArch64::XZR, Indices);
4617     return;
4618   }
4619 
4620   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4621       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4622     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4623     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4624                     AArch64::WZR, Indices);
4625     return;
4626   }
4627 
4628   if (AArch64::FPR128RegClass.contains(DestReg) &&
4629       AArch64::FPR128RegClass.contains(SrcReg)) {
4630     if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable())
4631       BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4632           .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4633           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4634           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4635     else if (Subtarget.hasNEON())
4636       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4637           .addReg(SrcReg)
4638           .addReg(SrcReg, getKillRegState(KillSrc));
4639     else {
4640       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4641           .addReg(AArch64::SP, RegState::Define)
4642           .addReg(SrcReg, getKillRegState(KillSrc))
4643           .addReg(AArch64::SP)
4644           .addImm(-16);
4645       BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
4646           .addReg(AArch64::SP, RegState::Define)
4647           .addReg(DestReg, RegState::Define)
4648           .addReg(AArch64::SP)
4649           .addImm(16);
4650     }
4651     return;
4652   }
4653 
4654   if (AArch64::FPR64RegClass.contains(DestReg) &&
4655       AArch64::FPR64RegClass.contains(SrcReg)) {
4656     BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4657         .addReg(SrcReg, getKillRegState(KillSrc));
4658     return;
4659   }
4660 
4661   if (AArch64::FPR32RegClass.contains(DestReg) &&
4662       AArch64::FPR32RegClass.contains(SrcReg)) {
4663     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4664         .addReg(SrcReg, getKillRegState(KillSrc));
4665     return;
4666   }
4667 
4668   if (AArch64::FPR16RegClass.contains(DestReg) &&
4669       AArch64::FPR16RegClass.contains(SrcReg)) {
4670     DestReg =
4671         RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4672     SrcReg =
4673         RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4674     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4675         .addReg(SrcReg, getKillRegState(KillSrc));
4676     return;
4677   }
4678 
4679   if (AArch64::FPR8RegClass.contains(DestReg) &&
4680       AArch64::FPR8RegClass.contains(SrcReg)) {
4681     DestReg =
4682         RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4683     SrcReg =
4684         RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4685     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4686         .addReg(SrcReg, getKillRegState(KillSrc));
4687     return;
4688   }
4689 
4690   // Copies between GPR64 and FPR64.
4691   if (AArch64::FPR64RegClass.contains(DestReg) &&
4692       AArch64::GPR64RegClass.contains(SrcReg)) {
4693     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4694         .addReg(SrcReg, getKillRegState(KillSrc));
4695     return;
4696   }
4697   if (AArch64::GPR64RegClass.contains(DestReg) &&
4698       AArch64::FPR64RegClass.contains(SrcReg)) {
4699     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4700         .addReg(SrcReg, getKillRegState(KillSrc));
4701     return;
4702   }
4703   // Copies between GPR32 and FPR32.
4704   if (AArch64::FPR32RegClass.contains(DestReg) &&
4705       AArch64::GPR32RegClass.contains(SrcReg)) {
4706     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4707         .addReg(SrcReg, getKillRegState(KillSrc));
4708     return;
4709   }
4710   if (AArch64::GPR32RegClass.contains(DestReg) &&
4711       AArch64::FPR32RegClass.contains(SrcReg)) {
4712     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4713         .addReg(SrcReg, getKillRegState(KillSrc));
4714     return;
4715   }
4716 
4717   if (DestReg == AArch64::NZCV) {
4718     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4719     BuildMI(MBB, I, DL, get(AArch64::MSR))
4720         .addImm(AArch64SysReg::NZCV)
4721         .addReg(SrcReg, getKillRegState(KillSrc))
4722         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4723     return;
4724   }
4725 
4726   if (SrcReg == AArch64::NZCV) {
4727     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4728     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4729         .addImm(AArch64SysReg::NZCV)
4730         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4731     return;
4732   }
4733 
4734 #ifndef NDEBUG
4735   const TargetRegisterInfo &TRI = getRegisterInfo();
4736   errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4737          << TRI.getRegAsmName(SrcReg) << "\n";
4738 #endif
4739   llvm_unreachable("unimplemented reg-to-reg copy");
4740 }
4741 
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4742 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4743                                     MachineBasicBlock &MBB,
4744                                     MachineBasicBlock::iterator InsertBefore,
4745                                     const MCInstrDesc &MCID,
4746                                     Register SrcReg, bool IsKill,
4747                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
4748                                     MachineMemOperand *MMO) {
4749   Register SrcReg0 = SrcReg;
4750   Register SrcReg1 = SrcReg;
4751   if (SrcReg.isPhysical()) {
4752     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
4753     SubIdx0 = 0;
4754     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
4755     SubIdx1 = 0;
4756   }
4757   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4758       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
4759       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
4760       .addFrameIndex(FI)
4761       .addImm(0)
4762       .addMemOperand(MMO);
4763 }
4764 
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const4765 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4766                                            MachineBasicBlock::iterator MBBI,
4767                                            Register SrcReg, bool isKill, int FI,
4768                                            const TargetRegisterClass *RC,
4769                                            const TargetRegisterInfo *TRI,
4770                                            Register VReg) const {
4771   MachineFunction &MF = *MBB.getParent();
4772   MachineFrameInfo &MFI = MF.getFrameInfo();
4773 
4774   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4775   MachineMemOperand *MMO =
4776       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
4777                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4778   unsigned Opc = 0;
4779   bool Offset = true;
4780   MCRegister PNRReg = MCRegister::NoRegister;
4781   unsigned StackID = TargetStackID::Default;
4782   switch (TRI->getSpillSize(*RC)) {
4783   case 1:
4784     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4785       Opc = AArch64::STRBui;
4786     break;
4787   case 2:
4788     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4789       Opc = AArch64::STRHui;
4790     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
4791       assert(Subtarget.hasSVEorSME() &&
4792              "Unexpected register store without SVE store instructions");
4793       Opc = AArch64::STR_PXI;
4794       StackID = TargetStackID::ScalableVector;
4795     } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
4796       assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4797              "Unexpected register store without SVE2p1 or SME2");
4798       if (SrcReg.isVirtual()) {
4799         auto NewSrcReg =
4800             MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
4801         BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), NewSrcReg)
4802             .addReg(SrcReg);
4803         SrcReg = NewSrcReg;
4804       } else
4805         SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
4806       Opc = AArch64::STR_PXI;
4807       StackID = TargetStackID::ScalableVector;
4808     }
4809     break;
4810   case 4:
4811     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4812       Opc = AArch64::STRWui;
4813       if (SrcReg.isVirtual())
4814         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4815       else
4816         assert(SrcReg != AArch64::WSP);
4817     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4818       Opc = AArch64::STRSui;
4819     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4820       Opc = AArch64::STR_PPXI;
4821       StackID = TargetStackID::ScalableVector;
4822     }
4823     break;
4824   case 8:
4825     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4826       Opc = AArch64::STRXui;
4827       if (SrcReg.isVirtual())
4828         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4829       else
4830         assert(SrcReg != AArch64::SP);
4831     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4832       Opc = AArch64::STRDui;
4833     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4834       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4835                               get(AArch64::STPWi), SrcReg, isKill,
4836                               AArch64::sube32, AArch64::subo32, FI, MMO);
4837       return;
4838     }
4839     break;
4840   case 16:
4841     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4842       Opc = AArch64::STRQui;
4843     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4844       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4845       Opc = AArch64::ST1Twov1d;
4846       Offset = false;
4847     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4848       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4849                               get(AArch64::STPXi), SrcReg, isKill,
4850                               AArch64::sube64, AArch64::subo64, FI, MMO);
4851       return;
4852     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4853       assert(Subtarget.hasSVEorSME() &&
4854              "Unexpected register store without SVE store instructions");
4855       Opc = AArch64::STR_ZXI;
4856       StackID = TargetStackID::ScalableVector;
4857     }
4858     break;
4859   case 24:
4860     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4861       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4862       Opc = AArch64::ST1Threev1d;
4863       Offset = false;
4864     }
4865     break;
4866   case 32:
4867     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4868       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4869       Opc = AArch64::ST1Fourv1d;
4870       Offset = false;
4871     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4872       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4873       Opc = AArch64::ST1Twov2d;
4874       Offset = false;
4875     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4876                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4877       assert(Subtarget.hasSVEorSME() &&
4878              "Unexpected register store without SVE store instructions");
4879       Opc = AArch64::STR_ZZXI;
4880       StackID = TargetStackID::ScalableVector;
4881     }
4882     break;
4883   case 48:
4884     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4885       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4886       Opc = AArch64::ST1Threev2d;
4887       Offset = false;
4888     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4889       assert(Subtarget.hasSVEorSME() &&
4890              "Unexpected register store without SVE store instructions");
4891       Opc = AArch64::STR_ZZZXI;
4892       StackID = TargetStackID::ScalableVector;
4893     }
4894     break;
4895   case 64:
4896     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4897       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4898       Opc = AArch64::ST1Fourv2d;
4899       Offset = false;
4900     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4901                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4902       assert(Subtarget.hasSVEorSME() &&
4903              "Unexpected register store without SVE store instructions");
4904       Opc = AArch64::STR_ZZZZXI;
4905       StackID = TargetStackID::ScalableVector;
4906     }
4907     break;
4908   }
4909   assert(Opc && "Unknown register class");
4910   MFI.setStackID(FI, StackID);
4911 
4912   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4913                                      .addReg(SrcReg, getKillRegState(isKill))
4914                                      .addFrameIndex(FI);
4915 
4916   if (Offset)
4917     MI.addImm(0);
4918   if (PNRReg.isValid())
4919     MI.addDef(PNRReg, RegState::Implicit);
4920   MI.addMemOperand(MMO);
4921 }
4922 
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4923 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4924                                      MachineBasicBlock &MBB,
4925                                      MachineBasicBlock::iterator InsertBefore,
4926                                      const MCInstrDesc &MCID,
4927                                      Register DestReg, unsigned SubIdx0,
4928                                      unsigned SubIdx1, int FI,
4929                                      MachineMemOperand *MMO) {
4930   Register DestReg0 = DestReg;
4931   Register DestReg1 = DestReg;
4932   bool IsUndef = true;
4933   if (DestReg.isPhysical()) {
4934     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
4935     SubIdx0 = 0;
4936     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
4937     SubIdx1 = 0;
4938     IsUndef = false;
4939   }
4940   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4941       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
4942       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
4943       .addFrameIndex(FI)
4944       .addImm(0)
4945       .addMemOperand(MMO);
4946 }
4947 
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const4948 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
4949                                             MachineBasicBlock::iterator MBBI,
4950                                             Register DestReg, int FI,
4951                                             const TargetRegisterClass *RC,
4952                                             const TargetRegisterInfo *TRI,
4953                                             Register VReg) const {
4954   MachineFunction &MF = *MBB.getParent();
4955   MachineFrameInfo &MFI = MF.getFrameInfo();
4956   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4957   MachineMemOperand *MMO =
4958       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
4959                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4960 
4961   unsigned Opc = 0;
4962   bool Offset = true;
4963   unsigned StackID = TargetStackID::Default;
4964   Register PNRReg = MCRegister::NoRegister;
4965   switch (TRI->getSpillSize(*RC)) {
4966   case 1:
4967     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4968       Opc = AArch64::LDRBui;
4969     break;
4970   case 2:
4971     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4972       Opc = AArch64::LDRHui;
4973     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
4974       assert(Subtarget.hasSVEorSME() &&
4975              "Unexpected register load without SVE load instructions");
4976       Opc = AArch64::LDR_PXI;
4977       StackID = TargetStackID::ScalableVector;
4978     } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
4979       assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4980              "Unexpected register load without SVE2p1 or SME2");
4981       PNRReg = DestReg;
4982       if (DestReg.isVirtual())
4983         DestReg = MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
4984       else
4985         DestReg = (DestReg - AArch64::PN0) + AArch64::P0;
4986       Opc = AArch64::LDR_PXI;
4987       StackID = TargetStackID::ScalableVector;
4988     }
4989     break;
4990   case 4:
4991     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4992       Opc = AArch64::LDRWui;
4993       if (DestReg.isVirtual())
4994         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
4995       else
4996         assert(DestReg != AArch64::WSP);
4997     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4998       Opc = AArch64::LDRSui;
4999     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5000       Opc = AArch64::LDR_PPXI;
5001       StackID = TargetStackID::ScalableVector;
5002     }
5003     break;
5004   case 8:
5005     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5006       Opc = AArch64::LDRXui;
5007       if (DestReg.isVirtual())
5008         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5009       else
5010         assert(DestReg != AArch64::SP);
5011     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5012       Opc = AArch64::LDRDui;
5013     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5014       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5015                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
5016                                AArch64::subo32, FI, MMO);
5017       return;
5018     }
5019     break;
5020   case 16:
5021     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5022       Opc = AArch64::LDRQui;
5023     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5024       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5025       Opc = AArch64::LD1Twov1d;
5026       Offset = false;
5027     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5028       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5029                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
5030                                AArch64::subo64, FI, MMO);
5031       return;
5032     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5033       assert(Subtarget.hasSVEorSME() &&
5034              "Unexpected register load without SVE load instructions");
5035       Opc = AArch64::LDR_ZXI;
5036       StackID = TargetStackID::ScalableVector;
5037     }
5038     break;
5039   case 24:
5040     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5041       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5042       Opc = AArch64::LD1Threev1d;
5043       Offset = false;
5044     }
5045     break;
5046   case 32:
5047     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5048       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5049       Opc = AArch64::LD1Fourv1d;
5050       Offset = false;
5051     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5052       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5053       Opc = AArch64::LD1Twov2d;
5054       Offset = false;
5055     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5056                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5057       assert(Subtarget.hasSVEorSME() &&
5058              "Unexpected register load without SVE load instructions");
5059       Opc = AArch64::LDR_ZZXI;
5060       StackID = TargetStackID::ScalableVector;
5061     }
5062     break;
5063   case 48:
5064     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5065       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5066       Opc = AArch64::LD1Threev2d;
5067       Offset = false;
5068     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5069       assert(Subtarget.hasSVEorSME() &&
5070              "Unexpected register load without SVE load instructions");
5071       Opc = AArch64::LDR_ZZZXI;
5072       StackID = TargetStackID::ScalableVector;
5073     }
5074     break;
5075   case 64:
5076     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5077       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5078       Opc = AArch64::LD1Fourv2d;
5079       Offset = false;
5080     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5081                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5082       assert(Subtarget.hasSVEorSME() &&
5083              "Unexpected register load without SVE load instructions");
5084       Opc = AArch64::LDR_ZZZZXI;
5085       StackID = TargetStackID::ScalableVector;
5086     }
5087     break;
5088   }
5089 
5090   assert(Opc && "Unknown register class");
5091   MFI.setStackID(FI, StackID);
5092 
5093   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5094                                      .addReg(DestReg, getDefRegState(true))
5095                                      .addFrameIndex(FI);
5096   if (Offset)
5097     MI.addImm(0);
5098   if (PNRReg.isValid() && !PNRReg.isVirtual())
5099     MI.addDef(PNRReg, RegState::Implicit);
5100   MI.addMemOperand(MMO);
5101 
5102   if (PNRReg.isValid() && PNRReg.isVirtual())
5103     BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg)
5104         .addReg(DestReg);
5105 }
5106 
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)5107 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5108                                            const MachineInstr &UseMI,
5109                                            const TargetRegisterInfo *TRI) {
5110   return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5111                                          UseMI.getIterator()),
5112                 [TRI](const MachineInstr &I) {
5113                   return I.modifiesRegister(AArch64::NZCV, TRI) ||
5114                          I.readsRegister(AArch64::NZCV, TRI);
5115                 });
5116 }
5117 
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)5118 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5119     const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5120   // The smallest scalable element supported by scaled SVE addressing
5121   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5122   // byte offset must always be a multiple of 2.
5123   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5124 
5125   // VGSized offsets are divided by '2', because the VG register is the
5126   // the number of 64bit granules as opposed to 128bit vector chunks,
5127   // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5128   // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5129   // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5130   ByteSized = Offset.getFixed();
5131   VGSized = Offset.getScalable() / 2;
5132 }
5133 
5134 /// Returns the offset in parts to which this frame offset can be
5135 /// decomposed for the purpose of describing a frame offset.
5136 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)5137 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5138     const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5139     int64_t &NumDataVectors) {
5140   // The smallest scalable element supported by scaled SVE addressing
5141   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5142   // byte offset must always be a multiple of 2.
5143   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5144 
5145   NumBytes = Offset.getFixed();
5146   NumDataVectors = 0;
5147   NumPredicateVectors = Offset.getScalable() / 2;
5148   // This method is used to get the offsets to adjust the frame offset.
5149   // If the function requires ADDPL to be used and needs more than two ADDPL
5150   // instructions, part of the offset is folded into NumDataVectors so that it
5151   // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5152   if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5153       NumPredicateVectors > 62) {
5154     NumDataVectors = NumPredicateVectors / 8;
5155     NumPredicateVectors -= NumDataVectors * 8;
5156   }
5157 }
5158 
5159 // Convenience function to create a DWARF expression for
5160 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)5161 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5162                                      int NumVGScaledBytes, unsigned VG,
5163                                      llvm::raw_string_ostream &Comment) {
5164   uint8_t buffer[16];
5165 
5166   if (NumBytes) {
5167     Expr.push_back(dwarf::DW_OP_consts);
5168     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5169     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5170     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5171   }
5172 
5173   if (NumVGScaledBytes) {
5174     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5175     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5176 
5177     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5178     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5179     Expr.push_back(0);
5180 
5181     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5182     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5183 
5184     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5185             << std::abs(NumVGScaledBytes) << " * VG";
5186   }
5187 }
5188 
5189 // Creates an MCCFIInstruction:
5190 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpression(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & Offset)5191 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5192                                                unsigned Reg,
5193                                                const StackOffset &Offset) {
5194   int64_t NumBytes, NumVGScaledBytes;
5195   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5196                                                         NumVGScaledBytes);
5197   std::string CommentBuffer;
5198   llvm::raw_string_ostream Comment(CommentBuffer);
5199 
5200   if (Reg == AArch64::SP)
5201     Comment << "sp";
5202   else if (Reg == AArch64::FP)
5203     Comment << "fp";
5204   else
5205     Comment << printReg(Reg, &TRI);
5206 
5207   // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5208   SmallString<64> Expr;
5209   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5210   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5211   Expr.push_back(0);
5212   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5213                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5214 
5215   // Wrap this into DW_CFA_def_cfa.
5216   SmallString<64> DefCfaExpr;
5217   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5218   uint8_t buffer[16];
5219   DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5220   DefCfaExpr.append(Expr.str());
5221   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5222                                         Comment.str());
5223 }
5224 
createDefCFA(const TargetRegisterInfo & TRI,unsigned FrameReg,unsigned Reg,const StackOffset & Offset,bool LastAdjustmentWasScalable)5225 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5226                                     unsigned FrameReg, unsigned Reg,
5227                                     const StackOffset &Offset,
5228                                     bool LastAdjustmentWasScalable) {
5229   if (Offset.getScalable())
5230     return createDefCFAExpression(TRI, Reg, Offset);
5231 
5232   if (FrameReg == Reg && !LastAdjustmentWasScalable)
5233     return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5234 
5235   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5236   return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5237 }
5238 
createCFAOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA)5239 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5240                                        unsigned Reg,
5241                                        const StackOffset &OffsetFromDefCFA) {
5242   int64_t NumBytes, NumVGScaledBytes;
5243   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5244       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5245 
5246   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5247 
5248   // Non-scalable offsets can use DW_CFA_offset directly.
5249   if (!NumVGScaledBytes)
5250     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5251 
5252   std::string CommentBuffer;
5253   llvm::raw_string_ostream Comment(CommentBuffer);
5254   Comment << printReg(Reg, &TRI) << "  @ cfa";
5255 
5256   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5257   SmallString<64> OffsetExpr;
5258   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5259                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5260 
5261   // Wrap this into DW_CFA_expression
5262   SmallString<64> CfaExpr;
5263   CfaExpr.push_back(dwarf::DW_CFA_expression);
5264   uint8_t buffer[16];
5265   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5266   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5267   CfaExpr.append(OffsetExpr.str());
5268 
5269   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5270                                         Comment.str());
5271 }
5272 
5273 // Helper function to emit a frame offset adjustment from a given
5274 // pointer (SrcReg), stored into DestReg. This function is explicit
5275 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5276 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5277                                MachineBasicBlock::iterator MBBI,
5278                                const DebugLoc &DL, unsigned DestReg,
5279                                unsigned SrcReg, int64_t Offset, unsigned Opc,
5280                                const TargetInstrInfo *TII,
5281                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5282                                bool *HasWinCFI, bool EmitCFAOffset,
5283                                StackOffset CFAOffset, unsigned FrameReg) {
5284   int Sign = 1;
5285   unsigned MaxEncoding, ShiftSize;
5286   switch (Opc) {
5287   case AArch64::ADDXri:
5288   case AArch64::ADDSXri:
5289   case AArch64::SUBXri:
5290   case AArch64::SUBSXri:
5291     MaxEncoding = 0xfff;
5292     ShiftSize = 12;
5293     break;
5294   case AArch64::ADDVL_XXI:
5295   case AArch64::ADDPL_XXI:
5296   case AArch64::ADDSVL_XXI:
5297   case AArch64::ADDSPL_XXI:
5298     MaxEncoding = 31;
5299     ShiftSize = 0;
5300     if (Offset < 0) {
5301       MaxEncoding = 32;
5302       Sign = -1;
5303       Offset = -Offset;
5304     }
5305     break;
5306   default:
5307     llvm_unreachable("Unsupported opcode");
5308   }
5309 
5310   // `Offset` can be in bytes or in "scalable bytes".
5311   int VScale = 1;
5312   if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5313     VScale = 16;
5314   else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5315     VScale = 2;
5316 
5317   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5318   // scratch register.  If DestReg is a virtual register, use it as the
5319   // scratch register; otherwise, create a new virtual register (to be
5320   // replaced by the scavenger at the end of PEI).  That case can be optimized
5321   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5322   // register can be loaded with offset%8 and the add/sub can use an extending
5323   // instruction with LSL#3.
5324   // Currently the function handles any offsets but generates a poor sequence
5325   // of code.
5326   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5327 
5328   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5329   Register TmpReg = DestReg;
5330   if (TmpReg == AArch64::XZR)
5331     TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5332         &AArch64::GPR64RegClass);
5333   do {
5334     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5335     unsigned LocalShiftSize = 0;
5336     if (ThisVal > MaxEncoding) {
5337       ThisVal = ThisVal >> ShiftSize;
5338       LocalShiftSize = ShiftSize;
5339     }
5340     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5341            "Encoding cannot handle value that big");
5342 
5343     Offset -= ThisVal << LocalShiftSize;
5344     if (Offset == 0)
5345       TmpReg = DestReg;
5346     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5347                    .addReg(SrcReg)
5348                    .addImm(Sign * (int)ThisVal);
5349     if (ShiftSize)
5350       MBI = MBI.addImm(
5351           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5352     MBI = MBI.setMIFlag(Flag);
5353 
5354     auto Change =
5355         VScale == 1
5356             ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5357             : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5358     if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5359       CFAOffset += Change;
5360     else
5361       CFAOffset -= Change;
5362     if (EmitCFAOffset && DestReg == TmpReg) {
5363       MachineFunction &MF = *MBB.getParent();
5364       const TargetSubtargetInfo &STI = MF.getSubtarget();
5365       const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5366 
5367       unsigned CFIIndex = MF.addFrameInst(
5368           createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5369       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5370           .addCFIIndex(CFIIndex)
5371           .setMIFlags(Flag);
5372     }
5373 
5374     if (NeedsWinCFI) {
5375       assert(Sign == 1 && "SEH directives should always have a positive sign");
5376       int Imm = (int)(ThisVal << LocalShiftSize);
5377       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5378           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5379         if (HasWinCFI)
5380           *HasWinCFI = true;
5381         if (Imm == 0)
5382           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5383         else
5384           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5385               .addImm(Imm)
5386               .setMIFlag(Flag);
5387         assert(Offset == 0 && "Expected remaining offset to be zero to "
5388                               "emit a single SEH directive");
5389       } else if (DestReg == AArch64::SP) {
5390         if (HasWinCFI)
5391           *HasWinCFI = true;
5392         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5393         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5394             .addImm(Imm)
5395             .setMIFlag(Flag);
5396       }
5397     }
5398 
5399     SrcReg = TmpReg;
5400   } while (Offset);
5401 }
5402 
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5403 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5404                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5405                            unsigned DestReg, unsigned SrcReg,
5406                            StackOffset Offset, const TargetInstrInfo *TII,
5407                            MachineInstr::MIFlag Flag, bool SetNZCV,
5408                            bool NeedsWinCFI, bool *HasWinCFI,
5409                            bool EmitCFAOffset, StackOffset CFAOffset,
5410                            unsigned FrameReg) {
5411   // If a function is marked as arm_locally_streaming, then the runtime value of
5412   // vscale in the prologue/epilogue is different the runtime value of vscale
5413   // in the function's body. To avoid having to consider multiple vscales,
5414   // we can use `addsvl` to allocate any scalable stack-slots, which under
5415   // most circumstances will be only locals, not callee-save slots.
5416   const Function &F = MBB.getParent()->getFunction();
5417   bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5418 
5419   int64_t Bytes, NumPredicateVectors, NumDataVectors;
5420   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5421       Offset, Bytes, NumPredicateVectors, NumDataVectors);
5422 
5423   // First emit non-scalable frame offsets, or a simple 'mov'.
5424   if (Bytes || (!Offset && SrcReg != DestReg)) {
5425     assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5426            "SP increment/decrement not 8-byte aligned");
5427     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5428     if (Bytes < 0) {
5429       Bytes = -Bytes;
5430       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5431     }
5432     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5433                        NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5434                        FrameReg);
5435     CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5436                      ? StackOffset::getFixed(-Bytes)
5437                      : StackOffset::getFixed(Bytes);
5438     SrcReg = DestReg;
5439     FrameReg = DestReg;
5440   }
5441 
5442   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5443          "SetNZCV not supported with SVE vectors");
5444   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5445          "WinCFI not supported with SVE vectors");
5446 
5447   if (NumDataVectors) {
5448     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5449                        UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5450                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5451                        CFAOffset, FrameReg);
5452     CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5453     SrcReg = DestReg;
5454   }
5455 
5456   if (NumPredicateVectors) {
5457     assert(DestReg != AArch64::SP && "Unaligned access to SP");
5458     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5459                        UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5460                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5461                        CFAOffset, FrameReg);
5462   }
5463 }
5464 
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const5465 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5466     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5467     MachineBasicBlock::iterator InsertPt, int FrameIndex,
5468     LiveIntervals *LIS, VirtRegMap *VRM) const {
5469   // This is a bit of a hack. Consider this instruction:
5470   //
5471   //   %0 = COPY %sp; GPR64all:%0
5472   //
5473   // We explicitly chose GPR64all for the virtual register so such a copy might
5474   // be eliminated by RegisterCoalescer. However, that may not be possible, and
5475   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5476   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5477   //
5478   // To prevent that, we are going to constrain the %0 register class here.
5479   if (MI.isFullCopy()) {
5480     Register DstReg = MI.getOperand(0).getReg();
5481     Register SrcReg = MI.getOperand(1).getReg();
5482     if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5483       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5484       return nullptr;
5485     }
5486     if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5487       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5488       return nullptr;
5489     }
5490     // Nothing can folded with copy from/to NZCV.
5491     if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5492       return nullptr;
5493   }
5494 
5495   // Handle the case where a copy is being spilled or filled but the source
5496   // and destination register class don't match.  For example:
5497   //
5498   //   %0 = COPY %xzr; GPR64common:%0
5499   //
5500   // In this case we can still safely fold away the COPY and generate the
5501   // following spill code:
5502   //
5503   //   STRXui %xzr, %stack.0
5504   //
5505   // This also eliminates spilled cross register class COPYs (e.g. between x and
5506   // d regs) of the same size.  For example:
5507   //
5508   //   %0 = COPY %1; GPR64:%0, FPR64:%1
5509   //
5510   // will be filled as
5511   //
5512   //   LDRDui %0, fi<#0>
5513   //
5514   // instead of
5515   //
5516   //   LDRXui %Temp, fi<#0>
5517   //   %0 = FMOV %Temp
5518   //
5519   if (MI.isCopy() && Ops.size() == 1 &&
5520       // Make sure we're only folding the explicit COPY defs/uses.
5521       (Ops[0] == 0 || Ops[0] == 1)) {
5522     bool IsSpill = Ops[0] == 0;
5523     bool IsFill = !IsSpill;
5524     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5525     const MachineRegisterInfo &MRI = MF.getRegInfo();
5526     MachineBasicBlock &MBB = *MI.getParent();
5527     const MachineOperand &DstMO = MI.getOperand(0);
5528     const MachineOperand &SrcMO = MI.getOperand(1);
5529     Register DstReg = DstMO.getReg();
5530     Register SrcReg = SrcMO.getReg();
5531     // This is slightly expensive to compute for physical regs since
5532     // getMinimalPhysRegClass is slow.
5533     auto getRegClass = [&](unsigned Reg) {
5534       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5535                                               : TRI.getMinimalPhysRegClass(Reg);
5536     };
5537 
5538     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5539       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5540                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5541              "Mismatched register size in non subreg COPY");
5542       if (IsSpill)
5543         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
5544                             getRegClass(SrcReg), &TRI, Register());
5545       else
5546         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
5547                              getRegClass(DstReg), &TRI, Register());
5548       return &*--InsertPt;
5549     }
5550 
5551     // Handle cases like spilling def of:
5552     //
5553     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5554     //
5555     // where the physical register source can be widened and stored to the full
5556     // virtual reg destination stack slot, in this case producing:
5557     //
5558     //   STRXui %xzr, %stack.0
5559     //
5560     if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5561         TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5562       assert(SrcMO.getSubReg() == 0 &&
5563              "Unexpected subreg on physical register");
5564       storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5565                           FrameIndex, &AArch64::GPR64RegClass, &TRI,
5566                           Register());
5567       return &*--InsertPt;
5568     }
5569 
5570     // Handle cases like filling use of:
5571     //
5572     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5573     //
5574     // where we can load the full virtual reg source stack slot, into the subreg
5575     // destination, in this case producing:
5576     //
5577     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
5578     //
5579     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5580       const TargetRegisterClass *FillRC;
5581       switch (DstMO.getSubReg()) {
5582       default:
5583         FillRC = nullptr;
5584         break;
5585       case AArch64::sub_32:
5586         FillRC = &AArch64::GPR32RegClass;
5587         break;
5588       case AArch64::ssub:
5589         FillRC = &AArch64::FPR32RegClass;
5590         break;
5591       case AArch64::dsub:
5592         FillRC = &AArch64::FPR64RegClass;
5593         break;
5594       }
5595 
5596       if (FillRC) {
5597         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5598                    TRI.getRegSizeInBits(*FillRC) &&
5599                "Mismatched regclass size on folded subreg COPY");
5600         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
5601                              Register());
5602         MachineInstr &LoadMI = *--InsertPt;
5603         MachineOperand &LoadDst = LoadMI.getOperand(0);
5604         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5605         LoadDst.setSubReg(DstMO.getSubReg());
5606         LoadDst.setIsUndef();
5607         return &LoadMI;
5608       }
5609     }
5610   }
5611 
5612   // Cannot fold.
5613   return nullptr;
5614 }
5615 
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)5616 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5617                                     StackOffset &SOffset,
5618                                     bool *OutUseUnscaledOp,
5619                                     unsigned *OutUnscaledOp,
5620                                     int64_t *EmittableOffset) {
5621   // Set output values in case of early exit.
5622   if (EmittableOffset)
5623     *EmittableOffset = 0;
5624   if (OutUseUnscaledOp)
5625     *OutUseUnscaledOp = false;
5626   if (OutUnscaledOp)
5627     *OutUnscaledOp = 0;
5628 
5629   // Exit early for structured vector spills/fills as they can't take an
5630   // immediate offset.
5631   switch (MI.getOpcode()) {
5632   default:
5633     break;
5634   case AArch64::LD1Rv1d:
5635   case AArch64::LD1Rv2s:
5636   case AArch64::LD1Rv2d:
5637   case AArch64::LD1Rv4h:
5638   case AArch64::LD1Rv4s:
5639   case AArch64::LD1Rv8b:
5640   case AArch64::LD1Rv8h:
5641   case AArch64::LD1Rv16b:
5642   case AArch64::LD1Twov2d:
5643   case AArch64::LD1Threev2d:
5644   case AArch64::LD1Fourv2d:
5645   case AArch64::LD1Twov1d:
5646   case AArch64::LD1Threev1d:
5647   case AArch64::LD1Fourv1d:
5648   case AArch64::ST1Twov2d:
5649   case AArch64::ST1Threev2d:
5650   case AArch64::ST1Fourv2d:
5651   case AArch64::ST1Twov1d:
5652   case AArch64::ST1Threev1d:
5653   case AArch64::ST1Fourv1d:
5654   case AArch64::ST1i8:
5655   case AArch64::ST1i16:
5656   case AArch64::ST1i32:
5657   case AArch64::ST1i64:
5658   case AArch64::IRG:
5659   case AArch64::IRGstack:
5660   case AArch64::STGloop:
5661   case AArch64::STZGloop:
5662     return AArch64FrameOffsetCannotUpdate;
5663   }
5664 
5665   // Get the min/max offset and the scale.
5666   TypeSize ScaleValue(0U, false), Width(0U, false);
5667   int64_t MinOff, MaxOff;
5668   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
5669                                       MaxOff))
5670     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5671 
5672   // Construct the complete offset.
5673   bool IsMulVL = ScaleValue.isScalable();
5674   unsigned Scale = ScaleValue.getKnownMinValue();
5675   int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5676 
5677   const MachineOperand &ImmOpnd =
5678       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
5679   Offset += ImmOpnd.getImm() * Scale;
5680 
5681   // If the offset doesn't match the scale, we rewrite the instruction to
5682   // use the unscaled instruction instead. Likewise, if we have a negative
5683   // offset and there is an unscaled op to use.
5684   std::optional<unsigned> UnscaledOp =
5685       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
5686   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5687   if (useUnscaledOp &&
5688       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
5689                                       MaxOff))
5690     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5691 
5692   Scale = ScaleValue.getKnownMinValue();
5693   assert(IsMulVL == ScaleValue.isScalable() &&
5694          "Unscaled opcode has different value for scalable");
5695 
5696   int64_t Remainder = Offset % Scale;
5697   assert(!(Remainder && useUnscaledOp) &&
5698          "Cannot have remainder when using unscaled op");
5699 
5700   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5701   int64_t NewOffset = Offset / Scale;
5702   if (MinOff <= NewOffset && NewOffset <= MaxOff)
5703     Offset = Remainder;
5704   else {
5705     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5706     Offset = Offset - NewOffset * Scale;
5707   }
5708 
5709   if (EmittableOffset)
5710     *EmittableOffset = NewOffset;
5711   if (OutUseUnscaledOp)
5712     *OutUseUnscaledOp = useUnscaledOp;
5713   if (OutUnscaledOp && UnscaledOp)
5714     *OutUnscaledOp = *UnscaledOp;
5715 
5716   if (IsMulVL)
5717     SOffset = StackOffset::get(SOffset.getFixed(), Offset);
5718   else
5719     SOffset = StackOffset::get(Offset, SOffset.getScalable());
5720   return AArch64FrameOffsetCanUpdate |
5721          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5722 }
5723 
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)5724 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5725                                     unsigned FrameReg, StackOffset &Offset,
5726                                     const AArch64InstrInfo *TII) {
5727   unsigned Opcode = MI.getOpcode();
5728   unsigned ImmIdx = FrameRegIdx + 1;
5729 
5730   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5731     Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
5732     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5733                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5734                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5735     MI.eraseFromParent();
5736     Offset = StackOffset();
5737     return true;
5738   }
5739 
5740   int64_t NewOffset;
5741   unsigned UnscaledOp;
5742   bool UseUnscaledOp;
5743   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
5744                                          &UnscaledOp, &NewOffset);
5745   if (Status & AArch64FrameOffsetCanUpdate) {
5746     if (Status & AArch64FrameOffsetIsLegal)
5747       // Replace the FrameIndex with FrameReg.
5748       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
5749     if (UseUnscaledOp)
5750       MI.setDesc(TII->get(UnscaledOp));
5751 
5752     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
5753     return !Offset;
5754   }
5755 
5756   return false;
5757 }
5758 
insertNoop(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI) const5759 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5760                                   MachineBasicBlock::iterator MI) const {
5761   DebugLoc DL;
5762   BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5763 }
5764 
getNop() const5765 MCInst AArch64InstrInfo::getNop() const {
5766   return MCInstBuilder(AArch64::HINT).addImm(0);
5767 }
5768 
5769 // AArch64 supports MachineCombiner.
useMachineCombiner() const5770 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5771 
5772 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)5773 static bool isCombineInstrSettingFlag(unsigned Opc) {
5774   switch (Opc) {
5775   case AArch64::ADDSWrr:
5776   case AArch64::ADDSWri:
5777   case AArch64::ADDSXrr:
5778   case AArch64::ADDSXri:
5779   case AArch64::SUBSWrr:
5780   case AArch64::SUBSXrr:
5781   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5782   case AArch64::SUBSWri:
5783   case AArch64::SUBSXri:
5784     return true;
5785   default:
5786     break;
5787   }
5788   return false;
5789 }
5790 
5791 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)5792 static bool isCombineInstrCandidate32(unsigned Opc) {
5793   switch (Opc) {
5794   case AArch64::ADDWrr:
5795   case AArch64::ADDWri:
5796   case AArch64::SUBWrr:
5797   case AArch64::ADDSWrr:
5798   case AArch64::ADDSWri:
5799   case AArch64::SUBSWrr:
5800   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5801   case AArch64::SUBWri:
5802   case AArch64::SUBSWri:
5803     return true;
5804   default:
5805     break;
5806   }
5807   return false;
5808 }
5809 
5810 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)5811 static bool isCombineInstrCandidate64(unsigned Opc) {
5812   switch (Opc) {
5813   case AArch64::ADDXrr:
5814   case AArch64::ADDXri:
5815   case AArch64::SUBXrr:
5816   case AArch64::ADDSXrr:
5817   case AArch64::ADDSXri:
5818   case AArch64::SUBSXrr:
5819   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5820   case AArch64::SUBXri:
5821   case AArch64::SUBSXri:
5822   case AArch64::ADDv8i8:
5823   case AArch64::ADDv16i8:
5824   case AArch64::ADDv4i16:
5825   case AArch64::ADDv8i16:
5826   case AArch64::ADDv2i32:
5827   case AArch64::ADDv4i32:
5828   case AArch64::SUBv8i8:
5829   case AArch64::SUBv16i8:
5830   case AArch64::SUBv4i16:
5831   case AArch64::SUBv8i16:
5832   case AArch64::SUBv2i32:
5833   case AArch64::SUBv4i32:
5834     return true;
5835   default:
5836     break;
5837   }
5838   return false;
5839 }
5840 
5841 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)5842 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5843   switch (Inst.getOpcode()) {
5844   default:
5845     break;
5846   case AArch64::FADDHrr:
5847   case AArch64::FADDSrr:
5848   case AArch64::FADDDrr:
5849   case AArch64::FADDv4f16:
5850   case AArch64::FADDv8f16:
5851   case AArch64::FADDv2f32:
5852   case AArch64::FADDv2f64:
5853   case AArch64::FADDv4f32:
5854   case AArch64::FSUBHrr:
5855   case AArch64::FSUBSrr:
5856   case AArch64::FSUBDrr:
5857   case AArch64::FSUBv4f16:
5858   case AArch64::FSUBv8f16:
5859   case AArch64::FSUBv2f32:
5860   case AArch64::FSUBv2f64:
5861   case AArch64::FSUBv4f32:
5862     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5863     // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5864     // the target options or if FADD/FSUB has the contract fast-math flag.
5865     return Options.UnsafeFPMath ||
5866            Options.AllowFPOpFusion == FPOpFusion::Fast ||
5867            Inst.getFlag(MachineInstr::FmContract);
5868     return true;
5869   }
5870   return false;
5871 }
5872 
5873 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)5874 static bool isCombineInstrCandidate(unsigned Opc) {
5875   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5876 }
5877 
5878 //
5879 // Utility routine that checks if \param MO is defined by an
5880 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)5881 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5882                        unsigned CombineOpc, unsigned ZeroReg = 0,
5883                        bool CheckZeroReg = false) {
5884   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5885   MachineInstr *MI = nullptr;
5886 
5887   if (MO.isReg() && MO.getReg().isVirtual())
5888     MI = MRI.getUniqueVRegDef(MO.getReg());
5889   // And it needs to be in the trace (otherwise, it won't have a depth).
5890   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5891     return false;
5892   // Must only used by the user we combine with.
5893   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
5894     return false;
5895 
5896   if (CheckZeroReg) {
5897     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5898            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5899            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5900     // The third input reg must be zero.
5901     if (MI->getOperand(3).getReg() != ZeroReg)
5902       return false;
5903   }
5904 
5905   if (isCombineInstrSettingFlag(CombineOpc) &&
5906       MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
5907     return false;
5908 
5909   return true;
5910 }
5911 
5912 //
5913 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)5914 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5915                               unsigned MulOpc, unsigned ZeroReg) {
5916   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
5917 }
5918 
5919 //
5920 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)5921 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5922                                unsigned MulOpc) {
5923   return canCombine(MBB, MO, MulOpc);
5924 }
5925 
5926 // TODO: There are many more machine instruction opcodes to match:
5927 //       1. Other data types (integer, vectors)
5928 //       2. Other math / logic operations (xor, or)
5929 //       3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst,bool Invert) const5930 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5931                                                    bool Invert) const {
5932   if (Invert)
5933     return false;
5934   switch (Inst.getOpcode()) {
5935   // == Floating-point types ==
5936   // -- Floating-point instructions --
5937   case AArch64::FADDHrr:
5938   case AArch64::FADDSrr:
5939   case AArch64::FADDDrr:
5940   case AArch64::FMULHrr:
5941   case AArch64::FMULSrr:
5942   case AArch64::FMULDrr:
5943   case AArch64::FMULX16:
5944   case AArch64::FMULX32:
5945   case AArch64::FMULX64:
5946   // -- Advanced SIMD instructions --
5947   case AArch64::FADDv4f16:
5948   case AArch64::FADDv8f16:
5949   case AArch64::FADDv2f32:
5950   case AArch64::FADDv4f32:
5951   case AArch64::FADDv2f64:
5952   case AArch64::FMULv4f16:
5953   case AArch64::FMULv8f16:
5954   case AArch64::FMULv2f32:
5955   case AArch64::FMULv4f32:
5956   case AArch64::FMULv2f64:
5957   case AArch64::FMULXv4f16:
5958   case AArch64::FMULXv8f16:
5959   case AArch64::FMULXv2f32:
5960   case AArch64::FMULXv4f32:
5961   case AArch64::FMULXv2f64:
5962   // -- SVE instructions --
5963   // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
5964   // in the SVE instruction set (though there are predicated ones).
5965   case AArch64::FADD_ZZZ_H:
5966   case AArch64::FADD_ZZZ_S:
5967   case AArch64::FADD_ZZZ_D:
5968   case AArch64::FMUL_ZZZ_H:
5969   case AArch64::FMUL_ZZZ_S:
5970   case AArch64::FMUL_ZZZ_D:
5971     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
5972            (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
5973             Inst.getFlag(MachineInstr::MIFlag::FmNsz));
5974 
5975   // == Integer types ==
5976   // -- Base instructions --
5977   // Opcodes MULWrr and MULXrr don't exist because
5978   // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
5979   // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
5980   // The machine-combiner does not support three-source-operands machine
5981   // instruction. So we cannot reassociate MULs.
5982   case AArch64::ADDWrr:
5983   case AArch64::ADDXrr:
5984   case AArch64::ANDWrr:
5985   case AArch64::ANDXrr:
5986   case AArch64::ORRWrr:
5987   case AArch64::ORRXrr:
5988   case AArch64::EORWrr:
5989   case AArch64::EORXrr:
5990   case AArch64::EONWrr:
5991   case AArch64::EONXrr:
5992   // -- Advanced SIMD instructions --
5993   // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
5994   // in the Advanced SIMD instruction set.
5995   case AArch64::ADDv8i8:
5996   case AArch64::ADDv16i8:
5997   case AArch64::ADDv4i16:
5998   case AArch64::ADDv8i16:
5999   case AArch64::ADDv2i32:
6000   case AArch64::ADDv4i32:
6001   case AArch64::ADDv1i64:
6002   case AArch64::ADDv2i64:
6003   case AArch64::MULv8i8:
6004   case AArch64::MULv16i8:
6005   case AArch64::MULv4i16:
6006   case AArch64::MULv8i16:
6007   case AArch64::MULv2i32:
6008   case AArch64::MULv4i32:
6009   case AArch64::ANDv8i8:
6010   case AArch64::ANDv16i8:
6011   case AArch64::ORRv8i8:
6012   case AArch64::ORRv16i8:
6013   case AArch64::EORv8i8:
6014   case AArch64::EORv16i8:
6015   // -- SVE instructions --
6016   case AArch64::ADD_ZZZ_B:
6017   case AArch64::ADD_ZZZ_H:
6018   case AArch64::ADD_ZZZ_S:
6019   case AArch64::ADD_ZZZ_D:
6020   case AArch64::MUL_ZZZ_B:
6021   case AArch64::MUL_ZZZ_H:
6022   case AArch64::MUL_ZZZ_S:
6023   case AArch64::MUL_ZZZ_D:
6024   case AArch64::AND_ZZZ:
6025   case AArch64::ORR_ZZZ:
6026   case AArch64::EOR_ZZZ:
6027     return true;
6028 
6029   default:
6030     return false;
6031   }
6032 }
6033 
6034 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6035 static bool getMaddPatterns(MachineInstr &Root,
6036                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6037   unsigned Opc = Root.getOpcode();
6038   MachineBasicBlock &MBB = *Root.getParent();
6039   bool Found = false;
6040 
6041   if (!isCombineInstrCandidate(Opc))
6042     return false;
6043   if (isCombineInstrSettingFlag(Opc)) {
6044     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
6045     // When NZCV is live bail out.
6046     if (Cmp_NZCV == -1)
6047       return false;
6048     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6049     // When opcode can't change bail out.
6050     // CHECKME: do we miss any cases for opcode conversion?
6051     if (NewOpc == Opc)
6052       return false;
6053     Opc = NewOpc;
6054   }
6055 
6056   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6057                       MachineCombinerPattern Pattern) {
6058     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6059       Patterns.push_back(Pattern);
6060       Found = true;
6061     }
6062   };
6063 
6064   auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
6065     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6066       Patterns.push_back(Pattern);
6067       Found = true;
6068     }
6069   };
6070 
6071   typedef MachineCombinerPattern MCP;
6072 
6073   switch (Opc) {
6074   default:
6075     break;
6076   case AArch64::ADDWrr:
6077     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6078            "ADDWrr does not have register operands");
6079     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6080     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6081     break;
6082   case AArch64::ADDXrr:
6083     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6084     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6085     break;
6086   case AArch64::SUBWrr:
6087     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6088     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6089     break;
6090   case AArch64::SUBXrr:
6091     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6092     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6093     break;
6094   case AArch64::ADDWri:
6095     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6096     break;
6097   case AArch64::ADDXri:
6098     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6099     break;
6100   case AArch64::SUBWri:
6101     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6102     break;
6103   case AArch64::SUBXri:
6104     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6105     break;
6106   case AArch64::ADDv8i8:
6107     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6108     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6109     break;
6110   case AArch64::ADDv16i8:
6111     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6112     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6113     break;
6114   case AArch64::ADDv4i16:
6115     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6116     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6117     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6118     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6119     break;
6120   case AArch64::ADDv8i16:
6121     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6122     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6123     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6124     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6125     break;
6126   case AArch64::ADDv2i32:
6127     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6128     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6129     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6130     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6131     break;
6132   case AArch64::ADDv4i32:
6133     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6134     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6135     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6136     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6137     break;
6138   case AArch64::SUBv8i8:
6139     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6140     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6141     break;
6142   case AArch64::SUBv16i8:
6143     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6144     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6145     break;
6146   case AArch64::SUBv4i16:
6147     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6148     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6149     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6150     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6151     break;
6152   case AArch64::SUBv8i16:
6153     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6154     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6155     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6156     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6157     break;
6158   case AArch64::SUBv2i32:
6159     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6160     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6161     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6162     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6163     break;
6164   case AArch64::SUBv4i32:
6165     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6166     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6167     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6168     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6169     break;
6170   }
6171   return Found;
6172 }
6173 /// Floating-Point Support
6174 
6175 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6176 static bool getFMAPatterns(MachineInstr &Root,
6177                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6178 
6179   if (!isCombineInstrCandidateFP(Root))
6180     return false;
6181 
6182   MachineBasicBlock &MBB = *Root.getParent();
6183   bool Found = false;
6184 
6185   auto Match = [&](int Opcode, int Operand,
6186                    MachineCombinerPattern Pattern) -> bool {
6187     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6188       Patterns.push_back(Pattern);
6189       return true;
6190     }
6191     return false;
6192   };
6193 
6194   typedef MachineCombinerPattern MCP;
6195 
6196   switch (Root.getOpcode()) {
6197   default:
6198     assert(false && "Unsupported FP instruction in combiner\n");
6199     break;
6200   case AArch64::FADDHrr:
6201     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6202            "FADDHrr does not have register operands");
6203 
6204     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6205     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6206     break;
6207   case AArch64::FADDSrr:
6208     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6209            "FADDSrr does not have register operands");
6210 
6211     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6212              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6213 
6214     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6215              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6216     break;
6217   case AArch64::FADDDrr:
6218     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6219              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6220 
6221     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6222              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6223     break;
6224   case AArch64::FADDv4f16:
6225     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6226              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6227 
6228     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6229              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6230     break;
6231   case AArch64::FADDv8f16:
6232     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6233              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6234 
6235     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6236              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6237     break;
6238   case AArch64::FADDv2f32:
6239     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6240              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6241 
6242     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6243              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6244     break;
6245   case AArch64::FADDv2f64:
6246     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6247              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6248 
6249     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6250              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6251     break;
6252   case AArch64::FADDv4f32:
6253     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6254              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6255 
6256     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6257              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6258     break;
6259   case AArch64::FSUBHrr:
6260     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6261     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6262     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6263     break;
6264   case AArch64::FSUBSrr:
6265     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6266 
6267     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6268              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6269 
6270     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6271     break;
6272   case AArch64::FSUBDrr:
6273     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6274 
6275     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6276              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6277 
6278     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6279     break;
6280   case AArch64::FSUBv4f16:
6281     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6282              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6283 
6284     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6285              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6286     break;
6287   case AArch64::FSUBv8f16:
6288     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6289              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6290 
6291     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6292              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6293     break;
6294   case AArch64::FSUBv2f32:
6295     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6296              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6297 
6298     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6299              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6300     break;
6301   case AArch64::FSUBv2f64:
6302     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6303              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6304 
6305     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6306              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6307     break;
6308   case AArch64::FSUBv4f32:
6309     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6310              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6311 
6312     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6313              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6314     break;
6315   }
6316   return Found;
6317 }
6318 
getFMULPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6319 static bool getFMULPatterns(MachineInstr &Root,
6320                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6321   MachineBasicBlock &MBB = *Root.getParent();
6322   bool Found = false;
6323 
6324   auto Match = [&](unsigned Opcode, int Operand,
6325                    MachineCombinerPattern Pattern) -> bool {
6326     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6327     MachineOperand &MO = Root.getOperand(Operand);
6328     MachineInstr *MI = nullptr;
6329     if (MO.isReg() && MO.getReg().isVirtual())
6330       MI = MRI.getUniqueVRegDef(MO.getReg());
6331     // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6332     if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6333         MI->getOperand(1).getReg().isVirtual())
6334       MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6335     if (MI && MI->getOpcode() == Opcode) {
6336       Patterns.push_back(Pattern);
6337       return true;
6338     }
6339     return false;
6340   };
6341 
6342   typedef MachineCombinerPattern MCP;
6343 
6344   switch (Root.getOpcode()) {
6345   default:
6346     return false;
6347   case AArch64::FMULv2f32:
6348     Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6349     Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6350     break;
6351   case AArch64::FMULv2f64:
6352     Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6353     Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6354     break;
6355   case AArch64::FMULv4f16:
6356     Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6357     Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6358     break;
6359   case AArch64::FMULv4f32:
6360     Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6361     Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6362     break;
6363   case AArch64::FMULv8f16:
6364     Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6365     Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6366     break;
6367   }
6368 
6369   return Found;
6370 }
6371 
getFNEGPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6372 static bool getFNEGPatterns(MachineInstr &Root,
6373                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6374   unsigned Opc = Root.getOpcode();
6375   MachineBasicBlock &MBB = *Root.getParent();
6376   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6377 
6378   auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool {
6379     MachineOperand &MO = Root.getOperand(1);
6380     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6381     if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6382         MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6383         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6384         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6385         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6386         MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6387       Patterns.push_back(Pattern);
6388       return true;
6389     }
6390     return false;
6391   };
6392 
6393   switch (Opc) {
6394   default:
6395     break;
6396   case AArch64::FNEGDr:
6397     return Match(AArch64::FMADDDrrr, MachineCombinerPattern::FNMADD);
6398   case AArch64::FNEGSr:
6399     return Match(AArch64::FMADDSrrr, MachineCombinerPattern::FNMADD);
6400   }
6401 
6402   return false;
6403 }
6404 
6405 /// Return true when a code sequence can improve throughput. It
6406 /// should be called only for instructions in loops.
6407 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const6408 bool AArch64InstrInfo::isThroughputPattern(
6409     MachineCombinerPattern Pattern) const {
6410   switch (Pattern) {
6411   default:
6412     break;
6413   case MachineCombinerPattern::FMULADDH_OP1:
6414   case MachineCombinerPattern::FMULADDH_OP2:
6415   case MachineCombinerPattern::FMULSUBH_OP1:
6416   case MachineCombinerPattern::FMULSUBH_OP2:
6417   case MachineCombinerPattern::FMULADDS_OP1:
6418   case MachineCombinerPattern::FMULADDS_OP2:
6419   case MachineCombinerPattern::FMULSUBS_OP1:
6420   case MachineCombinerPattern::FMULSUBS_OP2:
6421   case MachineCombinerPattern::FMULADDD_OP1:
6422   case MachineCombinerPattern::FMULADDD_OP2:
6423   case MachineCombinerPattern::FMULSUBD_OP1:
6424   case MachineCombinerPattern::FMULSUBD_OP2:
6425   case MachineCombinerPattern::FNMULSUBH_OP1:
6426   case MachineCombinerPattern::FNMULSUBS_OP1:
6427   case MachineCombinerPattern::FNMULSUBD_OP1:
6428   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6429   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6430   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6431   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6432   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6433   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6434   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6435   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6436   case MachineCombinerPattern::FMLAv4f16_OP2:
6437   case MachineCombinerPattern::FMLAv4f16_OP1:
6438   case MachineCombinerPattern::FMLAv8f16_OP1:
6439   case MachineCombinerPattern::FMLAv8f16_OP2:
6440   case MachineCombinerPattern::FMLAv2f32_OP2:
6441   case MachineCombinerPattern::FMLAv2f32_OP1:
6442   case MachineCombinerPattern::FMLAv2f64_OP1:
6443   case MachineCombinerPattern::FMLAv2f64_OP2:
6444   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6445   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6446   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6447   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6448   case MachineCombinerPattern::FMLAv4f32_OP1:
6449   case MachineCombinerPattern::FMLAv4f32_OP2:
6450   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6451   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6452   case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6453   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6454   case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6455   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6456   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6457   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6458   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6459   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6460   case MachineCombinerPattern::FMLSv4f16_OP1:
6461   case MachineCombinerPattern::FMLSv4f16_OP2:
6462   case MachineCombinerPattern::FMLSv8f16_OP1:
6463   case MachineCombinerPattern::FMLSv8f16_OP2:
6464   case MachineCombinerPattern::FMLSv2f32_OP2:
6465   case MachineCombinerPattern::FMLSv2f64_OP2:
6466   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6467   case MachineCombinerPattern::FMLSv4f32_OP2:
6468   case MachineCombinerPattern::FMULv2i32_indexed_OP1:
6469   case MachineCombinerPattern::FMULv2i32_indexed_OP2:
6470   case MachineCombinerPattern::FMULv2i64_indexed_OP1:
6471   case MachineCombinerPattern::FMULv2i64_indexed_OP2:
6472   case MachineCombinerPattern::FMULv4i16_indexed_OP1:
6473   case MachineCombinerPattern::FMULv4i16_indexed_OP2:
6474   case MachineCombinerPattern::FMULv4i32_indexed_OP1:
6475   case MachineCombinerPattern::FMULv4i32_indexed_OP2:
6476   case MachineCombinerPattern::FMULv8i16_indexed_OP1:
6477   case MachineCombinerPattern::FMULv8i16_indexed_OP2:
6478   case MachineCombinerPattern::MULADDv8i8_OP1:
6479   case MachineCombinerPattern::MULADDv8i8_OP2:
6480   case MachineCombinerPattern::MULADDv16i8_OP1:
6481   case MachineCombinerPattern::MULADDv16i8_OP2:
6482   case MachineCombinerPattern::MULADDv4i16_OP1:
6483   case MachineCombinerPattern::MULADDv4i16_OP2:
6484   case MachineCombinerPattern::MULADDv8i16_OP1:
6485   case MachineCombinerPattern::MULADDv8i16_OP2:
6486   case MachineCombinerPattern::MULADDv2i32_OP1:
6487   case MachineCombinerPattern::MULADDv2i32_OP2:
6488   case MachineCombinerPattern::MULADDv4i32_OP1:
6489   case MachineCombinerPattern::MULADDv4i32_OP2:
6490   case MachineCombinerPattern::MULSUBv8i8_OP1:
6491   case MachineCombinerPattern::MULSUBv8i8_OP2:
6492   case MachineCombinerPattern::MULSUBv16i8_OP1:
6493   case MachineCombinerPattern::MULSUBv16i8_OP2:
6494   case MachineCombinerPattern::MULSUBv4i16_OP1:
6495   case MachineCombinerPattern::MULSUBv4i16_OP2:
6496   case MachineCombinerPattern::MULSUBv8i16_OP1:
6497   case MachineCombinerPattern::MULSUBv8i16_OP2:
6498   case MachineCombinerPattern::MULSUBv2i32_OP1:
6499   case MachineCombinerPattern::MULSUBv2i32_OP2:
6500   case MachineCombinerPattern::MULSUBv4i32_OP1:
6501   case MachineCombinerPattern::MULSUBv4i32_OP2:
6502   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6503   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6504   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6505   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6506   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6507   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6508   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6509   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6510   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6511   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6512   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6513   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6514   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6515   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6516   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6517   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6518     return true;
6519   } // end switch (Pattern)
6520   return false;
6521 }
6522 
6523 /// Find other MI combine patterns.
getMiscPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)6524 static bool getMiscPatterns(MachineInstr &Root,
6525                             SmallVectorImpl<MachineCombinerPattern> &Patterns)
6526 {
6527   // A - (B + C)  ==>   (A - B) - C  or  (A - C) - B
6528   unsigned Opc = Root.getOpcode();
6529   MachineBasicBlock &MBB = *Root.getParent();
6530 
6531   switch (Opc) {
6532   case AArch64::SUBWrr:
6533   case AArch64::SUBSWrr:
6534   case AArch64::SUBXrr:
6535   case AArch64::SUBSXrr:
6536     // Found candidate root.
6537     break;
6538   default:
6539     return false;
6540   }
6541 
6542   if (isCombineInstrSettingFlag(Opc) &&
6543       Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
6544     return false;
6545 
6546   if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6547       canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6548       canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6549       canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6550     Patterns.push_back(MachineCombinerPattern::SUBADD_OP1);
6551     Patterns.push_back(MachineCombinerPattern::SUBADD_OP2);
6552     return true;
6553   }
6554 
6555   return false;
6556 }
6557 
6558 /// Return true when there is potentially a faster code sequence for an
6559 /// instruction chain ending in \p Root. All potential patterns are listed in
6560 /// the \p Pattern vector. Pattern should be sorted in priority order since the
6561 /// pattern evaluator stops checking as soon as it finds a faster sequence.
6562 
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns,bool DoRegPressureReduce) const6563 bool AArch64InstrInfo::getMachineCombinerPatterns(
6564     MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
6565     bool DoRegPressureReduce) const {
6566   // Integer patterns
6567   if (getMaddPatterns(Root, Patterns))
6568     return true;
6569   // Floating point patterns
6570   if (getFMULPatterns(Root, Patterns))
6571     return true;
6572   if (getFMAPatterns(Root, Patterns))
6573     return true;
6574   if (getFNEGPatterns(Root, Patterns))
6575     return true;
6576 
6577   // Other patterns
6578   if (getMiscPatterns(Root, Patterns))
6579     return true;
6580 
6581   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6582                                                      DoRegPressureReduce);
6583 }
6584 
6585 enum class FMAInstKind { Default, Indexed, Accumulator };
6586 /// genFusedMultiply - Generate fused multiply instructions.
6587 /// This function supports both integer and floating point instructions.
6588 /// A typical example:
6589 ///  F|MUL I=A,B,0
6590 ///  F|ADD R,I,C
6591 ///  ==> F|MADD R,A,B,C
6592 /// \param MF Containing MachineFunction
6593 /// \param MRI Register information
6594 /// \param TII Target information
6595 /// \param Root is the F|ADD instruction
6596 /// \param [out] InsInstrs is a vector of machine instructions and will
6597 /// contain the generated madd instruction
6598 /// \param IdxMulOpd is index of operand in Root that is the result of
6599 /// the F|MUL. In the example above IdxMulOpd is 1.
6600 /// \param MaddOpc the opcode fo the f|madd instruction
6601 /// \param RC Register class of operands
6602 /// \param kind of fma instruction (addressing mode) to be generated
6603 /// \param ReplacedAddend is the result register from the instruction
6604 /// replacing the non-combined operand, if any.
6605 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)6606 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6607                  const TargetInstrInfo *TII, MachineInstr &Root,
6608                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6609                  unsigned MaddOpc, const TargetRegisterClass *RC,
6610                  FMAInstKind kind = FMAInstKind::Default,
6611                  const Register *ReplacedAddend = nullptr) {
6612   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6613 
6614   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6615   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6616   Register ResultReg = Root.getOperand(0).getReg();
6617   Register SrcReg0 = MUL->getOperand(1).getReg();
6618   bool Src0IsKill = MUL->getOperand(1).isKill();
6619   Register SrcReg1 = MUL->getOperand(2).getReg();
6620   bool Src1IsKill = MUL->getOperand(2).isKill();
6621 
6622   Register SrcReg2;
6623   bool Src2IsKill;
6624   if (ReplacedAddend) {
6625     // If we just generated a new addend, we must be it's only use.
6626     SrcReg2 = *ReplacedAddend;
6627     Src2IsKill = true;
6628   } else {
6629     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
6630     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
6631   }
6632 
6633   if (ResultReg.isVirtual())
6634     MRI.constrainRegClass(ResultReg, RC);
6635   if (SrcReg0.isVirtual())
6636     MRI.constrainRegClass(SrcReg0, RC);
6637   if (SrcReg1.isVirtual())
6638     MRI.constrainRegClass(SrcReg1, RC);
6639   if (SrcReg2.isVirtual())
6640     MRI.constrainRegClass(SrcReg2, RC);
6641 
6642   MachineInstrBuilder MIB;
6643   if (kind == FMAInstKind::Default)
6644     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6645               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6646               .addReg(SrcReg1, getKillRegState(Src1IsKill))
6647               .addReg(SrcReg2, getKillRegState(Src2IsKill));
6648   else if (kind == FMAInstKind::Indexed)
6649     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6650               .addReg(SrcReg2, getKillRegState(Src2IsKill))
6651               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6652               .addReg(SrcReg1, getKillRegState(Src1IsKill))
6653               .addImm(MUL->getOperand(3).getImm());
6654   else if (kind == FMAInstKind::Accumulator)
6655     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6656               .addReg(SrcReg2, getKillRegState(Src2IsKill))
6657               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6658               .addReg(SrcReg1, getKillRegState(Src1IsKill));
6659   else
6660     assert(false && "Invalid FMA instruction kind \n");
6661   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6662   InsInstrs.push_back(MIB);
6663   return MUL;
6664 }
6665 
6666 static MachineInstr *
genFNegatedMAD(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs)6667 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6668                const TargetInstrInfo *TII, MachineInstr &Root,
6669                SmallVectorImpl<MachineInstr *> &InsInstrs) {
6670   MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
6671 
6672   unsigned Opc = 0;
6673   const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
6674   if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6675     Opc = AArch64::FNMADDSrrr;
6676   else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6677     Opc = AArch64::FNMADDDrrr;
6678   else
6679     return nullptr;
6680 
6681   Register ResultReg = Root.getOperand(0).getReg();
6682   Register SrcReg0 = MAD->getOperand(1).getReg();
6683   Register SrcReg1 = MAD->getOperand(2).getReg();
6684   Register SrcReg2 = MAD->getOperand(3).getReg();
6685   bool Src0IsKill = MAD->getOperand(1).isKill();
6686   bool Src1IsKill = MAD->getOperand(2).isKill();
6687   bool Src2IsKill = MAD->getOperand(3).isKill();
6688   if (ResultReg.isVirtual())
6689     MRI.constrainRegClass(ResultReg, RC);
6690   if (SrcReg0.isVirtual())
6691     MRI.constrainRegClass(SrcReg0, RC);
6692   if (SrcReg1.isVirtual())
6693     MRI.constrainRegClass(SrcReg1, RC);
6694   if (SrcReg2.isVirtual())
6695     MRI.constrainRegClass(SrcReg2, RC);
6696 
6697   MachineInstrBuilder MIB =
6698       BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
6699           .addReg(SrcReg0, getKillRegState(Src0IsKill))
6700           .addReg(SrcReg1, getKillRegState(Src1IsKill))
6701           .addReg(SrcReg2, getKillRegState(Src2IsKill));
6702   InsInstrs.push_back(MIB);
6703 
6704   return MAD;
6705 }
6706 
6707 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6708 static MachineInstr *
genIndexedMultiply(MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxDupOp,unsigned MulOpc,const TargetRegisterClass * RC,MachineRegisterInfo & MRI)6709 genIndexedMultiply(MachineInstr &Root,
6710                    SmallVectorImpl<MachineInstr *> &InsInstrs,
6711                    unsigned IdxDupOp, unsigned MulOpc,
6712                    const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6713   assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6714          "Invalid index of FMUL operand");
6715 
6716   MachineFunction &MF = *Root.getMF();
6717   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6718 
6719   MachineInstr *Dup =
6720       MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
6721 
6722   if (Dup->getOpcode() == TargetOpcode::COPY)
6723     Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
6724 
6725   Register DupSrcReg = Dup->getOperand(1).getReg();
6726   MRI.clearKillFlags(DupSrcReg);
6727   MRI.constrainRegClass(DupSrcReg, RC);
6728 
6729   unsigned DupSrcLane = Dup->getOperand(2).getImm();
6730 
6731   unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6732   MachineOperand &MulOp = Root.getOperand(IdxMulOp);
6733 
6734   Register ResultReg = Root.getOperand(0).getReg();
6735 
6736   MachineInstrBuilder MIB;
6737   MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
6738             .add(MulOp)
6739             .addReg(DupSrcReg)
6740             .addImm(DupSrcLane);
6741 
6742   InsInstrs.push_back(MIB);
6743   return &Root;
6744 }
6745 
6746 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6747 /// instructions.
6748 ///
6749 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6750 static MachineInstr *genFusedMultiplyAcc(
6751     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6752     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6753     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6754   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6755                           FMAInstKind::Accumulator);
6756 }
6757 
6758 /// genNeg - Helper to generate an intermediate negation of the second operand
6759 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)6760 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6761                        const TargetInstrInfo *TII, MachineInstr &Root,
6762                        SmallVectorImpl<MachineInstr *> &InsInstrs,
6763                        DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6764                        unsigned MnegOpc, const TargetRegisterClass *RC) {
6765   Register NewVR = MRI.createVirtualRegister(RC);
6766   MachineInstrBuilder MIB =
6767       BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
6768           .add(Root.getOperand(2));
6769   InsInstrs.push_back(MIB);
6770 
6771   assert(InstrIdxForVirtReg.empty());
6772   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6773 
6774   return NewVR;
6775 }
6776 
6777 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6778 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6779 static MachineInstr *genFusedMultiplyAccNeg(
6780     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6781     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6782     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6783     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6784   assert(IdxMulOpd == 1);
6785 
6786   Register NewVR =
6787       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6788   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6789                           FMAInstKind::Accumulator, &NewVR);
6790 }
6791 
6792 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6793 /// instructions.
6794 ///
6795 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6796 static MachineInstr *genFusedMultiplyIdx(
6797     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6798     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6799     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6800   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6801                           FMAInstKind::Indexed);
6802 }
6803 
6804 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6805 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6806 static MachineInstr *genFusedMultiplyIdxNeg(
6807     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6808     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6809     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6810     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6811   assert(IdxMulOpd == 1);
6812 
6813   Register NewVR =
6814       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6815 
6816   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6817                           FMAInstKind::Indexed, &NewVR);
6818 }
6819 
6820 /// genMaddR - Generate madd instruction and combine mul and add using
6821 /// an extra virtual register
6822 /// Example - an ADD intermediate needs to be stored in a register:
6823 ///   MUL I=A,B,0
6824 ///   ADD R,I,Imm
6825 ///   ==> ORR  V, ZR, Imm
6826 ///   ==> MADD R,A,B,V
6827 /// \param MF Containing MachineFunction
6828 /// \param MRI Register information
6829 /// \param TII Target information
6830 /// \param Root is the ADD instruction
6831 /// \param [out] InsInstrs is a vector of machine instructions and will
6832 /// contain the generated madd instruction
6833 /// \param IdxMulOpd is index of operand in Root that is the result of
6834 /// the MUL. In the example above IdxMulOpd is 1.
6835 /// \param MaddOpc the opcode fo the madd instruction
6836 /// \param VR is a virtual register that holds the value of an ADD operand
6837 /// (V in the example above).
6838 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)6839 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6840                               const TargetInstrInfo *TII, MachineInstr &Root,
6841                               SmallVectorImpl<MachineInstr *> &InsInstrs,
6842                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6843                               const TargetRegisterClass *RC) {
6844   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6845 
6846   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6847   Register ResultReg = Root.getOperand(0).getReg();
6848   Register SrcReg0 = MUL->getOperand(1).getReg();
6849   bool Src0IsKill = MUL->getOperand(1).isKill();
6850   Register SrcReg1 = MUL->getOperand(2).getReg();
6851   bool Src1IsKill = MUL->getOperand(2).isKill();
6852 
6853   if (ResultReg.isVirtual())
6854     MRI.constrainRegClass(ResultReg, RC);
6855   if (SrcReg0.isVirtual())
6856     MRI.constrainRegClass(SrcReg0, RC);
6857   if (SrcReg1.isVirtual())
6858     MRI.constrainRegClass(SrcReg1, RC);
6859   if (Register::isVirtualRegister(VR))
6860     MRI.constrainRegClass(VR, RC);
6861 
6862   MachineInstrBuilder MIB =
6863       BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6864           .addReg(SrcReg0, getKillRegState(Src0IsKill))
6865           .addReg(SrcReg1, getKillRegState(Src1IsKill))
6866           .addReg(VR);
6867   // Insert the MADD
6868   InsInstrs.push_back(MIB);
6869   return MUL;
6870 }
6871 
6872 /// Do the following transformation
6873 /// A - (B + C)  ==>   (A - B) - C
6874 /// A - (B + C)  ==>   (A - C) - B
6875 static void
genSubAdd2SubSub(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,unsigned IdxOpd1,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg)6876 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6877                  const TargetInstrInfo *TII, MachineInstr &Root,
6878                  SmallVectorImpl<MachineInstr *> &InsInstrs,
6879                  SmallVectorImpl<MachineInstr *> &DelInstrs,
6880                  unsigned IdxOpd1,
6881                  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6882   assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6883   unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6884   MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
6885 
6886   Register ResultReg = Root.getOperand(0).getReg();
6887   Register RegA = Root.getOperand(1).getReg();
6888   bool RegAIsKill = Root.getOperand(1).isKill();
6889   Register RegB = AddMI->getOperand(IdxOpd1).getReg();
6890   bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
6891   Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
6892   bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
6893   Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
6894 
6895   unsigned Opcode = Root.getOpcode();
6896   if (Opcode == AArch64::SUBSWrr)
6897     Opcode = AArch64::SUBWrr;
6898   else if (Opcode == AArch64::SUBSXrr)
6899     Opcode = AArch64::SUBXrr;
6900   else
6901     assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6902            "Unexpected instruction opcode.");
6903 
6904   MachineInstrBuilder MIB1 =
6905       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
6906           .addReg(RegA, getKillRegState(RegAIsKill))
6907           .addReg(RegB, getKillRegState(RegBIsKill));
6908   MachineInstrBuilder MIB2 =
6909       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
6910           .addReg(NewVR, getKillRegState(true))
6911           .addReg(RegC, getKillRegState(RegCIsKill));
6912 
6913   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6914   InsInstrs.push_back(MIB1);
6915   InsInstrs.push_back(MIB2);
6916   DelInstrs.push_back(AddMI);
6917 }
6918 
6919 /// When getMachineCombinerPatterns() finds potential patterns,
6920 /// this function generates the instructions that could replace the
6921 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const6922 void AArch64InstrInfo::genAlternativeCodeSequence(
6923     MachineInstr &Root, MachineCombinerPattern Pattern,
6924     SmallVectorImpl<MachineInstr *> &InsInstrs,
6925     SmallVectorImpl<MachineInstr *> &DelInstrs,
6926     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6927   MachineBasicBlock &MBB = *Root.getParent();
6928   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6929   MachineFunction &MF = *MBB.getParent();
6930   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6931 
6932   MachineInstr *MUL = nullptr;
6933   const TargetRegisterClass *RC;
6934   unsigned Opc;
6935   switch (Pattern) {
6936   default:
6937     // Reassociate instructions.
6938     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6939                                                 DelInstrs, InstrIdxForVirtReg);
6940     return;
6941   case MachineCombinerPattern::SUBADD_OP1:
6942     // A - (B + C)
6943     // ==> (A - B) - C
6944     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
6945                      InstrIdxForVirtReg);
6946     break;
6947   case MachineCombinerPattern::SUBADD_OP2:
6948     // A - (B + C)
6949     // ==> (A - C) - B
6950     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
6951                      InstrIdxForVirtReg);
6952     break;
6953   case MachineCombinerPattern::MULADDW_OP1:
6954   case MachineCombinerPattern::MULADDX_OP1:
6955     // MUL I=A,B,0
6956     // ADD R,I,C
6957     // ==> MADD R,A,B,C
6958     // --- Create(MADD);
6959     if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
6960       Opc = AArch64::MADDWrrr;
6961       RC = &AArch64::GPR32RegClass;
6962     } else {
6963       Opc = AArch64::MADDXrrr;
6964       RC = &AArch64::GPR64RegClass;
6965     }
6966     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
6967     break;
6968   case MachineCombinerPattern::MULADDW_OP2:
6969   case MachineCombinerPattern::MULADDX_OP2:
6970     // MUL I=A,B,0
6971     // ADD R,C,I
6972     // ==> MADD R,A,B,C
6973     // --- Create(MADD);
6974     if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
6975       Opc = AArch64::MADDWrrr;
6976       RC = &AArch64::GPR32RegClass;
6977     } else {
6978       Opc = AArch64::MADDXrrr;
6979       RC = &AArch64::GPR64RegClass;
6980     }
6981     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
6982     break;
6983   case MachineCombinerPattern::MULADDWI_OP1:
6984   case MachineCombinerPattern::MULADDXI_OP1: {
6985     // MUL I=A,B,0
6986     // ADD R,I,Imm
6987     // ==> MOV V, Imm
6988     // ==> MADD R,A,B,V
6989     // --- Create(MADD);
6990     const TargetRegisterClass *OrrRC;
6991     unsigned BitSize, OrrOpc, ZeroReg;
6992     if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
6993       OrrOpc = AArch64::ORRWri;
6994       OrrRC = &AArch64::GPR32spRegClass;
6995       BitSize = 32;
6996       ZeroReg = AArch64::WZR;
6997       Opc = AArch64::MADDWrrr;
6998       RC = &AArch64::GPR32RegClass;
6999     } else {
7000       OrrOpc = AArch64::ORRXri;
7001       OrrRC = &AArch64::GPR64spRegClass;
7002       BitSize = 64;
7003       ZeroReg = AArch64::XZR;
7004       Opc = AArch64::MADDXrrr;
7005       RC = &AArch64::GPR64RegClass;
7006     }
7007     Register NewVR = MRI.createVirtualRegister(OrrRC);
7008     uint64_t Imm = Root.getOperand(2).getImm();
7009 
7010     if (Root.getOperand(3).isImm()) {
7011       unsigned Val = Root.getOperand(3).getImm();
7012       Imm = Imm << Val;
7013     }
7014     uint64_t UImm = SignExtend64(Imm, BitSize);
7015     // The immediate can be composed via a single instruction.
7016     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7017     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7018     if (Insn.size() != 1)
7019       return;
7020     auto MovI = Insn.begin();
7021     MachineInstrBuilder MIB1;
7022     // MOV is an alias for one of three instructions: movz, movn, and orr.
7023     if (MovI->Opcode == OrrOpc)
7024       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7025                  .addReg(ZeroReg)
7026                  .addImm(MovI->Op2);
7027     else {
7028       if (BitSize == 32)
7029         assert((MovI->Opcode == AArch64::MOVNWi ||
7030                 MovI->Opcode == AArch64::MOVZWi) &&
7031                "Expected opcode");
7032       else
7033         assert((MovI->Opcode == AArch64::MOVNXi ||
7034                 MovI->Opcode == AArch64::MOVZXi) &&
7035                "Expected opcode");
7036       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7037                  .addImm(MovI->Op1)
7038                  .addImm(MovI->Op2);
7039     }
7040     InsInstrs.push_back(MIB1);
7041     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7042     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7043     break;
7044   }
7045   case MachineCombinerPattern::MULSUBW_OP1:
7046   case MachineCombinerPattern::MULSUBX_OP1: {
7047     // MUL I=A,B,0
7048     // SUB R,I, C
7049     // ==> SUB  V, 0, C
7050     // ==> MADD R,A,B,V // = -C + A*B
7051     // --- Create(MADD);
7052     const TargetRegisterClass *SubRC;
7053     unsigned SubOpc, ZeroReg;
7054     if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
7055       SubOpc = AArch64::SUBWrr;
7056       SubRC = &AArch64::GPR32spRegClass;
7057       ZeroReg = AArch64::WZR;
7058       Opc = AArch64::MADDWrrr;
7059       RC = &AArch64::GPR32RegClass;
7060     } else {
7061       SubOpc = AArch64::SUBXrr;
7062       SubRC = &AArch64::GPR64spRegClass;
7063       ZeroReg = AArch64::XZR;
7064       Opc = AArch64::MADDXrrr;
7065       RC = &AArch64::GPR64RegClass;
7066     }
7067     Register NewVR = MRI.createVirtualRegister(SubRC);
7068     // SUB NewVR, 0, C
7069     MachineInstrBuilder MIB1 =
7070         BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7071             .addReg(ZeroReg)
7072             .add(Root.getOperand(2));
7073     InsInstrs.push_back(MIB1);
7074     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7075     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7076     break;
7077   }
7078   case MachineCombinerPattern::MULSUBW_OP2:
7079   case MachineCombinerPattern::MULSUBX_OP2:
7080     // MUL I=A,B,0
7081     // SUB R,C,I
7082     // ==> MSUB R,A,B,C (computes C - A*B)
7083     // --- Create(MSUB);
7084     if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
7085       Opc = AArch64::MSUBWrrr;
7086       RC = &AArch64::GPR32RegClass;
7087     } else {
7088       Opc = AArch64::MSUBXrrr;
7089       RC = &AArch64::GPR64RegClass;
7090     }
7091     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7092     break;
7093   case MachineCombinerPattern::MULSUBWI_OP1:
7094   case MachineCombinerPattern::MULSUBXI_OP1: {
7095     // MUL I=A,B,0
7096     // SUB R,I, Imm
7097     // ==> MOV  V, -Imm
7098     // ==> MADD R,A,B,V // = -Imm + A*B
7099     // --- Create(MADD);
7100     const TargetRegisterClass *OrrRC;
7101     unsigned BitSize, OrrOpc, ZeroReg;
7102     if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
7103       OrrOpc = AArch64::ORRWri;
7104       OrrRC = &AArch64::GPR32spRegClass;
7105       BitSize = 32;
7106       ZeroReg = AArch64::WZR;
7107       Opc = AArch64::MADDWrrr;
7108       RC = &AArch64::GPR32RegClass;
7109     } else {
7110       OrrOpc = AArch64::ORRXri;
7111       OrrRC = &AArch64::GPR64spRegClass;
7112       BitSize = 64;
7113       ZeroReg = AArch64::XZR;
7114       Opc = AArch64::MADDXrrr;
7115       RC = &AArch64::GPR64RegClass;
7116     }
7117     Register NewVR = MRI.createVirtualRegister(OrrRC);
7118     uint64_t Imm = Root.getOperand(2).getImm();
7119     if (Root.getOperand(3).isImm()) {
7120       unsigned Val = Root.getOperand(3).getImm();
7121       Imm = Imm << Val;
7122     }
7123     uint64_t UImm = SignExtend64(-Imm, BitSize);
7124     // The immediate can be composed via a single instruction.
7125     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7126     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7127     if (Insn.size() != 1)
7128       return;
7129     auto MovI = Insn.begin();
7130     MachineInstrBuilder MIB1;
7131     // MOV is an alias for one of three instructions: movz, movn, and orr.
7132     if (MovI->Opcode == OrrOpc)
7133       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7134                  .addReg(ZeroReg)
7135                  .addImm(MovI->Op2);
7136     else {
7137       if (BitSize == 32)
7138         assert((MovI->Opcode == AArch64::MOVNWi ||
7139                 MovI->Opcode == AArch64::MOVZWi) &&
7140                "Expected opcode");
7141       else
7142         assert((MovI->Opcode == AArch64::MOVNXi ||
7143                 MovI->Opcode == AArch64::MOVZXi) &&
7144                "Expected opcode");
7145       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7146                  .addImm(MovI->Op1)
7147                  .addImm(MovI->Op2);
7148     }
7149     InsInstrs.push_back(MIB1);
7150     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7151     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7152     break;
7153   }
7154 
7155   case MachineCombinerPattern::MULADDv8i8_OP1:
7156     Opc = AArch64::MLAv8i8;
7157     RC = &AArch64::FPR64RegClass;
7158     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7159     break;
7160   case MachineCombinerPattern::MULADDv8i8_OP2:
7161     Opc = AArch64::MLAv8i8;
7162     RC = &AArch64::FPR64RegClass;
7163     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7164     break;
7165   case MachineCombinerPattern::MULADDv16i8_OP1:
7166     Opc = AArch64::MLAv16i8;
7167     RC = &AArch64::FPR128RegClass;
7168     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7169     break;
7170   case MachineCombinerPattern::MULADDv16i8_OP2:
7171     Opc = AArch64::MLAv16i8;
7172     RC = &AArch64::FPR128RegClass;
7173     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7174     break;
7175   case MachineCombinerPattern::MULADDv4i16_OP1:
7176     Opc = AArch64::MLAv4i16;
7177     RC = &AArch64::FPR64RegClass;
7178     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7179     break;
7180   case MachineCombinerPattern::MULADDv4i16_OP2:
7181     Opc = AArch64::MLAv4i16;
7182     RC = &AArch64::FPR64RegClass;
7183     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7184     break;
7185   case MachineCombinerPattern::MULADDv8i16_OP1:
7186     Opc = AArch64::MLAv8i16;
7187     RC = &AArch64::FPR128RegClass;
7188     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7189     break;
7190   case MachineCombinerPattern::MULADDv8i16_OP2:
7191     Opc = AArch64::MLAv8i16;
7192     RC = &AArch64::FPR128RegClass;
7193     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7194     break;
7195   case MachineCombinerPattern::MULADDv2i32_OP1:
7196     Opc = AArch64::MLAv2i32;
7197     RC = &AArch64::FPR64RegClass;
7198     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7199     break;
7200   case MachineCombinerPattern::MULADDv2i32_OP2:
7201     Opc = AArch64::MLAv2i32;
7202     RC = &AArch64::FPR64RegClass;
7203     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7204     break;
7205   case MachineCombinerPattern::MULADDv4i32_OP1:
7206     Opc = AArch64::MLAv4i32;
7207     RC = &AArch64::FPR128RegClass;
7208     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7209     break;
7210   case MachineCombinerPattern::MULADDv4i32_OP2:
7211     Opc = AArch64::MLAv4i32;
7212     RC = &AArch64::FPR128RegClass;
7213     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7214     break;
7215 
7216   case MachineCombinerPattern::MULSUBv8i8_OP1:
7217     Opc = AArch64::MLAv8i8;
7218     RC = &AArch64::FPR64RegClass;
7219     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7220                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7221                                  RC);
7222     break;
7223   case MachineCombinerPattern::MULSUBv8i8_OP2:
7224     Opc = AArch64::MLSv8i8;
7225     RC = &AArch64::FPR64RegClass;
7226     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7227     break;
7228   case MachineCombinerPattern::MULSUBv16i8_OP1:
7229     Opc = AArch64::MLAv16i8;
7230     RC = &AArch64::FPR128RegClass;
7231     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7232                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7233                                  RC);
7234     break;
7235   case MachineCombinerPattern::MULSUBv16i8_OP2:
7236     Opc = AArch64::MLSv16i8;
7237     RC = &AArch64::FPR128RegClass;
7238     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7239     break;
7240   case MachineCombinerPattern::MULSUBv4i16_OP1:
7241     Opc = AArch64::MLAv4i16;
7242     RC = &AArch64::FPR64RegClass;
7243     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7244                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7245                                  RC);
7246     break;
7247   case MachineCombinerPattern::MULSUBv4i16_OP2:
7248     Opc = AArch64::MLSv4i16;
7249     RC = &AArch64::FPR64RegClass;
7250     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7251     break;
7252   case MachineCombinerPattern::MULSUBv8i16_OP1:
7253     Opc = AArch64::MLAv8i16;
7254     RC = &AArch64::FPR128RegClass;
7255     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7256                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7257                                  RC);
7258     break;
7259   case MachineCombinerPattern::MULSUBv8i16_OP2:
7260     Opc = AArch64::MLSv8i16;
7261     RC = &AArch64::FPR128RegClass;
7262     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7263     break;
7264   case MachineCombinerPattern::MULSUBv2i32_OP1:
7265     Opc = AArch64::MLAv2i32;
7266     RC = &AArch64::FPR64RegClass;
7267     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7268                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7269                                  RC);
7270     break;
7271   case MachineCombinerPattern::MULSUBv2i32_OP2:
7272     Opc = AArch64::MLSv2i32;
7273     RC = &AArch64::FPR64RegClass;
7274     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7275     break;
7276   case MachineCombinerPattern::MULSUBv4i32_OP1:
7277     Opc = AArch64::MLAv4i32;
7278     RC = &AArch64::FPR128RegClass;
7279     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7280                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7281                                  RC);
7282     break;
7283   case MachineCombinerPattern::MULSUBv4i32_OP2:
7284     Opc = AArch64::MLSv4i32;
7285     RC = &AArch64::FPR128RegClass;
7286     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7287     break;
7288 
7289   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7290     Opc = AArch64::MLAv4i16_indexed;
7291     RC = &AArch64::FPR64RegClass;
7292     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7293     break;
7294   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7295     Opc = AArch64::MLAv4i16_indexed;
7296     RC = &AArch64::FPR64RegClass;
7297     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7298     break;
7299   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7300     Opc = AArch64::MLAv8i16_indexed;
7301     RC = &AArch64::FPR128RegClass;
7302     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7303     break;
7304   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7305     Opc = AArch64::MLAv8i16_indexed;
7306     RC = &AArch64::FPR128RegClass;
7307     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7308     break;
7309   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7310     Opc = AArch64::MLAv2i32_indexed;
7311     RC = &AArch64::FPR64RegClass;
7312     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7313     break;
7314   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7315     Opc = AArch64::MLAv2i32_indexed;
7316     RC = &AArch64::FPR64RegClass;
7317     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7318     break;
7319   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7320     Opc = AArch64::MLAv4i32_indexed;
7321     RC = &AArch64::FPR128RegClass;
7322     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7323     break;
7324   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7325     Opc = AArch64::MLAv4i32_indexed;
7326     RC = &AArch64::FPR128RegClass;
7327     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7328     break;
7329 
7330   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7331     Opc = AArch64::MLAv4i16_indexed;
7332     RC = &AArch64::FPR64RegClass;
7333     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7334                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7335                                  RC);
7336     break;
7337   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7338     Opc = AArch64::MLSv4i16_indexed;
7339     RC = &AArch64::FPR64RegClass;
7340     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7341     break;
7342   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7343     Opc = AArch64::MLAv8i16_indexed;
7344     RC = &AArch64::FPR128RegClass;
7345     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7346                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7347                                  RC);
7348     break;
7349   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7350     Opc = AArch64::MLSv8i16_indexed;
7351     RC = &AArch64::FPR128RegClass;
7352     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7353     break;
7354   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7355     Opc = AArch64::MLAv2i32_indexed;
7356     RC = &AArch64::FPR64RegClass;
7357     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7358                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7359                                  RC);
7360     break;
7361   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7362     Opc = AArch64::MLSv2i32_indexed;
7363     RC = &AArch64::FPR64RegClass;
7364     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7365     break;
7366   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7367     Opc = AArch64::MLAv4i32_indexed;
7368     RC = &AArch64::FPR128RegClass;
7369     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7370                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7371                                  RC);
7372     break;
7373   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7374     Opc = AArch64::MLSv4i32_indexed;
7375     RC = &AArch64::FPR128RegClass;
7376     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7377     break;
7378 
7379   // Floating Point Support
7380   case MachineCombinerPattern::FMULADDH_OP1:
7381     Opc = AArch64::FMADDHrrr;
7382     RC = &AArch64::FPR16RegClass;
7383     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7384     break;
7385   case MachineCombinerPattern::FMULADDS_OP1:
7386     Opc = AArch64::FMADDSrrr;
7387     RC = &AArch64::FPR32RegClass;
7388     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7389     break;
7390   case MachineCombinerPattern::FMULADDD_OP1:
7391     Opc = AArch64::FMADDDrrr;
7392     RC = &AArch64::FPR64RegClass;
7393     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7394     break;
7395 
7396   case MachineCombinerPattern::FMULADDH_OP2:
7397     Opc = AArch64::FMADDHrrr;
7398     RC = &AArch64::FPR16RegClass;
7399     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7400     break;
7401   case MachineCombinerPattern::FMULADDS_OP2:
7402     Opc = AArch64::FMADDSrrr;
7403     RC = &AArch64::FPR32RegClass;
7404     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7405     break;
7406   case MachineCombinerPattern::FMULADDD_OP2:
7407     Opc = AArch64::FMADDDrrr;
7408     RC = &AArch64::FPR64RegClass;
7409     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7410     break;
7411 
7412   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7413     Opc = AArch64::FMLAv1i32_indexed;
7414     RC = &AArch64::FPR32RegClass;
7415     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7416                            FMAInstKind::Indexed);
7417     break;
7418   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7419     Opc = AArch64::FMLAv1i32_indexed;
7420     RC = &AArch64::FPR32RegClass;
7421     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7422                            FMAInstKind::Indexed);
7423     break;
7424 
7425   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7426     Opc = AArch64::FMLAv1i64_indexed;
7427     RC = &AArch64::FPR64RegClass;
7428     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7429                            FMAInstKind::Indexed);
7430     break;
7431   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7432     Opc = AArch64::FMLAv1i64_indexed;
7433     RC = &AArch64::FPR64RegClass;
7434     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7435                            FMAInstKind::Indexed);
7436     break;
7437 
7438   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7439     RC = &AArch64::FPR64RegClass;
7440     Opc = AArch64::FMLAv4i16_indexed;
7441     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7442                            FMAInstKind::Indexed);
7443     break;
7444   case MachineCombinerPattern::FMLAv4f16_OP1:
7445     RC = &AArch64::FPR64RegClass;
7446     Opc = AArch64::FMLAv4f16;
7447     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7448                            FMAInstKind::Accumulator);
7449     break;
7450   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7451     RC = &AArch64::FPR64RegClass;
7452     Opc = AArch64::FMLAv4i16_indexed;
7453     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7454                            FMAInstKind::Indexed);
7455     break;
7456   case MachineCombinerPattern::FMLAv4f16_OP2:
7457     RC = &AArch64::FPR64RegClass;
7458     Opc = AArch64::FMLAv4f16;
7459     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7460                            FMAInstKind::Accumulator);
7461     break;
7462 
7463   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7464   case MachineCombinerPattern::FMLAv2f32_OP1:
7465     RC = &AArch64::FPR64RegClass;
7466     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7467       Opc = AArch64::FMLAv2i32_indexed;
7468       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7469                              FMAInstKind::Indexed);
7470     } else {
7471       Opc = AArch64::FMLAv2f32;
7472       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7473                              FMAInstKind::Accumulator);
7474     }
7475     break;
7476   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7477   case MachineCombinerPattern::FMLAv2f32_OP2:
7478     RC = &AArch64::FPR64RegClass;
7479     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7480       Opc = AArch64::FMLAv2i32_indexed;
7481       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7482                              FMAInstKind::Indexed);
7483     } else {
7484       Opc = AArch64::FMLAv2f32;
7485       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7486                              FMAInstKind::Accumulator);
7487     }
7488     break;
7489 
7490   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7491     RC = &AArch64::FPR128RegClass;
7492     Opc = AArch64::FMLAv8i16_indexed;
7493     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7494                            FMAInstKind::Indexed);
7495     break;
7496   case MachineCombinerPattern::FMLAv8f16_OP1:
7497     RC = &AArch64::FPR128RegClass;
7498     Opc = AArch64::FMLAv8f16;
7499     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7500                            FMAInstKind::Accumulator);
7501     break;
7502   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7503     RC = &AArch64::FPR128RegClass;
7504     Opc = AArch64::FMLAv8i16_indexed;
7505     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7506                            FMAInstKind::Indexed);
7507     break;
7508   case MachineCombinerPattern::FMLAv8f16_OP2:
7509     RC = &AArch64::FPR128RegClass;
7510     Opc = AArch64::FMLAv8f16;
7511     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7512                            FMAInstKind::Accumulator);
7513     break;
7514 
7515   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7516   case MachineCombinerPattern::FMLAv2f64_OP1:
7517     RC = &AArch64::FPR128RegClass;
7518     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7519       Opc = AArch64::FMLAv2i64_indexed;
7520       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7521                              FMAInstKind::Indexed);
7522     } else {
7523       Opc = AArch64::FMLAv2f64;
7524       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7525                              FMAInstKind::Accumulator);
7526     }
7527     break;
7528   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7529   case MachineCombinerPattern::FMLAv2f64_OP2:
7530     RC = &AArch64::FPR128RegClass;
7531     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7532       Opc = AArch64::FMLAv2i64_indexed;
7533       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7534                              FMAInstKind::Indexed);
7535     } else {
7536       Opc = AArch64::FMLAv2f64;
7537       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7538                              FMAInstKind::Accumulator);
7539     }
7540     break;
7541 
7542   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7543   case MachineCombinerPattern::FMLAv4f32_OP1:
7544     RC = &AArch64::FPR128RegClass;
7545     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7546       Opc = AArch64::FMLAv4i32_indexed;
7547       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7548                              FMAInstKind::Indexed);
7549     } else {
7550       Opc = AArch64::FMLAv4f32;
7551       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7552                              FMAInstKind::Accumulator);
7553     }
7554     break;
7555 
7556   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7557   case MachineCombinerPattern::FMLAv4f32_OP2:
7558     RC = &AArch64::FPR128RegClass;
7559     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7560       Opc = AArch64::FMLAv4i32_indexed;
7561       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7562                              FMAInstKind::Indexed);
7563     } else {
7564       Opc = AArch64::FMLAv4f32;
7565       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7566                              FMAInstKind::Accumulator);
7567     }
7568     break;
7569 
7570   case MachineCombinerPattern::FMULSUBH_OP1:
7571     Opc = AArch64::FNMSUBHrrr;
7572     RC = &AArch64::FPR16RegClass;
7573     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7574     break;
7575   case MachineCombinerPattern::FMULSUBS_OP1:
7576     Opc = AArch64::FNMSUBSrrr;
7577     RC = &AArch64::FPR32RegClass;
7578     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7579     break;
7580   case MachineCombinerPattern::FMULSUBD_OP1:
7581     Opc = AArch64::FNMSUBDrrr;
7582     RC = &AArch64::FPR64RegClass;
7583     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7584     break;
7585 
7586   case MachineCombinerPattern::FNMULSUBH_OP1:
7587     Opc = AArch64::FNMADDHrrr;
7588     RC = &AArch64::FPR16RegClass;
7589     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7590     break;
7591   case MachineCombinerPattern::FNMULSUBS_OP1:
7592     Opc = AArch64::FNMADDSrrr;
7593     RC = &AArch64::FPR32RegClass;
7594     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7595     break;
7596   case MachineCombinerPattern::FNMULSUBD_OP1:
7597     Opc = AArch64::FNMADDDrrr;
7598     RC = &AArch64::FPR64RegClass;
7599     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7600     break;
7601 
7602   case MachineCombinerPattern::FMULSUBH_OP2:
7603     Opc = AArch64::FMSUBHrrr;
7604     RC = &AArch64::FPR16RegClass;
7605     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7606     break;
7607   case MachineCombinerPattern::FMULSUBS_OP2:
7608     Opc = AArch64::FMSUBSrrr;
7609     RC = &AArch64::FPR32RegClass;
7610     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7611     break;
7612   case MachineCombinerPattern::FMULSUBD_OP2:
7613     Opc = AArch64::FMSUBDrrr;
7614     RC = &AArch64::FPR64RegClass;
7615     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7616     break;
7617 
7618   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7619     Opc = AArch64::FMLSv1i32_indexed;
7620     RC = &AArch64::FPR32RegClass;
7621     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7622                            FMAInstKind::Indexed);
7623     break;
7624 
7625   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7626     Opc = AArch64::FMLSv1i64_indexed;
7627     RC = &AArch64::FPR64RegClass;
7628     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7629                            FMAInstKind::Indexed);
7630     break;
7631 
7632   case MachineCombinerPattern::FMLSv4f16_OP1:
7633   case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7634     RC = &AArch64::FPR64RegClass;
7635     Register NewVR = MRI.createVirtualRegister(RC);
7636     MachineInstrBuilder MIB1 =
7637         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7638             .add(Root.getOperand(2));
7639     InsInstrs.push_back(MIB1);
7640     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7641     if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
7642       Opc = AArch64::FMLAv4f16;
7643       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7644                              FMAInstKind::Accumulator, &NewVR);
7645     } else {
7646       Opc = AArch64::FMLAv4i16_indexed;
7647       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7648                              FMAInstKind::Indexed, &NewVR);
7649     }
7650     break;
7651   }
7652   case MachineCombinerPattern::FMLSv4f16_OP2:
7653     RC = &AArch64::FPR64RegClass;
7654     Opc = AArch64::FMLSv4f16;
7655     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7656                            FMAInstKind::Accumulator);
7657     break;
7658   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7659     RC = &AArch64::FPR64RegClass;
7660     Opc = AArch64::FMLSv4i16_indexed;
7661     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7662                            FMAInstKind::Indexed);
7663     break;
7664 
7665   case MachineCombinerPattern::FMLSv2f32_OP2:
7666   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7667     RC = &AArch64::FPR64RegClass;
7668     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7669       Opc = AArch64::FMLSv2i32_indexed;
7670       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7671                              FMAInstKind::Indexed);
7672     } else {
7673       Opc = AArch64::FMLSv2f32;
7674       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7675                              FMAInstKind::Accumulator);
7676     }
7677     break;
7678 
7679   case MachineCombinerPattern::FMLSv8f16_OP1:
7680   case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7681     RC = &AArch64::FPR128RegClass;
7682     Register NewVR = MRI.createVirtualRegister(RC);
7683     MachineInstrBuilder MIB1 =
7684         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7685             .add(Root.getOperand(2));
7686     InsInstrs.push_back(MIB1);
7687     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7688     if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
7689       Opc = AArch64::FMLAv8f16;
7690       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7691                              FMAInstKind::Accumulator, &NewVR);
7692     } else {
7693       Opc = AArch64::FMLAv8i16_indexed;
7694       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7695                              FMAInstKind::Indexed, &NewVR);
7696     }
7697     break;
7698   }
7699   case MachineCombinerPattern::FMLSv8f16_OP2:
7700     RC = &AArch64::FPR128RegClass;
7701     Opc = AArch64::FMLSv8f16;
7702     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7703                            FMAInstKind::Accumulator);
7704     break;
7705   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7706     RC = &AArch64::FPR128RegClass;
7707     Opc = AArch64::FMLSv8i16_indexed;
7708     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7709                            FMAInstKind::Indexed);
7710     break;
7711 
7712   case MachineCombinerPattern::FMLSv2f64_OP2:
7713   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7714     RC = &AArch64::FPR128RegClass;
7715     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7716       Opc = AArch64::FMLSv2i64_indexed;
7717       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7718                              FMAInstKind::Indexed);
7719     } else {
7720       Opc = AArch64::FMLSv2f64;
7721       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7722                              FMAInstKind::Accumulator);
7723     }
7724     break;
7725 
7726   case MachineCombinerPattern::FMLSv4f32_OP2:
7727   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7728     RC = &AArch64::FPR128RegClass;
7729     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7730       Opc = AArch64::FMLSv4i32_indexed;
7731       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7732                              FMAInstKind::Indexed);
7733     } else {
7734       Opc = AArch64::FMLSv4f32;
7735       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7736                              FMAInstKind::Accumulator);
7737     }
7738     break;
7739   case MachineCombinerPattern::FMLSv2f32_OP1:
7740   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7741     RC = &AArch64::FPR64RegClass;
7742     Register NewVR = MRI.createVirtualRegister(RC);
7743     MachineInstrBuilder MIB1 =
7744         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7745             .add(Root.getOperand(2));
7746     InsInstrs.push_back(MIB1);
7747     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7748     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7749       Opc = AArch64::FMLAv2i32_indexed;
7750       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7751                              FMAInstKind::Indexed, &NewVR);
7752     } else {
7753       Opc = AArch64::FMLAv2f32;
7754       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7755                              FMAInstKind::Accumulator, &NewVR);
7756     }
7757     break;
7758   }
7759   case MachineCombinerPattern::FMLSv4f32_OP1:
7760   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7761     RC = &AArch64::FPR128RegClass;
7762     Register NewVR = MRI.createVirtualRegister(RC);
7763     MachineInstrBuilder MIB1 =
7764         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7765             .add(Root.getOperand(2));
7766     InsInstrs.push_back(MIB1);
7767     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7768     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7769       Opc = AArch64::FMLAv4i32_indexed;
7770       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7771                              FMAInstKind::Indexed, &NewVR);
7772     } else {
7773       Opc = AArch64::FMLAv4f32;
7774       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7775                              FMAInstKind::Accumulator, &NewVR);
7776     }
7777     break;
7778   }
7779   case MachineCombinerPattern::FMLSv2f64_OP1:
7780   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7781     RC = &AArch64::FPR128RegClass;
7782     Register NewVR = MRI.createVirtualRegister(RC);
7783     MachineInstrBuilder MIB1 =
7784         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7785             .add(Root.getOperand(2));
7786     InsInstrs.push_back(MIB1);
7787     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7788     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7789       Opc = AArch64::FMLAv2i64_indexed;
7790       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7791                              FMAInstKind::Indexed, &NewVR);
7792     } else {
7793       Opc = AArch64::FMLAv2f64;
7794       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7795                              FMAInstKind::Accumulator, &NewVR);
7796     }
7797     break;
7798   }
7799   case MachineCombinerPattern::FMULv2i32_indexed_OP1:
7800   case MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7801     unsigned IdxDupOp =
7802         (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2;
7803     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7804                        &AArch64::FPR128RegClass, MRI);
7805     break;
7806   }
7807   case MachineCombinerPattern::FMULv2i64_indexed_OP1:
7808   case MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7809     unsigned IdxDupOp =
7810         (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2;
7811     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7812                        &AArch64::FPR128RegClass, MRI);
7813     break;
7814   }
7815   case MachineCombinerPattern::FMULv4i16_indexed_OP1:
7816   case MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7817     unsigned IdxDupOp =
7818         (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2;
7819     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7820                        &AArch64::FPR128_loRegClass, MRI);
7821     break;
7822   }
7823   case MachineCombinerPattern::FMULv4i32_indexed_OP1:
7824   case MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7825     unsigned IdxDupOp =
7826         (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2;
7827     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7828                        &AArch64::FPR128RegClass, MRI);
7829     break;
7830   }
7831   case MachineCombinerPattern::FMULv8i16_indexed_OP1:
7832   case MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7833     unsigned IdxDupOp =
7834         (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2;
7835     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7836                        &AArch64::FPR128_loRegClass, MRI);
7837     break;
7838   }
7839   case MachineCombinerPattern::FNMADD: {
7840     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7841     break;
7842   }
7843 
7844   } // end switch (Pattern)
7845   // Record MUL and ADD/SUB for deletion
7846   if (MUL)
7847     DelInstrs.push_back(MUL);
7848   DelInstrs.push_back(&Root);
7849 
7850   // Set the flags on the inserted instructions to be the merged flags of the
7851   // instructions that we have combined.
7852   uint32_t Flags = Root.getFlags();
7853   if (MUL)
7854     Flags = Root.mergeFlagsWith(*MUL);
7855   for (auto *MI : InsInstrs)
7856     MI->setFlags(Flags);
7857 }
7858 
7859 /// Replace csincr-branch sequence by simple conditional branch
7860 ///
7861 /// Examples:
7862 /// 1. \code
7863 ///   csinc  w9, wzr, wzr, <condition code>
7864 ///   tbnz   w9, #0, 0x44
7865 ///    \endcode
7866 /// to
7867 ///    \code
7868 ///   b.<inverted condition code>
7869 ///    \endcode
7870 ///
7871 /// 2. \code
7872 ///   csinc w9, wzr, wzr, <condition code>
7873 ///   tbz   w9, #0, 0x44
7874 ///    \endcode
7875 /// to
7876 ///    \code
7877 ///   b.<condition code>
7878 ///    \endcode
7879 ///
7880 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7881 /// compare's constant operand is power of 2.
7882 ///
7883 /// Examples:
7884 ///    \code
7885 ///   and  w8, w8, #0x400
7886 ///   cbnz w8, L1
7887 ///    \endcode
7888 /// to
7889 ///    \code
7890 ///   tbnz w8, #10, L1
7891 ///    \endcode
7892 ///
7893 /// \param  MI Conditional Branch
7894 /// \return True when the simple conditional branch is generated
7895 ///
optimizeCondBranch(MachineInstr & MI) const7896 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7897   bool IsNegativeBranch = false;
7898   bool IsTestAndBranch = false;
7899   unsigned TargetBBInMI = 0;
7900   switch (MI.getOpcode()) {
7901   default:
7902     llvm_unreachable("Unknown branch instruction?");
7903   case AArch64::Bcc:
7904     return false;
7905   case AArch64::CBZW:
7906   case AArch64::CBZX:
7907     TargetBBInMI = 1;
7908     break;
7909   case AArch64::CBNZW:
7910   case AArch64::CBNZX:
7911     TargetBBInMI = 1;
7912     IsNegativeBranch = true;
7913     break;
7914   case AArch64::TBZW:
7915   case AArch64::TBZX:
7916     TargetBBInMI = 2;
7917     IsTestAndBranch = true;
7918     break;
7919   case AArch64::TBNZW:
7920   case AArch64::TBNZX:
7921     TargetBBInMI = 2;
7922     IsNegativeBranch = true;
7923     IsTestAndBranch = true;
7924     break;
7925   }
7926   // So we increment a zero register and test for bits other
7927   // than bit 0? Conservatively bail out in case the verifier
7928   // missed this case.
7929   if (IsTestAndBranch && MI.getOperand(1).getImm())
7930     return false;
7931 
7932   // Find Definition.
7933   assert(MI.getParent() && "Incomplete machine instruciton\n");
7934   MachineBasicBlock *MBB = MI.getParent();
7935   MachineFunction *MF = MBB->getParent();
7936   MachineRegisterInfo *MRI = &MF->getRegInfo();
7937   Register VReg = MI.getOperand(0).getReg();
7938   if (!VReg.isVirtual())
7939     return false;
7940 
7941   MachineInstr *DefMI = MRI->getVRegDef(VReg);
7942 
7943   // Look through COPY instructions to find definition.
7944   while (DefMI->isCopy()) {
7945     Register CopyVReg = DefMI->getOperand(1).getReg();
7946     if (!MRI->hasOneNonDBGUse(CopyVReg))
7947       return false;
7948     if (!MRI->hasOneDef(CopyVReg))
7949       return false;
7950     DefMI = MRI->getVRegDef(CopyVReg);
7951   }
7952 
7953   switch (DefMI->getOpcode()) {
7954   default:
7955     return false;
7956   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
7957   case AArch64::ANDWri:
7958   case AArch64::ANDXri: {
7959     if (IsTestAndBranch)
7960       return false;
7961     if (DefMI->getParent() != MBB)
7962       return false;
7963     if (!MRI->hasOneNonDBGUse(VReg))
7964       return false;
7965 
7966     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
7967     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
7968         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
7969     if (!isPowerOf2_64(Mask))
7970       return false;
7971 
7972     MachineOperand &MO = DefMI->getOperand(1);
7973     Register NewReg = MO.getReg();
7974     if (!NewReg.isVirtual())
7975       return false;
7976 
7977     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
7978 
7979     MachineBasicBlock &RefToMBB = *MBB;
7980     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
7981     DebugLoc DL = MI.getDebugLoc();
7982     unsigned Imm = Log2_64(Mask);
7983     unsigned Opc = (Imm < 32)
7984                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
7985                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
7986     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
7987                               .addReg(NewReg)
7988                               .addImm(Imm)
7989                               .addMBB(TBB);
7990     // Register lives on to the CBZ now.
7991     MO.setIsKill(false);
7992 
7993     // For immediate smaller than 32, we need to use the 32-bit
7994     // variant (W) in all cases. Indeed the 64-bit variant does not
7995     // allow to encode them.
7996     // Therefore, if the input register is 64-bit, we need to take the
7997     // 32-bit sub-part.
7998     if (!Is32Bit && Imm < 32)
7999       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8000     MI.eraseFromParent();
8001     return true;
8002   }
8003   // Look for CSINC
8004   case AArch64::CSINCWr:
8005   case AArch64::CSINCXr: {
8006     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8007           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8008         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8009           DefMI->getOperand(2).getReg() == AArch64::XZR))
8010       return false;
8011 
8012     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
8013       return false;
8014 
8015     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8016     // Convert only when the condition code is not modified between
8017     // the CSINC and the branch. The CC may be used by other
8018     // instructions in between.
8019     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8020       return false;
8021     MachineBasicBlock &RefToMBB = *MBB;
8022     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8023     DebugLoc DL = MI.getDebugLoc();
8024     if (IsNegativeBranch)
8025       CC = AArch64CC::getInvertedCondCode(CC);
8026     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8027     MI.eraseFromParent();
8028     return true;
8029   }
8030   }
8031 }
8032 
8033 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const8034 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8035   const unsigned Mask = AArch64II::MO_FRAGMENT;
8036   return std::make_pair(TF & Mask, TF & ~Mask);
8037 }
8038 
8039 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const8040 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8041   using namespace AArch64II;
8042 
8043   static const std::pair<unsigned, const char *> TargetFlags[] = {
8044       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8045       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
8046       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
8047       {MO_HI12, "aarch64-hi12"}};
8048   return ArrayRef(TargetFlags);
8049 }
8050 
8051 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const8052 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8053   using namespace AArch64II;
8054 
8055   static const std::pair<unsigned, const char *> TargetFlags[] = {
8056       {MO_COFFSTUB, "aarch64-coffstub"},
8057       {MO_GOT, "aarch64-got"},
8058       {MO_NC, "aarch64-nc"},
8059       {MO_S, "aarch64-s"},
8060       {MO_TLS, "aarch64-tls"},
8061       {MO_DLLIMPORT, "aarch64-dllimport"},
8062       {MO_PREL, "aarch64-prel"},
8063       {MO_TAGGED, "aarch64-tagged"},
8064       {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8065   };
8066   return ArrayRef(TargetFlags);
8067 }
8068 
8069 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const8070 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8071   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8072       {{MOSuppressPair, "aarch64-suppress-pair"},
8073        {MOStridedAccess, "aarch64-strided-access"}};
8074   return ArrayRef(TargetFlags);
8075 }
8076 
8077 /// Constants defining how certain sequences should be outlined.
8078 /// This encompasses how an outlined function should be called, and what kind of
8079 /// frame should be emitted for that outlined function.
8080 ///
8081 /// \p MachineOutlinerDefault implies that the function should be called with
8082 /// a save and restore of LR to the stack.
8083 ///
8084 /// That is,
8085 ///
8086 /// I1     Save LR                    OUTLINED_FUNCTION:
8087 /// I2 --> BL OUTLINED_FUNCTION       I1
8088 /// I3     Restore LR                 I2
8089 ///                                   I3
8090 ///                                   RET
8091 ///
8092 /// * Call construction overhead: 3 (save + BL + restore)
8093 /// * Frame construction overhead: 1 (ret)
8094 /// * Requires stack fixups? Yes
8095 ///
8096 /// \p MachineOutlinerTailCall implies that the function is being created from
8097 /// a sequence of instructions ending in a return.
8098 ///
8099 /// That is,
8100 ///
8101 /// I1                             OUTLINED_FUNCTION:
8102 /// I2 --> B OUTLINED_FUNCTION     I1
8103 /// RET                            I2
8104 ///                                RET
8105 ///
8106 /// * Call construction overhead: 1 (B)
8107 /// * Frame construction overhead: 0 (Return included in sequence)
8108 /// * Requires stack fixups? No
8109 ///
8110 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8111 /// a BL instruction, but doesn't require LR to be saved and restored. This
8112 /// happens when LR is known to be dead.
8113 ///
8114 /// That is,
8115 ///
8116 /// I1                                OUTLINED_FUNCTION:
8117 /// I2 --> BL OUTLINED_FUNCTION       I1
8118 /// I3                                I2
8119 ///                                   I3
8120 ///                                   RET
8121 ///
8122 /// * Call construction overhead: 1 (BL)
8123 /// * Frame construction overhead: 1 (RET)
8124 /// * Requires stack fixups? No
8125 ///
8126 /// \p MachineOutlinerThunk implies that the function is being created from
8127 /// a sequence of instructions ending in a call. The outlined function is
8128 /// called with a BL instruction, and the outlined function tail-calls the
8129 /// original call destination.
8130 ///
8131 /// That is,
8132 ///
8133 /// I1                                OUTLINED_FUNCTION:
8134 /// I2 --> BL OUTLINED_FUNCTION       I1
8135 /// BL f                              I2
8136 ///                                   B f
8137 /// * Call construction overhead: 1 (BL)
8138 /// * Frame construction overhead: 0
8139 /// * Requires stack fixups? No
8140 ///
8141 /// \p MachineOutlinerRegSave implies that the function should be called with a
8142 /// save and restore of LR to an available register. This allows us to avoid
8143 /// stack fixups. Note that this outlining variant is compatible with the
8144 /// NoLRSave case.
8145 ///
8146 /// That is,
8147 ///
8148 /// I1     Save LR                    OUTLINED_FUNCTION:
8149 /// I2 --> BL OUTLINED_FUNCTION       I1
8150 /// I3     Restore LR                 I2
8151 ///                                   I3
8152 ///                                   RET
8153 ///
8154 /// * Call construction overhead: 3 (save + BL + restore)
8155 /// * Frame construction overhead: 1 (ret)
8156 /// * Requires stack fixups? No
8157 enum MachineOutlinerClass {
8158   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
8159   MachineOutlinerTailCall, /// Only emit a branch.
8160   MachineOutlinerNoLRSave, /// Emit a call and return.
8161   MachineOutlinerThunk,    /// Emit a call and tail-call.
8162   MachineOutlinerRegSave   /// Same as default, but save to a register.
8163 };
8164 
8165 enum MachineOutlinerMBBFlags {
8166   LRUnavailableSomewhere = 0x2,
8167   HasCalls = 0x4,
8168   UnsafeRegsDead = 0x8
8169 };
8170 
8171 Register
findRegisterToSaveLRTo(outliner::Candidate & C) const8172 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8173   MachineFunction *MF = C.getMF();
8174   const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8175   const AArch64RegisterInfo *ARI =
8176       static_cast<const AArch64RegisterInfo *>(&TRI);
8177   // Check if there is an available register across the sequence that we can
8178   // use.
8179   for (unsigned Reg : AArch64::GPR64RegClass) {
8180     if (!ARI->isReservedReg(*MF, Reg) &&
8181         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
8182         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8183         Reg != AArch64::X17 && // Ditto for X17.
8184         C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8185         C.isAvailableInsideSeq(Reg, TRI))
8186       return Reg;
8187   }
8188   return Register();
8189 }
8190 
8191 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8192 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8193                                          const outliner::Candidate &b) {
8194   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8195   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8196 
8197   return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8198          MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8199 }
8200 
8201 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8202 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8203                                        const outliner::Candidate &b) {
8204   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8205   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8206 
8207   return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8208 }
8209 
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8210 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8211                                                 const outliner::Candidate &b) {
8212   const AArch64Subtarget &SubtargetA =
8213       a.getMF()->getSubtarget<AArch64Subtarget>();
8214   const AArch64Subtarget &SubtargetB =
8215       b.getMF()->getSubtarget<AArch64Subtarget>();
8216   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8217 }
8218 
8219 std::optional<outliner::OutlinedFunction>
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const8220 AArch64InstrInfo::getOutliningCandidateInfo(
8221     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8222   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8223 
8224   unsigned SequenceSize = 0;
8225   for (auto &MI : FirstCand)
8226     SequenceSize += getInstSizeInBytes(MI);
8227 
8228   unsigned NumBytesToCreateFrame = 0;
8229 
8230   // We only allow outlining for functions having exactly matching return
8231   // address signing attributes, i.e., all share the same value for the
8232   // attribute "sign-return-address" and all share the same type of key they
8233   // are signed with.
8234   // Additionally we require all functions to simultaniously either support
8235   // v8.3a features or not. Otherwise an outlined function could get signed
8236   // using dedicated v8.3 instructions and a call from a function that doesn't
8237   // support v8.3 instructions would therefore be invalid.
8238   if (std::adjacent_find(
8239           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8240           [](const outliner::Candidate &a, const outliner::Candidate &b) {
8241             // Return true if a and b are non-equal w.r.t. return address
8242             // signing or support of v8.3a features
8243             if (outliningCandidatesSigningScopeConsensus(a, b) &&
8244                 outliningCandidatesSigningKeyConsensus(a, b) &&
8245                 outliningCandidatesV8_3OpsConsensus(a, b)) {
8246               return false;
8247             }
8248             return true;
8249           }) != RepeatedSequenceLocs.end()) {
8250     return std::nullopt;
8251   }
8252 
8253   // Since at this point all candidates agree on their return address signing
8254   // picking just one is fine. If the candidate functions potentially sign their
8255   // return addresses, the outlined function should do the same. Note that in
8256   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8257   // not certainly true that the outlined function will have to sign its return
8258   // address but this decision is made later, when the decision to outline
8259   // has already been made.
8260   // The same holds for the number of additional instructions we need: On
8261   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8262   // necessary. However, at this point we don't know if the outlined function
8263   // will have a RET instruction so we assume the worst.
8264   const TargetRegisterInfo &TRI = getRegisterInfo();
8265   // Performing a tail call may require extra checks when PAuth is enabled.
8266   // If PAuth is disabled, set it to zero for uniformity.
8267   unsigned NumBytesToCheckLRInTCEpilogue = 0;
8268   if (FirstCand.getMF()
8269           ->getInfo<AArch64FunctionInfo>()
8270           ->shouldSignReturnAddress(true)) {
8271     // One PAC and one AUT instructions
8272     NumBytesToCreateFrame += 8;
8273 
8274     // PAuth is enabled - set extra tail call cost, if any.
8275     auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod();
8276     NumBytesToCheckLRInTCEpilogue =
8277         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8278     // Checking the authenticated LR value may significantly impact
8279     // SequenceSize, so account for it for more precise results.
8280     if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8281       SequenceSize += NumBytesToCheckLRInTCEpilogue;
8282 
8283     // We have to check if sp modifying instructions would get outlined.
8284     // If so we only allow outlining if sp is unchanged overall, so matching
8285     // sub and add instructions are okay to outline, all other sp modifications
8286     // are not
8287     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8288       int SPValue = 0;
8289       for (auto &MI : C) {
8290         if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8291           switch (MI.getOpcode()) {
8292           case AArch64::ADDXri:
8293           case AArch64::ADDWri:
8294             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8295             assert(MI.getOperand(2).isImm() &&
8296                    "Expected operand to be immediate");
8297             assert(MI.getOperand(1).isReg() &&
8298                    "Expected operand to be a register");
8299             // Check if the add just increments sp. If so, we search for
8300             // matching sub instructions that decrement sp. If not, the
8301             // modification is illegal
8302             if (MI.getOperand(1).getReg() == AArch64::SP)
8303               SPValue += MI.getOperand(2).getImm();
8304             else
8305               return true;
8306             break;
8307           case AArch64::SUBXri:
8308           case AArch64::SUBWri:
8309             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8310             assert(MI.getOperand(2).isImm() &&
8311                    "Expected operand to be immediate");
8312             assert(MI.getOperand(1).isReg() &&
8313                    "Expected operand to be a register");
8314             // Check if the sub just decrements sp. If so, we search for
8315             // matching add instructions that increment sp. If not, the
8316             // modification is illegal
8317             if (MI.getOperand(1).getReg() == AArch64::SP)
8318               SPValue -= MI.getOperand(2).getImm();
8319             else
8320               return true;
8321             break;
8322           default:
8323             return true;
8324           }
8325         }
8326       }
8327       if (SPValue)
8328         return true;
8329       return false;
8330     };
8331     // Remove candidates with illegal stack modifying instructions
8332     llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8333 
8334     // If the sequence doesn't have enough candidates left, then we're done.
8335     if (RepeatedSequenceLocs.size() < 2)
8336       return std::nullopt;
8337   }
8338 
8339   // Properties about candidate MBBs that hold for all of them.
8340   unsigned FlagsSetInAll = 0xF;
8341 
8342   // Compute liveness information for each candidate, and set FlagsSetInAll.
8343   for (outliner::Candidate &C : RepeatedSequenceLocs)
8344     FlagsSetInAll &= C.Flags;
8345 
8346   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8347 
8348   // Helper lambda which sets call information for every candidate.
8349   auto SetCandidateCallInfo =
8350       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8351         for (outliner::Candidate &C : RepeatedSequenceLocs)
8352           C.setCallInfo(CallID, NumBytesForCall);
8353       };
8354 
8355   unsigned FrameID = MachineOutlinerDefault;
8356   NumBytesToCreateFrame += 4;
8357 
8358   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8359     return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8360   });
8361 
8362   // We check to see if CFI Instructions are present, and if they are
8363   // we find the number of CFI Instructions in the candidates.
8364   unsigned CFICount = 0;
8365   for (auto &I : RepeatedSequenceLocs[0]) {
8366     if (I.isCFIInstruction())
8367       CFICount++;
8368   }
8369 
8370   // We compare the number of found CFI Instructions to  the number of CFI
8371   // instructions in the parent function for each candidate.  We must check this
8372   // since if we outline one of the CFI instructions in a function, we have to
8373   // outline them all for correctness. If we do not, the address offsets will be
8374   // incorrect between the two sections of the program.
8375   for (outliner::Candidate &C : RepeatedSequenceLocs) {
8376     std::vector<MCCFIInstruction> CFIInstructions =
8377         C.getMF()->getFrameInstructions();
8378 
8379     if (CFICount > 0 && CFICount != CFIInstructions.size())
8380       return std::nullopt;
8381   }
8382 
8383   // Returns true if an instructions is safe to fix up, false otherwise.
8384   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8385     if (MI.isCall())
8386       return true;
8387 
8388     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8389         !MI.readsRegister(AArch64::SP, &TRI))
8390       return true;
8391 
8392     // Any modification of SP will break our code to save/restore LR.
8393     // FIXME: We could handle some instructions which add a constant
8394     // offset to SP, with a bit more work.
8395     if (MI.modifiesRegister(AArch64::SP, &TRI))
8396       return false;
8397 
8398     // At this point, we have a stack instruction that we might need to
8399     // fix up. We'll handle it if it's a load or store.
8400     if (MI.mayLoadOrStore()) {
8401       const MachineOperand *Base; // Filled with the base operand of MI.
8402       int64_t Offset;             // Filled with the offset of MI.
8403       bool OffsetIsScalable;
8404 
8405       // Does it allow us to offset the base operand and is the base the
8406       // register SP?
8407       if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8408           !Base->isReg() || Base->getReg() != AArch64::SP)
8409         return false;
8410 
8411       // Fixe-up code below assumes bytes.
8412       if (OffsetIsScalable)
8413         return false;
8414 
8415       // Find the minimum/maximum offset for this instruction and check
8416       // if fixing it up would be in range.
8417       int64_t MinOffset,
8418           MaxOffset;  // Unscaled offsets for the instruction.
8419       // The scale to multiply the offsets by.
8420       TypeSize Scale(0U, false), DummyWidth(0U, false);
8421       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8422 
8423       Offset += 16; // Update the offset to what it would be if we outlined.
8424       if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8425           Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8426         return false;
8427 
8428       // It's in range, so we can outline it.
8429       return true;
8430     }
8431 
8432     // FIXME: Add handling for instructions like "add x0, sp, #8".
8433 
8434     // We can't fix it up, so don't outline it.
8435     return false;
8436   };
8437 
8438   // True if it's possible to fix up each stack instruction in this sequence.
8439   // Important for frames/call variants that modify the stack.
8440   bool AllStackInstrsSafe = llvm::all_of(FirstCand, IsSafeToFixup);
8441 
8442   // If the last instruction in any candidate is a terminator, then we should
8443   // tail call all of the candidates.
8444   if (RepeatedSequenceLocs[0].back().isTerminator()) {
8445     FrameID = MachineOutlinerTailCall;
8446     NumBytesToCreateFrame = 0;
8447     unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8448     SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8449   }
8450 
8451   else if (LastInstrOpcode == AArch64::BL ||
8452            ((LastInstrOpcode == AArch64::BLR ||
8453              LastInstrOpcode == AArch64::BLRNoIP) &&
8454             !HasBTI)) {
8455     // FIXME: Do we need to check if the code after this uses the value of LR?
8456     FrameID = MachineOutlinerThunk;
8457     NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8458     SetCandidateCallInfo(MachineOutlinerThunk, 4);
8459   }
8460 
8461   else {
8462     // We need to decide how to emit calls + frames. We can always emit the same
8463     // frame if we don't need to save to the stack. If we have to save to the
8464     // stack, then we need a different frame.
8465     unsigned NumBytesNoStackCalls = 0;
8466     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8467 
8468     // Check if we have to save LR.
8469     for (outliner::Candidate &C : RepeatedSequenceLocs) {
8470       bool LRAvailable =
8471           (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8472               ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8473               : true;
8474       // If we have a noreturn caller, then we're going to be conservative and
8475       // say that we have to save LR. If we don't have a ret at the end of the
8476       // block, then we can't reason about liveness accurately.
8477       //
8478       // FIXME: We can probably do better than always disabling this in
8479       // noreturn functions by fixing up the liveness info.
8480       bool IsNoReturn =
8481           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8482 
8483       // Is LR available? If so, we don't need a save.
8484       if (LRAvailable && !IsNoReturn) {
8485         NumBytesNoStackCalls += 4;
8486         C.setCallInfo(MachineOutlinerNoLRSave, 4);
8487         CandidatesWithoutStackFixups.push_back(C);
8488       }
8489 
8490       // Is an unused register available? If so, we won't modify the stack, so
8491       // we can outline with the same frame type as those that don't save LR.
8492       else if (findRegisterToSaveLRTo(C)) {
8493         NumBytesNoStackCalls += 12;
8494         C.setCallInfo(MachineOutlinerRegSave, 12);
8495         CandidatesWithoutStackFixups.push_back(C);
8496       }
8497 
8498       // Is SP used in the sequence at all? If not, we don't have to modify
8499       // the stack, so we are guaranteed to get the same frame.
8500       else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8501         NumBytesNoStackCalls += 12;
8502         C.setCallInfo(MachineOutlinerDefault, 12);
8503         CandidatesWithoutStackFixups.push_back(C);
8504       }
8505 
8506       // If we outline this, we need to modify the stack. Pretend we don't
8507       // outline this by saving all of its bytes.
8508       else {
8509         NumBytesNoStackCalls += SequenceSize;
8510       }
8511     }
8512 
8513     // If there are no places where we have to save LR, then note that we
8514     // don't have to update the stack. Otherwise, give every candidate the
8515     // default call type, as long as it's safe to do so.
8516     if (!AllStackInstrsSafe ||
8517         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8518       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8519       FrameID = MachineOutlinerNoLRSave;
8520     } else {
8521       SetCandidateCallInfo(MachineOutlinerDefault, 12);
8522 
8523       // Bugzilla ID: 46767
8524       // TODO: Check if fixing up the stack more than once is safe so we can
8525       // outline these.
8526       //
8527       // An outline resulting in a caller that requires stack fixups at the
8528       // callsite to a callee that also requires stack fixups can happen when
8529       // there are no available registers at the candidate callsite for a
8530       // candidate that itself also has calls.
8531       //
8532       // In other words if function_containing_sequence in the following pseudo
8533       // assembly requires that we save LR at the point of the call, but there
8534       // are no available registers: in this case we save using SP and as a
8535       // result the SP offsets requires stack fixups by multiples of 16.
8536       //
8537       // function_containing_sequence:
8538       //   ...
8539       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8540       //   call OUTLINED_FUNCTION_N
8541       //   restore LR from SP
8542       //   ...
8543       //
8544       // OUTLINED_FUNCTION_N:
8545       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8546       //   ...
8547       //   bl foo
8548       //   restore LR from SP
8549       //   ret
8550       //
8551       // Because the code to handle more than one stack fixup does not
8552       // currently have the proper checks for legality, these cases will assert
8553       // in the AArch64 MachineOutliner. This is because the code to do this
8554       // needs more hardening, testing, better checks that generated code is
8555       // legal, etc and because it is only verified to handle a single pass of
8556       // stack fixup.
8557       //
8558       // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8559       // these cases until they are known to be handled. Bugzilla 46767 is
8560       // referenced in comments at the assert site.
8561       //
8562       // To avoid asserting (or generating non-legal code on noassert builds)
8563       // we remove all candidates which would need more than one stack fixup by
8564       // pruning the cases where the candidate has calls while also having no
8565       // available LR and having no available general purpose registers to copy
8566       // LR to (ie one extra stack save/restore).
8567       //
8568       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8569         erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
8570           auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8571           return (llvm::any_of(C, IsCall)) &&
8572                  (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8573                   !findRegisterToSaveLRTo(C));
8574         });
8575       }
8576     }
8577 
8578     // If we dropped all of the candidates, bail out here.
8579     if (RepeatedSequenceLocs.size() < 2) {
8580       RepeatedSequenceLocs.clear();
8581       return std::nullopt;
8582     }
8583   }
8584 
8585   // Does every candidate's MBB contain a call? If so, then we might have a call
8586   // in the range.
8587   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8588     // Check if the range contains a call. These require a save + restore of the
8589     // link register.
8590     bool ModStackToSaveLR = false;
8591     if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
8592                     [](const MachineInstr &MI) { return MI.isCall(); }))
8593       ModStackToSaveLR = true;
8594 
8595     // Handle the last instruction separately. If this is a tail call, then the
8596     // last instruction is a call. We don't want to save + restore in this case.
8597     // However, it could be possible that the last instruction is a call without
8598     // it being valid to tail call this sequence. We should consider this as
8599     // well.
8600     else if (FrameID != MachineOutlinerThunk &&
8601              FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8602       ModStackToSaveLR = true;
8603 
8604     if (ModStackToSaveLR) {
8605       // We can't fix up the stack. Bail out.
8606       if (!AllStackInstrsSafe) {
8607         RepeatedSequenceLocs.clear();
8608         return std::nullopt;
8609       }
8610 
8611       // Save + restore LR.
8612       NumBytesToCreateFrame += 8;
8613     }
8614   }
8615 
8616   // If we have CFI instructions, we can only outline if the outlined section
8617   // can be a tail call
8618   if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8619     return std::nullopt;
8620 
8621   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8622                                     NumBytesToCreateFrame, FrameID);
8623 }
8624 
mergeOutliningCandidateAttributes(Function & F,std::vector<outliner::Candidate> & Candidates) const8625 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8626     Function &F, std::vector<outliner::Candidate> &Candidates) const {
8627   // If a bunch of candidates reach this point they must agree on their return
8628   // address signing. It is therefore enough to just consider the signing
8629   // behaviour of one of them
8630   const auto &CFn = Candidates.front().getMF()->getFunction();
8631 
8632   // Since all candidates belong to the same module, just copy the
8633   // function-level attributes of an arbitrary function.
8634   if (CFn.hasFnAttribute("sign-return-address"))
8635     F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
8636   if (CFn.hasFnAttribute("sign-return-address-key"))
8637     F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
8638 
8639   AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8640 }
8641 
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const8642 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8643     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8644   const Function &F = MF.getFunction();
8645 
8646   // Can F be deduplicated by the linker? If it can, don't outline from it.
8647   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8648     return false;
8649 
8650   // Don't outline from functions with section markings; the program could
8651   // expect that all the code is in the named section.
8652   // FIXME: Allow outlining from multiple functions with the same section
8653   // marking.
8654   if (F.hasSection())
8655     return false;
8656 
8657   // Outlining from functions with redzones is unsafe since the outliner may
8658   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8659   // outline from it.
8660   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8661   if (!AFI || AFI->hasRedZone().value_or(true))
8662     return false;
8663 
8664   // FIXME: Teach the outliner to generate/handle Windows unwind info.
8665   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8666     return false;
8667 
8668   // It's safe to outline from MF.
8669   return true;
8670 }
8671 
8672 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
getOutlinableRanges(MachineBasicBlock & MBB,unsigned & Flags) const8673 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8674                                       unsigned &Flags) const {
8675   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8676          "Must track liveness!");
8677   SmallVector<
8678       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8679       Ranges;
8680   // According to the AArch64 Procedure Call Standard, the following are
8681   // undefined on entry/exit from a function call:
8682   //
8683   // * Registers x16, x17, (and thus w16, w17)
8684   // * Condition codes (and thus the NZCV register)
8685   //
8686   // If any of these registers are used inside or live across an outlined
8687   // function, then they may be modified later, either by the compiler or
8688   // some other tool (like the linker).
8689   //
8690   // To avoid outlining in these situations, partition each block into ranges
8691   // where these registers are dead. We will only outline from those ranges.
8692   LiveRegUnits LRU(getRegisterInfo());
8693   auto AreAllUnsafeRegsDead = [&LRU]() {
8694     return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8695            LRU.available(AArch64::NZCV);
8696   };
8697 
8698   // We need to know if LR is live across an outlining boundary later on in
8699   // order to decide how we'll create the outlined call, frame, etc.
8700   //
8701   // It's pretty expensive to check this for *every candidate* within a block.
8702   // That's some potentially n^2 behaviour, since in the worst case, we'd need
8703   // to compute liveness from the end of the block for O(n) candidates within
8704   // the block.
8705   //
8706   // So, to improve the average case, let's keep track of liveness from the end
8707   // of the block to the beginning of *every outlinable range*. If we know that
8708   // LR is available in every range we could outline from, then we know that
8709   // we don't need to check liveness for any candidate within that range.
8710   bool LRAvailableEverywhere = true;
8711   // Compute liveness bottom-up.
8712   LRU.addLiveOuts(MBB);
8713   // Update flags that require info about the entire MBB.
8714   auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8715     if (MI.isCall() && !MI.isTerminator())
8716       Flags |= MachineOutlinerMBBFlags::HasCalls;
8717   };
8718   // Range: [RangeBegin, RangeEnd)
8719   MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8720   unsigned RangeLen;
8721   auto CreateNewRangeStartingAt =
8722       [&RangeBegin, &RangeEnd,
8723        &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8724         RangeBegin = NewBegin;
8725         RangeEnd = std::next(RangeBegin);
8726         RangeLen = 0;
8727       };
8728   auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8729     // At least one unsafe register is not dead. We do not want to outline at
8730     // this point. If it is long enough to outline from, save the range
8731     // [RangeBegin, RangeEnd).
8732     if (RangeLen > 1)
8733       Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
8734   };
8735   // Find the first point where all unsafe registers are dead.
8736   // FIND: <safe instr> <-- end of first potential range
8737   // SKIP: <unsafe def>
8738   // SKIP: ... everything between ...
8739   // SKIP: <unsafe use>
8740   auto FirstPossibleEndPt = MBB.instr_rbegin();
8741   for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8742     LRU.stepBackward(*FirstPossibleEndPt);
8743     // Update flags that impact how we outline across the entire block,
8744     // regardless of safety.
8745     UpdateWholeMBBFlags(*FirstPossibleEndPt);
8746     if (AreAllUnsafeRegsDead())
8747       break;
8748   }
8749   // If we exhausted the entire block, we have no safe ranges to outline.
8750   if (FirstPossibleEndPt == MBB.instr_rend())
8751     return Ranges;
8752   // Current range.
8753   CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8754   // StartPt points to the first place where all unsafe registers
8755   // are dead (if there is any such point). Begin partitioning the MBB into
8756   // ranges.
8757   for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
8758     LRU.stepBackward(MI);
8759     UpdateWholeMBBFlags(MI);
8760     if (!AreAllUnsafeRegsDead()) {
8761       SaveRangeIfNonEmpty();
8762       CreateNewRangeStartingAt(MI.getIterator());
8763       continue;
8764     }
8765     LRAvailableEverywhere &= LRU.available(AArch64::LR);
8766     RangeBegin = MI.getIterator();
8767     ++RangeLen;
8768   }
8769   // Above loop misses the last (or only) range. If we are still safe, then
8770   // let's save the range.
8771   if (AreAllUnsafeRegsDead())
8772     SaveRangeIfNonEmpty();
8773   if (Ranges.empty())
8774     return Ranges;
8775   // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8776   // the order.
8777   std::reverse(Ranges.begin(), Ranges.end());
8778   // If there is at least one outlinable range where LR is unavailable
8779   // somewhere, remember that.
8780   if (!LRAvailableEverywhere)
8781     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8782   return Ranges;
8783 }
8784 
8785 outliner::InstrType
getOutliningTypeImpl(MachineBasicBlock::iterator & MIT,unsigned Flags) const8786 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8787                                    unsigned Flags) const {
8788   MachineInstr &MI = *MIT;
8789   MachineBasicBlock *MBB = MI.getParent();
8790   MachineFunction *MF = MBB->getParent();
8791   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8792 
8793   // Don't outline anything used for return address signing. The outlined
8794   // function will get signed later if needed
8795   switch (MI.getOpcode()) {
8796   case AArch64::PACM:
8797   case AArch64::PACIASP:
8798   case AArch64::PACIBSP:
8799   case AArch64::PACIASPPC:
8800   case AArch64::PACIBSPPC:
8801   case AArch64::AUTIASP:
8802   case AArch64::AUTIBSP:
8803   case AArch64::AUTIASPPCi:
8804   case AArch64::AUTIASPPCr:
8805   case AArch64::AUTIBSPPCi:
8806   case AArch64::AUTIBSPPCr:
8807   case AArch64::RETAA:
8808   case AArch64::RETAB:
8809   case AArch64::RETAASPPCi:
8810   case AArch64::RETAASPPCr:
8811   case AArch64::RETABSPPCi:
8812   case AArch64::RETABSPPCr:
8813   case AArch64::EMITBKEY:
8814   case AArch64::PAUTH_PROLOGUE:
8815   case AArch64::PAUTH_EPILOGUE:
8816     return outliner::InstrType::Illegal;
8817   }
8818 
8819   // Don't outline LOHs.
8820   if (FuncInfo->getLOHRelated().count(&MI))
8821     return outliner::InstrType::Illegal;
8822 
8823   // We can only outline these if we will tail call the outlined function, or
8824   // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8825   // in a tail call.
8826   //
8827   // FIXME: If the proper fixups for the offset are implemented, this should be
8828   // possible.
8829   if (MI.isCFIInstruction())
8830     return outliner::InstrType::Legal;
8831 
8832   // Is this a terminator for a basic block?
8833   if (MI.isTerminator())
8834     // TargetInstrInfo::getOutliningType has already filtered out anything
8835     // that would break this, so we can allow it here.
8836     return outliner::InstrType::Legal;
8837 
8838   // Make sure none of the operands are un-outlinable.
8839   for (const MachineOperand &MOP : MI.operands()) {
8840     // A check preventing CFI indices was here before, but only CFI
8841     // instructions should have those.
8842     assert(!MOP.isCFIIndex());
8843 
8844     // If it uses LR or W30 explicitly, then don't touch it.
8845     if (MOP.isReg() && !MOP.isImplicit() &&
8846         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8847       return outliner::InstrType::Illegal;
8848   }
8849 
8850   // Special cases for instructions that can always be outlined, but will fail
8851   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8852   // be outlined because they don't require a *specific* value to be in LR.
8853   if (MI.getOpcode() == AArch64::ADRP)
8854     return outliner::InstrType::Legal;
8855 
8856   // If MI is a call we might be able to outline it. We don't want to outline
8857   // any calls that rely on the position of items on the stack. When we outline
8858   // something containing a call, we have to emit a save and restore of LR in
8859   // the outlined function. Currently, this always happens by saving LR to the
8860   // stack. Thus, if we outline, say, half the parameters for a function call
8861   // plus the call, then we'll break the callee's expectations for the layout
8862   // of the stack.
8863   //
8864   // FIXME: Allow calls to functions which construct a stack frame, as long
8865   // as they don't access arguments on the stack.
8866   // FIXME: Figure out some way to analyze functions defined in other modules.
8867   // We should be able to compute the memory usage based on the IR calling
8868   // convention, even if we can't see the definition.
8869   if (MI.isCall()) {
8870     // Get the function associated with the call. Look at each operand and find
8871     // the one that represents the callee and get its name.
8872     const Function *Callee = nullptr;
8873     for (const MachineOperand &MOP : MI.operands()) {
8874       if (MOP.isGlobal()) {
8875         Callee = dyn_cast<Function>(MOP.getGlobal());
8876         break;
8877       }
8878     }
8879 
8880     // Never outline calls to mcount.  There isn't any rule that would require
8881     // this, but the Linux kernel's "ftrace" feature depends on it.
8882     if (Callee && Callee->getName() == "\01_mcount")
8883       return outliner::InstrType::Illegal;
8884 
8885     // If we don't know anything about the callee, assume it depends on the
8886     // stack layout of the caller. In that case, it's only legal to outline
8887     // as a tail-call. Explicitly list the call instructions we know about so we
8888     // don't get unexpected results with call pseudo-instructions.
8889     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8890     if (MI.getOpcode() == AArch64::BLR ||
8891         MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8892       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8893 
8894     if (!Callee)
8895       return UnknownCallOutlineType;
8896 
8897     // We have a function we have information about. Check it if it's something
8898     // can safely outline.
8899     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
8900 
8901     // We don't know what's going on with the callee at all. Don't touch it.
8902     if (!CalleeMF)
8903       return UnknownCallOutlineType;
8904 
8905     // Check if we know anything about the callee saves on the function. If we
8906     // don't, then don't touch it, since that implies that we haven't
8907     // computed anything about its stack frame yet.
8908     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8909     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8910         MFI.getNumObjects() > 0)
8911       return UnknownCallOutlineType;
8912 
8913     // At this point, we can say that CalleeMF ought to not pass anything on the
8914     // stack. Therefore, we can outline it.
8915     return outliner::InstrType::Legal;
8916   }
8917 
8918   // Don't touch the link register or W30.
8919   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8920       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
8921     return outliner::InstrType::Illegal;
8922 
8923   // Don't outline BTI instructions, because that will prevent the outlining
8924   // site from being indirectly callable.
8925   if (hasBTISemantics(MI))
8926     return outliner::InstrType::Illegal;
8927 
8928   return outliner::InstrType::Legal;
8929 }
8930 
fixupPostOutline(MachineBasicBlock & MBB) const8931 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
8932   for (MachineInstr &MI : MBB) {
8933     const MachineOperand *Base;
8934     TypeSize Width(0, false);
8935     int64_t Offset;
8936     bool OffsetIsScalable;
8937 
8938     // Is this a load or store with an immediate offset with SP as the base?
8939     if (!MI.mayLoadOrStore() ||
8940         !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
8941                                       &RI) ||
8942         (Base->isReg() && Base->getReg() != AArch64::SP))
8943       continue;
8944 
8945     // It is, so we have to fix it up.
8946     TypeSize Scale(0U, false);
8947     int64_t Dummy1, Dummy2;
8948 
8949     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
8950     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
8951     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
8952     assert(Scale != 0 && "Unexpected opcode!");
8953     assert(!OffsetIsScalable && "Expected offset to be a byte offset");
8954 
8955     // We've pushed the return address to the stack, so add 16 to the offset.
8956     // This is safe, since we already checked if it would overflow when we
8957     // checked if this instruction was legal to outline.
8958     int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
8959     StackOffsetOperand.setImm(NewImm);
8960   }
8961 }
8962 
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,const AArch64InstrInfo * TII,bool ShouldSignReturnAddr)8963 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
8964                                  const AArch64InstrInfo *TII,
8965                                  bool ShouldSignReturnAddr) {
8966   if (!ShouldSignReturnAddr)
8967     return;
8968 
8969   BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
8970       .setMIFlag(MachineInstr::FrameSetup);
8971   BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
8972           TII->get(AArch64::PAUTH_EPILOGUE))
8973       .setMIFlag(MachineInstr::FrameDestroy);
8974 }
8975 
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const8976 void AArch64InstrInfo::buildOutlinedFrame(
8977     MachineBasicBlock &MBB, MachineFunction &MF,
8978     const outliner::OutlinedFunction &OF) const {
8979 
8980   AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
8981 
8982   if (OF.FrameConstructionID == MachineOutlinerTailCall)
8983     FI->setOutliningStyle("Tail Call");
8984   else if (OF.FrameConstructionID == MachineOutlinerThunk) {
8985     // For thunk outlining, rewrite the last instruction from a call to a
8986     // tail-call.
8987     MachineInstr *Call = &*--MBB.instr_end();
8988     unsigned TailOpcode;
8989     if (Call->getOpcode() == AArch64::BL) {
8990       TailOpcode = AArch64::TCRETURNdi;
8991     } else {
8992       assert(Call->getOpcode() == AArch64::BLR ||
8993              Call->getOpcode() == AArch64::BLRNoIP);
8994       TailOpcode = AArch64::TCRETURNriALL;
8995     }
8996     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
8997                            .add(Call->getOperand(0))
8998                            .addImm(0);
8999     MBB.insert(MBB.end(), TC);
9000     Call->eraseFromParent();
9001 
9002     FI->setOutliningStyle("Thunk");
9003   }
9004 
9005   bool IsLeafFunction = true;
9006 
9007   // Is there a call in the outlined range?
9008   auto IsNonTailCall = [](const MachineInstr &MI) {
9009     return MI.isCall() && !MI.isReturn();
9010   };
9011 
9012   if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9013     // Fix up the instructions in the range, since we're going to modify the
9014     // stack.
9015 
9016     // Bugzilla ID: 46767
9017     // TODO: Check if fixing up twice is safe so we can outline these.
9018     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9019            "Can only fix up stack references once");
9020     fixupPostOutline(MBB);
9021 
9022     IsLeafFunction = false;
9023 
9024     // LR has to be a live in so that we can save it.
9025     if (!MBB.isLiveIn(AArch64::LR))
9026       MBB.addLiveIn(AArch64::LR);
9027 
9028     MachineBasicBlock::iterator It = MBB.begin();
9029     MachineBasicBlock::iterator Et = MBB.end();
9030 
9031     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9032         OF.FrameConstructionID == MachineOutlinerThunk)
9033       Et = std::prev(MBB.end());
9034 
9035     // Insert a save before the outlined region
9036     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9037                                 .addReg(AArch64::SP, RegState::Define)
9038                                 .addReg(AArch64::LR)
9039                                 .addReg(AArch64::SP)
9040                                 .addImm(-16);
9041     It = MBB.insert(It, STRXpre);
9042 
9043     if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9044       const TargetSubtargetInfo &STI = MF.getSubtarget();
9045       const MCRegisterInfo *MRI = STI.getRegisterInfo();
9046       unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9047 
9048       // Add a CFI saying the stack was moved 16 B down.
9049       int64_t StackPosEntry =
9050           MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9051       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9052           .addCFIIndex(StackPosEntry)
9053           .setMIFlags(MachineInstr::FrameSetup);
9054 
9055       // Add a CFI saying that the LR that we want to find is now 16 B higher
9056       // than before.
9057       int64_t LRPosEntry = MF.addFrameInst(
9058           MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9059       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9060           .addCFIIndex(LRPosEntry)
9061           .setMIFlags(MachineInstr::FrameSetup);
9062     }
9063 
9064     // Insert a restore before the terminator for the function.
9065     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9066                                  .addReg(AArch64::SP, RegState::Define)
9067                                  .addReg(AArch64::LR, RegState::Define)
9068                                  .addReg(AArch64::SP)
9069                                  .addImm(16);
9070     Et = MBB.insert(Et, LDRXpost);
9071   }
9072 
9073   bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9074 
9075   // If this is a tail call outlined function, then there's already a return.
9076   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9077       OF.FrameConstructionID == MachineOutlinerThunk) {
9078     signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9079     return;
9080   }
9081 
9082   // It's not a tail call, so we have to insert the return ourselves.
9083 
9084   // LR has to be a live in so that we can return to it.
9085   if (!MBB.isLiveIn(AArch64::LR))
9086     MBB.addLiveIn(AArch64::LR);
9087 
9088   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9089                           .addReg(AArch64::LR);
9090   MBB.insert(MBB.end(), ret);
9091 
9092   signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9093 
9094   FI->setOutliningStyle("Function");
9095 
9096   // Did we have to modify the stack by saving the link register?
9097   if (OF.FrameConstructionID != MachineOutlinerDefault)
9098     return;
9099 
9100   // We modified the stack.
9101   // Walk over the basic block and fix up all the stack accesses.
9102   fixupPostOutline(MBB);
9103 }
9104 
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,outliner::Candidate & C) const9105 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9106     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9107     MachineFunction &MF, outliner::Candidate &C) const {
9108 
9109   // Are we tail calling?
9110   if (C.CallConstructionID == MachineOutlinerTailCall) {
9111     // If yes, then we can just branch to the label.
9112     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9113                             .addGlobalAddress(M.getNamedValue(MF.getName()))
9114                             .addImm(0));
9115     return It;
9116   }
9117 
9118   // Are we saving the link register?
9119   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9120       C.CallConstructionID == MachineOutlinerThunk) {
9121     // No, so just insert the call.
9122     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9123                             .addGlobalAddress(M.getNamedValue(MF.getName())));
9124     return It;
9125   }
9126 
9127   // We want to return the spot where we inserted the call.
9128   MachineBasicBlock::iterator CallPt;
9129 
9130   // Instructions for saving and restoring LR around the call instruction we're
9131   // going to insert.
9132   MachineInstr *Save;
9133   MachineInstr *Restore;
9134   // Can we save to a register?
9135   if (C.CallConstructionID == MachineOutlinerRegSave) {
9136     // FIXME: This logic should be sunk into a target-specific interface so that
9137     // we don't have to recompute the register.
9138     Register Reg = findRegisterToSaveLRTo(C);
9139     assert(Reg && "No callee-saved register available?");
9140 
9141     // LR has to be a live in so that we can save it.
9142     if (!MBB.isLiveIn(AArch64::LR))
9143       MBB.addLiveIn(AArch64::LR);
9144 
9145     // Save and restore LR from Reg.
9146     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9147                .addReg(AArch64::XZR)
9148                .addReg(AArch64::LR)
9149                .addImm(0);
9150     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9151                 .addReg(AArch64::XZR)
9152                 .addReg(Reg)
9153                 .addImm(0);
9154   } else {
9155     // We have the default case. Save and restore from SP.
9156     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9157                .addReg(AArch64::SP, RegState::Define)
9158                .addReg(AArch64::LR)
9159                .addReg(AArch64::SP)
9160                .addImm(-16);
9161     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9162                   .addReg(AArch64::SP, RegState::Define)
9163                   .addReg(AArch64::LR, RegState::Define)
9164                   .addReg(AArch64::SP)
9165                   .addImm(16);
9166   }
9167 
9168   It = MBB.insert(It, Save);
9169   It++;
9170 
9171   // Insert the call.
9172   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9173                           .addGlobalAddress(M.getNamedValue(MF.getName())));
9174   CallPt = It;
9175   It++;
9176 
9177   It = MBB.insert(It, Restore);
9178   return CallPt;
9179 }
9180 
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const9181 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9182   MachineFunction &MF) const {
9183   return MF.getFunction().hasMinSize();
9184 }
9185 
buildClearRegister(Register Reg,MachineBasicBlock & MBB,MachineBasicBlock::iterator Iter,DebugLoc & DL,bool AllowSideEffects) const9186 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9187                                           MachineBasicBlock::iterator Iter,
9188                                           DebugLoc &DL,
9189                                           bool AllowSideEffects) const {
9190   const MachineFunction &MF = *MBB.getParent();
9191   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9192   const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9193 
9194   if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9195     BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9196   } else if (STI.hasSVE()) {
9197     BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9198       .addImm(0)
9199       .addImm(0);
9200   } else {
9201     BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9202       .addImm(0);
9203   }
9204 }
9205 
9206 std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const9207 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9208 
9209   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9210   // and zero immediate operands used as an alias for mov instruction.
9211   if (MI.getOpcode() == AArch64::ORRWrs &&
9212       MI.getOperand(1).getReg() == AArch64::WZR &&
9213       MI.getOperand(3).getImm() == 0x0 &&
9214       // Check that the w->w move is not a zero-extending w->x mov.
9215       (!MI.getOperand(0).getReg().isVirtual() ||
9216        MI.getOperand(0).getSubReg() == 0) &&
9217       (!MI.getOperand(0).getReg().isPhysical() ||
9218        MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9219                                     AArch64::X0) == -1))
9220     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9221 
9222   if (MI.getOpcode() == AArch64::ORRXrs &&
9223       MI.getOperand(1).getReg() == AArch64::XZR &&
9224       MI.getOperand(3).getImm() == 0x0)
9225     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9226 
9227   return std::nullopt;
9228 }
9229 
9230 std::optional<DestSourcePair>
isCopyLikeInstrImpl(const MachineInstr & MI) const9231 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9232   if (MI.getOpcode() == AArch64::ORRWrs &&
9233       MI.getOperand(1).getReg() == AArch64::WZR &&
9234       MI.getOperand(3).getImm() == 0x0)
9235     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9236   return std::nullopt;
9237 }
9238 
9239 std::optional<RegImmPair>
isAddImmediate(const MachineInstr & MI,Register Reg) const9240 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9241   int Sign = 1;
9242   int64_t Offset = 0;
9243 
9244   // TODO: Handle cases where Reg is a super- or sub-register of the
9245   // destination register.
9246   const MachineOperand &Op0 = MI.getOperand(0);
9247   if (!Op0.isReg() || Reg != Op0.getReg())
9248     return std::nullopt;
9249 
9250   switch (MI.getOpcode()) {
9251   default:
9252     return std::nullopt;
9253   case AArch64::SUBWri:
9254   case AArch64::SUBXri:
9255   case AArch64::SUBSWri:
9256   case AArch64::SUBSXri:
9257     Sign *= -1;
9258     [[fallthrough]];
9259   case AArch64::ADDSWri:
9260   case AArch64::ADDSXri:
9261   case AArch64::ADDWri:
9262   case AArch64::ADDXri: {
9263     // TODO: Third operand can be global address (usually some string).
9264     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9265         !MI.getOperand(2).isImm())
9266       return std::nullopt;
9267     int Shift = MI.getOperand(3).getImm();
9268     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9269     Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9270   }
9271   }
9272   return RegImmPair{MI.getOperand(1).getReg(), Offset};
9273 }
9274 
9275 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9276 /// the destination register then, if possible, describe the value in terms of
9277 /// the source register.
9278 static std::optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)9279 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9280                        const TargetInstrInfo *TII,
9281                        const TargetRegisterInfo *TRI) {
9282   auto DestSrc = TII->isCopyLikeInstr(MI);
9283   if (!DestSrc)
9284     return std::nullopt;
9285 
9286   Register DestReg = DestSrc->Destination->getReg();
9287   Register SrcReg = DestSrc->Source->getReg();
9288 
9289   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9290 
9291   // If the described register is the destination, just return the source.
9292   if (DestReg == DescribedReg)
9293     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9294 
9295   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9296   if (MI.getOpcode() == AArch64::ORRWrs &&
9297       TRI->isSuperRegister(DestReg, DescribedReg))
9298     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9299 
9300   // We may need to describe the lower part of a ORRXrs move.
9301   if (MI.getOpcode() == AArch64::ORRXrs &&
9302       TRI->isSubRegister(DestReg, DescribedReg)) {
9303     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9304     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9305   }
9306 
9307   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9308          "Unhandled ORR[XW]rs copy case");
9309 
9310   return std::nullopt;
9311 }
9312 
isFunctionSafeToSplit(const MachineFunction & MF) const9313 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9314   // Functions cannot be split to different sections on AArch64 if they have
9315   // a red zone. This is because relaxing a cross-section branch may require
9316   // incrementing the stack pointer to spill a register, which would overwrite
9317   // the red zone.
9318   if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9319     return false;
9320 
9321   return TargetInstrInfo::isFunctionSafeToSplit(MF);
9322 }
9323 
isMBBSafeToSplitToCold(const MachineBasicBlock & MBB) const9324 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9325     const MachineBasicBlock &MBB) const {
9326   // Asm Goto blocks can contain conditional branches to goto labels, which can
9327   // get moved out of range of the branch instruction.
9328   auto isAsmGoto = [](const MachineInstr &MI) {
9329     return MI.getOpcode() == AArch64::INLINEASM_BR;
9330   };
9331   if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9332     return false;
9333 
9334   // Because jump tables are label-relative instead of table-relative, they all
9335   // must be in the same section or relocation fixup handling will fail.
9336 
9337   // Check if MBB is a jump table target
9338   const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9339   auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9340     return llvm::is_contained(JTE.MBBs, &MBB);
9341   };
9342   if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9343     return false;
9344 
9345   // Check if MBB contains a jump table lookup
9346   for (const MachineInstr &MI : MBB) {
9347     switch (MI.getOpcode()) {
9348     case TargetOpcode::G_BRJT:
9349     case AArch64::JumpTableDest32:
9350     case AArch64::JumpTableDest16:
9351     case AArch64::JumpTableDest8:
9352       return false;
9353     default:
9354       continue;
9355     }
9356   }
9357 
9358   // MBB isn't a special case, so it's safe to be split to the cold section.
9359   return true;
9360 }
9361 
9362 std::optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const9363 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9364                                       Register Reg) const {
9365   const MachineFunction *MF = MI.getMF();
9366   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9367   switch (MI.getOpcode()) {
9368   case AArch64::MOVZWi:
9369   case AArch64::MOVZXi: {
9370     // MOVZWi may be used for producing zero-extended 32-bit immediates in
9371     // 64-bit parameters, so we need to consider super-registers.
9372     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9373       return std::nullopt;
9374 
9375     if (!MI.getOperand(1).isImm())
9376       return std::nullopt;
9377     int64_t Immediate = MI.getOperand(1).getImm();
9378     int Shift = MI.getOperand(2).getImm();
9379     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9380                             nullptr);
9381   }
9382   case AArch64::ORRWrs:
9383   case AArch64::ORRXrs:
9384     return describeORRLoadedValue(MI, Reg, this, TRI);
9385   }
9386 
9387   return TargetInstrInfo::describeLoadedValue(MI, Reg);
9388 }
9389 
isExtendLikelyToBeFolded(MachineInstr & ExtMI,MachineRegisterInfo & MRI) const9390 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9391     MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9392   assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9393          ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9394          ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9395 
9396   // Anyexts are nops.
9397   if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9398     return true;
9399 
9400   Register DefReg = ExtMI.getOperand(0).getReg();
9401   if (!MRI.hasOneNonDBGUse(DefReg))
9402     return false;
9403 
9404   // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9405   // addressing mode.
9406   auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9407   return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9408 }
9409 
getElementSizeForOpcode(unsigned Opc) const9410 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9411   return get(Opc).TSFlags & AArch64::ElementSizeMask;
9412 }
9413 
isPTestLikeOpcode(unsigned Opc) const9414 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9415   return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9416 }
9417 
isWhileOpcode(unsigned Opc) const9418 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9419   return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9420 }
9421 
9422 unsigned int
getTailDuplicateSize(CodeGenOptLevel OptLevel) const9423 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9424   return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9425 }
9426 
isLegalAddressingMode(unsigned NumBytes,int64_t Offset,unsigned Scale) const9427 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9428                                              unsigned Scale) const {
9429   if (Offset && Scale)
9430     return false;
9431 
9432   // Check Reg + Imm
9433   if (!Scale) {
9434     // 9-bit signed offset
9435     if (isInt<9>(Offset))
9436       return true;
9437 
9438     // 12-bit unsigned offset
9439     unsigned Shift = Log2_64(NumBytes);
9440     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9441         // Must be a multiple of NumBytes (NumBytes is a power of 2)
9442         (Offset >> Shift) << Shift == Offset)
9443       return true;
9444     return false;
9445   }
9446 
9447   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9448   return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9449 }
9450 
getBLRCallOpcode(const MachineFunction & MF)9451 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9452   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9453     return AArch64::BLRNoIP;
9454   else
9455     return AArch64::BLR;
9456 }
9457 
isReallyTriviallyReMaterializable(const MachineInstr & MI) const9458 bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
9459     const MachineInstr &MI) const {
9460   const MachineFunction &MF = *MI.getMF();
9461   const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
9462 
9463   // If the function contains changes to streaming mode, then there
9464   // is a danger that rematerialised instructions end up between
9465   // instruction sequences (e.g. call sequences, or prolog/epilogue)
9466   // where the streaming-SVE mode is temporarily changed.
9467   if (AFI.hasStreamingModeChanges()) {
9468     // Avoid rematerializing rematerializable instructions that use/define
9469     // scalable values, such as 'pfalse' or 'ptrue', which result in different
9470     // results when the runtime vector length is different.
9471     const MachineRegisterInfo &MRI = MF.getRegInfo();
9472     const MachineFrameInfo &MFI = MF.getFrameInfo();
9473     if (any_of(MI.operands(), [&MRI, &MFI](const MachineOperand &MO) {
9474           if (MO.isFI() &&
9475               MFI.getStackID(MO.getIndex()) == TargetStackID::ScalableVector)
9476             return true;
9477           if (!MO.isReg())
9478             return false;
9479 
9480           if (MO.getReg().isVirtual()) {
9481             const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
9482             return AArch64::ZPRRegClass.hasSubClassEq(RC) ||
9483                    AArch64::PPRRegClass.hasSubClassEq(RC);
9484           }
9485           return AArch64::ZPRRegClass.contains(MO.getReg()) ||
9486                  AArch64::PPRRegClass.contains(MO.getReg());
9487         }))
9488       return false;
9489 
9490     // Avoid rematerializing instructions that return a value that is
9491     // different depending on vector length, even when it is not returned
9492     // in a scalable vector/predicate register.
9493     switch (MI.getOpcode()) {
9494     default:
9495       break;
9496     case AArch64::RDVLI_XI:
9497     case AArch64::ADDVL_XXI:
9498     case AArch64::ADDPL_XXI:
9499     case AArch64::CNTB_XPiI:
9500     case AArch64::CNTH_XPiI:
9501     case AArch64::CNTW_XPiI:
9502     case AArch64::CNTD_XPiI:
9503       return false;
9504     }
9505   }
9506 
9507   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
9508 }
9509 
9510 MachineBasicBlock::iterator
probedStackAlloc(MachineBasicBlock::iterator MBBI,Register TargetReg,bool FrameSetup) const9511 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9512                                    Register TargetReg, bool FrameSetup) const {
9513   assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9514 
9515   MachineBasicBlock &MBB = *MBBI->getParent();
9516   MachineFunction &MF = *MBB.getParent();
9517   const AArch64InstrInfo *TII =
9518       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9519   int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9520   DebugLoc DL = MBB.findDebugLoc(MBBI);
9521 
9522   MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
9523   MachineBasicBlock *LoopTestMBB =
9524       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9525   MF.insert(MBBInsertPoint, LoopTestMBB);
9526   MachineBasicBlock *LoopBodyMBB =
9527       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9528   MF.insert(MBBInsertPoint, LoopBodyMBB);
9529   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9530   MF.insert(MBBInsertPoint, ExitMBB);
9531   MachineInstr::MIFlag Flags =
9532       FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9533 
9534   // LoopTest:
9535   //   SUB SP, SP, #ProbeSize
9536   emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9537                   AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9538 
9539   //   CMP SP, TargetReg
9540   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9541           AArch64::XZR)
9542       .addReg(AArch64::SP)
9543       .addReg(TargetReg)
9544       .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9545       .setMIFlags(Flags);
9546 
9547   //   B.<Cond> LoopExit
9548   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9549       .addImm(AArch64CC::LE)
9550       .addMBB(ExitMBB)
9551       .setMIFlags(Flags);
9552 
9553   //   STR XZR, [SP]
9554   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9555       .addReg(AArch64::XZR)
9556       .addReg(AArch64::SP)
9557       .addImm(0)
9558       .setMIFlags(Flags);
9559 
9560   //   B loop
9561   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9562       .addMBB(LoopTestMBB)
9563       .setMIFlags(Flags);
9564 
9565   // LoopExit:
9566   //   MOV SP, TargetReg
9567   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9568       .addReg(TargetReg)
9569       .addImm(0)
9570       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9571       .setMIFlags(Flags);
9572 
9573   //   LDR XZR, [SP]
9574   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9575       .addReg(AArch64::XZR, RegState::Define)
9576       .addReg(AArch64::SP)
9577       .addImm(0)
9578       .setMIFlags(Flags);
9579 
9580   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
9581   ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
9582 
9583   LoopTestMBB->addSuccessor(ExitMBB);
9584   LoopTestMBB->addSuccessor(LoopBodyMBB);
9585   LoopBodyMBB->addSuccessor(LoopTestMBB);
9586   MBB.addSuccessor(LoopTestMBB);
9587 
9588   // Update liveins.
9589   if (MF.getRegInfo().reservedRegsFrozen()) {
9590     bool anyChange = false;
9591     do {
9592       anyChange = recomputeLiveIns(*ExitMBB) ||
9593                   recomputeLiveIns(*LoopBodyMBB) ||
9594                   recomputeLiveIns(*LoopTestMBB);
9595     } while (anyChange);
9596     ;
9597   }
9598 
9599   return ExitMBB->begin();
9600 }
9601 
9602 #define GET_INSTRINFO_HELPERS
9603 #define GET_INSTRMAP_INFO
9604 #include "AArch64GenInstrInfo.inc"
9605