1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugInfoMetadata.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstrDesc.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CodeGen.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Compiler.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/MathExtras.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetOptions.h"
47 #include <cassert>
48 #include <cstdint>
49 #include <iterator>
50 #include <utility>
51 
52 using namespace llvm;
53 
54 #define GET_INSTRINFO_CTOR_DTOR
55 #include "AArch64GenInstrInfo.inc"
56 
57 static cl::opt<unsigned> TBZDisplacementBits(
58     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
59     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
60 
61 static cl::opt<unsigned> CBZDisplacementBits(
62     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
63     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
64 
65 static cl::opt<unsigned>
66     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
67                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
68 
AArch64InstrInfo(const AArch64Subtarget & STI)69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
70     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
71                           AArch64::CATCHRET),
72       RI(STI.getTargetTriple()), Subtarget(STI) {}
73 
74 /// GetInstSize - Return the number of bytes of code the specified
75 /// instruction may be.  This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
77   const MachineBasicBlock &MBB = *MI.getParent();
78   const MachineFunction *MF = MBB.getParent();
79   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
80 
81   {
82     auto Op = MI.getOpcode();
83     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
84       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
85   }
86 
87   // Meta-instructions emit no code.
88   if (MI.isMetaInstruction())
89     return 0;
90 
91   // FIXME: We currently only handle pseudoinstructions that don't get expanded
92   //        before the assembly printer.
93   unsigned NumBytes = 0;
94   const MCInstrDesc &Desc = MI.getDesc();
95   switch (Desc.getOpcode()) {
96   default:
97     // Anything not explicitly designated otherwise is a normal 4-byte insn.
98     NumBytes = 4;
99     break;
100   case TargetOpcode::STACKMAP:
101     // The upper bound for a stackmap intrinsic is the full length of its shadow
102     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
103     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
104     break;
105   case TargetOpcode::PATCHPOINT:
106     // The size of the patchpoint intrinsic is the number of bytes requested
107     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
108     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
109     break;
110   case AArch64::TLSDESC_CALLSEQ:
111     // This gets lowered to an instruction sequence which takes 16 bytes
112     NumBytes = 16;
113     break;
114   case AArch64::SpeculationBarrierISBDSBEndBB:
115     // This gets lowered to 2 4-byte instructions.
116     NumBytes = 8;
117     break;
118   case AArch64::SpeculationBarrierSBEndBB:
119     // This gets lowered to 1 4-byte instructions.
120     NumBytes = 4;
121     break;
122   case AArch64::JumpTableDest32:
123   case AArch64::JumpTableDest16:
124   case AArch64::JumpTableDest8:
125     NumBytes = 12;
126     break;
127   case AArch64::SPACE:
128     NumBytes = MI.getOperand(1).getImm();
129     break;
130   case TargetOpcode::BUNDLE:
131     NumBytes = getInstBundleLength(MI);
132     break;
133   }
134 
135   return NumBytes;
136 }
137 
getInstBundleLength(const MachineInstr & MI) const138 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
139   unsigned Size = 0;
140   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
141   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
142   while (++I != E && I->isInsideBundle()) {
143     assert(!I->isBundle() && "No nested bundle!");
144     Size += getInstSizeInBytes(*I);
145   }
146   return Size;
147 }
148 
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)149 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
150                             SmallVectorImpl<MachineOperand> &Cond) {
151   // Block ends with fall-through condbranch.
152   switch (LastInst->getOpcode()) {
153   default:
154     llvm_unreachable("Unknown branch instruction?");
155   case AArch64::Bcc:
156     Target = LastInst->getOperand(1).getMBB();
157     Cond.push_back(LastInst->getOperand(0));
158     break;
159   case AArch64::CBZW:
160   case AArch64::CBZX:
161   case AArch64::CBNZW:
162   case AArch64::CBNZX:
163     Target = LastInst->getOperand(1).getMBB();
164     Cond.push_back(MachineOperand::CreateImm(-1));
165     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
166     Cond.push_back(LastInst->getOperand(0));
167     break;
168   case AArch64::TBZW:
169   case AArch64::TBZX:
170   case AArch64::TBNZW:
171   case AArch64::TBNZX:
172     Target = LastInst->getOperand(2).getMBB();
173     Cond.push_back(MachineOperand::CreateImm(-1));
174     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
175     Cond.push_back(LastInst->getOperand(0));
176     Cond.push_back(LastInst->getOperand(1));
177   }
178 }
179 
getBranchDisplacementBits(unsigned Opc)180 static unsigned getBranchDisplacementBits(unsigned Opc) {
181   switch (Opc) {
182   default:
183     llvm_unreachable("unexpected opcode!");
184   case AArch64::B:
185     return 64;
186   case AArch64::TBNZW:
187   case AArch64::TBZW:
188   case AArch64::TBNZX:
189   case AArch64::TBZX:
190     return TBZDisplacementBits;
191   case AArch64::CBNZW:
192   case AArch64::CBZW:
193   case AArch64::CBNZX:
194   case AArch64::CBZX:
195     return CBZDisplacementBits;
196   case AArch64::Bcc:
197     return BCCDisplacementBits;
198   }
199 }
200 
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const201 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
202                                              int64_t BrOffset) const {
203   unsigned Bits = getBranchDisplacementBits(BranchOp);
204   assert(Bits >= 3 && "max branch displacement must be enough to jump"
205                       "over conditional branch expansion");
206   return isIntN(Bits, BrOffset / 4);
207 }
208 
209 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const210 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
211   switch (MI.getOpcode()) {
212   default:
213     llvm_unreachable("unexpected opcode!");
214   case AArch64::B:
215     return MI.getOperand(0).getMBB();
216   case AArch64::TBZW:
217   case AArch64::TBNZW:
218   case AArch64::TBZX:
219   case AArch64::TBNZX:
220     return MI.getOperand(2).getMBB();
221   case AArch64::CBZW:
222   case AArch64::CBNZW:
223   case AArch64::CBZX:
224   case AArch64::CBNZX:
225   case AArch64::Bcc:
226     return MI.getOperand(1).getMBB();
227   }
228 }
229 
230 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const231 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
232                                      MachineBasicBlock *&TBB,
233                                      MachineBasicBlock *&FBB,
234                                      SmallVectorImpl<MachineOperand> &Cond,
235                                      bool AllowModify) const {
236   // If the block has no terminators, it just falls into the block after it.
237   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
238   if (I == MBB.end())
239     return false;
240 
241   // Skip over SpeculationBarrierEndBB terminators
242   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
243       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
244     --I;
245   }
246 
247   if (!isUnpredicatedTerminator(*I))
248     return false;
249 
250   // Get the last instruction in the block.
251   MachineInstr *LastInst = &*I;
252 
253   // If there is only one terminator instruction, process it.
254   unsigned LastOpc = LastInst->getOpcode();
255   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
256     if (isUncondBranchOpcode(LastOpc)) {
257       TBB = LastInst->getOperand(0).getMBB();
258       return false;
259     }
260     if (isCondBranchOpcode(LastOpc)) {
261       // Block ends with fall-through condbranch.
262       parseCondBranch(LastInst, TBB, Cond);
263       return false;
264     }
265     return true; // Can't handle indirect branch.
266   }
267 
268   // Get the instruction before it if it is a terminator.
269   MachineInstr *SecondLastInst = &*I;
270   unsigned SecondLastOpc = SecondLastInst->getOpcode();
271 
272   // If AllowModify is true and the block ends with two or more unconditional
273   // branches, delete all but the first unconditional branch.
274   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
275     while (isUncondBranchOpcode(SecondLastOpc)) {
276       LastInst->eraseFromParent();
277       LastInst = SecondLastInst;
278       LastOpc = LastInst->getOpcode();
279       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
280         // Return now the only terminator is an unconditional branch.
281         TBB = LastInst->getOperand(0).getMBB();
282         return false;
283       } else {
284         SecondLastInst = &*I;
285         SecondLastOpc = SecondLastInst->getOpcode();
286       }
287     }
288   }
289 
290   // If there are three terminators, we don't know what sort of block this is.
291   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
292     return true;
293 
294   // If the block ends with a B and a Bcc, handle it.
295   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
296     parseCondBranch(SecondLastInst, TBB, Cond);
297     FBB = LastInst->getOperand(0).getMBB();
298     return false;
299   }
300 
301   // If the block ends with two unconditional branches, handle it.  The second
302   // one is not executed, so remove it.
303   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
304     TBB = SecondLastInst->getOperand(0).getMBB();
305     I = LastInst;
306     if (AllowModify)
307       I->eraseFromParent();
308     return false;
309   }
310 
311   // ...likewise if it ends with an indirect branch followed by an unconditional
312   // branch.
313   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
314     I = LastInst;
315     if (AllowModify)
316       I->eraseFromParent();
317     return true;
318   }
319 
320   // Otherwise, can't handle this.
321   return true;
322 }
323 
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const324 bool AArch64InstrInfo::reverseBranchCondition(
325     SmallVectorImpl<MachineOperand> &Cond) const {
326   if (Cond[0].getImm() != -1) {
327     // Regular Bcc
328     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
329     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
330   } else {
331     // Folded compare-and-branch
332     switch (Cond[1].getImm()) {
333     default:
334       llvm_unreachable("Unknown conditional branch!");
335     case AArch64::CBZW:
336       Cond[1].setImm(AArch64::CBNZW);
337       break;
338     case AArch64::CBNZW:
339       Cond[1].setImm(AArch64::CBZW);
340       break;
341     case AArch64::CBZX:
342       Cond[1].setImm(AArch64::CBNZX);
343       break;
344     case AArch64::CBNZX:
345       Cond[1].setImm(AArch64::CBZX);
346       break;
347     case AArch64::TBZW:
348       Cond[1].setImm(AArch64::TBNZW);
349       break;
350     case AArch64::TBNZW:
351       Cond[1].setImm(AArch64::TBZW);
352       break;
353     case AArch64::TBZX:
354       Cond[1].setImm(AArch64::TBNZX);
355       break;
356     case AArch64::TBNZX:
357       Cond[1].setImm(AArch64::TBZX);
358       break;
359     }
360   }
361 
362   return false;
363 }
364 
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const365 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
366                                         int *BytesRemoved) const {
367   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
368   if (I == MBB.end())
369     return 0;
370 
371   if (!isUncondBranchOpcode(I->getOpcode()) &&
372       !isCondBranchOpcode(I->getOpcode()))
373     return 0;
374 
375   // Remove the branch.
376   I->eraseFromParent();
377 
378   I = MBB.end();
379 
380   if (I == MBB.begin()) {
381     if (BytesRemoved)
382       *BytesRemoved = 4;
383     return 1;
384   }
385   --I;
386   if (!isCondBranchOpcode(I->getOpcode())) {
387     if (BytesRemoved)
388       *BytesRemoved = 4;
389     return 1;
390   }
391 
392   // Remove the branch.
393   I->eraseFromParent();
394   if (BytesRemoved)
395     *BytesRemoved = 8;
396 
397   return 2;
398 }
399 
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const400 void AArch64InstrInfo::instantiateCondBranch(
401     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
402     ArrayRef<MachineOperand> Cond) const {
403   if (Cond[0].getImm() != -1) {
404     // Regular Bcc
405     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
406   } else {
407     // Folded compare-and-branch
408     // Note that we use addOperand instead of addReg to keep the flags.
409     const MachineInstrBuilder MIB =
410         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
411     if (Cond.size() > 3)
412       MIB.addImm(Cond[3].getImm());
413     MIB.addMBB(TBB);
414   }
415 }
416 
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const417 unsigned AArch64InstrInfo::insertBranch(
418     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
419     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
420   // Shouldn't be a fall through.
421   assert(TBB && "insertBranch must not be told to insert a fallthrough");
422 
423   if (!FBB) {
424     if (Cond.empty()) // Unconditional branch?
425       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
426     else
427       instantiateCondBranch(MBB, DL, TBB, Cond);
428 
429     if (BytesAdded)
430       *BytesAdded = 4;
431 
432     return 1;
433   }
434 
435   // Two-way conditional branch.
436   instantiateCondBranch(MBB, DL, TBB, Cond);
437   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
438 
439   if (BytesAdded)
440     *BytesAdded = 8;
441 
442   return 2;
443 }
444 
445 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)446 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
447   while (Register::isVirtualRegister(VReg)) {
448     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
449     if (!DefMI->isFullCopy())
450       return VReg;
451     VReg = DefMI->getOperand(1).getReg();
452   }
453   return VReg;
454 }
455 
456 // Determine if VReg is defined by an instruction that can be folded into a
457 // csel instruction. If so, return the folded opcode, and the replacement
458 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)459 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
460                                 unsigned *NewVReg = nullptr) {
461   VReg = removeCopies(MRI, VReg);
462   if (!Register::isVirtualRegister(VReg))
463     return 0;
464 
465   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
466   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
467   unsigned Opc = 0;
468   unsigned SrcOpNum = 0;
469   switch (DefMI->getOpcode()) {
470   case AArch64::ADDSXri:
471   case AArch64::ADDSWri:
472     // if NZCV is used, do not fold.
473     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
474       return 0;
475     // fall-through to ADDXri and ADDWri.
476     LLVM_FALLTHROUGH;
477   case AArch64::ADDXri:
478   case AArch64::ADDWri:
479     // add x, 1 -> csinc.
480     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
481         DefMI->getOperand(3).getImm() != 0)
482       return 0;
483     SrcOpNum = 1;
484     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
485     break;
486 
487   case AArch64::ORNXrr:
488   case AArch64::ORNWrr: {
489     // not x -> csinv, represented as orn dst, xzr, src.
490     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
491     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
492       return 0;
493     SrcOpNum = 2;
494     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
495     break;
496   }
497 
498   case AArch64::SUBSXrr:
499   case AArch64::SUBSWrr:
500     // if NZCV is used, do not fold.
501     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
502       return 0;
503     // fall-through to SUBXrr and SUBWrr.
504     LLVM_FALLTHROUGH;
505   case AArch64::SUBXrr:
506   case AArch64::SUBWrr: {
507     // neg x -> csneg, represented as sub dst, xzr, src.
508     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
509     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
510       return 0;
511     SrcOpNum = 2;
512     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
513     break;
514   }
515   default:
516     return 0;
517   }
518   assert(Opc && SrcOpNum && "Missing parameters");
519 
520   if (NewVReg)
521     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
522   return Opc;
523 }
524 
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const525 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
526                                        ArrayRef<MachineOperand> Cond,
527                                        Register DstReg, Register TrueReg,
528                                        Register FalseReg, int &CondCycles,
529                                        int &TrueCycles,
530                                        int &FalseCycles) const {
531   // Check register classes.
532   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
533   const TargetRegisterClass *RC =
534       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
535   if (!RC)
536     return false;
537 
538   // Also need to check the dest regclass, in case we're trying to optimize
539   // something like:
540   // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
541   if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
542     return false;
543 
544   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
545   unsigned ExtraCondLat = Cond.size() != 1;
546 
547   // GPRs are handled by csel.
548   // FIXME: Fold in x+1, -x, and ~x when applicable.
549   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
550       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
551     // Single-cycle csel, csinc, csinv, and csneg.
552     CondCycles = 1 + ExtraCondLat;
553     TrueCycles = FalseCycles = 1;
554     if (canFoldIntoCSel(MRI, TrueReg))
555       TrueCycles = 0;
556     else if (canFoldIntoCSel(MRI, FalseReg))
557       FalseCycles = 0;
558     return true;
559   }
560 
561   // Scalar floating point is handled by fcsel.
562   // FIXME: Form fabs, fmin, and fmax when applicable.
563   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
564       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
565     CondCycles = 5 + ExtraCondLat;
566     TrueCycles = FalseCycles = 2;
567     return true;
568   }
569 
570   // Can't do vectors.
571   return false;
572 }
573 
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const574 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
575                                     MachineBasicBlock::iterator I,
576                                     const DebugLoc &DL, Register DstReg,
577                                     ArrayRef<MachineOperand> Cond,
578                                     Register TrueReg, Register FalseReg) const {
579   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
580 
581   // Parse the condition code, see parseCondBranch() above.
582   AArch64CC::CondCode CC;
583   switch (Cond.size()) {
584   default:
585     llvm_unreachable("Unknown condition opcode in Cond");
586   case 1: // b.cc
587     CC = AArch64CC::CondCode(Cond[0].getImm());
588     break;
589   case 3: { // cbz/cbnz
590     // We must insert a compare against 0.
591     bool Is64Bit;
592     switch (Cond[1].getImm()) {
593     default:
594       llvm_unreachable("Unknown branch opcode in Cond");
595     case AArch64::CBZW:
596       Is64Bit = false;
597       CC = AArch64CC::EQ;
598       break;
599     case AArch64::CBZX:
600       Is64Bit = true;
601       CC = AArch64CC::EQ;
602       break;
603     case AArch64::CBNZW:
604       Is64Bit = false;
605       CC = AArch64CC::NE;
606       break;
607     case AArch64::CBNZX:
608       Is64Bit = true;
609       CC = AArch64CC::NE;
610       break;
611     }
612     Register SrcReg = Cond[2].getReg();
613     if (Is64Bit) {
614       // cmp reg, #0 is actually subs xzr, reg, #0.
615       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
616       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
617           .addReg(SrcReg)
618           .addImm(0)
619           .addImm(0);
620     } else {
621       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
622       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
623           .addReg(SrcReg)
624           .addImm(0)
625           .addImm(0);
626     }
627     break;
628   }
629   case 4: { // tbz/tbnz
630     // We must insert a tst instruction.
631     switch (Cond[1].getImm()) {
632     default:
633       llvm_unreachable("Unknown branch opcode in Cond");
634     case AArch64::TBZW:
635     case AArch64::TBZX:
636       CC = AArch64CC::EQ;
637       break;
638     case AArch64::TBNZW:
639     case AArch64::TBNZX:
640       CC = AArch64CC::NE;
641       break;
642     }
643     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
644     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
645       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
646           .addReg(Cond[2].getReg())
647           .addImm(
648               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
649     else
650       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
651           .addReg(Cond[2].getReg())
652           .addImm(
653               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
654     break;
655   }
656   }
657 
658   unsigned Opc = 0;
659   const TargetRegisterClass *RC = nullptr;
660   bool TryFold = false;
661   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
662     RC = &AArch64::GPR64RegClass;
663     Opc = AArch64::CSELXr;
664     TryFold = true;
665   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
666     RC = &AArch64::GPR32RegClass;
667     Opc = AArch64::CSELWr;
668     TryFold = true;
669   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
670     RC = &AArch64::FPR64RegClass;
671     Opc = AArch64::FCSELDrrr;
672   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
673     RC = &AArch64::FPR32RegClass;
674     Opc = AArch64::FCSELSrrr;
675   }
676   assert(RC && "Unsupported regclass");
677 
678   // Try folding simple instructions into the csel.
679   if (TryFold) {
680     unsigned NewVReg = 0;
681     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
682     if (FoldedOpc) {
683       // The folded opcodes csinc, csinc and csneg apply the operation to
684       // FalseReg, so we need to invert the condition.
685       CC = AArch64CC::getInvertedCondCode(CC);
686       TrueReg = FalseReg;
687     } else
688       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
689 
690     // Fold the operation. Leave any dead instructions for DCE to clean up.
691     if (FoldedOpc) {
692       FalseReg = NewVReg;
693       Opc = FoldedOpc;
694       // The extends the live range of NewVReg.
695       MRI.clearKillFlags(NewVReg);
696     }
697   }
698 
699   // Pull all virtual register into the appropriate class.
700   MRI.constrainRegClass(TrueReg, RC);
701   MRI.constrainRegClass(FalseReg, RC);
702 
703   // Insert the csel.
704   BuildMI(MBB, I, DL, get(Opc), DstReg)
705       .addReg(TrueReg)
706       .addReg(FalseReg)
707       .addImm(CC);
708 }
709 
710 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
canBeExpandedToORR(const MachineInstr & MI,unsigned BitSize)711 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
712   uint64_t Imm = MI.getOperand(1).getImm();
713   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
714   uint64_t Encoding;
715   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
716 }
717 
718 // FIXME: this implementation should be micro-architecture dependent, so a
719 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const720 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
721   if (!Subtarget.hasCustomCheapAsMoveHandling())
722     return MI.isAsCheapAsAMove();
723 
724   const unsigned Opcode = MI.getOpcode();
725 
726   // Firstly, check cases gated by features.
727 
728   if (Subtarget.hasZeroCycleZeroingFP()) {
729     if (Opcode == AArch64::FMOVH0 ||
730         Opcode == AArch64::FMOVS0 ||
731         Opcode == AArch64::FMOVD0)
732       return true;
733   }
734 
735   if (Subtarget.hasZeroCycleZeroingGP()) {
736     if (Opcode == TargetOpcode::COPY &&
737         (MI.getOperand(1).getReg() == AArch64::WZR ||
738          MI.getOperand(1).getReg() == AArch64::XZR))
739       return true;
740   }
741 
742   // Secondly, check cases specific to sub-targets.
743 
744   if (Subtarget.hasExynosCheapAsMoveHandling()) {
745     if (isExynosCheapAsMove(MI))
746       return true;
747 
748     return MI.isAsCheapAsAMove();
749   }
750 
751   // Finally, check generic cases.
752 
753   switch (Opcode) {
754   default:
755     return false;
756 
757   // add/sub on register without shift
758   case AArch64::ADDWri:
759   case AArch64::ADDXri:
760   case AArch64::SUBWri:
761   case AArch64::SUBXri:
762     return (MI.getOperand(3).getImm() == 0);
763 
764   // logical ops on immediate
765   case AArch64::ANDWri:
766   case AArch64::ANDXri:
767   case AArch64::EORWri:
768   case AArch64::EORXri:
769   case AArch64::ORRWri:
770   case AArch64::ORRXri:
771     return true;
772 
773   // logical ops on register without shift
774   case AArch64::ANDWrr:
775   case AArch64::ANDXrr:
776   case AArch64::BICWrr:
777   case AArch64::BICXrr:
778   case AArch64::EONWrr:
779   case AArch64::EONXrr:
780   case AArch64::EORWrr:
781   case AArch64::EORXrr:
782   case AArch64::ORNWrr:
783   case AArch64::ORNXrr:
784   case AArch64::ORRWrr:
785   case AArch64::ORRXrr:
786     return true;
787 
788   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
789   // ORRXri, it is as cheap as MOV
790   case AArch64::MOVi32imm:
791     return canBeExpandedToORR(MI, 32);
792   case AArch64::MOVi64imm:
793     return canBeExpandedToORR(MI, 64);
794   }
795 
796   llvm_unreachable("Unknown opcode to check as cheap as a move!");
797 }
798 
isFalkorShiftExtFast(const MachineInstr & MI)799 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
800   switch (MI.getOpcode()) {
801   default:
802     return false;
803 
804   case AArch64::ADDWrs:
805   case AArch64::ADDXrs:
806   case AArch64::ADDSWrs:
807   case AArch64::ADDSXrs: {
808     unsigned Imm = MI.getOperand(3).getImm();
809     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
810     if (ShiftVal == 0)
811       return true;
812     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
813   }
814 
815   case AArch64::ADDWrx:
816   case AArch64::ADDXrx:
817   case AArch64::ADDXrx64:
818   case AArch64::ADDSWrx:
819   case AArch64::ADDSXrx:
820   case AArch64::ADDSXrx64: {
821     unsigned Imm = MI.getOperand(3).getImm();
822     switch (AArch64_AM::getArithExtendType(Imm)) {
823     default:
824       return false;
825     case AArch64_AM::UXTB:
826     case AArch64_AM::UXTH:
827     case AArch64_AM::UXTW:
828     case AArch64_AM::UXTX:
829       return AArch64_AM::getArithShiftValue(Imm) <= 4;
830     }
831   }
832 
833   case AArch64::SUBWrs:
834   case AArch64::SUBSWrs: {
835     unsigned Imm = MI.getOperand(3).getImm();
836     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
837     return ShiftVal == 0 ||
838            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
839   }
840 
841   case AArch64::SUBXrs:
842   case AArch64::SUBSXrs: {
843     unsigned Imm = MI.getOperand(3).getImm();
844     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
845     return ShiftVal == 0 ||
846            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
847   }
848 
849   case AArch64::SUBWrx:
850   case AArch64::SUBXrx:
851   case AArch64::SUBXrx64:
852   case AArch64::SUBSWrx:
853   case AArch64::SUBSXrx:
854   case AArch64::SUBSXrx64: {
855     unsigned Imm = MI.getOperand(3).getImm();
856     switch (AArch64_AM::getArithExtendType(Imm)) {
857     default:
858       return false;
859     case AArch64_AM::UXTB:
860     case AArch64_AM::UXTH:
861     case AArch64_AM::UXTW:
862     case AArch64_AM::UXTX:
863       return AArch64_AM::getArithShiftValue(Imm) == 0;
864     }
865   }
866 
867   case AArch64::LDRBBroW:
868   case AArch64::LDRBBroX:
869   case AArch64::LDRBroW:
870   case AArch64::LDRBroX:
871   case AArch64::LDRDroW:
872   case AArch64::LDRDroX:
873   case AArch64::LDRHHroW:
874   case AArch64::LDRHHroX:
875   case AArch64::LDRHroW:
876   case AArch64::LDRHroX:
877   case AArch64::LDRQroW:
878   case AArch64::LDRQroX:
879   case AArch64::LDRSBWroW:
880   case AArch64::LDRSBWroX:
881   case AArch64::LDRSBXroW:
882   case AArch64::LDRSBXroX:
883   case AArch64::LDRSHWroW:
884   case AArch64::LDRSHWroX:
885   case AArch64::LDRSHXroW:
886   case AArch64::LDRSHXroX:
887   case AArch64::LDRSWroW:
888   case AArch64::LDRSWroX:
889   case AArch64::LDRSroW:
890   case AArch64::LDRSroX:
891   case AArch64::LDRWroW:
892   case AArch64::LDRWroX:
893   case AArch64::LDRXroW:
894   case AArch64::LDRXroX:
895   case AArch64::PRFMroW:
896   case AArch64::PRFMroX:
897   case AArch64::STRBBroW:
898   case AArch64::STRBBroX:
899   case AArch64::STRBroW:
900   case AArch64::STRBroX:
901   case AArch64::STRDroW:
902   case AArch64::STRDroX:
903   case AArch64::STRHHroW:
904   case AArch64::STRHHroX:
905   case AArch64::STRHroW:
906   case AArch64::STRHroX:
907   case AArch64::STRQroW:
908   case AArch64::STRQroX:
909   case AArch64::STRSroW:
910   case AArch64::STRSroX:
911   case AArch64::STRWroW:
912   case AArch64::STRWroX:
913   case AArch64::STRXroW:
914   case AArch64::STRXroX: {
915     unsigned IsSigned = MI.getOperand(3).getImm();
916     return !IsSigned;
917   }
918   }
919 }
920 
isSEHInstruction(const MachineInstr & MI)921 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
922   unsigned Opc = MI.getOpcode();
923   switch (Opc) {
924     default:
925       return false;
926     case AArch64::SEH_StackAlloc:
927     case AArch64::SEH_SaveFPLR:
928     case AArch64::SEH_SaveFPLR_X:
929     case AArch64::SEH_SaveReg:
930     case AArch64::SEH_SaveReg_X:
931     case AArch64::SEH_SaveRegP:
932     case AArch64::SEH_SaveRegP_X:
933     case AArch64::SEH_SaveFReg:
934     case AArch64::SEH_SaveFReg_X:
935     case AArch64::SEH_SaveFRegP:
936     case AArch64::SEH_SaveFRegP_X:
937     case AArch64::SEH_SetFP:
938     case AArch64::SEH_AddFP:
939     case AArch64::SEH_Nop:
940     case AArch64::SEH_PrologEnd:
941     case AArch64::SEH_EpilogStart:
942     case AArch64::SEH_EpilogEnd:
943       return true;
944   }
945 }
946 
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const947 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
948                                              Register &SrcReg, Register &DstReg,
949                                              unsigned &SubIdx) const {
950   switch (MI.getOpcode()) {
951   default:
952     return false;
953   case AArch64::SBFMXri: // aka sxtw
954   case AArch64::UBFMXri: // aka uxtw
955     // Check for the 32 -> 64 bit extension case, these instructions can do
956     // much more.
957     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
958       return false;
959     // This is a signed or unsigned 32 -> 64 bit extension.
960     SrcReg = MI.getOperand(1).getReg();
961     DstReg = MI.getOperand(0).getReg();
962     SubIdx = AArch64::sub_32;
963     return true;
964   }
965 }
966 
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const967 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
968     const MachineInstr &MIa, const MachineInstr &MIb) const {
969   const TargetRegisterInfo *TRI = &getRegisterInfo();
970   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
971   int64_t OffsetA = 0, OffsetB = 0;
972   unsigned WidthA = 0, WidthB = 0;
973   bool OffsetAIsScalable = false, OffsetBIsScalable = false;
974 
975   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
976   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
977 
978   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
979       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
980     return false;
981 
982   // Retrieve the base, offset from the base and width. Width
983   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
984   // base are identical, and the offset of a lower memory access +
985   // the width doesn't overlap the offset of a higher memory access,
986   // then the memory accesses are different.
987   // If OffsetAIsScalable and OffsetBIsScalable are both true, they
988   // are assumed to have the same scale (vscale).
989   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
990                                    WidthA, TRI) &&
991       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
992                                    WidthB, TRI)) {
993     if (BaseOpA->isIdenticalTo(*BaseOpB) &&
994         OffsetAIsScalable == OffsetBIsScalable) {
995       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
996       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
997       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
998       if (LowOffset + LowWidth <= HighOffset)
999         return true;
1000     }
1001   }
1002   return false;
1003 }
1004 
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1005 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1006                                             const MachineBasicBlock *MBB,
1007                                             const MachineFunction &MF) const {
1008   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1009     return true;
1010   switch (MI.getOpcode()) {
1011   case AArch64::HINT:
1012     // CSDB hints are scheduling barriers.
1013     if (MI.getOperand(0).getImm() == 0x14)
1014       return true;
1015     break;
1016   case AArch64::DSB:
1017   case AArch64::ISB:
1018     // DSB and ISB also are scheduling barriers.
1019     return true;
1020   default:;
1021   }
1022   return isSEHInstruction(MI);
1023 }
1024 
1025 /// analyzeCompare - For a comparison instruction, return the source registers
1026 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1027 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int & CmpMask,int & CmpValue) const1028 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1029                                       Register &SrcReg2, int &CmpMask,
1030                                       int &CmpValue) const {
1031   // The first operand can be a frame index where we'd normally expect a
1032   // register.
1033   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1034   if (!MI.getOperand(1).isReg())
1035     return false;
1036 
1037   switch (MI.getOpcode()) {
1038   default:
1039     break;
1040   case AArch64::SUBSWrr:
1041   case AArch64::SUBSWrs:
1042   case AArch64::SUBSWrx:
1043   case AArch64::SUBSXrr:
1044   case AArch64::SUBSXrs:
1045   case AArch64::SUBSXrx:
1046   case AArch64::ADDSWrr:
1047   case AArch64::ADDSWrs:
1048   case AArch64::ADDSWrx:
1049   case AArch64::ADDSXrr:
1050   case AArch64::ADDSXrs:
1051   case AArch64::ADDSXrx:
1052     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1053     SrcReg = MI.getOperand(1).getReg();
1054     SrcReg2 = MI.getOperand(2).getReg();
1055     CmpMask = ~0;
1056     CmpValue = 0;
1057     return true;
1058   case AArch64::SUBSWri:
1059   case AArch64::ADDSWri:
1060   case AArch64::SUBSXri:
1061   case AArch64::ADDSXri:
1062     SrcReg = MI.getOperand(1).getReg();
1063     SrcReg2 = 0;
1064     CmpMask = ~0;
1065     // FIXME: In order to convert CmpValue to 0 or 1
1066     CmpValue = MI.getOperand(2).getImm() != 0;
1067     return true;
1068   case AArch64::ANDSWri:
1069   case AArch64::ANDSXri:
1070     // ANDS does not use the same encoding scheme as the others xxxS
1071     // instructions.
1072     SrcReg = MI.getOperand(1).getReg();
1073     SrcReg2 = 0;
1074     CmpMask = ~0;
1075     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1076     // while the type of CmpValue is int. When converting uint64_t to int,
1077     // the high 32 bits of uint64_t will be lost.
1078     // In fact it causes a bug in spec2006-483.xalancbmk
1079     // CmpValue is only used to compare with zero in OptimizeCompareInstr
1080     CmpValue = AArch64_AM::decodeLogicalImmediate(
1081                    MI.getOperand(2).getImm(),
1082                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1083     return true;
1084   }
1085 
1086   return false;
1087 }
1088 
UpdateOperandRegClass(MachineInstr & Instr)1089 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1090   MachineBasicBlock *MBB = Instr.getParent();
1091   assert(MBB && "Can't get MachineBasicBlock here");
1092   MachineFunction *MF = MBB->getParent();
1093   assert(MF && "Can't get MachineFunction here");
1094   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1095   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1096   MachineRegisterInfo *MRI = &MF->getRegInfo();
1097 
1098   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1099        ++OpIdx) {
1100     MachineOperand &MO = Instr.getOperand(OpIdx);
1101     const TargetRegisterClass *OpRegCstraints =
1102         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1103 
1104     // If there's no constraint, there's nothing to do.
1105     if (!OpRegCstraints)
1106       continue;
1107     // If the operand is a frame index, there's nothing to do here.
1108     // A frame index operand will resolve correctly during PEI.
1109     if (MO.isFI())
1110       continue;
1111 
1112     assert(MO.isReg() &&
1113            "Operand has register constraints without being a register!");
1114 
1115     Register Reg = MO.getReg();
1116     if (Register::isPhysicalRegister(Reg)) {
1117       if (!OpRegCstraints->contains(Reg))
1118         return false;
1119     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1120                !MRI->constrainRegClass(Reg, OpRegCstraints))
1121       return false;
1122   }
1123 
1124   return true;
1125 }
1126 
1127 /// Return the opcode that does not set flags when possible - otherwise
1128 /// return the original opcode. The caller is responsible to do the actual
1129 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1130 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1131   // Don't convert all compare instructions, because for some the zero register
1132   // encoding becomes the sp register.
1133   bool MIDefinesZeroReg = false;
1134   if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1135     MIDefinesZeroReg = true;
1136 
1137   switch (MI.getOpcode()) {
1138   default:
1139     return MI.getOpcode();
1140   case AArch64::ADDSWrr:
1141     return AArch64::ADDWrr;
1142   case AArch64::ADDSWri:
1143     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1144   case AArch64::ADDSWrs:
1145     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1146   case AArch64::ADDSWrx:
1147     return AArch64::ADDWrx;
1148   case AArch64::ADDSXrr:
1149     return AArch64::ADDXrr;
1150   case AArch64::ADDSXri:
1151     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1152   case AArch64::ADDSXrs:
1153     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1154   case AArch64::ADDSXrx:
1155     return AArch64::ADDXrx;
1156   case AArch64::SUBSWrr:
1157     return AArch64::SUBWrr;
1158   case AArch64::SUBSWri:
1159     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1160   case AArch64::SUBSWrs:
1161     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1162   case AArch64::SUBSWrx:
1163     return AArch64::SUBWrx;
1164   case AArch64::SUBSXrr:
1165     return AArch64::SUBXrr;
1166   case AArch64::SUBSXri:
1167     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1168   case AArch64::SUBSXrs:
1169     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1170   case AArch64::SUBSXrx:
1171     return AArch64::SUBXrx;
1172   }
1173 }
1174 
1175 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1176 
1177 /// True when condition flags are accessed (either by writing or reading)
1178 /// on the instruction trace starting at From and ending at To.
1179 ///
1180 /// Note: If From and To are from different blocks it's assumed CC are accessed
1181 ///       on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1182 static bool areCFlagsAccessedBetweenInstrs(
1183     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1184     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1185   // Early exit if To is at the beginning of the BB.
1186   if (To == To->getParent()->begin())
1187     return true;
1188 
1189   // Check whether the instructions are in the same basic block
1190   // If not, assume the condition flags might get modified somewhere.
1191   if (To->getParent() != From->getParent())
1192     return true;
1193 
1194   // From must be above To.
1195   assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1196                       [From](MachineInstr &MI) {
1197                         return MI.getIterator() == From;
1198                       }) != To->getParent()->rend());
1199 
1200   // We iterate backward starting at \p To until we hit \p From.
1201   for (const MachineInstr &Instr :
1202        instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1203     if (((AccessToCheck & AK_Write) &&
1204          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1205         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1206       return true;
1207   }
1208   return false;
1209 }
1210 
1211 /// Try to optimize a compare instruction. A compare instruction is an
1212 /// instruction which produces AArch64::NZCV. It can be truly compare
1213 /// instruction
1214 /// when there are no uses of its destination register.
1215 ///
1216 /// The following steps are tried in order:
1217 /// 1. Convert CmpInstr into an unconditional version.
1218 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1219 ///    condition code or an instruction which can be converted into such an
1220 ///    instruction.
1221 ///    Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int CmpMask,int CmpValue,const MachineRegisterInfo * MRI) const1222 bool AArch64InstrInfo::optimizeCompareInstr(
1223     MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
1224     int CmpValue, const MachineRegisterInfo *MRI) const {
1225   assert(CmpInstr.getParent());
1226   assert(MRI);
1227 
1228   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1229   int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1230   if (DeadNZCVIdx != -1) {
1231     if (CmpInstr.definesRegister(AArch64::WZR) ||
1232         CmpInstr.definesRegister(AArch64::XZR)) {
1233       CmpInstr.eraseFromParent();
1234       return true;
1235     }
1236     unsigned Opc = CmpInstr.getOpcode();
1237     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1238     if (NewOpc == Opc)
1239       return false;
1240     const MCInstrDesc &MCID = get(NewOpc);
1241     CmpInstr.setDesc(MCID);
1242     CmpInstr.RemoveOperand(DeadNZCVIdx);
1243     bool succeeded = UpdateOperandRegClass(CmpInstr);
1244     (void)succeeded;
1245     assert(succeeded && "Some operands reg class are incompatible!");
1246     return true;
1247   }
1248 
1249   // Continue only if we have a "ri" where immediate is zero.
1250   // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1251   // function.
1252   assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1253   if (CmpValue != 0 || SrcReg2 != 0)
1254     return false;
1255 
1256   // CmpInstr is a Compare instruction if destination register is not used.
1257   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1258     return false;
1259 
1260   return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1261 }
1262 
1263 /// Get opcode of S version of Instr.
1264 /// If Instr is S version its opcode is returned.
1265 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1266 /// or we are not interested in it.
sForm(MachineInstr & Instr)1267 static unsigned sForm(MachineInstr &Instr) {
1268   switch (Instr.getOpcode()) {
1269   default:
1270     return AArch64::INSTRUCTION_LIST_END;
1271 
1272   case AArch64::ADDSWrr:
1273   case AArch64::ADDSWri:
1274   case AArch64::ADDSXrr:
1275   case AArch64::ADDSXri:
1276   case AArch64::SUBSWrr:
1277   case AArch64::SUBSWri:
1278   case AArch64::SUBSXrr:
1279   case AArch64::SUBSXri:
1280     return Instr.getOpcode();
1281 
1282   case AArch64::ADDWrr:
1283     return AArch64::ADDSWrr;
1284   case AArch64::ADDWri:
1285     return AArch64::ADDSWri;
1286   case AArch64::ADDXrr:
1287     return AArch64::ADDSXrr;
1288   case AArch64::ADDXri:
1289     return AArch64::ADDSXri;
1290   case AArch64::ADCWr:
1291     return AArch64::ADCSWr;
1292   case AArch64::ADCXr:
1293     return AArch64::ADCSXr;
1294   case AArch64::SUBWrr:
1295     return AArch64::SUBSWrr;
1296   case AArch64::SUBWri:
1297     return AArch64::SUBSWri;
1298   case AArch64::SUBXrr:
1299     return AArch64::SUBSXrr;
1300   case AArch64::SUBXri:
1301     return AArch64::SUBSXri;
1302   case AArch64::SBCWr:
1303     return AArch64::SBCSWr;
1304   case AArch64::SBCXr:
1305     return AArch64::SBCSXr;
1306   case AArch64::ANDWri:
1307     return AArch64::ANDSWri;
1308   case AArch64::ANDXri:
1309     return AArch64::ANDSXri;
1310   }
1311 }
1312 
1313 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(MachineBasicBlock * MBB)1314 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1315   for (auto *BB : MBB->successors())
1316     if (BB->isLiveIn(AArch64::NZCV))
1317       return true;
1318   return false;
1319 }
1320 
1321 namespace {
1322 
1323 struct UsedNZCV {
1324   bool N = false;
1325   bool Z = false;
1326   bool C = false;
1327   bool V = false;
1328 
1329   UsedNZCV() = default;
1330 
operator |=__anon1e3e5bcb0211::UsedNZCV1331   UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1332     this->N |= UsedFlags.N;
1333     this->Z |= UsedFlags.Z;
1334     this->C |= UsedFlags.C;
1335     this->V |= UsedFlags.V;
1336     return *this;
1337   }
1338 };
1339 
1340 } // end anonymous namespace
1341 
1342 /// Find a condition code used by the instruction.
1343 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1344 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1345 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1346   switch (Instr.getOpcode()) {
1347   default:
1348     return AArch64CC::Invalid;
1349 
1350   case AArch64::Bcc: {
1351     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1352     assert(Idx >= 2);
1353     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1354   }
1355 
1356   case AArch64::CSINVWr:
1357   case AArch64::CSINVXr:
1358   case AArch64::CSINCWr:
1359   case AArch64::CSINCXr:
1360   case AArch64::CSELWr:
1361   case AArch64::CSELXr:
1362   case AArch64::CSNEGWr:
1363   case AArch64::CSNEGXr:
1364   case AArch64::FCSELSrrr:
1365   case AArch64::FCSELDrrr: {
1366     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1367     assert(Idx >= 1);
1368     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1369   }
1370   }
1371 }
1372 
getUsedNZCV(AArch64CC::CondCode CC)1373 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1374   assert(CC != AArch64CC::Invalid);
1375   UsedNZCV UsedFlags;
1376   switch (CC) {
1377   default:
1378     break;
1379 
1380   case AArch64CC::EQ: // Z set
1381   case AArch64CC::NE: // Z clear
1382     UsedFlags.Z = true;
1383     break;
1384 
1385   case AArch64CC::HI: // Z clear and C set
1386   case AArch64CC::LS: // Z set   or  C clear
1387     UsedFlags.Z = true;
1388     LLVM_FALLTHROUGH;
1389   case AArch64CC::HS: // C set
1390   case AArch64CC::LO: // C clear
1391     UsedFlags.C = true;
1392     break;
1393 
1394   case AArch64CC::MI: // N set
1395   case AArch64CC::PL: // N clear
1396     UsedFlags.N = true;
1397     break;
1398 
1399   case AArch64CC::VS: // V set
1400   case AArch64CC::VC: // V clear
1401     UsedFlags.V = true;
1402     break;
1403 
1404   case AArch64CC::GT: // Z clear, N and V the same
1405   case AArch64CC::LE: // Z set,   N and V differ
1406     UsedFlags.Z = true;
1407     LLVM_FALLTHROUGH;
1408   case AArch64CC::GE: // N and V the same
1409   case AArch64CC::LT: // N and V differ
1410     UsedFlags.N = true;
1411     UsedFlags.V = true;
1412     break;
1413   }
1414   return UsedFlags;
1415 }
1416 
isADDSRegImm(unsigned Opcode)1417 static bool isADDSRegImm(unsigned Opcode) {
1418   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1419 }
1420 
isSUBSRegImm(unsigned Opcode)1421 static bool isSUBSRegImm(unsigned Opcode) {
1422   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1423 }
1424 
1425 /// Check if CmpInstr can be substituted by MI.
1426 ///
1427 /// CmpInstr can be substituted:
1428 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1429 /// - and, MI and CmpInstr are from the same MachineBB
1430 /// - and, condition flags are not alive in successors of the CmpInstr parent
1431 /// - and, if MI opcode is the S form there must be no defs of flags between
1432 ///        MI and CmpInstr
1433 ///        or if MI opcode is not the S form there must be neither defs of flags
1434 ///        nor uses of flags between MI and CmpInstr.
1435 /// - and  C/V flags are not used after CmpInstr
canInstrSubstituteCmpInstr(MachineInstr * MI,MachineInstr * CmpInstr,const TargetRegisterInfo * TRI)1436 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1437                                        const TargetRegisterInfo *TRI) {
1438   assert(MI);
1439   assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1440   assert(CmpInstr);
1441 
1442   const unsigned CmpOpcode = CmpInstr->getOpcode();
1443   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1444     return false;
1445 
1446   if (MI->getParent() != CmpInstr->getParent())
1447     return false;
1448 
1449   if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1450     return false;
1451 
1452   AccessKind AccessToCheck = AK_Write;
1453   if (sForm(*MI) != MI->getOpcode())
1454     AccessToCheck = AK_All;
1455   if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1456     return false;
1457 
1458   UsedNZCV NZCVUsedAfterCmp;
1459   for (const MachineInstr &Instr :
1460        instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
1461                                 CmpInstr->getParent()->instr_end())) {
1462     if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1463       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1464       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1465         return false;
1466       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1467     }
1468 
1469     if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1470       break;
1471   }
1472 
1473   return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1474 }
1475 
1476 /// Substitute an instruction comparing to zero with another instruction
1477 /// which produces needed condition flags.
1478 ///
1479 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo * MRI) const1480 bool AArch64InstrInfo::substituteCmpToZero(
1481     MachineInstr &CmpInstr, unsigned SrcReg,
1482     const MachineRegisterInfo *MRI) const {
1483   assert(MRI);
1484   // Get the unique definition of SrcReg.
1485   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1486   if (!MI)
1487     return false;
1488 
1489   const TargetRegisterInfo *TRI = &getRegisterInfo();
1490 
1491   unsigned NewOpc = sForm(*MI);
1492   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1493     return false;
1494 
1495   if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1496     return false;
1497 
1498   // Update the instruction to set NZCV.
1499   MI->setDesc(get(NewOpc));
1500   CmpInstr.eraseFromParent();
1501   bool succeeded = UpdateOperandRegClass(*MI);
1502   (void)succeeded;
1503   assert(succeeded && "Some operands reg class are incompatible!");
1504   MI->addRegisterDefined(AArch64::NZCV, TRI);
1505   return true;
1506 }
1507 
expandPostRAPseudo(MachineInstr & MI) const1508 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1509   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1510       MI.getOpcode() != AArch64::CATCHRET)
1511     return false;
1512 
1513   MachineBasicBlock &MBB = *MI.getParent();
1514   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1515   auto TRI = Subtarget.getRegisterInfo();
1516   DebugLoc DL = MI.getDebugLoc();
1517 
1518   if (MI.getOpcode() == AArch64::CATCHRET) {
1519     // Skip to the first instruction before the epilog.
1520     const TargetInstrInfo *TII =
1521       MBB.getParent()->getSubtarget().getInstrInfo();
1522     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1523     auto MBBI = MachineBasicBlock::iterator(MI);
1524     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1525     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1526            FirstEpilogSEH != MBB.begin())
1527       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1528     if (FirstEpilogSEH != MBB.begin())
1529       FirstEpilogSEH = std::next(FirstEpilogSEH);
1530     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1531         .addReg(AArch64::X0, RegState::Define)
1532         .addMBB(TargetMBB);
1533     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1534         .addReg(AArch64::X0, RegState::Define)
1535         .addReg(AArch64::X0)
1536         .addMBB(TargetMBB)
1537         .addImm(0);
1538     return true;
1539   }
1540 
1541   Register Reg = MI.getOperand(0).getReg();
1542   const GlobalValue *GV =
1543       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1544   const TargetMachine &TM = MBB.getParent()->getTarget();
1545   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1546   const unsigned char MO_NC = AArch64II::MO_NC;
1547 
1548   if ((OpFlags & AArch64II::MO_GOT) != 0) {
1549     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1550         .addGlobalAddress(GV, 0, OpFlags);
1551     if (Subtarget.isTargetILP32()) {
1552       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1553       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1554           .addDef(Reg32, RegState::Dead)
1555           .addUse(Reg, RegState::Kill)
1556           .addImm(0)
1557           .addMemOperand(*MI.memoperands_begin())
1558           .addDef(Reg, RegState::Implicit);
1559     } else {
1560       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1561           .addReg(Reg, RegState::Kill)
1562           .addImm(0)
1563           .addMemOperand(*MI.memoperands_begin());
1564     }
1565   } else if (TM.getCodeModel() == CodeModel::Large) {
1566     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1567     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1568         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1569         .addImm(0);
1570     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1571         .addReg(Reg, RegState::Kill)
1572         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1573         .addImm(16);
1574     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1575         .addReg(Reg, RegState::Kill)
1576         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1577         .addImm(32);
1578     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1579         .addReg(Reg, RegState::Kill)
1580         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1581         .addImm(48);
1582     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1583         .addReg(Reg, RegState::Kill)
1584         .addImm(0)
1585         .addMemOperand(*MI.memoperands_begin());
1586   } else if (TM.getCodeModel() == CodeModel::Tiny) {
1587     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1588         .addGlobalAddress(GV, 0, OpFlags);
1589   } else {
1590     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1591         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1592     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1593     if (Subtarget.isTargetILP32()) {
1594       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1595       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1596           .addDef(Reg32, RegState::Dead)
1597           .addUse(Reg, RegState::Kill)
1598           .addGlobalAddress(GV, 0, LoFlags)
1599           .addMemOperand(*MI.memoperands_begin())
1600           .addDef(Reg, RegState::Implicit);
1601     } else {
1602       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1603           .addReg(Reg, RegState::Kill)
1604           .addGlobalAddress(GV, 0, LoFlags)
1605           .addMemOperand(*MI.memoperands_begin());
1606     }
1607   }
1608 
1609   MBB.erase(MI);
1610 
1611   return true;
1612 }
1613 
1614 // Return true if this instruction simply sets its single destination register
1615 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)1616 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1617   switch (MI.getOpcode()) {
1618   default:
1619     break;
1620   case AArch64::MOVZWi:
1621   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1622     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1623       assert(MI.getDesc().getNumOperands() == 3 &&
1624              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1625       return true;
1626     }
1627     break;
1628   case AArch64::ANDWri: // and Rd, Rzr, #imm
1629     return MI.getOperand(1).getReg() == AArch64::WZR;
1630   case AArch64::ANDXri:
1631     return MI.getOperand(1).getReg() == AArch64::XZR;
1632   case TargetOpcode::COPY:
1633     return MI.getOperand(1).getReg() == AArch64::WZR;
1634   }
1635   return false;
1636 }
1637 
1638 // Return true if this instruction simply renames a general register without
1639 // modifying bits.
isGPRCopy(const MachineInstr & MI)1640 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1641   switch (MI.getOpcode()) {
1642   default:
1643     break;
1644   case TargetOpcode::COPY: {
1645     // GPR32 copies will by lowered to ORRXrs
1646     Register DstReg = MI.getOperand(0).getReg();
1647     return (AArch64::GPR32RegClass.contains(DstReg) ||
1648             AArch64::GPR64RegClass.contains(DstReg));
1649   }
1650   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1651     if (MI.getOperand(1).getReg() == AArch64::XZR) {
1652       assert(MI.getDesc().getNumOperands() == 4 &&
1653              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1654       return true;
1655     }
1656     break;
1657   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1658     if (MI.getOperand(2).getImm() == 0) {
1659       assert(MI.getDesc().getNumOperands() == 4 &&
1660              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1661       return true;
1662     }
1663     break;
1664   }
1665   return false;
1666 }
1667 
1668 // Return true if this instruction simply renames a general register without
1669 // modifying bits.
isFPRCopy(const MachineInstr & MI)1670 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1671   switch (MI.getOpcode()) {
1672   default:
1673     break;
1674   case TargetOpcode::COPY: {
1675     // FPR64 copies will by lowered to ORR.16b
1676     Register DstReg = MI.getOperand(0).getReg();
1677     return (AArch64::FPR64RegClass.contains(DstReg) ||
1678             AArch64::FPR128RegClass.contains(DstReg));
1679   }
1680   case AArch64::ORRv16i8:
1681     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1682       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1683              "invalid ORRv16i8 operands");
1684       return true;
1685     }
1686     break;
1687   }
1688   return false;
1689 }
1690 
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const1691 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1692                                                int &FrameIndex) const {
1693   switch (MI.getOpcode()) {
1694   default:
1695     break;
1696   case AArch64::LDRWui:
1697   case AArch64::LDRXui:
1698   case AArch64::LDRBui:
1699   case AArch64::LDRHui:
1700   case AArch64::LDRSui:
1701   case AArch64::LDRDui:
1702   case AArch64::LDRQui:
1703     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1704         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1705       FrameIndex = MI.getOperand(1).getIndex();
1706       return MI.getOperand(0).getReg();
1707     }
1708     break;
1709   }
1710 
1711   return 0;
1712 }
1713 
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const1714 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1715                                               int &FrameIndex) const {
1716   switch (MI.getOpcode()) {
1717   default:
1718     break;
1719   case AArch64::STRWui:
1720   case AArch64::STRXui:
1721   case AArch64::STRBui:
1722   case AArch64::STRHui:
1723   case AArch64::STRSui:
1724   case AArch64::STRDui:
1725   case AArch64::STRQui:
1726   case AArch64::LDR_PXI:
1727   case AArch64::STR_PXI:
1728     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1729         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1730       FrameIndex = MI.getOperand(1).getIndex();
1731       return MI.getOperand(0).getReg();
1732     }
1733     break;
1734   }
1735   return 0;
1736 }
1737 
1738 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)1739 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1740   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1741     return MMO->getFlags() & MOSuppressPair;
1742   });
1743 }
1744 
1745 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)1746 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1747   if (MI.memoperands_empty())
1748     return;
1749   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1750 }
1751 
1752 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)1753 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1754   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1755     return MMO->getFlags() & MOStridedAccess;
1756   });
1757 }
1758 
isUnscaledLdSt(unsigned Opc)1759 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1760   switch (Opc) {
1761   default:
1762     return false;
1763   case AArch64::STURSi:
1764   case AArch64::STURDi:
1765   case AArch64::STURQi:
1766   case AArch64::STURBBi:
1767   case AArch64::STURHHi:
1768   case AArch64::STURWi:
1769   case AArch64::STURXi:
1770   case AArch64::LDURSi:
1771   case AArch64::LDURDi:
1772   case AArch64::LDURQi:
1773   case AArch64::LDURWi:
1774   case AArch64::LDURXi:
1775   case AArch64::LDURSWi:
1776   case AArch64::LDURHHi:
1777   case AArch64::LDURBBi:
1778   case AArch64::LDURSBWi:
1779   case AArch64::LDURSHWi:
1780     return true;
1781   }
1782 }
1783 
getUnscaledLdSt(unsigned Opc)1784 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
1785   switch (Opc) {
1786   default: return {};
1787   case AArch64::PRFMui: return AArch64::PRFUMi;
1788   case AArch64::LDRXui: return AArch64::LDURXi;
1789   case AArch64::LDRWui: return AArch64::LDURWi;
1790   case AArch64::LDRBui: return AArch64::LDURBi;
1791   case AArch64::LDRHui: return AArch64::LDURHi;
1792   case AArch64::LDRSui: return AArch64::LDURSi;
1793   case AArch64::LDRDui: return AArch64::LDURDi;
1794   case AArch64::LDRQui: return AArch64::LDURQi;
1795   case AArch64::LDRBBui: return AArch64::LDURBBi;
1796   case AArch64::LDRHHui: return AArch64::LDURHHi;
1797   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1798   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1799   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1800   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1801   case AArch64::LDRSWui: return AArch64::LDURSWi;
1802   case AArch64::STRXui: return AArch64::STURXi;
1803   case AArch64::STRWui: return AArch64::STURWi;
1804   case AArch64::STRBui: return AArch64::STURBi;
1805   case AArch64::STRHui: return AArch64::STURHi;
1806   case AArch64::STRSui: return AArch64::STURSi;
1807   case AArch64::STRDui: return AArch64::STURDi;
1808   case AArch64::STRQui: return AArch64::STURQi;
1809   case AArch64::STRBBui: return AArch64::STURBBi;
1810   case AArch64::STRHHui: return AArch64::STURHHi;
1811   }
1812 }
1813 
getLoadStoreImmIdx(unsigned Opc)1814 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
1815   switch (Opc) {
1816   default:
1817     return 2;
1818   case AArch64::LDPXi:
1819   case AArch64::LDPDi:
1820   case AArch64::STPXi:
1821   case AArch64::STPDi:
1822   case AArch64::LDNPXi:
1823   case AArch64::LDNPDi:
1824   case AArch64::STNPXi:
1825   case AArch64::STNPDi:
1826   case AArch64::LDPQi:
1827   case AArch64::STPQi:
1828   case AArch64::LDNPQi:
1829   case AArch64::STNPQi:
1830   case AArch64::LDPWi:
1831   case AArch64::LDPSi:
1832   case AArch64::STPWi:
1833   case AArch64::STPSi:
1834   case AArch64::LDNPWi:
1835   case AArch64::LDNPSi:
1836   case AArch64::STNPWi:
1837   case AArch64::STNPSi:
1838   case AArch64::LDG:
1839   case AArch64::STGPi:
1840   case AArch64::LD1B_IMM:
1841   case AArch64::LD1H_IMM:
1842   case AArch64::LD1W_IMM:
1843   case AArch64::LD1D_IMM:
1844   case AArch64::ST1B_IMM:
1845   case AArch64::ST1H_IMM:
1846   case AArch64::ST1W_IMM:
1847   case AArch64::ST1D_IMM:
1848   case AArch64::LD1B_H_IMM:
1849   case AArch64::LD1SB_H_IMM:
1850   case AArch64::LD1H_S_IMM:
1851   case AArch64::LD1SH_S_IMM:
1852   case AArch64::LD1W_D_IMM:
1853   case AArch64::LD1SW_D_IMM:
1854   case AArch64::ST1B_H_IMM:
1855   case AArch64::ST1H_S_IMM:
1856   case AArch64::ST1W_D_IMM:
1857   case AArch64::LD1B_S_IMM:
1858   case AArch64::LD1SB_S_IMM:
1859   case AArch64::LD1H_D_IMM:
1860   case AArch64::LD1SH_D_IMM:
1861   case AArch64::ST1B_S_IMM:
1862   case AArch64::ST1H_D_IMM:
1863   case AArch64::LD1B_D_IMM:
1864   case AArch64::LD1SB_D_IMM:
1865   case AArch64::ST1B_D_IMM:
1866     return 3;
1867   case AArch64::ADDG:
1868   case AArch64::STGOffset:
1869   case AArch64::LDR_PXI:
1870   case AArch64::STR_PXI:
1871     return 2;
1872   }
1873 }
1874 
isPairableLdStInst(const MachineInstr & MI)1875 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1876   switch (MI.getOpcode()) {
1877   default:
1878     return false;
1879   // Scaled instructions.
1880   case AArch64::STRSui:
1881   case AArch64::STRDui:
1882   case AArch64::STRQui:
1883   case AArch64::STRXui:
1884   case AArch64::STRWui:
1885   case AArch64::LDRSui:
1886   case AArch64::LDRDui:
1887   case AArch64::LDRQui:
1888   case AArch64::LDRXui:
1889   case AArch64::LDRWui:
1890   case AArch64::LDRSWui:
1891   // Unscaled instructions.
1892   case AArch64::STURSi:
1893   case AArch64::STURDi:
1894   case AArch64::STURQi:
1895   case AArch64::STURWi:
1896   case AArch64::STURXi:
1897   case AArch64::LDURSi:
1898   case AArch64::LDURDi:
1899   case AArch64::LDURQi:
1900   case AArch64::LDURWi:
1901   case AArch64::LDURXi:
1902   case AArch64::LDURSWi:
1903     return true;
1904   }
1905 }
1906 
convertToFlagSettingOpc(unsigned Opc,bool & Is64Bit)1907 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1908                                                    bool &Is64Bit) {
1909   switch (Opc) {
1910   default:
1911     llvm_unreachable("Opcode has no flag setting equivalent!");
1912   // 32-bit cases:
1913   case AArch64::ADDWri:
1914     Is64Bit = false;
1915     return AArch64::ADDSWri;
1916   case AArch64::ADDWrr:
1917     Is64Bit = false;
1918     return AArch64::ADDSWrr;
1919   case AArch64::ADDWrs:
1920     Is64Bit = false;
1921     return AArch64::ADDSWrs;
1922   case AArch64::ADDWrx:
1923     Is64Bit = false;
1924     return AArch64::ADDSWrx;
1925   case AArch64::ANDWri:
1926     Is64Bit = false;
1927     return AArch64::ANDSWri;
1928   case AArch64::ANDWrr:
1929     Is64Bit = false;
1930     return AArch64::ANDSWrr;
1931   case AArch64::ANDWrs:
1932     Is64Bit = false;
1933     return AArch64::ANDSWrs;
1934   case AArch64::BICWrr:
1935     Is64Bit = false;
1936     return AArch64::BICSWrr;
1937   case AArch64::BICWrs:
1938     Is64Bit = false;
1939     return AArch64::BICSWrs;
1940   case AArch64::SUBWri:
1941     Is64Bit = false;
1942     return AArch64::SUBSWri;
1943   case AArch64::SUBWrr:
1944     Is64Bit = false;
1945     return AArch64::SUBSWrr;
1946   case AArch64::SUBWrs:
1947     Is64Bit = false;
1948     return AArch64::SUBSWrs;
1949   case AArch64::SUBWrx:
1950     Is64Bit = false;
1951     return AArch64::SUBSWrx;
1952   // 64-bit cases:
1953   case AArch64::ADDXri:
1954     Is64Bit = true;
1955     return AArch64::ADDSXri;
1956   case AArch64::ADDXrr:
1957     Is64Bit = true;
1958     return AArch64::ADDSXrr;
1959   case AArch64::ADDXrs:
1960     Is64Bit = true;
1961     return AArch64::ADDSXrs;
1962   case AArch64::ADDXrx:
1963     Is64Bit = true;
1964     return AArch64::ADDSXrx;
1965   case AArch64::ANDXri:
1966     Is64Bit = true;
1967     return AArch64::ANDSXri;
1968   case AArch64::ANDXrr:
1969     Is64Bit = true;
1970     return AArch64::ANDSXrr;
1971   case AArch64::ANDXrs:
1972     Is64Bit = true;
1973     return AArch64::ANDSXrs;
1974   case AArch64::BICXrr:
1975     Is64Bit = true;
1976     return AArch64::BICSXrr;
1977   case AArch64::BICXrs:
1978     Is64Bit = true;
1979     return AArch64::BICSXrs;
1980   case AArch64::SUBXri:
1981     Is64Bit = true;
1982     return AArch64::SUBSXri;
1983   case AArch64::SUBXrr:
1984     Is64Bit = true;
1985     return AArch64::SUBSXrr;
1986   case AArch64::SUBXrs:
1987     Is64Bit = true;
1988     return AArch64::SUBSXrs;
1989   case AArch64::SUBXrx:
1990     Is64Bit = true;
1991     return AArch64::SUBSXrx;
1992   }
1993 }
1994 
1995 // Is this a candidate for ld/st merging or pairing?  For example, we don't
1996 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const1997 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
1998   // If this is a volatile load/store, don't mess with it.
1999   if (MI.hasOrderedMemoryRef())
2000     return false;
2001 
2002   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2003   assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
2004          "Expected a reg or frame index operand.");
2005   if (!MI.getOperand(2).isImm())
2006     return false;
2007 
2008   // Can't merge/pair if the instruction modifies the base register.
2009   // e.g., ldr x0, [x0]
2010   // This case will never occur with an FI base.
2011   if (MI.getOperand(1).isReg()) {
2012     Register BaseReg = MI.getOperand(1).getReg();
2013     const TargetRegisterInfo *TRI = &getRegisterInfo();
2014     if (MI.modifiesRegister(BaseReg, TRI))
2015       return false;
2016   }
2017 
2018   // Check if this load/store has a hint to avoid pair formation.
2019   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2020   if (isLdStPairSuppressed(MI))
2021     return false;
2022 
2023   // Do not pair any callee-save store/reload instructions in the
2024   // prologue/epilogue if the CFI information encoded the operations as separate
2025   // instructions, as that will cause the size of the actual prologue to mismatch
2026   // with the prologue size recorded in the Windows CFI.
2027   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2028   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2029                      MI.getMF()->getFunction().needsUnwindTableEntry();
2030   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2031                       MI.getFlag(MachineInstr::FrameDestroy)))
2032     return false;
2033 
2034   // On some CPUs quad load/store pairs are slower than two single load/stores.
2035   if (Subtarget.isPaired128Slow()) {
2036     switch (MI.getOpcode()) {
2037     default:
2038       break;
2039     case AArch64::LDURQi:
2040     case AArch64::STURQi:
2041     case AArch64::LDRQui:
2042     case AArch64::STRQui:
2043       return false;
2044     }
2045   }
2046 
2047   return true;
2048 }
2049 
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2050 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2051     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2052     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2053     const TargetRegisterInfo *TRI) const {
2054   if (!LdSt.mayLoadOrStore())
2055     return false;
2056 
2057   const MachineOperand *BaseOp;
2058   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2059                                     Width, TRI))
2060     return false;
2061   BaseOps.push_back(BaseOp);
2062   return true;
2063 }
2064 
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2065 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
2066     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2067     bool &OffsetIsScalable, unsigned &Width,
2068     const TargetRegisterInfo *TRI) const {
2069   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2070   // Handle only loads/stores with base register followed by immediate offset.
2071   if (LdSt.getNumExplicitOperands() == 3) {
2072     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2073     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2074         !LdSt.getOperand(2).isImm())
2075       return false;
2076   } else if (LdSt.getNumExplicitOperands() == 4) {
2077     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2078     if (!LdSt.getOperand(1).isReg() ||
2079         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2080         !LdSt.getOperand(3).isImm())
2081       return false;
2082   } else
2083     return false;
2084 
2085   // Get the scaling factor for the instruction and set the width for the
2086   // instruction.
2087   TypeSize Scale(0U, false);
2088   int64_t Dummy1, Dummy2;
2089 
2090   // If this returns false, then it's an instruction we don't want to handle.
2091   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2092     return false;
2093 
2094   // Compute the offset. Offset is calculated as the immediate operand
2095   // multiplied by the scaling factor. Unscaled instructions have scaling factor
2096   // set to 1.
2097   if (LdSt.getNumExplicitOperands() == 3) {
2098     BaseOp = &LdSt.getOperand(1);
2099     Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2100   } else {
2101     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2102     BaseOp = &LdSt.getOperand(2);
2103     Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2104   }
2105   OffsetIsScalable = Scale.isScalable();
2106 
2107   if (!BaseOp->isReg() && !BaseOp->isFI())
2108     return false;
2109 
2110   return true;
2111 }
2112 
2113 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const2114 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
2115   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2116   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2117   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2118   return OfsOp;
2119 }
2120 
getMemOpInfo(unsigned Opcode,TypeSize & Scale,unsigned & Width,int64_t & MinOffset,int64_t & MaxOffset)2121 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2122                                     unsigned &Width, int64_t &MinOffset,
2123                                     int64_t &MaxOffset) {
2124   const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2125   switch (Opcode) {
2126   // Not a memory operation or something we want to handle.
2127   default:
2128     Scale = TypeSize::Fixed(0);
2129     Width = 0;
2130     MinOffset = MaxOffset = 0;
2131     return false;
2132   case AArch64::STRWpost:
2133   case AArch64::LDRWpost:
2134     Width = 32;
2135     Scale = TypeSize::Fixed(4);
2136     MinOffset = -256;
2137     MaxOffset = 255;
2138     break;
2139   case AArch64::LDURQi:
2140   case AArch64::STURQi:
2141     Width = 16;
2142     Scale = TypeSize::Fixed(1);
2143     MinOffset = -256;
2144     MaxOffset = 255;
2145     break;
2146   case AArch64::PRFUMi:
2147   case AArch64::LDURXi:
2148   case AArch64::LDURDi:
2149   case AArch64::STURXi:
2150   case AArch64::STURDi:
2151     Width = 8;
2152     Scale = TypeSize::Fixed(1);
2153     MinOffset = -256;
2154     MaxOffset = 255;
2155     break;
2156   case AArch64::LDURWi:
2157   case AArch64::LDURSi:
2158   case AArch64::LDURSWi:
2159   case AArch64::STURWi:
2160   case AArch64::STURSi:
2161     Width = 4;
2162     Scale = TypeSize::Fixed(1);
2163     MinOffset = -256;
2164     MaxOffset = 255;
2165     break;
2166   case AArch64::LDURHi:
2167   case AArch64::LDURHHi:
2168   case AArch64::LDURSHXi:
2169   case AArch64::LDURSHWi:
2170   case AArch64::STURHi:
2171   case AArch64::STURHHi:
2172     Width = 2;
2173     Scale = TypeSize::Fixed(1);
2174     MinOffset = -256;
2175     MaxOffset = 255;
2176     break;
2177   case AArch64::LDURBi:
2178   case AArch64::LDURBBi:
2179   case AArch64::LDURSBXi:
2180   case AArch64::LDURSBWi:
2181   case AArch64::STURBi:
2182   case AArch64::STURBBi:
2183     Width = 1;
2184     Scale = TypeSize::Fixed(1);
2185     MinOffset = -256;
2186     MaxOffset = 255;
2187     break;
2188   case AArch64::LDPQi:
2189   case AArch64::LDNPQi:
2190   case AArch64::STPQi:
2191   case AArch64::STNPQi:
2192     Scale = TypeSize::Fixed(16);
2193     Width = 32;
2194     MinOffset = -64;
2195     MaxOffset = 63;
2196     break;
2197   case AArch64::LDRQui:
2198   case AArch64::STRQui:
2199     Scale = TypeSize::Fixed(16);
2200     Width = 16;
2201     MinOffset = 0;
2202     MaxOffset = 4095;
2203     break;
2204   case AArch64::LDPXi:
2205   case AArch64::LDPDi:
2206   case AArch64::LDNPXi:
2207   case AArch64::LDNPDi:
2208   case AArch64::STPXi:
2209   case AArch64::STPDi:
2210   case AArch64::STNPXi:
2211   case AArch64::STNPDi:
2212     Scale = TypeSize::Fixed(8);
2213     Width = 16;
2214     MinOffset = -64;
2215     MaxOffset = 63;
2216     break;
2217   case AArch64::PRFMui:
2218   case AArch64::LDRXui:
2219   case AArch64::LDRDui:
2220   case AArch64::STRXui:
2221   case AArch64::STRDui:
2222     Scale = TypeSize::Fixed(8);
2223     Width = 8;
2224     MinOffset = 0;
2225     MaxOffset = 4095;
2226     break;
2227   case AArch64::LDPWi:
2228   case AArch64::LDPSi:
2229   case AArch64::LDNPWi:
2230   case AArch64::LDNPSi:
2231   case AArch64::STPWi:
2232   case AArch64::STPSi:
2233   case AArch64::STNPWi:
2234   case AArch64::STNPSi:
2235     Scale = TypeSize::Fixed(4);
2236     Width = 8;
2237     MinOffset = -64;
2238     MaxOffset = 63;
2239     break;
2240   case AArch64::LDRWui:
2241   case AArch64::LDRSui:
2242   case AArch64::LDRSWui:
2243   case AArch64::STRWui:
2244   case AArch64::STRSui:
2245     Scale = TypeSize::Fixed(4);
2246     Width = 4;
2247     MinOffset = 0;
2248     MaxOffset = 4095;
2249     break;
2250   case AArch64::LDRHui:
2251   case AArch64::LDRHHui:
2252   case AArch64::LDRSHWui:
2253   case AArch64::LDRSHXui:
2254   case AArch64::STRHui:
2255   case AArch64::STRHHui:
2256     Scale = TypeSize::Fixed(2);
2257     Width = 2;
2258     MinOffset = 0;
2259     MaxOffset = 4095;
2260     break;
2261   case AArch64::LDRBui:
2262   case AArch64::LDRBBui:
2263   case AArch64::LDRSBWui:
2264   case AArch64::LDRSBXui:
2265   case AArch64::STRBui:
2266   case AArch64::STRBBui:
2267     Scale = TypeSize::Fixed(1);
2268     Width = 1;
2269     MinOffset = 0;
2270     MaxOffset = 4095;
2271     break;
2272   case AArch64::ADDG:
2273     Scale = TypeSize::Fixed(16);
2274     Width = 0;
2275     MinOffset = 0;
2276     MaxOffset = 63;
2277     break;
2278   case AArch64::TAGPstack:
2279     Scale = TypeSize::Fixed(16);
2280     Width = 0;
2281     // TAGP with a negative offset turns into SUBP, which has a maximum offset
2282     // of 63 (not 64!).
2283     MinOffset = -63;
2284     MaxOffset = 63;
2285     break;
2286   case AArch64::LDG:
2287   case AArch64::STGOffset:
2288   case AArch64::STZGOffset:
2289     Scale = TypeSize::Fixed(16);
2290     Width = 16;
2291     MinOffset = -256;
2292     MaxOffset = 255;
2293     break;
2294   case AArch64::STR_ZZZZXI:
2295   case AArch64::LDR_ZZZZXI:
2296     Scale = TypeSize::Scalable(16);
2297     Width = SVEMaxBytesPerVector * 4;
2298     MinOffset = -256;
2299     MaxOffset = 252;
2300     break;
2301   case AArch64::STR_ZZZXI:
2302   case AArch64::LDR_ZZZXI:
2303     Scale = TypeSize::Scalable(16);
2304     Width = SVEMaxBytesPerVector * 3;
2305     MinOffset = -256;
2306     MaxOffset = 253;
2307     break;
2308   case AArch64::STR_ZZXI:
2309   case AArch64::LDR_ZZXI:
2310     Scale = TypeSize::Scalable(16);
2311     Width = SVEMaxBytesPerVector * 2;
2312     MinOffset = -256;
2313     MaxOffset = 254;
2314     break;
2315   case AArch64::LDR_PXI:
2316   case AArch64::STR_PXI:
2317     Scale = TypeSize::Scalable(2);
2318     Width = SVEMaxBytesPerVector / 8;
2319     MinOffset = -256;
2320     MaxOffset = 255;
2321     break;
2322   case AArch64::LDR_ZXI:
2323   case AArch64::STR_ZXI:
2324     Scale = TypeSize::Scalable(16);
2325     Width = SVEMaxBytesPerVector;
2326     MinOffset = -256;
2327     MaxOffset = 255;
2328     break;
2329   case AArch64::LD1B_IMM:
2330   case AArch64::LD1H_IMM:
2331   case AArch64::LD1W_IMM:
2332   case AArch64::LD1D_IMM:
2333   case AArch64::ST1B_IMM:
2334   case AArch64::ST1H_IMM:
2335   case AArch64::ST1W_IMM:
2336   case AArch64::ST1D_IMM:
2337     // A full vectors worth of data
2338     // Width = mbytes * elements
2339     Scale = TypeSize::Scalable(16);
2340     Width = SVEMaxBytesPerVector;
2341     MinOffset = -8;
2342     MaxOffset = 7;
2343     break;
2344   case AArch64::LD1B_H_IMM:
2345   case AArch64::LD1SB_H_IMM:
2346   case AArch64::LD1H_S_IMM:
2347   case AArch64::LD1SH_S_IMM:
2348   case AArch64::LD1W_D_IMM:
2349   case AArch64::LD1SW_D_IMM:
2350   case AArch64::ST1B_H_IMM:
2351   case AArch64::ST1H_S_IMM:
2352   case AArch64::ST1W_D_IMM:
2353     // A half vector worth of data
2354     // Width = mbytes * elements
2355     Scale = TypeSize::Scalable(8);
2356     Width = SVEMaxBytesPerVector / 2;
2357     MinOffset = -8;
2358     MaxOffset = 7;
2359     break;
2360   case AArch64::LD1B_S_IMM:
2361   case AArch64::LD1SB_S_IMM:
2362   case AArch64::LD1H_D_IMM:
2363   case AArch64::LD1SH_D_IMM:
2364   case AArch64::ST1B_S_IMM:
2365   case AArch64::ST1H_D_IMM:
2366     // A quarter vector worth of data
2367     // Width = mbytes * elements
2368     Scale = TypeSize::Scalable(4);
2369     Width = SVEMaxBytesPerVector / 4;
2370     MinOffset = -8;
2371     MaxOffset = 7;
2372     break;
2373   case AArch64::LD1B_D_IMM:
2374   case AArch64::LD1SB_D_IMM:
2375   case AArch64::ST1B_D_IMM:
2376     // A eighth vector worth of data
2377     // Width = mbytes * elements
2378     Scale = TypeSize::Scalable(2);
2379     Width = SVEMaxBytesPerVector / 8;
2380     MinOffset = -8;
2381     MaxOffset = 7;
2382     break;
2383   case AArch64::ST2GOffset:
2384   case AArch64::STZ2GOffset:
2385     Scale = TypeSize::Fixed(16);
2386     Width = 32;
2387     MinOffset = -256;
2388     MaxOffset = 255;
2389     break;
2390   case AArch64::STGPi:
2391     Scale = TypeSize::Fixed(16);
2392     Width = 16;
2393     MinOffset = -64;
2394     MaxOffset = 63;
2395     break;
2396   }
2397 
2398   return true;
2399 }
2400 
2401 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)2402 int AArch64InstrInfo::getMemScale(unsigned Opc) {
2403   switch (Opc) {
2404   default:
2405     llvm_unreachable("Opcode has unknown scale!");
2406   case AArch64::LDRBBui:
2407   case AArch64::LDURBBi:
2408   case AArch64::LDRSBWui:
2409   case AArch64::LDURSBWi:
2410   case AArch64::STRBBui:
2411   case AArch64::STURBBi:
2412     return 1;
2413   case AArch64::LDRHHui:
2414   case AArch64::LDURHHi:
2415   case AArch64::LDRSHWui:
2416   case AArch64::LDURSHWi:
2417   case AArch64::STRHHui:
2418   case AArch64::STURHHi:
2419     return 2;
2420   case AArch64::LDRSui:
2421   case AArch64::LDURSi:
2422   case AArch64::LDRSWui:
2423   case AArch64::LDURSWi:
2424   case AArch64::LDRWui:
2425   case AArch64::LDURWi:
2426   case AArch64::STRSui:
2427   case AArch64::STURSi:
2428   case AArch64::STRWui:
2429   case AArch64::STURWi:
2430   case AArch64::LDPSi:
2431   case AArch64::LDPSWi:
2432   case AArch64::LDPWi:
2433   case AArch64::STPSi:
2434   case AArch64::STPWi:
2435     return 4;
2436   case AArch64::LDRDui:
2437   case AArch64::LDURDi:
2438   case AArch64::LDRXui:
2439   case AArch64::LDURXi:
2440   case AArch64::STRDui:
2441   case AArch64::STURDi:
2442   case AArch64::STRXui:
2443   case AArch64::STURXi:
2444   case AArch64::LDPDi:
2445   case AArch64::LDPXi:
2446   case AArch64::STPDi:
2447   case AArch64::STPXi:
2448     return 8;
2449   case AArch64::LDRQui:
2450   case AArch64::LDURQi:
2451   case AArch64::STRQui:
2452   case AArch64::STURQi:
2453   case AArch64::LDPQi:
2454   case AArch64::STPQi:
2455   case AArch64::STGOffset:
2456   case AArch64::STZGOffset:
2457   case AArch64::ST2GOffset:
2458   case AArch64::STZ2GOffset:
2459   case AArch64::STGPi:
2460     return 16;
2461   }
2462 }
2463 
2464 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
2465 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)2466 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2467   int Scale = AArch64InstrInfo::getMemScale(Opc);
2468 
2469   // If the byte-offset isn't a multiple of the stride, we can't scale this
2470   // offset.
2471   if (Offset % Scale != 0)
2472     return false;
2473 
2474   // Convert the byte-offset used by unscaled into an "element" offset used
2475   // by the scaled pair load/store instructions.
2476   Offset /= Scale;
2477   return true;
2478 }
2479 
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)2480 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2481   if (FirstOpc == SecondOpc)
2482     return true;
2483   // We can also pair sign-ext and zero-ext instructions.
2484   switch (FirstOpc) {
2485   default:
2486     return false;
2487   case AArch64::LDRWui:
2488   case AArch64::LDURWi:
2489     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2490   case AArch64::LDRSWui:
2491   case AArch64::LDURSWi:
2492     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2493   }
2494   // These instructions can't be paired based on their opcodes.
2495   return false;
2496 }
2497 
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)2498 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2499                             int64_t Offset1, unsigned Opcode1, int FI2,
2500                             int64_t Offset2, unsigned Opcode2) {
2501   // Accesses through fixed stack object frame indices may access a different
2502   // fixed stack slot. Check that the object offsets + offsets match.
2503   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2504     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2505     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2506     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2507     // Convert to scaled object offsets.
2508     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
2509     if (ObjectOffset1 % Scale1 != 0)
2510       return false;
2511     ObjectOffset1 /= Scale1;
2512     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
2513     if (ObjectOffset2 % Scale2 != 0)
2514       return false;
2515     ObjectOffset2 /= Scale2;
2516     ObjectOffset1 += Offset1;
2517     ObjectOffset2 += Offset2;
2518     return ObjectOffset1 + 1 == ObjectOffset2;
2519   }
2520 
2521   return FI1 == FI2;
2522 }
2523 
2524 /// Detect opportunities for ldp/stp formation.
2525 ///
2526 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,ArrayRef<const MachineOperand * > BaseOps2,unsigned NumLoads,unsigned NumBytes) const2527 bool AArch64InstrInfo::shouldClusterMemOps(
2528     ArrayRef<const MachineOperand *> BaseOps1,
2529     ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
2530     unsigned NumBytes) const {
2531   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
2532   const MachineOperand &BaseOp1 = *BaseOps1.front();
2533   const MachineOperand &BaseOp2 = *BaseOps2.front();
2534   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2535   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2536   if (BaseOp1.getType() != BaseOp2.getType())
2537     return false;
2538 
2539   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2540          "Only base registers and frame indices are supported.");
2541 
2542   // Check for both base regs and base FI.
2543   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2544     return false;
2545 
2546   // Only cluster up to a single pair.
2547   if (NumLoads > 2)
2548     return false;
2549 
2550   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2551     return false;
2552 
2553   // Can we pair these instructions based on their opcodes?
2554   unsigned FirstOpc = FirstLdSt.getOpcode();
2555   unsigned SecondOpc = SecondLdSt.getOpcode();
2556   if (!canPairLdStOpc(FirstOpc, SecondOpc))
2557     return false;
2558 
2559   // Can't merge volatiles or load/stores that have a hint to avoid pair
2560   // formation, for example.
2561   if (!isCandidateToMergeOrPair(FirstLdSt) ||
2562       !isCandidateToMergeOrPair(SecondLdSt))
2563     return false;
2564 
2565   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2566   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2567   if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2568     return false;
2569 
2570   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2571   if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2572     return false;
2573 
2574   // Pairwise instructions have a 7-bit signed offset field.
2575   if (Offset1 > 63 || Offset1 < -64)
2576     return false;
2577 
2578   // The caller should already have ordered First/SecondLdSt by offset.
2579   // Note: except for non-equal frame index bases
2580   if (BaseOp1.isFI()) {
2581     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2582            "Caller should have ordered offsets.");
2583 
2584     const MachineFrameInfo &MFI =
2585         FirstLdSt.getParent()->getParent()->getFrameInfo();
2586     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2587                            BaseOp2.getIndex(), Offset2, SecondOpc);
2588   }
2589 
2590   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
2591 
2592   return Offset1 + 1 == Offset2;
2593 }
2594 
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)2595 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2596                                             unsigned Reg, unsigned SubIdx,
2597                                             unsigned State,
2598                                             const TargetRegisterInfo *TRI) {
2599   if (!SubIdx)
2600     return MIB.addReg(Reg, State);
2601 
2602   if (Register::isPhysicalRegister(Reg))
2603     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2604   return MIB.addReg(Reg, State, SubIdx);
2605 }
2606 
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)2607 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2608                                         unsigned NumRegs) {
2609   // We really want the positive remainder mod 32 here, that happens to be
2610   // easily obtainable with a mask.
2611   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2612 }
2613 
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const2614 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2615                                         MachineBasicBlock::iterator I,
2616                                         const DebugLoc &DL, MCRegister DestReg,
2617                                         MCRegister SrcReg, bool KillSrc,
2618                                         unsigned Opcode,
2619                                         ArrayRef<unsigned> Indices) const {
2620   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2621   const TargetRegisterInfo *TRI = &getRegisterInfo();
2622   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2623   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2624   unsigned NumRegs = Indices.size();
2625 
2626   int SubReg = 0, End = NumRegs, Incr = 1;
2627   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2628     SubReg = NumRegs - 1;
2629     End = -1;
2630     Incr = -1;
2631   }
2632 
2633   for (; SubReg != End; SubReg += Incr) {
2634     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2635     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2636     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2637     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2638   }
2639 }
2640 
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const2641 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2642                                        MachineBasicBlock::iterator I,
2643                                        DebugLoc DL, unsigned DestReg,
2644                                        unsigned SrcReg, bool KillSrc,
2645                                        unsigned Opcode, unsigned ZeroReg,
2646                                        llvm::ArrayRef<unsigned> Indices) const {
2647   const TargetRegisterInfo *TRI = &getRegisterInfo();
2648   unsigned NumRegs = Indices.size();
2649 
2650 #ifndef NDEBUG
2651   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2652   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2653   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2654          "GPR reg sequences should not be able to overlap");
2655 #endif
2656 
2657   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2658     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2659     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2660     MIB.addReg(ZeroReg);
2661     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2662     MIB.addImm(0);
2663   }
2664 }
2665 
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const2666 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2667                                    MachineBasicBlock::iterator I,
2668                                    const DebugLoc &DL, MCRegister DestReg,
2669                                    MCRegister SrcReg, bool KillSrc) const {
2670   if (AArch64::GPR32spRegClass.contains(DestReg) &&
2671       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2672     const TargetRegisterInfo *TRI = &getRegisterInfo();
2673 
2674     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2675       // If either operand is WSP, expand to ADD #0.
2676       if (Subtarget.hasZeroCycleRegMove()) {
2677         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2678         MCRegister DestRegX = TRI->getMatchingSuperReg(
2679             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2680         MCRegister SrcRegX = TRI->getMatchingSuperReg(
2681             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2682         // This instruction is reading and writing X registers.  This may upset
2683         // the register scavenger and machine verifier, so we need to indicate
2684         // that we are reading an undefined value from SrcRegX, but a proper
2685         // value from SrcReg.
2686         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2687             .addReg(SrcRegX, RegState::Undef)
2688             .addImm(0)
2689             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2690             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2691       } else {
2692         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2693             .addReg(SrcReg, getKillRegState(KillSrc))
2694             .addImm(0)
2695             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2696       }
2697     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2698       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2699           .addImm(0)
2700           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2701     } else {
2702       if (Subtarget.hasZeroCycleRegMove()) {
2703         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2704         MCRegister DestRegX = TRI->getMatchingSuperReg(
2705             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2706         MCRegister SrcRegX = TRI->getMatchingSuperReg(
2707             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2708         // This instruction is reading and writing X registers.  This may upset
2709         // the register scavenger and machine verifier, so we need to indicate
2710         // that we are reading an undefined value from SrcRegX, but a proper
2711         // value from SrcReg.
2712         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2713             .addReg(AArch64::XZR)
2714             .addReg(SrcRegX, RegState::Undef)
2715             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2716       } else {
2717         // Otherwise, expand to ORR WZR.
2718         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2719             .addReg(AArch64::WZR)
2720             .addReg(SrcReg, getKillRegState(KillSrc));
2721       }
2722     }
2723     return;
2724   }
2725 
2726   // Copy a Predicate register by ORRing with itself.
2727   if (AArch64::PPRRegClass.contains(DestReg) &&
2728       AArch64::PPRRegClass.contains(SrcReg)) {
2729     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2730     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
2731       .addReg(SrcReg) // Pg
2732       .addReg(SrcReg)
2733       .addReg(SrcReg, getKillRegState(KillSrc));
2734     return;
2735   }
2736 
2737   // Copy a Z register by ORRing with itself.
2738   if (AArch64::ZPRRegClass.contains(DestReg) &&
2739       AArch64::ZPRRegClass.contains(SrcReg)) {
2740     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2741     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
2742       .addReg(SrcReg)
2743       .addReg(SrcReg, getKillRegState(KillSrc));
2744     return;
2745   }
2746 
2747   // Copy a Z register pair by copying the individual sub-registers.
2748   if (AArch64::ZPR2RegClass.contains(DestReg) &&
2749       AArch64::ZPR2RegClass.contains(SrcReg)) {
2750     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
2751     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2752                      Indices);
2753     return;
2754   }
2755 
2756   // Copy a Z register triple by copying the individual sub-registers.
2757   if (AArch64::ZPR3RegClass.contains(DestReg) &&
2758       AArch64::ZPR3RegClass.contains(SrcReg)) {
2759     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
2760                                        AArch64::zsub2};
2761     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2762                      Indices);
2763     return;
2764   }
2765 
2766   // Copy a Z register quad by copying the individual sub-registers.
2767   if (AArch64::ZPR4RegClass.contains(DestReg) &&
2768       AArch64::ZPR4RegClass.contains(SrcReg)) {
2769     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
2770                                        AArch64::zsub2, AArch64::zsub3};
2771     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2772                      Indices);
2773     return;
2774   }
2775 
2776   if (AArch64::GPR64spRegClass.contains(DestReg) &&
2777       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2778     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2779       // If either operand is SP, expand to ADD #0.
2780       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2781           .addReg(SrcReg, getKillRegState(KillSrc))
2782           .addImm(0)
2783           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2784     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2785       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2786           .addImm(0)
2787           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2788     } else {
2789       // Otherwise, expand to ORR XZR.
2790       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2791           .addReg(AArch64::XZR)
2792           .addReg(SrcReg, getKillRegState(KillSrc));
2793     }
2794     return;
2795   }
2796 
2797   // Copy a DDDD register quad by copying the individual sub-registers.
2798   if (AArch64::DDDDRegClass.contains(DestReg) &&
2799       AArch64::DDDDRegClass.contains(SrcReg)) {
2800     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2801                                        AArch64::dsub2, AArch64::dsub3};
2802     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2803                      Indices);
2804     return;
2805   }
2806 
2807   // Copy a DDD register triple by copying the individual sub-registers.
2808   if (AArch64::DDDRegClass.contains(DestReg) &&
2809       AArch64::DDDRegClass.contains(SrcReg)) {
2810     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2811                                        AArch64::dsub2};
2812     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2813                      Indices);
2814     return;
2815   }
2816 
2817   // Copy a DD register pair by copying the individual sub-registers.
2818   if (AArch64::DDRegClass.contains(DestReg) &&
2819       AArch64::DDRegClass.contains(SrcReg)) {
2820     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2821     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2822                      Indices);
2823     return;
2824   }
2825 
2826   // Copy a QQQQ register quad by copying the individual sub-registers.
2827   if (AArch64::QQQQRegClass.contains(DestReg) &&
2828       AArch64::QQQQRegClass.contains(SrcReg)) {
2829     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2830                                        AArch64::qsub2, AArch64::qsub3};
2831     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2832                      Indices);
2833     return;
2834   }
2835 
2836   // Copy a QQQ register triple by copying the individual sub-registers.
2837   if (AArch64::QQQRegClass.contains(DestReg) &&
2838       AArch64::QQQRegClass.contains(SrcReg)) {
2839     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2840                                        AArch64::qsub2};
2841     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2842                      Indices);
2843     return;
2844   }
2845 
2846   // Copy a QQ register pair by copying the individual sub-registers.
2847   if (AArch64::QQRegClass.contains(DestReg) &&
2848       AArch64::QQRegClass.contains(SrcReg)) {
2849     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2850     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2851                      Indices);
2852     return;
2853   }
2854 
2855   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2856       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2857     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2858     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2859                     AArch64::XZR, Indices);
2860     return;
2861   }
2862 
2863   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2864       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2865     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2866     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2867                     AArch64::WZR, Indices);
2868     return;
2869   }
2870 
2871   if (AArch64::FPR128RegClass.contains(DestReg) &&
2872       AArch64::FPR128RegClass.contains(SrcReg)) {
2873     if (Subtarget.hasNEON()) {
2874       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2875           .addReg(SrcReg)
2876           .addReg(SrcReg, getKillRegState(KillSrc));
2877     } else {
2878       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2879           .addReg(AArch64::SP, RegState::Define)
2880           .addReg(SrcReg, getKillRegState(KillSrc))
2881           .addReg(AArch64::SP)
2882           .addImm(-16);
2883       BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2884           .addReg(AArch64::SP, RegState::Define)
2885           .addReg(DestReg, RegState::Define)
2886           .addReg(AArch64::SP)
2887           .addImm(16);
2888     }
2889     return;
2890   }
2891 
2892   if (AArch64::FPR64RegClass.contains(DestReg) &&
2893       AArch64::FPR64RegClass.contains(SrcReg)) {
2894     if (Subtarget.hasNEON()) {
2895       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2896                                        &AArch64::FPR128RegClass);
2897       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2898                                       &AArch64::FPR128RegClass);
2899       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2900           .addReg(SrcReg)
2901           .addReg(SrcReg, getKillRegState(KillSrc));
2902     } else {
2903       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2904           .addReg(SrcReg, getKillRegState(KillSrc));
2905     }
2906     return;
2907   }
2908 
2909   if (AArch64::FPR32RegClass.contains(DestReg) &&
2910       AArch64::FPR32RegClass.contains(SrcReg)) {
2911     if (Subtarget.hasNEON()) {
2912       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2913                                        &AArch64::FPR128RegClass);
2914       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2915                                       &AArch64::FPR128RegClass);
2916       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2917           .addReg(SrcReg)
2918           .addReg(SrcReg, getKillRegState(KillSrc));
2919     } else {
2920       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2921           .addReg(SrcReg, getKillRegState(KillSrc));
2922     }
2923     return;
2924   }
2925 
2926   if (AArch64::FPR16RegClass.contains(DestReg) &&
2927       AArch64::FPR16RegClass.contains(SrcReg)) {
2928     if (Subtarget.hasNEON()) {
2929       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2930                                        &AArch64::FPR128RegClass);
2931       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2932                                       &AArch64::FPR128RegClass);
2933       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2934           .addReg(SrcReg)
2935           .addReg(SrcReg, getKillRegState(KillSrc));
2936     } else {
2937       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2938                                        &AArch64::FPR32RegClass);
2939       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2940                                       &AArch64::FPR32RegClass);
2941       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2942           .addReg(SrcReg, getKillRegState(KillSrc));
2943     }
2944     return;
2945   }
2946 
2947   if (AArch64::FPR8RegClass.contains(DestReg) &&
2948       AArch64::FPR8RegClass.contains(SrcReg)) {
2949     if (Subtarget.hasNEON()) {
2950       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2951                                        &AArch64::FPR128RegClass);
2952       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2953                                       &AArch64::FPR128RegClass);
2954       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2955           .addReg(SrcReg)
2956           .addReg(SrcReg, getKillRegState(KillSrc));
2957     } else {
2958       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2959                                        &AArch64::FPR32RegClass);
2960       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2961                                       &AArch64::FPR32RegClass);
2962       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2963           .addReg(SrcReg, getKillRegState(KillSrc));
2964     }
2965     return;
2966   }
2967 
2968   // Copies between GPR64 and FPR64.
2969   if (AArch64::FPR64RegClass.contains(DestReg) &&
2970       AArch64::GPR64RegClass.contains(SrcReg)) {
2971     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2972         .addReg(SrcReg, getKillRegState(KillSrc));
2973     return;
2974   }
2975   if (AArch64::GPR64RegClass.contains(DestReg) &&
2976       AArch64::FPR64RegClass.contains(SrcReg)) {
2977     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2978         .addReg(SrcReg, getKillRegState(KillSrc));
2979     return;
2980   }
2981   // Copies between GPR32 and FPR32.
2982   if (AArch64::FPR32RegClass.contains(DestReg) &&
2983       AArch64::GPR32RegClass.contains(SrcReg)) {
2984     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2985         .addReg(SrcReg, getKillRegState(KillSrc));
2986     return;
2987   }
2988   if (AArch64::GPR32RegClass.contains(DestReg) &&
2989       AArch64::FPR32RegClass.contains(SrcReg)) {
2990     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2991         .addReg(SrcReg, getKillRegState(KillSrc));
2992     return;
2993   }
2994 
2995   if (DestReg == AArch64::NZCV) {
2996     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2997     BuildMI(MBB, I, DL, get(AArch64::MSR))
2998         .addImm(AArch64SysReg::NZCV)
2999         .addReg(SrcReg, getKillRegState(KillSrc))
3000         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
3001     return;
3002   }
3003 
3004   if (SrcReg == AArch64::NZCV) {
3005     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
3006     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
3007         .addImm(AArch64SysReg::NZCV)
3008         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
3009     return;
3010   }
3011 
3012   llvm_unreachable("unimplemented reg-to-reg copy");
3013 }
3014 
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)3015 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
3016                                     MachineBasicBlock &MBB,
3017                                     MachineBasicBlock::iterator InsertBefore,
3018                                     const MCInstrDesc &MCID,
3019                                     Register SrcReg, bool IsKill,
3020                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
3021                                     MachineMemOperand *MMO) {
3022   Register SrcReg0 = SrcReg;
3023   Register SrcReg1 = SrcReg;
3024   if (Register::isPhysicalRegister(SrcReg)) {
3025     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
3026     SubIdx0 = 0;
3027     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
3028     SubIdx1 = 0;
3029   }
3030   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3031       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3032       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3033       .addFrameIndex(FI)
3034       .addImm(0)
3035       .addMemOperand(MMO);
3036 }
3037 
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3038 void AArch64InstrInfo::storeRegToStackSlot(
3039     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
3040     bool isKill, int FI, const TargetRegisterClass *RC,
3041     const TargetRegisterInfo *TRI) const {
3042   MachineFunction &MF = *MBB.getParent();
3043   MachineFrameInfo &MFI = MF.getFrameInfo();
3044 
3045   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3046   MachineMemOperand *MMO =
3047       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
3048                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3049   unsigned Opc = 0;
3050   bool Offset = true;
3051   unsigned StackID = TargetStackID::Default;
3052   switch (TRI->getSpillSize(*RC)) {
3053   case 1:
3054     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3055       Opc = AArch64::STRBui;
3056     break;
3057   case 2:
3058     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3059       Opc = AArch64::STRHui;
3060     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3061       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3062       Opc = AArch64::STR_PXI;
3063       StackID = TargetStackID::SVEVector;
3064     }
3065     break;
3066   case 4:
3067     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3068       Opc = AArch64::STRWui;
3069       if (Register::isVirtualRegister(SrcReg))
3070         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3071       else
3072         assert(SrcReg != AArch64::WSP);
3073     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3074       Opc = AArch64::STRSui;
3075     break;
3076   case 8:
3077     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3078       Opc = AArch64::STRXui;
3079       if (Register::isVirtualRegister(SrcReg))
3080         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3081       else
3082         assert(SrcReg != AArch64::SP);
3083     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3084       Opc = AArch64::STRDui;
3085     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3086       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3087                               get(AArch64::STPWi), SrcReg, isKill,
3088                               AArch64::sube32, AArch64::subo32, FI, MMO);
3089       return;
3090     }
3091     break;
3092   case 16:
3093     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3094       Opc = AArch64::STRQui;
3095     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3096       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3097       Opc = AArch64::ST1Twov1d;
3098       Offset = false;
3099     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3100       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3101                               get(AArch64::STPXi), SrcReg, isKill,
3102                               AArch64::sube64, AArch64::subo64, FI, MMO);
3103       return;
3104     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3105       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3106       Opc = AArch64::STR_ZXI;
3107       StackID = TargetStackID::SVEVector;
3108     }
3109     break;
3110   case 24:
3111     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3112       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3113       Opc = AArch64::ST1Threev1d;
3114       Offset = false;
3115     }
3116     break;
3117   case 32:
3118     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3119       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3120       Opc = AArch64::ST1Fourv1d;
3121       Offset = false;
3122     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3123       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3124       Opc = AArch64::ST1Twov2d;
3125       Offset = false;
3126     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3127       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3128       Opc = AArch64::STR_ZZXI;
3129       StackID = TargetStackID::SVEVector;
3130     }
3131     break;
3132   case 48:
3133     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3134       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3135       Opc = AArch64::ST1Threev2d;
3136       Offset = false;
3137     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3138       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3139       Opc = AArch64::STR_ZZZXI;
3140       StackID = TargetStackID::SVEVector;
3141     }
3142     break;
3143   case 64:
3144     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3145       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3146       Opc = AArch64::ST1Fourv2d;
3147       Offset = false;
3148     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3149       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3150       Opc = AArch64::STR_ZZZZXI;
3151       StackID = TargetStackID::SVEVector;
3152     }
3153     break;
3154   }
3155   assert(Opc && "Unknown register class");
3156   MFI.setStackID(FI, StackID);
3157 
3158   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3159                                      .addReg(SrcReg, getKillRegState(isKill))
3160                                      .addFrameIndex(FI);
3161 
3162   if (Offset)
3163     MI.addImm(0);
3164   MI.addMemOperand(MMO);
3165 }
3166 
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)3167 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
3168                                      MachineBasicBlock &MBB,
3169                                      MachineBasicBlock::iterator InsertBefore,
3170                                      const MCInstrDesc &MCID,
3171                                      Register DestReg, unsigned SubIdx0,
3172                                      unsigned SubIdx1, int FI,
3173                                      MachineMemOperand *MMO) {
3174   Register DestReg0 = DestReg;
3175   Register DestReg1 = DestReg;
3176   bool IsUndef = true;
3177   if (Register::isPhysicalRegister(DestReg)) {
3178     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3179     SubIdx0 = 0;
3180     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3181     SubIdx1 = 0;
3182     IsUndef = false;
3183   }
3184   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3185       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3186       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3187       .addFrameIndex(FI)
3188       .addImm(0)
3189       .addMemOperand(MMO);
3190 }
3191 
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3192 void AArch64InstrInfo::loadRegFromStackSlot(
3193     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
3194     int FI, const TargetRegisterClass *RC,
3195     const TargetRegisterInfo *TRI) const {
3196   MachineFunction &MF = *MBB.getParent();
3197   MachineFrameInfo &MFI = MF.getFrameInfo();
3198   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3199   MachineMemOperand *MMO =
3200       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
3201                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3202 
3203   unsigned Opc = 0;
3204   bool Offset = true;
3205   unsigned StackID = TargetStackID::Default;
3206   switch (TRI->getSpillSize(*RC)) {
3207   case 1:
3208     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3209       Opc = AArch64::LDRBui;
3210     break;
3211   case 2:
3212     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3213       Opc = AArch64::LDRHui;
3214     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3215       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3216       Opc = AArch64::LDR_PXI;
3217       StackID = TargetStackID::SVEVector;
3218     }
3219     break;
3220   case 4:
3221     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3222       Opc = AArch64::LDRWui;
3223       if (Register::isVirtualRegister(DestReg))
3224         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3225       else
3226         assert(DestReg != AArch64::WSP);
3227     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3228       Opc = AArch64::LDRSui;
3229     break;
3230   case 8:
3231     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3232       Opc = AArch64::LDRXui;
3233       if (Register::isVirtualRegister(DestReg))
3234         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3235       else
3236         assert(DestReg != AArch64::SP);
3237     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3238       Opc = AArch64::LDRDui;
3239     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3240       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3241                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
3242                                AArch64::subo32, FI, MMO);
3243       return;
3244     }
3245     break;
3246   case 16:
3247     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3248       Opc = AArch64::LDRQui;
3249     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3250       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3251       Opc = AArch64::LD1Twov1d;
3252       Offset = false;
3253     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3254       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3255                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
3256                                AArch64::subo64, FI, MMO);
3257       return;
3258     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3259       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3260       Opc = AArch64::LDR_ZXI;
3261       StackID = TargetStackID::SVEVector;
3262     }
3263     break;
3264   case 24:
3265     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3266       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3267       Opc = AArch64::LD1Threev1d;
3268       Offset = false;
3269     }
3270     break;
3271   case 32:
3272     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3273       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3274       Opc = AArch64::LD1Fourv1d;
3275       Offset = false;
3276     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3277       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3278       Opc = AArch64::LD1Twov2d;
3279       Offset = false;
3280     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3281       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3282       Opc = AArch64::LDR_ZZXI;
3283       StackID = TargetStackID::SVEVector;
3284     }
3285     break;
3286   case 48:
3287     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3288       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3289       Opc = AArch64::LD1Threev2d;
3290       Offset = false;
3291     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3292       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3293       Opc = AArch64::LDR_ZZZXI;
3294       StackID = TargetStackID::SVEVector;
3295     }
3296     break;
3297   case 64:
3298     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3299       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3300       Opc = AArch64::LD1Fourv2d;
3301       Offset = false;
3302     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3303       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3304       Opc = AArch64::LDR_ZZZZXI;
3305       StackID = TargetStackID::SVEVector;
3306     }
3307     break;
3308   }
3309 
3310   assert(Opc && "Unknown register class");
3311   MFI.setStackID(FI, StackID);
3312 
3313   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3314                                      .addReg(DestReg, getDefRegState(true))
3315                                      .addFrameIndex(FI);
3316   if (Offset)
3317     MI.addImm(0);
3318   MI.addMemOperand(MMO);
3319 }
3320 
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)3321 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
3322                                            const MachineInstr &UseMI,
3323                                            const TargetRegisterInfo *TRI) {
3324   return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
3325                                          UseMI.getIterator()),
3326                 [TRI](const MachineInstr &I) {
3327                   return I.modifiesRegister(AArch64::NZCV, TRI) ||
3328                          I.readsRegister(AArch64::NZCV, TRI);
3329                 });
3330 }
3331 
3332 // Helper function to emit a frame offset adjustment from a given
3333 // pointer (SrcReg), stored into DestReg. This function is explicit
3334 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI)3335 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
3336                                MachineBasicBlock::iterator MBBI,
3337                                const DebugLoc &DL, unsigned DestReg,
3338                                unsigned SrcReg, int64_t Offset, unsigned Opc,
3339                                const TargetInstrInfo *TII,
3340                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3341                                bool *HasWinCFI) {
3342   int Sign = 1;
3343   unsigned MaxEncoding, ShiftSize;
3344   switch (Opc) {
3345   case AArch64::ADDXri:
3346   case AArch64::ADDSXri:
3347   case AArch64::SUBXri:
3348   case AArch64::SUBSXri:
3349     MaxEncoding = 0xfff;
3350     ShiftSize = 12;
3351     break;
3352   case AArch64::ADDVL_XXI:
3353   case AArch64::ADDPL_XXI:
3354     MaxEncoding = 31;
3355     ShiftSize = 0;
3356     if (Offset < 0) {
3357       MaxEncoding = 32;
3358       Sign = -1;
3359       Offset = -Offset;
3360     }
3361     break;
3362   default:
3363     llvm_unreachable("Unsupported opcode");
3364   }
3365 
3366   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3367   // scratch register.  If DestReg is a virtual register, use it as the
3368   // scratch register; otherwise, create a new virtual register (to be
3369   // replaced by the scavenger at the end of PEI).  That case can be optimized
3370   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3371   // register can be loaded with offset%8 and the add/sub can use an extending
3372   // instruction with LSL#3.
3373   // Currently the function handles any offsets but generates a poor sequence
3374   // of code.
3375   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3376 
3377   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3378   Register TmpReg = DestReg;
3379   if (TmpReg == AArch64::XZR)
3380     TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
3381         &AArch64::GPR64RegClass);
3382   do {
3383     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
3384     unsigned LocalShiftSize = 0;
3385     if (ThisVal > MaxEncoding) {
3386       ThisVal = ThisVal >> ShiftSize;
3387       LocalShiftSize = ShiftSize;
3388     }
3389     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3390            "Encoding cannot handle value that big");
3391 
3392     Offset -= ThisVal << LocalShiftSize;
3393     if (Offset == 0)
3394       TmpReg = DestReg;
3395     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
3396                    .addReg(SrcReg)
3397                    .addImm(Sign * (int)ThisVal);
3398     if (ShiftSize)
3399       MBI = MBI.addImm(
3400           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
3401     MBI = MBI.setMIFlag(Flag);
3402 
3403     if (NeedsWinCFI) {
3404       assert(Sign == 1 && "SEH directives should always have a positive sign");
3405       int Imm = (int)(ThisVal << LocalShiftSize);
3406       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3407           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3408         if (HasWinCFI)
3409           *HasWinCFI = true;
3410         if (Imm == 0)
3411           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
3412         else
3413           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
3414               .addImm(Imm)
3415               .setMIFlag(Flag);
3416         assert(Offset == 0 && "Expected remaining offset to be zero to "
3417                               "emit a single SEH directive");
3418       } else if (DestReg == AArch64::SP) {
3419         if (HasWinCFI)
3420           *HasWinCFI = true;
3421         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
3422         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3423             .addImm(Imm)
3424             .setMIFlag(Flag);
3425       }
3426       if (HasWinCFI)
3427         *HasWinCFI = true;
3428     }
3429 
3430     SrcReg = TmpReg;
3431   } while (Offset);
3432 }
3433 
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI)3434 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
3435                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
3436                            unsigned DestReg, unsigned SrcReg,
3437                            StackOffset Offset, const TargetInstrInfo *TII,
3438                            MachineInstr::MIFlag Flag, bool SetNZCV,
3439                            bool NeedsWinCFI, bool *HasWinCFI) {
3440   int64_t Bytes, NumPredicateVectors, NumDataVectors;
3441   Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
3442 
3443   // First emit non-scalable frame offsets, or a simple 'mov'.
3444   if (Bytes || (!Offset && SrcReg != DestReg)) {
3445     assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
3446            "SP increment/decrement not 16-byte aligned");
3447     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
3448     if (Bytes < 0) {
3449       Bytes = -Bytes;
3450       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
3451     }
3452     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
3453                        NeedsWinCFI, HasWinCFI);
3454     SrcReg = DestReg;
3455   }
3456 
3457   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
3458          "SetNZCV not supported with SVE vectors");
3459   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
3460          "WinCFI not supported with SVE vectors");
3461 
3462   if (NumDataVectors) {
3463     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
3464                        AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3465     SrcReg = DestReg;
3466   }
3467 
3468   if (NumPredicateVectors) {
3469     assert(DestReg != AArch64::SP && "Unaligned access to SP");
3470     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
3471                        AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3472   }
3473 }
3474 
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const3475 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
3476     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
3477     MachineBasicBlock::iterator InsertPt, int FrameIndex,
3478     LiveIntervals *LIS, VirtRegMap *VRM) const {
3479   // This is a bit of a hack. Consider this instruction:
3480   //
3481   //   %0 = COPY %sp; GPR64all:%0
3482   //
3483   // We explicitly chose GPR64all for the virtual register so such a copy might
3484   // be eliminated by RegisterCoalescer. However, that may not be possible, and
3485   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3486   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3487   //
3488   // To prevent that, we are going to constrain the %0 register class here.
3489   //
3490   // <rdar://problem/11522048>
3491   //
3492   if (MI.isFullCopy()) {
3493     Register DstReg = MI.getOperand(0).getReg();
3494     Register SrcReg = MI.getOperand(1).getReg();
3495     if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
3496       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3497       return nullptr;
3498     }
3499     if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
3500       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3501       return nullptr;
3502     }
3503   }
3504 
3505   // Handle the case where a copy is being spilled or filled but the source
3506   // and destination register class don't match.  For example:
3507   //
3508   //   %0 = COPY %xzr; GPR64common:%0
3509   //
3510   // In this case we can still safely fold away the COPY and generate the
3511   // following spill code:
3512   //
3513   //   STRXui %xzr, %stack.0
3514   //
3515   // This also eliminates spilled cross register class COPYs (e.g. between x and
3516   // d regs) of the same size.  For example:
3517   //
3518   //   %0 = COPY %1; GPR64:%0, FPR64:%1
3519   //
3520   // will be filled as
3521   //
3522   //   LDRDui %0, fi<#0>
3523   //
3524   // instead of
3525   //
3526   //   LDRXui %Temp, fi<#0>
3527   //   %0 = FMOV %Temp
3528   //
3529   if (MI.isCopy() && Ops.size() == 1 &&
3530       // Make sure we're only folding the explicit COPY defs/uses.
3531       (Ops[0] == 0 || Ops[0] == 1)) {
3532     bool IsSpill = Ops[0] == 0;
3533     bool IsFill = !IsSpill;
3534     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3535     const MachineRegisterInfo &MRI = MF.getRegInfo();
3536     MachineBasicBlock &MBB = *MI.getParent();
3537     const MachineOperand &DstMO = MI.getOperand(0);
3538     const MachineOperand &SrcMO = MI.getOperand(1);
3539     Register DstReg = DstMO.getReg();
3540     Register SrcReg = SrcMO.getReg();
3541     // This is slightly expensive to compute for physical regs since
3542     // getMinimalPhysRegClass is slow.
3543     auto getRegClass = [&](unsigned Reg) {
3544       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
3545                                               : TRI.getMinimalPhysRegClass(Reg);
3546     };
3547 
3548     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3549       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3550                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3551              "Mismatched register size in non subreg COPY");
3552       if (IsSpill)
3553         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3554                             getRegClass(SrcReg), &TRI);
3555       else
3556         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3557                              getRegClass(DstReg), &TRI);
3558       return &*--InsertPt;
3559     }
3560 
3561     // Handle cases like spilling def of:
3562     //
3563     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3564     //
3565     // where the physical register source can be widened and stored to the full
3566     // virtual reg destination stack slot, in this case producing:
3567     //
3568     //   STRXui %xzr, %stack.0
3569     //
3570     if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
3571       assert(SrcMO.getSubReg() == 0 &&
3572              "Unexpected subreg on physical register");
3573       const TargetRegisterClass *SpillRC;
3574       unsigned SpillSubreg;
3575       switch (DstMO.getSubReg()) {
3576       default:
3577         SpillRC = nullptr;
3578         break;
3579       case AArch64::sub_32:
3580       case AArch64::ssub:
3581         if (AArch64::GPR32RegClass.contains(SrcReg)) {
3582           SpillRC = &AArch64::GPR64RegClass;
3583           SpillSubreg = AArch64::sub_32;
3584         } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3585           SpillRC = &AArch64::FPR64RegClass;
3586           SpillSubreg = AArch64::ssub;
3587         } else
3588           SpillRC = nullptr;
3589         break;
3590       case AArch64::dsub:
3591         if (AArch64::FPR64RegClass.contains(SrcReg)) {
3592           SpillRC = &AArch64::FPR128RegClass;
3593           SpillSubreg = AArch64::dsub;
3594         } else
3595           SpillRC = nullptr;
3596         break;
3597       }
3598 
3599       if (SpillRC)
3600         if (unsigned WidenedSrcReg =
3601                 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3602           storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3603                               FrameIndex, SpillRC, &TRI);
3604           return &*--InsertPt;
3605         }
3606     }
3607 
3608     // Handle cases like filling use of:
3609     //
3610     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3611     //
3612     // where we can load the full virtual reg source stack slot, into the subreg
3613     // destination, in this case producing:
3614     //
3615     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
3616     //
3617     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3618       const TargetRegisterClass *FillRC;
3619       switch (DstMO.getSubReg()) {
3620       default:
3621         FillRC = nullptr;
3622         break;
3623       case AArch64::sub_32:
3624         FillRC = &AArch64::GPR32RegClass;
3625         break;
3626       case AArch64::ssub:
3627         FillRC = &AArch64::FPR32RegClass;
3628         break;
3629       case AArch64::dsub:
3630         FillRC = &AArch64::FPR64RegClass;
3631         break;
3632       }
3633 
3634       if (FillRC) {
3635         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3636                    TRI.getRegSizeInBits(*FillRC) &&
3637                "Mismatched regclass size on folded subreg COPY");
3638         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3639         MachineInstr &LoadMI = *--InsertPt;
3640         MachineOperand &LoadDst = LoadMI.getOperand(0);
3641         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3642         LoadDst.setSubReg(DstMO.getSubReg());
3643         LoadDst.setIsUndef();
3644         return &LoadMI;
3645       }
3646     }
3647   }
3648 
3649   // Cannot fold.
3650   return nullptr;
3651 }
3652 
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)3653 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
3654                                     StackOffset &SOffset,
3655                                     bool *OutUseUnscaledOp,
3656                                     unsigned *OutUnscaledOp,
3657                                     int64_t *EmittableOffset) {
3658   // Set output values in case of early exit.
3659   if (EmittableOffset)
3660     *EmittableOffset = 0;
3661   if (OutUseUnscaledOp)
3662     *OutUseUnscaledOp = false;
3663   if (OutUnscaledOp)
3664     *OutUnscaledOp = 0;
3665 
3666   // Exit early for structured vector spills/fills as they can't take an
3667   // immediate offset.
3668   switch (MI.getOpcode()) {
3669   default:
3670     break;
3671   case AArch64::LD1Twov2d:
3672   case AArch64::LD1Threev2d:
3673   case AArch64::LD1Fourv2d:
3674   case AArch64::LD1Twov1d:
3675   case AArch64::LD1Threev1d:
3676   case AArch64::LD1Fourv1d:
3677   case AArch64::ST1Twov2d:
3678   case AArch64::ST1Threev2d:
3679   case AArch64::ST1Fourv2d:
3680   case AArch64::ST1Twov1d:
3681   case AArch64::ST1Threev1d:
3682   case AArch64::ST1Fourv1d:
3683   case AArch64::IRG:
3684   case AArch64::IRGstack:
3685   case AArch64::STGloop:
3686   case AArch64::STZGloop:
3687     return AArch64FrameOffsetCannotUpdate;
3688   }
3689 
3690   // Get the min/max offset and the scale.
3691   TypeSize ScaleValue(0U, false);
3692   unsigned Width;
3693   int64_t MinOff, MaxOff;
3694   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
3695                                       MaxOff))
3696     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3697 
3698   // Construct the complete offset.
3699   bool IsMulVL = ScaleValue.isScalable();
3700   unsigned Scale = ScaleValue.getKnownMinSize();
3701   int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();
3702 
3703   const MachineOperand &ImmOpnd =
3704       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
3705   Offset += ImmOpnd.getImm() * Scale;
3706 
3707   // If the offset doesn't match the scale, we rewrite the instruction to
3708   // use the unscaled instruction instead. Likewise, if we have a negative
3709   // offset and there is an unscaled op to use.
3710   Optional<unsigned> UnscaledOp =
3711       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
3712   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3713   if (useUnscaledOp &&
3714       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
3715                                       MaxOff))
3716     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3717 
3718   Scale = ScaleValue.getKnownMinSize();
3719   assert(IsMulVL == ScaleValue.isScalable() &&
3720          "Unscaled opcode has different value for scalable");
3721 
3722   int64_t Remainder = Offset % Scale;
3723   assert(!(Remainder && useUnscaledOp) &&
3724          "Cannot have remainder when using unscaled op");
3725 
3726   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3727   int64_t NewOffset = Offset / Scale;
3728   if (MinOff <= NewOffset && NewOffset <= MaxOff)
3729     Offset = Remainder;
3730   else {
3731     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3732     Offset = Offset - NewOffset * Scale + Remainder;
3733   }
3734 
3735   if (EmittableOffset)
3736     *EmittableOffset = NewOffset;
3737   if (OutUseUnscaledOp)
3738     *OutUseUnscaledOp = useUnscaledOp;
3739   if (OutUnscaledOp && UnscaledOp)
3740     *OutUnscaledOp = *UnscaledOp;
3741 
3742   if (IsMulVL)
3743     SOffset = StackOffset(Offset, MVT::nxv1i8) +
3744               StackOffset(SOffset.getBytes(), MVT::i8);
3745   else
3746     SOffset = StackOffset(Offset, MVT::i8) +
3747               StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
3748   return AArch64FrameOffsetCanUpdate |
3749          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
3750 }
3751 
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)3752 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3753                                     unsigned FrameReg, StackOffset &Offset,
3754                                     const AArch64InstrInfo *TII) {
3755   unsigned Opcode = MI.getOpcode();
3756   unsigned ImmIdx = FrameRegIdx + 1;
3757 
3758   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3759     Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
3760     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3761                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3762                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3763     MI.eraseFromParent();
3764     Offset = StackOffset();
3765     return true;
3766   }
3767 
3768   int64_t NewOffset;
3769   unsigned UnscaledOp;
3770   bool UseUnscaledOp;
3771   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3772                                          &UnscaledOp, &NewOffset);
3773   if (Status & AArch64FrameOffsetCanUpdate) {
3774     if (Status & AArch64FrameOffsetIsLegal)
3775       // Replace the FrameIndex with FrameReg.
3776       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3777     if (UseUnscaledOp)
3778       MI.setDesc(TII->get(UnscaledOp));
3779 
3780     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3781     return !Offset;
3782   }
3783 
3784   return false;
3785 }
3786 
getNoop(MCInst & NopInst) const3787 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3788   NopInst.setOpcode(AArch64::HINT);
3789   NopInst.addOperand(MCOperand::createImm(0));
3790 }
3791 
3792 // AArch64 supports MachineCombiner.
useMachineCombiner() const3793 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3794 
3795 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)3796 static bool isCombineInstrSettingFlag(unsigned Opc) {
3797   switch (Opc) {
3798   case AArch64::ADDSWrr:
3799   case AArch64::ADDSWri:
3800   case AArch64::ADDSXrr:
3801   case AArch64::ADDSXri:
3802   case AArch64::SUBSWrr:
3803   case AArch64::SUBSXrr:
3804   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3805   case AArch64::SUBSWri:
3806   case AArch64::SUBSXri:
3807     return true;
3808   default:
3809     break;
3810   }
3811   return false;
3812 }
3813 
3814 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)3815 static bool isCombineInstrCandidate32(unsigned Opc) {
3816   switch (Opc) {
3817   case AArch64::ADDWrr:
3818   case AArch64::ADDWri:
3819   case AArch64::SUBWrr:
3820   case AArch64::ADDSWrr:
3821   case AArch64::ADDSWri:
3822   case AArch64::SUBSWrr:
3823   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3824   case AArch64::SUBWri:
3825   case AArch64::SUBSWri:
3826     return true;
3827   default:
3828     break;
3829   }
3830   return false;
3831 }
3832 
3833 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)3834 static bool isCombineInstrCandidate64(unsigned Opc) {
3835   switch (Opc) {
3836   case AArch64::ADDXrr:
3837   case AArch64::ADDXri:
3838   case AArch64::SUBXrr:
3839   case AArch64::ADDSXrr:
3840   case AArch64::ADDSXri:
3841   case AArch64::SUBSXrr:
3842   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3843   case AArch64::SUBXri:
3844   case AArch64::SUBSXri:
3845   case AArch64::ADDv8i8:
3846   case AArch64::ADDv16i8:
3847   case AArch64::ADDv4i16:
3848   case AArch64::ADDv8i16:
3849   case AArch64::ADDv2i32:
3850   case AArch64::ADDv4i32:
3851   case AArch64::SUBv8i8:
3852   case AArch64::SUBv16i8:
3853   case AArch64::SUBv4i16:
3854   case AArch64::SUBv8i16:
3855   case AArch64::SUBv2i32:
3856   case AArch64::SUBv4i32:
3857     return true;
3858   default:
3859     break;
3860   }
3861   return false;
3862 }
3863 
3864 // FP Opcodes that can be combined with a FMUL
isCombineInstrCandidateFP(const MachineInstr & Inst)3865 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3866   switch (Inst.getOpcode()) {
3867   default:
3868     break;
3869   case AArch64::FADDHrr:
3870   case AArch64::FADDSrr:
3871   case AArch64::FADDDrr:
3872   case AArch64::FADDv4f16:
3873   case AArch64::FADDv8f16:
3874   case AArch64::FADDv2f32:
3875   case AArch64::FADDv2f64:
3876   case AArch64::FADDv4f32:
3877   case AArch64::FSUBHrr:
3878   case AArch64::FSUBSrr:
3879   case AArch64::FSUBDrr:
3880   case AArch64::FSUBv4f16:
3881   case AArch64::FSUBv8f16:
3882   case AArch64::FSUBv2f32:
3883   case AArch64::FSUBv2f64:
3884   case AArch64::FSUBv4f32:
3885     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3886     return (Options.UnsafeFPMath ||
3887             Options.AllowFPOpFusion == FPOpFusion::Fast);
3888   }
3889   return false;
3890 }
3891 
3892 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)3893 static bool isCombineInstrCandidate(unsigned Opc) {
3894   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
3895 }
3896 
3897 //
3898 // Utility routine that checks if \param MO is defined by an
3899 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)3900 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
3901                        unsigned CombineOpc, unsigned ZeroReg = 0,
3902                        bool CheckZeroReg = false) {
3903   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3904   MachineInstr *MI = nullptr;
3905 
3906   if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
3907     MI = MRI.getUniqueVRegDef(MO.getReg());
3908   // And it needs to be in the trace (otherwise, it won't have a depth).
3909   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3910     return false;
3911   // Must only used by the user we combine with.
3912   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3913     return false;
3914 
3915   if (CheckZeroReg) {
3916     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3917            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3918            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3919     // The third input reg must be zero.
3920     if (MI->getOperand(3).getReg() != ZeroReg)
3921       return false;
3922   }
3923 
3924   return true;
3925 }
3926 
3927 //
3928 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)3929 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3930                               unsigned MulOpc, unsigned ZeroReg) {
3931   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3932 }
3933 
3934 //
3935 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)3936 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3937                                unsigned MulOpc) {
3938   return canCombine(MBB, MO, MulOpc);
3939 }
3940 
3941 // TODO: There are many more machine instruction opcodes to match:
3942 //       1. Other data types (integer, vectors)
3943 //       2. Other math / logic operations (xor, or)
3944 //       3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst) const3945 bool AArch64InstrInfo::isAssociativeAndCommutative(
3946     const MachineInstr &Inst) const {
3947   switch (Inst.getOpcode()) {
3948   case AArch64::FADDDrr:
3949   case AArch64::FADDSrr:
3950   case AArch64::FADDv2f32:
3951   case AArch64::FADDv2f64:
3952   case AArch64::FADDv4f32:
3953   case AArch64::FMULDrr:
3954   case AArch64::FMULSrr:
3955   case AArch64::FMULX32:
3956   case AArch64::FMULX64:
3957   case AArch64::FMULXv2f32:
3958   case AArch64::FMULXv2f64:
3959   case AArch64::FMULXv4f32:
3960   case AArch64::FMULv2f32:
3961   case AArch64::FMULv2f64:
3962   case AArch64::FMULv4f32:
3963     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3964   default:
3965     return false;
3966   }
3967 }
3968 
3969 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)3970 static bool getMaddPatterns(MachineInstr &Root,
3971                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3972   unsigned Opc = Root.getOpcode();
3973   MachineBasicBlock &MBB = *Root.getParent();
3974   bool Found = false;
3975 
3976   if (!isCombineInstrCandidate(Opc))
3977     return false;
3978   if (isCombineInstrSettingFlag(Opc)) {
3979     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3980     // When NZCV is live bail out.
3981     if (Cmp_NZCV == -1)
3982       return false;
3983     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3984     // When opcode can't change bail out.
3985     // CHECKME: do we miss any cases for opcode conversion?
3986     if (NewOpc == Opc)
3987       return false;
3988     Opc = NewOpc;
3989   }
3990 
3991   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
3992                       MachineCombinerPattern Pattern) {
3993     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
3994       Patterns.push_back(Pattern);
3995       Found = true;
3996     }
3997   };
3998 
3999   auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
4000     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
4001       Patterns.push_back(Pattern);
4002       Found = true;
4003     }
4004   };
4005 
4006   typedef MachineCombinerPattern MCP;
4007 
4008   switch (Opc) {
4009   default:
4010     break;
4011   case AArch64::ADDWrr:
4012     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4013            "ADDWrr does not have register operands");
4014     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
4015     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
4016     break;
4017   case AArch64::ADDXrr:
4018     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
4019     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
4020     break;
4021   case AArch64::SUBWrr:
4022     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
4023     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
4024     break;
4025   case AArch64::SUBXrr:
4026     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
4027     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
4028     break;
4029   case AArch64::ADDWri:
4030     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
4031     break;
4032   case AArch64::ADDXri:
4033     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
4034     break;
4035   case AArch64::SUBWri:
4036     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
4037     break;
4038   case AArch64::SUBXri:
4039     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
4040     break;
4041   case AArch64::ADDv8i8:
4042     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
4043     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
4044     break;
4045   case AArch64::ADDv16i8:
4046     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
4047     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
4048     break;
4049   case AArch64::ADDv4i16:
4050     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
4051     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
4052     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
4053     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
4054     break;
4055   case AArch64::ADDv8i16:
4056     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
4057     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
4058     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
4059     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
4060     break;
4061   case AArch64::ADDv2i32:
4062     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
4063     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
4064     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
4065     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
4066     break;
4067   case AArch64::ADDv4i32:
4068     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
4069     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
4070     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
4071     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
4072     break;
4073   case AArch64::SUBv8i8:
4074     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
4075     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
4076     break;
4077   case AArch64::SUBv16i8:
4078     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
4079     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
4080     break;
4081   case AArch64::SUBv4i16:
4082     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
4083     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
4084     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
4085     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
4086     break;
4087   case AArch64::SUBv8i16:
4088     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
4089     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
4090     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
4091     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
4092     break;
4093   case AArch64::SUBv2i32:
4094     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
4095     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
4096     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
4097     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
4098     break;
4099   case AArch64::SUBv4i32:
4100     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
4101     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
4102     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
4103     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
4104     break;
4105   }
4106   return Found;
4107 }
4108 /// Floating-Point Support
4109 
4110 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)4111 static bool getFMAPatterns(MachineInstr &Root,
4112                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4113 
4114   if (!isCombineInstrCandidateFP(Root))
4115     return false;
4116 
4117   MachineBasicBlock &MBB = *Root.getParent();
4118   bool Found = false;
4119 
4120   auto Match = [&](int Opcode, int Operand,
4121                    MachineCombinerPattern Pattern) -> bool {
4122     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
4123       Patterns.push_back(Pattern);
4124       return true;
4125     }
4126     return false;
4127   };
4128 
4129   typedef MachineCombinerPattern MCP;
4130 
4131   switch (Root.getOpcode()) {
4132   default:
4133     assert(false && "Unsupported FP instruction in combiner\n");
4134     break;
4135   case AArch64::FADDHrr:
4136     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4137            "FADDHrr does not have register operands");
4138 
4139     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
4140     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
4141     break;
4142   case AArch64::FADDSrr:
4143     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4144            "FADDSrr does not have register operands");
4145 
4146     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
4147              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
4148 
4149     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
4150              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
4151     break;
4152   case AArch64::FADDDrr:
4153     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
4154              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
4155 
4156     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
4157              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
4158     break;
4159   case AArch64::FADDv4f16:
4160     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
4161              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
4162 
4163     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
4164              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
4165     break;
4166   case AArch64::FADDv8f16:
4167     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
4168              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
4169 
4170     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
4171              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
4172     break;
4173   case AArch64::FADDv2f32:
4174     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
4175              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
4176 
4177     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
4178              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
4179     break;
4180   case AArch64::FADDv2f64:
4181     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
4182              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
4183 
4184     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
4185              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
4186     break;
4187   case AArch64::FADDv4f32:
4188     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
4189              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
4190 
4191     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
4192              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
4193     break;
4194   case AArch64::FSUBHrr:
4195     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
4196     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
4197     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
4198     break;
4199   case AArch64::FSUBSrr:
4200     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
4201 
4202     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
4203              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
4204 
4205     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
4206     break;
4207   case AArch64::FSUBDrr:
4208     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
4209 
4210     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
4211              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
4212 
4213     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
4214     break;
4215   case AArch64::FSUBv4f16:
4216     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
4217              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
4218 
4219     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
4220              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
4221     break;
4222   case AArch64::FSUBv8f16:
4223     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
4224              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
4225 
4226     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
4227              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
4228     break;
4229   case AArch64::FSUBv2f32:
4230     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
4231              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4232 
4233     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4234              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4235     break;
4236   case AArch64::FSUBv2f64:
4237     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4238              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4239 
4240     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4241              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4242     break;
4243   case AArch64::FSUBv4f32:
4244     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4245              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4246 
4247     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4248              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4249     break;
4250   }
4251   return Found;
4252 }
4253 
4254 /// Return true when a code sequence can improve throughput. It
4255 /// should be called only for instructions in loops.
4256 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const4257 bool AArch64InstrInfo::isThroughputPattern(
4258     MachineCombinerPattern Pattern) const {
4259   switch (Pattern) {
4260   default:
4261     break;
4262   case MachineCombinerPattern::FMULADDH_OP1:
4263   case MachineCombinerPattern::FMULADDH_OP2:
4264   case MachineCombinerPattern::FMULSUBH_OP1:
4265   case MachineCombinerPattern::FMULSUBH_OP2:
4266   case MachineCombinerPattern::FMULADDS_OP1:
4267   case MachineCombinerPattern::FMULADDS_OP2:
4268   case MachineCombinerPattern::FMULSUBS_OP1:
4269   case MachineCombinerPattern::FMULSUBS_OP2:
4270   case MachineCombinerPattern::FMULADDD_OP1:
4271   case MachineCombinerPattern::FMULADDD_OP2:
4272   case MachineCombinerPattern::FMULSUBD_OP1:
4273   case MachineCombinerPattern::FMULSUBD_OP2:
4274   case MachineCombinerPattern::FNMULSUBH_OP1:
4275   case MachineCombinerPattern::FNMULSUBS_OP1:
4276   case MachineCombinerPattern::FNMULSUBD_OP1:
4277   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
4278   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
4279   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
4280   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
4281   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4282   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4283   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4284   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4285   case MachineCombinerPattern::FMLAv4f16_OP2:
4286   case MachineCombinerPattern::FMLAv4f16_OP1:
4287   case MachineCombinerPattern::FMLAv8f16_OP1:
4288   case MachineCombinerPattern::FMLAv8f16_OP2:
4289   case MachineCombinerPattern::FMLAv2f32_OP2:
4290   case MachineCombinerPattern::FMLAv2f32_OP1:
4291   case MachineCombinerPattern::FMLAv2f64_OP1:
4292   case MachineCombinerPattern::FMLAv2f64_OP2:
4293   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4294   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4295   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4296   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4297   case MachineCombinerPattern::FMLAv4f32_OP1:
4298   case MachineCombinerPattern::FMLAv4f32_OP2:
4299   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4300   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4301   case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
4302   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
4303   case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
4304   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
4305   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4306   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4307   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4308   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4309   case MachineCombinerPattern::FMLSv4f16_OP1:
4310   case MachineCombinerPattern::FMLSv4f16_OP2:
4311   case MachineCombinerPattern::FMLSv8f16_OP1:
4312   case MachineCombinerPattern::FMLSv8f16_OP2:
4313   case MachineCombinerPattern::FMLSv2f32_OP2:
4314   case MachineCombinerPattern::FMLSv2f64_OP2:
4315   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4316   case MachineCombinerPattern::FMLSv4f32_OP2:
4317   case MachineCombinerPattern::MULADDv8i8_OP1:
4318   case MachineCombinerPattern::MULADDv8i8_OP2:
4319   case MachineCombinerPattern::MULADDv16i8_OP1:
4320   case MachineCombinerPattern::MULADDv16i8_OP2:
4321   case MachineCombinerPattern::MULADDv4i16_OP1:
4322   case MachineCombinerPattern::MULADDv4i16_OP2:
4323   case MachineCombinerPattern::MULADDv8i16_OP1:
4324   case MachineCombinerPattern::MULADDv8i16_OP2:
4325   case MachineCombinerPattern::MULADDv2i32_OP1:
4326   case MachineCombinerPattern::MULADDv2i32_OP2:
4327   case MachineCombinerPattern::MULADDv4i32_OP1:
4328   case MachineCombinerPattern::MULADDv4i32_OP2:
4329   case MachineCombinerPattern::MULSUBv8i8_OP1:
4330   case MachineCombinerPattern::MULSUBv8i8_OP2:
4331   case MachineCombinerPattern::MULSUBv16i8_OP1:
4332   case MachineCombinerPattern::MULSUBv16i8_OP2:
4333   case MachineCombinerPattern::MULSUBv4i16_OP1:
4334   case MachineCombinerPattern::MULSUBv4i16_OP2:
4335   case MachineCombinerPattern::MULSUBv8i16_OP1:
4336   case MachineCombinerPattern::MULSUBv8i16_OP2:
4337   case MachineCombinerPattern::MULSUBv2i32_OP1:
4338   case MachineCombinerPattern::MULSUBv2i32_OP2:
4339   case MachineCombinerPattern::MULSUBv4i32_OP1:
4340   case MachineCombinerPattern::MULSUBv4i32_OP2:
4341   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4342   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4343   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4344   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4345   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4346   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4347   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4348   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4349   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4350   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4351   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4352   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4353   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4354   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4355   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4356   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4357     return true;
4358   } // end switch (Pattern)
4359   return false;
4360 }
4361 /// Return true when there is potentially a faster code sequence for an
4362 /// instruction chain ending in \p Root. All potential patterns are listed in
4363 /// the \p Pattern vector. Pattern should be sorted in priority order since the
4364 /// pattern evaluator stops checking as soon as it finds a faster sequence.
4365 
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns) const4366 bool AArch64InstrInfo::getMachineCombinerPatterns(
4367     MachineInstr &Root,
4368     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
4369   // Integer patterns
4370   if (getMaddPatterns(Root, Patterns))
4371     return true;
4372   // Floating point patterns
4373   if (getFMAPatterns(Root, Patterns))
4374     return true;
4375 
4376   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
4377 }
4378 
4379 enum class FMAInstKind { Default, Indexed, Accumulator };
4380 /// genFusedMultiply - Generate fused multiply instructions.
4381 /// This function supports both integer and floating point instructions.
4382 /// A typical example:
4383 ///  F|MUL I=A,B,0
4384 ///  F|ADD R,I,C
4385 ///  ==> F|MADD R,A,B,C
4386 /// \param MF Containing MachineFunction
4387 /// \param MRI Register information
4388 /// \param TII Target information
4389 /// \param Root is the F|ADD instruction
4390 /// \param [out] InsInstrs is a vector of machine instructions and will
4391 /// contain the generated madd instruction
4392 /// \param IdxMulOpd is index of operand in Root that is the result of
4393 /// the F|MUL. In the example above IdxMulOpd is 1.
4394 /// \param MaddOpc the opcode fo the f|madd instruction
4395 /// \param RC Register class of operands
4396 /// \param kind of fma instruction (addressing mode) to be generated
4397 /// \param ReplacedAddend is the result register from the instruction
4398 /// replacing the non-combined operand, if any.
4399 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)4400 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
4401                  const TargetInstrInfo *TII, MachineInstr &Root,
4402                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
4403                  unsigned MaddOpc, const TargetRegisterClass *RC,
4404                  FMAInstKind kind = FMAInstKind::Default,
4405                  const Register *ReplacedAddend = nullptr) {
4406   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4407 
4408   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
4409   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4410   Register ResultReg = Root.getOperand(0).getReg();
4411   Register SrcReg0 = MUL->getOperand(1).getReg();
4412   bool Src0IsKill = MUL->getOperand(1).isKill();
4413   Register SrcReg1 = MUL->getOperand(2).getReg();
4414   bool Src1IsKill = MUL->getOperand(2).isKill();
4415 
4416   unsigned SrcReg2;
4417   bool Src2IsKill;
4418   if (ReplacedAddend) {
4419     // If we just generated a new addend, we must be it's only use.
4420     SrcReg2 = *ReplacedAddend;
4421     Src2IsKill = true;
4422   } else {
4423     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4424     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4425   }
4426 
4427   if (Register::isVirtualRegister(ResultReg))
4428     MRI.constrainRegClass(ResultReg, RC);
4429   if (Register::isVirtualRegister(SrcReg0))
4430     MRI.constrainRegClass(SrcReg0, RC);
4431   if (Register::isVirtualRegister(SrcReg1))
4432     MRI.constrainRegClass(SrcReg1, RC);
4433   if (Register::isVirtualRegister(SrcReg2))
4434     MRI.constrainRegClass(SrcReg2, RC);
4435 
4436   MachineInstrBuilder MIB;
4437   if (kind == FMAInstKind::Default)
4438     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4439               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4440               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4441               .addReg(SrcReg2, getKillRegState(Src2IsKill));
4442   else if (kind == FMAInstKind::Indexed)
4443     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4444               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4445               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4446               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4447               .addImm(MUL->getOperand(3).getImm());
4448   else if (kind == FMAInstKind::Accumulator)
4449     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4450               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4451               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4452               .addReg(SrcReg1, getKillRegState(Src1IsKill));
4453   else
4454     assert(false && "Invalid FMA instruction kind \n");
4455   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4456   InsInstrs.push_back(MIB);
4457   return MUL;
4458 }
4459 
4460 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
4461 /// instructions.
4462 ///
4463 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4464 static MachineInstr *genFusedMultiplyAcc(
4465     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4466     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4467     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4468   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4469                           FMAInstKind::Accumulator);
4470 }
4471 
4472 /// genNeg - Helper to generate an intermediate negation of the second operand
4473 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)4474 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
4475                        const TargetInstrInfo *TII, MachineInstr &Root,
4476                        SmallVectorImpl<MachineInstr *> &InsInstrs,
4477                        DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
4478                        unsigned MnegOpc, const TargetRegisterClass *RC) {
4479   Register NewVR = MRI.createVirtualRegister(RC);
4480   MachineInstrBuilder MIB =
4481       BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
4482           .add(Root.getOperand(2));
4483   InsInstrs.push_back(MIB);
4484 
4485   assert(InstrIdxForVirtReg.empty());
4486   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4487 
4488   return NewVR;
4489 }
4490 
4491 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4492 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4493 static MachineInstr *genFusedMultiplyAccNeg(
4494     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4495     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4496     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4497     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4498   assert(IdxMulOpd == 1);
4499 
4500   Register NewVR =
4501       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4502   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4503                           FMAInstKind::Accumulator, &NewVR);
4504 }
4505 
4506 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
4507 /// instructions.
4508 ///
4509 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4510 static MachineInstr *genFusedMultiplyIdx(
4511     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4512     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4513     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4514   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4515                           FMAInstKind::Indexed);
4516 }
4517 
4518 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4519 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4520 static MachineInstr *genFusedMultiplyIdxNeg(
4521     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4522     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4523     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4524     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4525   assert(IdxMulOpd == 1);
4526 
4527   Register NewVR =
4528       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4529 
4530   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4531                           FMAInstKind::Indexed, &NewVR);
4532 }
4533 
4534 /// genMaddR - Generate madd instruction and combine mul and add using
4535 /// an extra virtual register
4536 /// Example - an ADD intermediate needs to be stored in a register:
4537 ///   MUL I=A,B,0
4538 ///   ADD R,I,Imm
4539 ///   ==> ORR  V, ZR, Imm
4540 ///   ==> MADD R,A,B,V
4541 /// \param MF Containing MachineFunction
4542 /// \param MRI Register information
4543 /// \param TII Target information
4544 /// \param Root is the ADD instruction
4545 /// \param [out] InsInstrs is a vector of machine instructions and will
4546 /// contain the generated madd instruction
4547 /// \param IdxMulOpd is index of operand in Root that is the result of
4548 /// the MUL. In the example above IdxMulOpd is 1.
4549 /// \param MaddOpc the opcode fo the madd instruction
4550 /// \param VR is a virtual register that holds the value of an ADD operand
4551 /// (V in the example above).
4552 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)4553 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4554                               const TargetInstrInfo *TII, MachineInstr &Root,
4555                               SmallVectorImpl<MachineInstr *> &InsInstrs,
4556                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4557                               const TargetRegisterClass *RC) {
4558   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4559 
4560   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4561   Register ResultReg = Root.getOperand(0).getReg();
4562   Register SrcReg0 = MUL->getOperand(1).getReg();
4563   bool Src0IsKill = MUL->getOperand(1).isKill();
4564   Register SrcReg1 = MUL->getOperand(2).getReg();
4565   bool Src1IsKill = MUL->getOperand(2).isKill();
4566 
4567   if (Register::isVirtualRegister(ResultReg))
4568     MRI.constrainRegClass(ResultReg, RC);
4569   if (Register::isVirtualRegister(SrcReg0))
4570     MRI.constrainRegClass(SrcReg0, RC);
4571   if (Register::isVirtualRegister(SrcReg1))
4572     MRI.constrainRegClass(SrcReg1, RC);
4573   if (Register::isVirtualRegister(VR))
4574     MRI.constrainRegClass(VR, RC);
4575 
4576   MachineInstrBuilder MIB =
4577       BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4578           .addReg(SrcReg0, getKillRegState(Src0IsKill))
4579           .addReg(SrcReg1, getKillRegState(Src1IsKill))
4580           .addReg(VR);
4581   // Insert the MADD
4582   InsInstrs.push_back(MIB);
4583   return MUL;
4584 }
4585 
4586 /// When getMachineCombinerPatterns() finds potential patterns,
4587 /// this function generates the instructions that could replace the
4588 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const4589 void AArch64InstrInfo::genAlternativeCodeSequence(
4590     MachineInstr &Root, MachineCombinerPattern Pattern,
4591     SmallVectorImpl<MachineInstr *> &InsInstrs,
4592     SmallVectorImpl<MachineInstr *> &DelInstrs,
4593     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4594   MachineBasicBlock &MBB = *Root.getParent();
4595   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4596   MachineFunction &MF = *MBB.getParent();
4597   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4598 
4599   MachineInstr *MUL;
4600   const TargetRegisterClass *RC;
4601   unsigned Opc;
4602   switch (Pattern) {
4603   default:
4604     // Reassociate instructions.
4605     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4606                                                 DelInstrs, InstrIdxForVirtReg);
4607     return;
4608   case MachineCombinerPattern::MULADDW_OP1:
4609   case MachineCombinerPattern::MULADDX_OP1:
4610     // MUL I=A,B,0
4611     // ADD R,I,C
4612     // ==> MADD R,A,B,C
4613     // --- Create(MADD);
4614     if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4615       Opc = AArch64::MADDWrrr;
4616       RC = &AArch64::GPR32RegClass;
4617     } else {
4618       Opc = AArch64::MADDXrrr;
4619       RC = &AArch64::GPR64RegClass;
4620     }
4621     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4622     break;
4623   case MachineCombinerPattern::MULADDW_OP2:
4624   case MachineCombinerPattern::MULADDX_OP2:
4625     // MUL I=A,B,0
4626     // ADD R,C,I
4627     // ==> MADD R,A,B,C
4628     // --- Create(MADD);
4629     if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4630       Opc = AArch64::MADDWrrr;
4631       RC = &AArch64::GPR32RegClass;
4632     } else {
4633       Opc = AArch64::MADDXrrr;
4634       RC = &AArch64::GPR64RegClass;
4635     }
4636     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4637     break;
4638   case MachineCombinerPattern::MULADDWI_OP1:
4639   case MachineCombinerPattern::MULADDXI_OP1: {
4640     // MUL I=A,B,0
4641     // ADD R,I,Imm
4642     // ==> ORR  V, ZR, Imm
4643     // ==> MADD R,A,B,V
4644     // --- Create(MADD);
4645     const TargetRegisterClass *OrrRC;
4646     unsigned BitSize, OrrOpc, ZeroReg;
4647     if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4648       OrrOpc = AArch64::ORRWri;
4649       OrrRC = &AArch64::GPR32spRegClass;
4650       BitSize = 32;
4651       ZeroReg = AArch64::WZR;
4652       Opc = AArch64::MADDWrrr;
4653       RC = &AArch64::GPR32RegClass;
4654     } else {
4655       OrrOpc = AArch64::ORRXri;
4656       OrrRC = &AArch64::GPR64spRegClass;
4657       BitSize = 64;
4658       ZeroReg = AArch64::XZR;
4659       Opc = AArch64::MADDXrrr;
4660       RC = &AArch64::GPR64RegClass;
4661     }
4662     Register NewVR = MRI.createVirtualRegister(OrrRC);
4663     uint64_t Imm = Root.getOperand(2).getImm();
4664 
4665     if (Root.getOperand(3).isImm()) {
4666       unsigned Val = Root.getOperand(3).getImm();
4667       Imm = Imm << Val;
4668     }
4669     uint64_t UImm = SignExtend64(Imm, BitSize);
4670     uint64_t Encoding;
4671     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4672       MachineInstrBuilder MIB1 =
4673           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4674               .addReg(ZeroReg)
4675               .addImm(Encoding);
4676       InsInstrs.push_back(MIB1);
4677       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4678       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4679     }
4680     break;
4681   }
4682   case MachineCombinerPattern::MULSUBW_OP1:
4683   case MachineCombinerPattern::MULSUBX_OP1: {
4684     // MUL I=A,B,0
4685     // SUB R,I, C
4686     // ==> SUB  V, 0, C
4687     // ==> MADD R,A,B,V // = -C + A*B
4688     // --- Create(MADD);
4689     const TargetRegisterClass *SubRC;
4690     unsigned SubOpc, ZeroReg;
4691     if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4692       SubOpc = AArch64::SUBWrr;
4693       SubRC = &AArch64::GPR32spRegClass;
4694       ZeroReg = AArch64::WZR;
4695       Opc = AArch64::MADDWrrr;
4696       RC = &AArch64::GPR32RegClass;
4697     } else {
4698       SubOpc = AArch64::SUBXrr;
4699       SubRC = &AArch64::GPR64spRegClass;
4700       ZeroReg = AArch64::XZR;
4701       Opc = AArch64::MADDXrrr;
4702       RC = &AArch64::GPR64RegClass;
4703     }
4704     Register NewVR = MRI.createVirtualRegister(SubRC);
4705     // SUB NewVR, 0, C
4706     MachineInstrBuilder MIB1 =
4707         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4708             .addReg(ZeroReg)
4709             .add(Root.getOperand(2));
4710     InsInstrs.push_back(MIB1);
4711     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4712     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4713     break;
4714   }
4715   case MachineCombinerPattern::MULSUBW_OP2:
4716   case MachineCombinerPattern::MULSUBX_OP2:
4717     // MUL I=A,B,0
4718     // SUB R,C,I
4719     // ==> MSUB R,A,B,C (computes C - A*B)
4720     // --- Create(MSUB);
4721     if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4722       Opc = AArch64::MSUBWrrr;
4723       RC = &AArch64::GPR32RegClass;
4724     } else {
4725       Opc = AArch64::MSUBXrrr;
4726       RC = &AArch64::GPR64RegClass;
4727     }
4728     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4729     break;
4730   case MachineCombinerPattern::MULSUBWI_OP1:
4731   case MachineCombinerPattern::MULSUBXI_OP1: {
4732     // MUL I=A,B,0
4733     // SUB R,I, Imm
4734     // ==> ORR  V, ZR, -Imm
4735     // ==> MADD R,A,B,V // = -Imm + A*B
4736     // --- Create(MADD);
4737     const TargetRegisterClass *OrrRC;
4738     unsigned BitSize, OrrOpc, ZeroReg;
4739     if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4740       OrrOpc = AArch64::ORRWri;
4741       OrrRC = &AArch64::GPR32spRegClass;
4742       BitSize = 32;
4743       ZeroReg = AArch64::WZR;
4744       Opc = AArch64::MADDWrrr;
4745       RC = &AArch64::GPR32RegClass;
4746     } else {
4747       OrrOpc = AArch64::ORRXri;
4748       OrrRC = &AArch64::GPR64spRegClass;
4749       BitSize = 64;
4750       ZeroReg = AArch64::XZR;
4751       Opc = AArch64::MADDXrrr;
4752       RC = &AArch64::GPR64RegClass;
4753     }
4754     Register NewVR = MRI.createVirtualRegister(OrrRC);
4755     uint64_t Imm = Root.getOperand(2).getImm();
4756     if (Root.getOperand(3).isImm()) {
4757       unsigned Val = Root.getOperand(3).getImm();
4758       Imm = Imm << Val;
4759     }
4760     uint64_t UImm = SignExtend64(-Imm, BitSize);
4761     uint64_t Encoding;
4762     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4763       MachineInstrBuilder MIB1 =
4764           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4765               .addReg(ZeroReg)
4766               .addImm(Encoding);
4767       InsInstrs.push_back(MIB1);
4768       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4769       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4770     }
4771     break;
4772   }
4773 
4774   case MachineCombinerPattern::MULADDv8i8_OP1:
4775     Opc = AArch64::MLAv8i8;
4776     RC = &AArch64::FPR64RegClass;
4777     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4778     break;
4779   case MachineCombinerPattern::MULADDv8i8_OP2:
4780     Opc = AArch64::MLAv8i8;
4781     RC = &AArch64::FPR64RegClass;
4782     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4783     break;
4784   case MachineCombinerPattern::MULADDv16i8_OP1:
4785     Opc = AArch64::MLAv16i8;
4786     RC = &AArch64::FPR128RegClass;
4787     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4788     break;
4789   case MachineCombinerPattern::MULADDv16i8_OP2:
4790     Opc = AArch64::MLAv16i8;
4791     RC = &AArch64::FPR128RegClass;
4792     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4793     break;
4794   case MachineCombinerPattern::MULADDv4i16_OP1:
4795     Opc = AArch64::MLAv4i16;
4796     RC = &AArch64::FPR64RegClass;
4797     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4798     break;
4799   case MachineCombinerPattern::MULADDv4i16_OP2:
4800     Opc = AArch64::MLAv4i16;
4801     RC = &AArch64::FPR64RegClass;
4802     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4803     break;
4804   case MachineCombinerPattern::MULADDv8i16_OP1:
4805     Opc = AArch64::MLAv8i16;
4806     RC = &AArch64::FPR128RegClass;
4807     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4808     break;
4809   case MachineCombinerPattern::MULADDv8i16_OP2:
4810     Opc = AArch64::MLAv8i16;
4811     RC = &AArch64::FPR128RegClass;
4812     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4813     break;
4814   case MachineCombinerPattern::MULADDv2i32_OP1:
4815     Opc = AArch64::MLAv2i32;
4816     RC = &AArch64::FPR64RegClass;
4817     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4818     break;
4819   case MachineCombinerPattern::MULADDv2i32_OP2:
4820     Opc = AArch64::MLAv2i32;
4821     RC = &AArch64::FPR64RegClass;
4822     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4823     break;
4824   case MachineCombinerPattern::MULADDv4i32_OP1:
4825     Opc = AArch64::MLAv4i32;
4826     RC = &AArch64::FPR128RegClass;
4827     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4828     break;
4829   case MachineCombinerPattern::MULADDv4i32_OP2:
4830     Opc = AArch64::MLAv4i32;
4831     RC = &AArch64::FPR128RegClass;
4832     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4833     break;
4834 
4835   case MachineCombinerPattern::MULSUBv8i8_OP1:
4836     Opc = AArch64::MLAv8i8;
4837     RC = &AArch64::FPR64RegClass;
4838     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4839                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
4840                                  RC);
4841     break;
4842   case MachineCombinerPattern::MULSUBv8i8_OP2:
4843     Opc = AArch64::MLSv8i8;
4844     RC = &AArch64::FPR64RegClass;
4845     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4846     break;
4847   case MachineCombinerPattern::MULSUBv16i8_OP1:
4848     Opc = AArch64::MLAv16i8;
4849     RC = &AArch64::FPR128RegClass;
4850     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4851                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
4852                                  RC);
4853     break;
4854   case MachineCombinerPattern::MULSUBv16i8_OP2:
4855     Opc = AArch64::MLSv16i8;
4856     RC = &AArch64::FPR128RegClass;
4857     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4858     break;
4859   case MachineCombinerPattern::MULSUBv4i16_OP1:
4860     Opc = AArch64::MLAv4i16;
4861     RC = &AArch64::FPR64RegClass;
4862     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4863                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
4864                                  RC);
4865     break;
4866   case MachineCombinerPattern::MULSUBv4i16_OP2:
4867     Opc = AArch64::MLSv4i16;
4868     RC = &AArch64::FPR64RegClass;
4869     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4870     break;
4871   case MachineCombinerPattern::MULSUBv8i16_OP1:
4872     Opc = AArch64::MLAv8i16;
4873     RC = &AArch64::FPR128RegClass;
4874     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4875                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
4876                                  RC);
4877     break;
4878   case MachineCombinerPattern::MULSUBv8i16_OP2:
4879     Opc = AArch64::MLSv8i16;
4880     RC = &AArch64::FPR128RegClass;
4881     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4882     break;
4883   case MachineCombinerPattern::MULSUBv2i32_OP1:
4884     Opc = AArch64::MLAv2i32;
4885     RC = &AArch64::FPR64RegClass;
4886     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4887                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
4888                                  RC);
4889     break;
4890   case MachineCombinerPattern::MULSUBv2i32_OP2:
4891     Opc = AArch64::MLSv2i32;
4892     RC = &AArch64::FPR64RegClass;
4893     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4894     break;
4895   case MachineCombinerPattern::MULSUBv4i32_OP1:
4896     Opc = AArch64::MLAv4i32;
4897     RC = &AArch64::FPR128RegClass;
4898     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4899                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
4900                                  RC);
4901     break;
4902   case MachineCombinerPattern::MULSUBv4i32_OP2:
4903     Opc = AArch64::MLSv4i32;
4904     RC = &AArch64::FPR128RegClass;
4905     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4906     break;
4907 
4908   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4909     Opc = AArch64::MLAv4i16_indexed;
4910     RC = &AArch64::FPR64RegClass;
4911     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4912     break;
4913   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4914     Opc = AArch64::MLAv4i16_indexed;
4915     RC = &AArch64::FPR64RegClass;
4916     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4917     break;
4918   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4919     Opc = AArch64::MLAv8i16_indexed;
4920     RC = &AArch64::FPR128RegClass;
4921     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4922     break;
4923   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4924     Opc = AArch64::MLAv8i16_indexed;
4925     RC = &AArch64::FPR128RegClass;
4926     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4927     break;
4928   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4929     Opc = AArch64::MLAv2i32_indexed;
4930     RC = &AArch64::FPR64RegClass;
4931     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4932     break;
4933   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4934     Opc = AArch64::MLAv2i32_indexed;
4935     RC = &AArch64::FPR64RegClass;
4936     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4937     break;
4938   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4939     Opc = AArch64::MLAv4i32_indexed;
4940     RC = &AArch64::FPR128RegClass;
4941     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4942     break;
4943   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4944     Opc = AArch64::MLAv4i32_indexed;
4945     RC = &AArch64::FPR128RegClass;
4946     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4947     break;
4948 
4949   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4950     Opc = AArch64::MLAv4i16_indexed;
4951     RC = &AArch64::FPR64RegClass;
4952     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4953                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
4954                                  RC);
4955     break;
4956   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4957     Opc = AArch64::MLSv4i16_indexed;
4958     RC = &AArch64::FPR64RegClass;
4959     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4960     break;
4961   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4962     Opc = AArch64::MLAv8i16_indexed;
4963     RC = &AArch64::FPR128RegClass;
4964     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4965                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
4966                                  RC);
4967     break;
4968   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4969     Opc = AArch64::MLSv8i16_indexed;
4970     RC = &AArch64::FPR128RegClass;
4971     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4972     break;
4973   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4974     Opc = AArch64::MLAv2i32_indexed;
4975     RC = &AArch64::FPR64RegClass;
4976     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4977                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
4978                                  RC);
4979     break;
4980   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4981     Opc = AArch64::MLSv2i32_indexed;
4982     RC = &AArch64::FPR64RegClass;
4983     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4984     break;
4985   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4986     Opc = AArch64::MLAv4i32_indexed;
4987     RC = &AArch64::FPR128RegClass;
4988     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4989                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
4990                                  RC);
4991     break;
4992   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4993     Opc = AArch64::MLSv4i32_indexed;
4994     RC = &AArch64::FPR128RegClass;
4995     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4996     break;
4997 
4998   // Floating Point Support
4999   case MachineCombinerPattern::FMULADDH_OP1:
5000     Opc = AArch64::FMADDHrrr;
5001     RC = &AArch64::FPR16RegClass;
5002     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5003     break;
5004   case MachineCombinerPattern::FMULADDS_OP1:
5005     Opc = AArch64::FMADDSrrr;
5006     RC = &AArch64::FPR32RegClass;
5007     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5008     break;
5009   case MachineCombinerPattern::FMULADDD_OP1:
5010     Opc = AArch64::FMADDDrrr;
5011     RC = &AArch64::FPR64RegClass;
5012     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5013     break;
5014 
5015   case MachineCombinerPattern::FMULADDH_OP2:
5016     Opc = AArch64::FMADDHrrr;
5017     RC = &AArch64::FPR16RegClass;
5018     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5019     break;
5020   case MachineCombinerPattern::FMULADDS_OP2:
5021     Opc = AArch64::FMADDSrrr;
5022     RC = &AArch64::FPR32RegClass;
5023     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5024     break;
5025   case MachineCombinerPattern::FMULADDD_OP2:
5026     Opc = AArch64::FMADDDrrr;
5027     RC = &AArch64::FPR64RegClass;
5028     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5029     break;
5030 
5031   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
5032     Opc = AArch64::FMLAv1i32_indexed;
5033     RC = &AArch64::FPR32RegClass;
5034     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5035                            FMAInstKind::Indexed);
5036     break;
5037   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
5038     Opc = AArch64::FMLAv1i32_indexed;
5039     RC = &AArch64::FPR32RegClass;
5040     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5041                            FMAInstKind::Indexed);
5042     break;
5043 
5044   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
5045     Opc = AArch64::FMLAv1i64_indexed;
5046     RC = &AArch64::FPR64RegClass;
5047     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5048                            FMAInstKind::Indexed);
5049     break;
5050   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
5051     Opc = AArch64::FMLAv1i64_indexed;
5052     RC = &AArch64::FPR64RegClass;
5053     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5054                            FMAInstKind::Indexed);
5055     break;
5056 
5057   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
5058     RC = &AArch64::FPR64RegClass;
5059     Opc = AArch64::FMLAv4i16_indexed;
5060     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5061                            FMAInstKind::Indexed);
5062     break;
5063   case MachineCombinerPattern::FMLAv4f16_OP1:
5064     RC = &AArch64::FPR64RegClass;
5065     Opc = AArch64::FMLAv4f16;
5066     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5067                            FMAInstKind::Accumulator);
5068     break;
5069   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
5070     RC = &AArch64::FPR64RegClass;
5071     Opc = AArch64::FMLAv4i16_indexed;
5072     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5073                            FMAInstKind::Indexed);
5074     break;
5075   case MachineCombinerPattern::FMLAv4f16_OP2:
5076     RC = &AArch64::FPR64RegClass;
5077     Opc = AArch64::FMLAv4f16;
5078     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5079                            FMAInstKind::Accumulator);
5080     break;
5081 
5082   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
5083   case MachineCombinerPattern::FMLAv2f32_OP1:
5084     RC = &AArch64::FPR64RegClass;
5085     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
5086       Opc = AArch64::FMLAv2i32_indexed;
5087       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5088                              FMAInstKind::Indexed);
5089     } else {
5090       Opc = AArch64::FMLAv2f32;
5091       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5092                              FMAInstKind::Accumulator);
5093     }
5094     break;
5095   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
5096   case MachineCombinerPattern::FMLAv2f32_OP2:
5097     RC = &AArch64::FPR64RegClass;
5098     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
5099       Opc = AArch64::FMLAv2i32_indexed;
5100       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5101                              FMAInstKind::Indexed);
5102     } else {
5103       Opc = AArch64::FMLAv2f32;
5104       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5105                              FMAInstKind::Accumulator);
5106     }
5107     break;
5108 
5109   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
5110     RC = &AArch64::FPR128RegClass;
5111     Opc = AArch64::FMLAv8i16_indexed;
5112     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5113                            FMAInstKind::Indexed);
5114     break;
5115   case MachineCombinerPattern::FMLAv8f16_OP1:
5116     RC = &AArch64::FPR128RegClass;
5117     Opc = AArch64::FMLAv8f16;
5118     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5119                            FMAInstKind::Accumulator);
5120     break;
5121   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
5122     RC = &AArch64::FPR128RegClass;
5123     Opc = AArch64::FMLAv8i16_indexed;
5124     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5125                            FMAInstKind::Indexed);
5126     break;
5127   case MachineCombinerPattern::FMLAv8f16_OP2:
5128     RC = &AArch64::FPR128RegClass;
5129     Opc = AArch64::FMLAv8f16;
5130     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5131                            FMAInstKind::Accumulator);
5132     break;
5133 
5134   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
5135   case MachineCombinerPattern::FMLAv2f64_OP1:
5136     RC = &AArch64::FPR128RegClass;
5137     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
5138       Opc = AArch64::FMLAv2i64_indexed;
5139       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5140                              FMAInstKind::Indexed);
5141     } else {
5142       Opc = AArch64::FMLAv2f64;
5143       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5144                              FMAInstKind::Accumulator);
5145     }
5146     break;
5147   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
5148   case MachineCombinerPattern::FMLAv2f64_OP2:
5149     RC = &AArch64::FPR128RegClass;
5150     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
5151       Opc = AArch64::FMLAv2i64_indexed;
5152       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5153                              FMAInstKind::Indexed);
5154     } else {
5155       Opc = AArch64::FMLAv2f64;
5156       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5157                              FMAInstKind::Accumulator);
5158     }
5159     break;
5160 
5161   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
5162   case MachineCombinerPattern::FMLAv4f32_OP1:
5163     RC = &AArch64::FPR128RegClass;
5164     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
5165       Opc = AArch64::FMLAv4i32_indexed;
5166       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5167                              FMAInstKind::Indexed);
5168     } else {
5169       Opc = AArch64::FMLAv4f32;
5170       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5171                              FMAInstKind::Accumulator);
5172     }
5173     break;
5174 
5175   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
5176   case MachineCombinerPattern::FMLAv4f32_OP2:
5177     RC = &AArch64::FPR128RegClass;
5178     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
5179       Opc = AArch64::FMLAv4i32_indexed;
5180       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5181                              FMAInstKind::Indexed);
5182     } else {
5183       Opc = AArch64::FMLAv4f32;
5184       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5185                              FMAInstKind::Accumulator);
5186     }
5187     break;
5188 
5189   case MachineCombinerPattern::FMULSUBH_OP1:
5190     Opc = AArch64::FNMSUBHrrr;
5191     RC = &AArch64::FPR16RegClass;
5192     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5193     break;
5194   case MachineCombinerPattern::FMULSUBS_OP1:
5195     Opc = AArch64::FNMSUBSrrr;
5196     RC = &AArch64::FPR32RegClass;
5197     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5198     break;
5199   case MachineCombinerPattern::FMULSUBD_OP1:
5200     Opc = AArch64::FNMSUBDrrr;
5201     RC = &AArch64::FPR64RegClass;
5202     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5203     break;
5204 
5205   case MachineCombinerPattern::FNMULSUBH_OP1:
5206     Opc = AArch64::FNMADDHrrr;
5207     RC = &AArch64::FPR16RegClass;
5208     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5209     break;
5210   case MachineCombinerPattern::FNMULSUBS_OP1:
5211     Opc = AArch64::FNMADDSrrr;
5212     RC = &AArch64::FPR32RegClass;
5213     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5214     break;
5215   case MachineCombinerPattern::FNMULSUBD_OP1:
5216     Opc = AArch64::FNMADDDrrr;
5217     RC = &AArch64::FPR64RegClass;
5218     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5219     break;
5220 
5221   case MachineCombinerPattern::FMULSUBH_OP2:
5222     Opc = AArch64::FMSUBHrrr;
5223     RC = &AArch64::FPR16RegClass;
5224     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5225     break;
5226   case MachineCombinerPattern::FMULSUBS_OP2:
5227     Opc = AArch64::FMSUBSrrr;
5228     RC = &AArch64::FPR32RegClass;
5229     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5230     break;
5231   case MachineCombinerPattern::FMULSUBD_OP2:
5232     Opc = AArch64::FMSUBDrrr;
5233     RC = &AArch64::FPR64RegClass;
5234     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5235     break;
5236 
5237   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
5238     Opc = AArch64::FMLSv1i32_indexed;
5239     RC = &AArch64::FPR32RegClass;
5240     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5241                            FMAInstKind::Indexed);
5242     break;
5243 
5244   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
5245     Opc = AArch64::FMLSv1i64_indexed;
5246     RC = &AArch64::FPR64RegClass;
5247     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5248                            FMAInstKind::Indexed);
5249     break;
5250 
5251   case MachineCombinerPattern::FMLSv4f16_OP1:
5252   case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
5253     RC = &AArch64::FPR64RegClass;
5254     Register NewVR = MRI.createVirtualRegister(RC);
5255     MachineInstrBuilder MIB1 =
5256         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
5257             .add(Root.getOperand(2));
5258     InsInstrs.push_back(MIB1);
5259     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5260     if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
5261       Opc = AArch64::FMLAv4f16;
5262       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5263                              FMAInstKind::Accumulator, &NewVR);
5264     } else {
5265       Opc = AArch64::FMLAv4i16_indexed;
5266       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5267                              FMAInstKind::Indexed, &NewVR);
5268     }
5269     break;
5270   }
5271   case MachineCombinerPattern::FMLSv4f16_OP2:
5272     RC = &AArch64::FPR64RegClass;
5273     Opc = AArch64::FMLSv4f16;
5274     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5275                            FMAInstKind::Accumulator);
5276     break;
5277   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
5278     RC = &AArch64::FPR64RegClass;
5279     Opc = AArch64::FMLSv4i16_indexed;
5280     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5281                            FMAInstKind::Indexed);
5282     break;
5283 
5284   case MachineCombinerPattern::FMLSv2f32_OP2:
5285   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
5286     RC = &AArch64::FPR64RegClass;
5287     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
5288       Opc = AArch64::FMLSv2i32_indexed;
5289       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5290                              FMAInstKind::Indexed);
5291     } else {
5292       Opc = AArch64::FMLSv2f32;
5293       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5294                              FMAInstKind::Accumulator);
5295     }
5296     break;
5297 
5298   case MachineCombinerPattern::FMLSv8f16_OP1:
5299   case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
5300     RC = &AArch64::FPR128RegClass;
5301     Register NewVR = MRI.createVirtualRegister(RC);
5302     MachineInstrBuilder MIB1 =
5303         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
5304             .add(Root.getOperand(2));
5305     InsInstrs.push_back(MIB1);
5306     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5307     if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
5308       Opc = AArch64::FMLAv8f16;
5309       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5310                              FMAInstKind::Accumulator, &NewVR);
5311     } else {
5312       Opc = AArch64::FMLAv8i16_indexed;
5313       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5314                              FMAInstKind::Indexed, &NewVR);
5315     }
5316     break;
5317   }
5318   case MachineCombinerPattern::FMLSv8f16_OP2:
5319     RC = &AArch64::FPR128RegClass;
5320     Opc = AArch64::FMLSv8f16;
5321     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5322                            FMAInstKind::Accumulator);
5323     break;
5324   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
5325     RC = &AArch64::FPR128RegClass;
5326     Opc = AArch64::FMLSv8i16_indexed;
5327     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5328                            FMAInstKind::Indexed);
5329     break;
5330 
5331   case MachineCombinerPattern::FMLSv2f64_OP2:
5332   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
5333     RC = &AArch64::FPR128RegClass;
5334     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
5335       Opc = AArch64::FMLSv2i64_indexed;
5336       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5337                              FMAInstKind::Indexed);
5338     } else {
5339       Opc = AArch64::FMLSv2f64;
5340       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5341                              FMAInstKind::Accumulator);
5342     }
5343     break;
5344 
5345   case MachineCombinerPattern::FMLSv4f32_OP2:
5346   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
5347     RC = &AArch64::FPR128RegClass;
5348     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
5349       Opc = AArch64::FMLSv4i32_indexed;
5350       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5351                              FMAInstKind::Indexed);
5352     } else {
5353       Opc = AArch64::FMLSv4f32;
5354       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5355                              FMAInstKind::Accumulator);
5356     }
5357     break;
5358   case MachineCombinerPattern::FMLSv2f32_OP1:
5359   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
5360     RC = &AArch64::FPR64RegClass;
5361     Register NewVR = MRI.createVirtualRegister(RC);
5362     MachineInstrBuilder MIB1 =
5363         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
5364             .add(Root.getOperand(2));
5365     InsInstrs.push_back(MIB1);
5366     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5367     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
5368       Opc = AArch64::FMLAv2i32_indexed;
5369       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5370                              FMAInstKind::Indexed, &NewVR);
5371     } else {
5372       Opc = AArch64::FMLAv2f32;
5373       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5374                              FMAInstKind::Accumulator, &NewVR);
5375     }
5376     break;
5377   }
5378   case MachineCombinerPattern::FMLSv4f32_OP1:
5379   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
5380     RC = &AArch64::FPR128RegClass;
5381     Register NewVR = MRI.createVirtualRegister(RC);
5382     MachineInstrBuilder MIB1 =
5383         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
5384             .add(Root.getOperand(2));
5385     InsInstrs.push_back(MIB1);
5386     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5387     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
5388       Opc = AArch64::FMLAv4i32_indexed;
5389       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5390                              FMAInstKind::Indexed, &NewVR);
5391     } else {
5392       Opc = AArch64::FMLAv4f32;
5393       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5394                              FMAInstKind::Accumulator, &NewVR);
5395     }
5396     break;
5397   }
5398   case MachineCombinerPattern::FMLSv2f64_OP1:
5399   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
5400     RC = &AArch64::FPR128RegClass;
5401     Register NewVR = MRI.createVirtualRegister(RC);
5402     MachineInstrBuilder MIB1 =
5403         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
5404             .add(Root.getOperand(2));
5405     InsInstrs.push_back(MIB1);
5406     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5407     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
5408       Opc = AArch64::FMLAv2i64_indexed;
5409       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5410                              FMAInstKind::Indexed, &NewVR);
5411     } else {
5412       Opc = AArch64::FMLAv2f64;
5413       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5414                              FMAInstKind::Accumulator, &NewVR);
5415     }
5416     break;
5417   }
5418   } // end switch (Pattern)
5419   // Record MUL and ADD/SUB for deletion
5420   DelInstrs.push_back(MUL);
5421   DelInstrs.push_back(&Root);
5422 }
5423 
5424 /// Replace csincr-branch sequence by simple conditional branch
5425 ///
5426 /// Examples:
5427 /// 1. \code
5428 ///   csinc  w9, wzr, wzr, <condition code>
5429 ///   tbnz   w9, #0, 0x44
5430 ///    \endcode
5431 /// to
5432 ///    \code
5433 ///   b.<inverted condition code>
5434 ///    \endcode
5435 ///
5436 /// 2. \code
5437 ///   csinc w9, wzr, wzr, <condition code>
5438 ///   tbz   w9, #0, 0x44
5439 ///    \endcode
5440 /// to
5441 ///    \code
5442 ///   b.<condition code>
5443 ///    \endcode
5444 ///
5445 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
5446 /// compare's constant operand is power of 2.
5447 ///
5448 /// Examples:
5449 ///    \code
5450 ///   and  w8, w8, #0x400
5451 ///   cbnz w8, L1
5452 ///    \endcode
5453 /// to
5454 ///    \code
5455 ///   tbnz w8, #10, L1
5456 ///    \endcode
5457 ///
5458 /// \param  MI Conditional Branch
5459 /// \return True when the simple conditional branch is generated
5460 ///
optimizeCondBranch(MachineInstr & MI) const5461 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
5462   bool IsNegativeBranch = false;
5463   bool IsTestAndBranch = false;
5464   unsigned TargetBBInMI = 0;
5465   switch (MI.getOpcode()) {
5466   default:
5467     llvm_unreachable("Unknown branch instruction?");
5468   case AArch64::Bcc:
5469     return false;
5470   case AArch64::CBZW:
5471   case AArch64::CBZX:
5472     TargetBBInMI = 1;
5473     break;
5474   case AArch64::CBNZW:
5475   case AArch64::CBNZX:
5476     TargetBBInMI = 1;
5477     IsNegativeBranch = true;
5478     break;
5479   case AArch64::TBZW:
5480   case AArch64::TBZX:
5481     TargetBBInMI = 2;
5482     IsTestAndBranch = true;
5483     break;
5484   case AArch64::TBNZW:
5485   case AArch64::TBNZX:
5486     TargetBBInMI = 2;
5487     IsNegativeBranch = true;
5488     IsTestAndBranch = true;
5489     break;
5490   }
5491   // So we increment a zero register and test for bits other
5492   // than bit 0? Conservatively bail out in case the verifier
5493   // missed this case.
5494   if (IsTestAndBranch && MI.getOperand(1).getImm())
5495     return false;
5496 
5497   // Find Definition.
5498   assert(MI.getParent() && "Incomplete machine instruciton\n");
5499   MachineBasicBlock *MBB = MI.getParent();
5500   MachineFunction *MF = MBB->getParent();
5501   MachineRegisterInfo *MRI = &MF->getRegInfo();
5502   Register VReg = MI.getOperand(0).getReg();
5503   if (!Register::isVirtualRegister(VReg))
5504     return false;
5505 
5506   MachineInstr *DefMI = MRI->getVRegDef(VReg);
5507 
5508   // Look through COPY instructions to find definition.
5509   while (DefMI->isCopy()) {
5510     Register CopyVReg = DefMI->getOperand(1).getReg();
5511     if (!MRI->hasOneNonDBGUse(CopyVReg))
5512       return false;
5513     if (!MRI->hasOneDef(CopyVReg))
5514       return false;
5515     DefMI = MRI->getVRegDef(CopyVReg);
5516   }
5517 
5518   switch (DefMI->getOpcode()) {
5519   default:
5520     return false;
5521   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
5522   case AArch64::ANDWri:
5523   case AArch64::ANDXri: {
5524     if (IsTestAndBranch)
5525       return false;
5526     if (DefMI->getParent() != MBB)
5527       return false;
5528     if (!MRI->hasOneNonDBGUse(VReg))
5529       return false;
5530 
5531     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
5532     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
5533         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
5534     if (!isPowerOf2_64(Mask))
5535       return false;
5536 
5537     MachineOperand &MO = DefMI->getOperand(1);
5538     Register NewReg = MO.getReg();
5539     if (!Register::isVirtualRegister(NewReg))
5540       return false;
5541 
5542     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
5543 
5544     MachineBasicBlock &RefToMBB = *MBB;
5545     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
5546     DebugLoc DL = MI.getDebugLoc();
5547     unsigned Imm = Log2_64(Mask);
5548     unsigned Opc = (Imm < 32)
5549                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
5550                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
5551     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
5552                               .addReg(NewReg)
5553                               .addImm(Imm)
5554                               .addMBB(TBB);
5555     // Register lives on to the CBZ now.
5556     MO.setIsKill(false);
5557 
5558     // For immediate smaller than 32, we need to use the 32-bit
5559     // variant (W) in all cases. Indeed the 64-bit variant does not
5560     // allow to encode them.
5561     // Therefore, if the input register is 64-bit, we need to take the
5562     // 32-bit sub-part.
5563     if (!Is32Bit && Imm < 32)
5564       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
5565     MI.eraseFromParent();
5566     return true;
5567   }
5568   // Look for CSINC
5569   case AArch64::CSINCWr:
5570   case AArch64::CSINCXr: {
5571     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
5572           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
5573         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
5574           DefMI->getOperand(2).getReg() == AArch64::XZR))
5575       return false;
5576 
5577     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
5578       return false;
5579 
5580     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
5581     // Convert only when the condition code is not modified between
5582     // the CSINC and the branch. The CC may be used by other
5583     // instructions in between.
5584     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
5585       return false;
5586     MachineBasicBlock &RefToMBB = *MBB;
5587     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
5588     DebugLoc DL = MI.getDebugLoc();
5589     if (IsNegativeBranch)
5590       CC = AArch64CC::getInvertedCondCode(CC);
5591     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
5592     MI.eraseFromParent();
5593     return true;
5594   }
5595   }
5596 }
5597 
5598 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const5599 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
5600   const unsigned Mask = AArch64II::MO_FRAGMENT;
5601   return std::make_pair(TF & Mask, TF & ~Mask);
5602 }
5603 
5604 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const5605 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
5606   using namespace AArch64II;
5607 
5608   static const std::pair<unsigned, const char *> TargetFlags[] = {
5609       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
5610       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
5611       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
5612       {MO_HI12, "aarch64-hi12"}};
5613   return makeArrayRef(TargetFlags);
5614 }
5615 
5616 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const5617 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
5618   using namespace AArch64II;
5619 
5620   static const std::pair<unsigned, const char *> TargetFlags[] = {
5621       {MO_COFFSTUB, "aarch64-coffstub"},
5622       {MO_GOT, "aarch64-got"},
5623       {MO_NC, "aarch64-nc"},
5624       {MO_S, "aarch64-s"},
5625       {MO_TLS, "aarch64-tls"},
5626       {MO_DLLIMPORT, "aarch64-dllimport"},
5627       {MO_PREL, "aarch64-prel"},
5628       {MO_TAGGED, "aarch64-tagged"}};
5629   return makeArrayRef(TargetFlags);
5630 }
5631 
5632 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const5633 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
5634   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
5635       {{MOSuppressPair, "aarch64-suppress-pair"},
5636        {MOStridedAccess, "aarch64-strided-access"}};
5637   return makeArrayRef(TargetFlags);
5638 }
5639 
5640 /// Constants defining how certain sequences should be outlined.
5641 /// This encompasses how an outlined function should be called, and what kind of
5642 /// frame should be emitted for that outlined function.
5643 ///
5644 /// \p MachineOutlinerDefault implies that the function should be called with
5645 /// a save and restore of LR to the stack.
5646 ///
5647 /// That is,
5648 ///
5649 /// I1     Save LR                    OUTLINED_FUNCTION:
5650 /// I2 --> BL OUTLINED_FUNCTION       I1
5651 /// I3     Restore LR                 I2
5652 ///                                   I3
5653 ///                                   RET
5654 ///
5655 /// * Call construction overhead: 3 (save + BL + restore)
5656 /// * Frame construction overhead: 1 (ret)
5657 /// * Requires stack fixups? Yes
5658 ///
5659 /// \p MachineOutlinerTailCall implies that the function is being created from
5660 /// a sequence of instructions ending in a return.
5661 ///
5662 /// That is,
5663 ///
5664 /// I1                             OUTLINED_FUNCTION:
5665 /// I2 --> B OUTLINED_FUNCTION     I1
5666 /// RET                            I2
5667 ///                                RET
5668 ///
5669 /// * Call construction overhead: 1 (B)
5670 /// * Frame construction overhead: 0 (Return included in sequence)
5671 /// * Requires stack fixups? No
5672 ///
5673 /// \p MachineOutlinerNoLRSave implies that the function should be called using
5674 /// a BL instruction, but doesn't require LR to be saved and restored. This
5675 /// happens when LR is known to be dead.
5676 ///
5677 /// That is,
5678 ///
5679 /// I1                                OUTLINED_FUNCTION:
5680 /// I2 --> BL OUTLINED_FUNCTION       I1
5681 /// I3                                I2
5682 ///                                   I3
5683 ///                                   RET
5684 ///
5685 /// * Call construction overhead: 1 (BL)
5686 /// * Frame construction overhead: 1 (RET)
5687 /// * Requires stack fixups? No
5688 ///
5689 /// \p MachineOutlinerThunk implies that the function is being created from
5690 /// a sequence of instructions ending in a call. The outlined function is
5691 /// called with a BL instruction, and the outlined function tail-calls the
5692 /// original call destination.
5693 ///
5694 /// That is,
5695 ///
5696 /// I1                                OUTLINED_FUNCTION:
5697 /// I2 --> BL OUTLINED_FUNCTION       I1
5698 /// BL f                              I2
5699 ///                                   B f
5700 /// * Call construction overhead: 1 (BL)
5701 /// * Frame construction overhead: 0
5702 /// * Requires stack fixups? No
5703 ///
5704 /// \p MachineOutlinerRegSave implies that the function should be called with a
5705 /// save and restore of LR to an available register. This allows us to avoid
5706 /// stack fixups. Note that this outlining variant is compatible with the
5707 /// NoLRSave case.
5708 ///
5709 /// That is,
5710 ///
5711 /// I1     Save LR                    OUTLINED_FUNCTION:
5712 /// I2 --> BL OUTLINED_FUNCTION       I1
5713 /// I3     Restore LR                 I2
5714 ///                                   I3
5715 ///                                   RET
5716 ///
5717 /// * Call construction overhead: 3 (save + BL + restore)
5718 /// * Frame construction overhead: 1 (ret)
5719 /// * Requires stack fixups? No
5720 enum MachineOutlinerClass {
5721   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
5722   MachineOutlinerTailCall, /// Only emit a branch.
5723   MachineOutlinerNoLRSave, /// Emit a call and return.
5724   MachineOutlinerThunk,    /// Emit a call and tail-call.
5725   MachineOutlinerRegSave   /// Same as default, but save to a register.
5726 };
5727 
5728 enum MachineOutlinerMBBFlags {
5729   LRUnavailableSomewhere = 0x2,
5730   HasCalls = 0x4,
5731   UnsafeRegsDead = 0x8
5732 };
5733 
5734 unsigned
findRegisterToSaveLRTo(const outliner::Candidate & C) const5735 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
5736   assert(C.LRUWasSet && "LRU wasn't set?");
5737   MachineFunction *MF = C.getMF();
5738   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5739       MF->getSubtarget().getRegisterInfo());
5740 
5741   // Check if there is an available register across the sequence that we can
5742   // use.
5743   for (unsigned Reg : AArch64::GPR64RegClass) {
5744     if (!ARI->isReservedReg(*MF, Reg) &&
5745         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
5746         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
5747         Reg != AArch64::X17 && // Ditto for X17.
5748         C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
5749       return Reg;
5750   }
5751 
5752   // No suitable register. Return 0.
5753   return 0u;
5754 }
5755 
5756 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5757 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
5758                                          const outliner::Candidate &b) {
5759   const Function &Fa = a.getMF()->getFunction();
5760   const Function &Fb = b.getMF()->getFunction();
5761 
5762   // If none of the functions have the "sign-return-address" attribute their
5763   // signing behaviour is equal
5764   if (!Fa.hasFnAttribute("sign-return-address") &&
5765       !Fb.hasFnAttribute("sign-return-address")) {
5766     return true;
5767   }
5768 
5769   // If both functions have the "sign-return-address" attribute their signing
5770   // behaviour is equal, if the values of the attributes are equal
5771   if (Fa.hasFnAttribute("sign-return-address") &&
5772       Fb.hasFnAttribute("sign-return-address")) {
5773     StringRef ScopeA =
5774         Fa.getFnAttribute("sign-return-address").getValueAsString();
5775     StringRef ScopeB =
5776         Fb.getFnAttribute("sign-return-address").getValueAsString();
5777     return ScopeA.equals(ScopeB);
5778   }
5779 
5780   // If function B doesn't have the "sign-return-address" attribute but A does,
5781   // the functions' signing behaviour is equal if A's value for
5782   // "sign-return-address" is "none" and vice versa.
5783   if (Fa.hasFnAttribute("sign-return-address")) {
5784     StringRef ScopeA =
5785         Fa.getFnAttribute("sign-return-address").getValueAsString();
5786     return ScopeA.equals("none");
5787   }
5788 
5789   if (Fb.hasFnAttribute("sign-return-address")) {
5790     StringRef ScopeB =
5791         Fb.getFnAttribute("sign-return-address").getValueAsString();
5792     return ScopeB.equals("none");
5793   }
5794 
5795   llvm_unreachable("Unkown combination of sign-return-address attributes");
5796 }
5797 
5798 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5799 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
5800                                        const outliner::Candidate &b) {
5801   const Function &Fa = a.getMF()->getFunction();
5802   const Function &Fb = b.getMF()->getFunction();
5803 
5804   // If none of the functions have the "sign-return-address-key" attribute
5805   // their keys are equal
5806   if (!Fa.hasFnAttribute("sign-return-address-key") &&
5807       !Fb.hasFnAttribute("sign-return-address-key")) {
5808     return true;
5809   }
5810 
5811   // If both functions have the "sign-return-address-key" attribute their
5812   // keys are equal if the values of "sign-return-address-key" are equal
5813   if (Fa.hasFnAttribute("sign-return-address-key") &&
5814       Fb.hasFnAttribute("sign-return-address-key")) {
5815     StringRef KeyA =
5816         Fa.getFnAttribute("sign-return-address-key").getValueAsString();
5817     StringRef KeyB =
5818         Fb.getFnAttribute("sign-return-address-key").getValueAsString();
5819     return KeyA.equals(KeyB);
5820   }
5821 
5822   // If B doesn't have the "sign-return-address-key" attribute, both keys are
5823   // equal, if function a has the default key (a_key)
5824   if (Fa.hasFnAttribute("sign-return-address-key")) {
5825     StringRef KeyA =
5826         Fa.getFnAttribute("sign-return-address-key").getValueAsString();
5827     return KeyA.equals_lower("a_key");
5828   }
5829 
5830   if (Fb.hasFnAttribute("sign-return-address-key")) {
5831     StringRef KeyB =
5832         Fb.getFnAttribute("sign-return-address-key").getValueAsString();
5833     return KeyB.equals_lower("a_key");
5834   }
5835 
5836   llvm_unreachable("Unkown combination of sign-return-address-key attributes");
5837 }
5838 
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5839 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
5840                                                 const outliner::Candidate &b) {
5841   const AArch64Subtarget &SubtargetA =
5842       a.getMF()->getSubtarget<AArch64Subtarget>();
5843   const AArch64Subtarget &SubtargetB =
5844       b.getMF()->getSubtarget<AArch64Subtarget>();
5845   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
5846 }
5847 
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const5848 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
5849     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
5850   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
5851   unsigned SequenceSize =
5852       std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
5853                       [this](unsigned Sum, const MachineInstr &MI) {
5854                         return Sum + getInstSizeInBytes(MI);
5855                       });
5856   unsigned NumBytesToCreateFrame = 0;
5857 
5858   // We only allow outlining for functions having exactly matching return
5859   // address signing attributes, i.e., all share the same value for the
5860   // attribute "sign-return-address" and all share the same type of key they
5861   // are signed with.
5862   // Additionally we require all functions to simultaniously either support
5863   // v8.3a features or not. Otherwise an outlined function could get signed
5864   // using dedicated v8.3 instructions and a call from a function that doesn't
5865   // support v8.3 instructions would therefore be invalid.
5866   if (std::adjacent_find(
5867           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5868           [](const outliner::Candidate &a, const outliner::Candidate &b) {
5869             // Return true if a and b are non-equal w.r.t. return address
5870             // signing or support of v8.3a features
5871             if (outliningCandidatesSigningScopeConsensus(a, b) &&
5872                 outliningCandidatesSigningKeyConsensus(a, b) &&
5873                 outliningCandidatesV8_3OpsConsensus(a, b)) {
5874               return false;
5875             }
5876             return true;
5877           }) != RepeatedSequenceLocs.end()) {
5878     return outliner::OutlinedFunction();
5879   }
5880 
5881   // Since at this point all candidates agree on their return address signing
5882   // picking just one is fine. If the candidate functions potentially sign their
5883   // return addresses, the outlined function should do the same. Note that in
5884   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
5885   // not certainly true that the outlined function will have to sign its return
5886   // address but this decision is made later, when the decision to outline
5887   // has already been made.
5888   // The same holds for the number of additional instructions we need: On
5889   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
5890   // necessary. However, at this point we don't know if the outlined function
5891   // will have a RET instruction so we assume the worst.
5892   const Function &FCF = FirstCand.getMF()->getFunction();
5893   const TargetRegisterInfo &TRI = getRegisterInfo();
5894   if (FCF.hasFnAttribute("sign-return-address")) {
5895     // One PAC and one AUT instructions
5896     NumBytesToCreateFrame += 8;
5897 
5898     // We have to check if sp modifying instructions would get outlined.
5899     // If so we only allow outlining if sp is unchanged overall, so matching
5900     // sub and add instructions are okay to outline, all other sp modifications
5901     // are not
5902     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
5903       int SPValue = 0;
5904       MachineBasicBlock::iterator MBBI = C.front();
5905       for (;;) {
5906         if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
5907           switch (MBBI->getOpcode()) {
5908           case AArch64::ADDXri:
5909           case AArch64::ADDWri:
5910             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5911             assert(MBBI->getOperand(2).isImm() &&
5912                    "Expected operand to be immediate");
5913             assert(MBBI->getOperand(1).isReg() &&
5914                    "Expected operand to be a register");
5915             // Check if the add just increments sp. If so, we search for
5916             // matching sub instructions that decrement sp. If not, the
5917             // modification is illegal
5918             if (MBBI->getOperand(1).getReg() == AArch64::SP)
5919               SPValue += MBBI->getOperand(2).getImm();
5920             else
5921               return true;
5922             break;
5923           case AArch64::SUBXri:
5924           case AArch64::SUBWri:
5925             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5926             assert(MBBI->getOperand(2).isImm() &&
5927                    "Expected operand to be immediate");
5928             assert(MBBI->getOperand(1).isReg() &&
5929                    "Expected operand to be a register");
5930             // Check if the sub just decrements sp. If so, we search for
5931             // matching add instructions that increment sp. If not, the
5932             // modification is illegal
5933             if (MBBI->getOperand(1).getReg() == AArch64::SP)
5934               SPValue -= MBBI->getOperand(2).getImm();
5935             else
5936               return true;
5937             break;
5938           default:
5939             return true;
5940           }
5941         }
5942         if (MBBI == C.back())
5943           break;
5944         ++MBBI;
5945       }
5946       if (SPValue)
5947         return true;
5948       return false;
5949     };
5950     // Remove candidates with illegal stack modifying instructions
5951     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
5952                                               RepeatedSequenceLocs.end(),
5953                                               hasIllegalSPModification),
5954                                RepeatedSequenceLocs.end());
5955 
5956     // If the sequence doesn't have enough candidates left, then we're done.
5957     if (RepeatedSequenceLocs.size() < 2)
5958       return outliner::OutlinedFunction();
5959   }
5960 
5961   // Properties about candidate MBBs that hold for all of them.
5962   unsigned FlagsSetInAll = 0xF;
5963 
5964   // Compute liveness information for each candidate, and set FlagsSetInAll.
5965   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5966                 [&FlagsSetInAll](outliner::Candidate &C) {
5967                   FlagsSetInAll &= C.Flags;
5968                 });
5969 
5970   // According to the AArch64 Procedure Call Standard, the following are
5971   // undefined on entry/exit from a function call:
5972   //
5973   // * Registers x16, x17, (and thus w16, w17)
5974   // * Condition codes (and thus the NZCV register)
5975   //
5976   // Because if this, we can't outline any sequence of instructions where
5977   // one
5978   // of these registers is live into/across it. Thus, we need to delete
5979   // those
5980   // candidates.
5981   auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
5982     // If the unsafe registers in this block are all dead, then we don't need
5983     // to compute liveness here.
5984     if (C.Flags & UnsafeRegsDead)
5985       return false;
5986     C.initLRU(TRI);
5987     LiveRegUnits LRU = C.LRU;
5988     return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
5989             !LRU.available(AArch64::NZCV));
5990   };
5991 
5992   // Are there any candidates where those registers are live?
5993   if (!(FlagsSetInAll & UnsafeRegsDead)) {
5994     // Erase every candidate that violates the restrictions above. (It could be
5995     // true that we have viable candidates, so it's not worth bailing out in
5996     // the case that, say, 1 out of 20 candidates violate the restructions.)
5997     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
5998                                               RepeatedSequenceLocs.end(),
5999                                               CantGuaranteeValueAcrossCall),
6000                                RepeatedSequenceLocs.end());
6001 
6002     // If the sequence doesn't have enough candidates left, then we're done.
6003     if (RepeatedSequenceLocs.size() < 2)
6004       return outliner::OutlinedFunction();
6005   }
6006 
6007   // At this point, we have only "safe" candidates to outline. Figure out
6008   // frame + call instruction information.
6009 
6010   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
6011 
6012   // Helper lambda which sets call information for every candidate.
6013   auto SetCandidateCallInfo =
6014       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
6015         for (outliner::Candidate &C : RepeatedSequenceLocs)
6016           C.setCallInfo(CallID, NumBytesForCall);
6017       };
6018 
6019   unsigned FrameID = MachineOutlinerDefault;
6020   NumBytesToCreateFrame += 4;
6021 
6022   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
6023     return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
6024   });
6025 
6026   // We check to see if CFI Instructions are present, and if they are
6027   // we find the number of CFI Instructions in the candidates.
6028   unsigned CFICount = 0;
6029   MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
6030   for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
6031        Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
6032     const std::vector<MCCFIInstruction> &CFIInstructions =
6033         RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
6034     if (MBBI->isCFIInstruction()) {
6035       unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
6036       MCCFIInstruction CFI = CFIInstructions[CFIIndex];
6037       CFICount++;
6038     }
6039     MBBI++;
6040   }
6041 
6042   // We compare the number of found CFI Instructions to  the number of CFI
6043   // instructions in the parent function for each candidate.  We must check this
6044   // since if we outline one of the CFI instructions in a function, we have to
6045   // outline them all for correctness. If we do not, the address offsets will be
6046   // incorrect between the two sections of the program.
6047   for (outliner::Candidate &C : RepeatedSequenceLocs) {
6048     std::vector<MCCFIInstruction> CFIInstructions =
6049         C.getMF()->getFrameInstructions();
6050 
6051     if (CFICount > 0 && CFICount != CFIInstructions.size())
6052       return outliner::OutlinedFunction();
6053   }
6054 
6055   // Returns true if an instructions is safe to fix up, false otherwise.
6056   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
6057     if (MI.isCall())
6058       return true;
6059 
6060     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
6061         !MI.readsRegister(AArch64::SP, &TRI))
6062       return true;
6063 
6064     // Any modification of SP will break our code to save/restore LR.
6065     // FIXME: We could handle some instructions which add a constant
6066     // offset to SP, with a bit more work.
6067     if (MI.modifiesRegister(AArch64::SP, &TRI))
6068       return false;
6069 
6070     // At this point, we have a stack instruction that we might need to
6071     // fix up. We'll handle it if it's a load or store.
6072     if (MI.mayLoadOrStore()) {
6073       const MachineOperand *Base; // Filled with the base operand of MI.
6074       int64_t Offset;             // Filled with the offset of MI.
6075       bool OffsetIsScalable;
6076 
6077       // Does it allow us to offset the base operand and is the base the
6078       // register SP?
6079       if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
6080           !Base->isReg() || Base->getReg() != AArch64::SP)
6081         return false;
6082 
6083       // Fixe-up code below assumes bytes.
6084       if (OffsetIsScalable)
6085         return false;
6086 
6087       // Find the minimum/maximum offset for this instruction and check
6088       // if fixing it up would be in range.
6089       int64_t MinOffset,
6090           MaxOffset;  // Unscaled offsets for the instruction.
6091       TypeSize Scale(0U, false); // The scale to multiply the offsets by.
6092       unsigned DummyWidth;
6093       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
6094 
6095       Offset += 16; // Update the offset to what it would be if we outlined.
6096       if (Offset < MinOffset * (int64_t)Scale.getFixedSize() ||
6097           Offset > MaxOffset * (int64_t)Scale.getFixedSize())
6098         return false;
6099 
6100       // It's in range, so we can outline it.
6101       return true;
6102     }
6103 
6104     // FIXME: Add handling for instructions like "add x0, sp, #8".
6105 
6106     // We can't fix it up, so don't outline it.
6107     return false;
6108   };
6109 
6110   // True if it's possible to fix up each stack instruction in this sequence.
6111   // Important for frames/call variants that modify the stack.
6112   bool AllStackInstrsSafe = std::all_of(
6113       FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
6114 
6115   // If the last instruction in any candidate is a terminator, then we should
6116   // tail call all of the candidates.
6117   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
6118     FrameID = MachineOutlinerTailCall;
6119     NumBytesToCreateFrame = 0;
6120     SetCandidateCallInfo(MachineOutlinerTailCall, 4);
6121   }
6122 
6123   else if (LastInstrOpcode == AArch64::BL ||
6124            ((LastInstrOpcode == AArch64::BLR ||
6125              LastInstrOpcode == AArch64::BLRNoIP) &&
6126             !HasBTI)) {
6127     // FIXME: Do we need to check if the code after this uses the value of LR?
6128     FrameID = MachineOutlinerThunk;
6129     NumBytesToCreateFrame = 0;
6130     SetCandidateCallInfo(MachineOutlinerThunk, 4);
6131   }
6132 
6133   else {
6134     // We need to decide how to emit calls + frames. We can always emit the same
6135     // frame if we don't need to save to the stack. If we have to save to the
6136     // stack, then we need a different frame.
6137     unsigned NumBytesNoStackCalls = 0;
6138     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
6139 
6140     // Check if we have to save LR.
6141     for (outliner::Candidate &C : RepeatedSequenceLocs) {
6142       C.initLRU(TRI);
6143 
6144       // If we have a noreturn caller, then we're going to be conservative and
6145       // say that we have to save LR. If we don't have a ret at the end of the
6146       // block, then we can't reason about liveness accurately.
6147       //
6148       // FIXME: We can probably do better than always disabling this in
6149       // noreturn functions by fixing up the liveness info.
6150       bool IsNoReturn =
6151           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
6152 
6153       // Is LR available? If so, we don't need a save.
6154       if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
6155         NumBytesNoStackCalls += 4;
6156         C.setCallInfo(MachineOutlinerNoLRSave, 4);
6157         CandidatesWithoutStackFixups.push_back(C);
6158       }
6159 
6160       // Is an unused register available? If so, we won't modify the stack, so
6161       // we can outline with the same frame type as those that don't save LR.
6162       else if (findRegisterToSaveLRTo(C)) {
6163         NumBytesNoStackCalls += 12;
6164         C.setCallInfo(MachineOutlinerRegSave, 12);
6165         CandidatesWithoutStackFixups.push_back(C);
6166       }
6167 
6168       // Is SP used in the sequence at all? If not, we don't have to modify
6169       // the stack, so we are guaranteed to get the same frame.
6170       else if (C.UsedInSequence.available(AArch64::SP)) {
6171         NumBytesNoStackCalls += 12;
6172         C.setCallInfo(MachineOutlinerDefault, 12);
6173         CandidatesWithoutStackFixups.push_back(C);
6174       }
6175 
6176       // If we outline this, we need to modify the stack. Pretend we don't
6177       // outline this by saving all of its bytes.
6178       else {
6179         NumBytesNoStackCalls += SequenceSize;
6180       }
6181     }
6182 
6183     // If there are no places where we have to save LR, then note that we
6184     // don't have to update the stack. Otherwise, give every candidate the
6185     // default call type, as long as it's safe to do so.
6186     if (!AllStackInstrsSafe ||
6187         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
6188       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
6189       FrameID = MachineOutlinerNoLRSave;
6190     } else {
6191       SetCandidateCallInfo(MachineOutlinerDefault, 12);
6192     }
6193 
6194     // If we dropped all of the candidates, bail out here.
6195     if (RepeatedSequenceLocs.size() < 2) {
6196       RepeatedSequenceLocs.clear();
6197       return outliner::OutlinedFunction();
6198     }
6199   }
6200 
6201   // Does every candidate's MBB contain a call? If so, then we might have a call
6202   // in the range.
6203   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
6204     // Check if the range contains a call. These require a save + restore of the
6205     // link register.
6206     bool ModStackToSaveLR = false;
6207     if (std::any_of(FirstCand.front(), FirstCand.back(),
6208                     [](const MachineInstr &MI) { return MI.isCall(); }))
6209       ModStackToSaveLR = true;
6210 
6211     // Handle the last instruction separately. If this is a tail call, then the
6212     // last instruction is a call. We don't want to save + restore in this case.
6213     // However, it could be possible that the last instruction is a call without
6214     // it being valid to tail call this sequence. We should consider this as
6215     // well.
6216     else if (FrameID != MachineOutlinerThunk &&
6217              FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
6218       ModStackToSaveLR = true;
6219 
6220     if (ModStackToSaveLR) {
6221       // We can't fix up the stack. Bail out.
6222       if (!AllStackInstrsSafe) {
6223         RepeatedSequenceLocs.clear();
6224         return outliner::OutlinedFunction();
6225       }
6226 
6227       // Save + restore LR.
6228       NumBytesToCreateFrame += 8;
6229     }
6230   }
6231 
6232   // If we have CFI instructions, we can only outline if the outlined section
6233   // can be a tail call
6234   if (FrameID != MachineOutlinerTailCall && CFICount > 0)
6235     return outliner::OutlinedFunction();
6236 
6237   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
6238                                     NumBytesToCreateFrame, FrameID);
6239 }
6240 
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const6241 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
6242     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
6243   const Function &F = MF.getFunction();
6244 
6245   // Can F be deduplicated by the linker? If it can, don't outline from it.
6246   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
6247     return false;
6248 
6249   // Don't outline from functions with section markings; the program could
6250   // expect that all the code is in the named section.
6251   // FIXME: Allow outlining from multiple functions with the same section
6252   // marking.
6253   if (F.hasSection())
6254     return false;
6255 
6256   // Outlining from functions with redzones is unsafe since the outliner may
6257   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
6258   // outline from it.
6259   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
6260   if (!AFI || AFI->hasRedZone().getValueOr(true))
6261     return false;
6262 
6263   // FIXME: Teach the outliner to generate/handle Windows unwind info.
6264   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
6265     return false;
6266 
6267   // It's safe to outline from MF.
6268   return true;
6269 }
6270 
isMBBSafeToOutlineFrom(MachineBasicBlock & MBB,unsigned & Flags) const6271 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
6272                                               unsigned &Flags) const {
6273   // Check if LR is available through all of the MBB. If it's not, then set
6274   // a flag.
6275   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
6276          "Suitable Machine Function for outlining must track liveness");
6277   LiveRegUnits LRU(getRegisterInfo());
6278 
6279   std::for_each(MBB.rbegin(), MBB.rend(),
6280                 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
6281 
6282   // Check if each of the unsafe registers are available...
6283   bool W16AvailableInBlock = LRU.available(AArch64::W16);
6284   bool W17AvailableInBlock = LRU.available(AArch64::W17);
6285   bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
6286 
6287   // If all of these are dead (and not live out), we know we don't have to check
6288   // them later.
6289   if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
6290     Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
6291 
6292   // Now, add the live outs to the set.
6293   LRU.addLiveOuts(MBB);
6294 
6295   // If any of these registers is available in the MBB, but also a live out of
6296   // the block, then we know outlining is unsafe.
6297   if (W16AvailableInBlock && !LRU.available(AArch64::W16))
6298     return false;
6299   if (W17AvailableInBlock && !LRU.available(AArch64::W17))
6300     return false;
6301   if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
6302     return false;
6303 
6304   // Check if there's a call inside this MachineBasicBlock. If there is, then
6305   // set a flag.
6306   if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
6307     Flags |= MachineOutlinerMBBFlags::HasCalls;
6308 
6309   MachineFunction *MF = MBB.getParent();
6310 
6311   // In the event that we outline, we may have to save LR. If there is an
6312   // available register in the MBB, then we'll always save LR there. Check if
6313   // this is true.
6314   bool CanSaveLR = false;
6315   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
6316       MF->getSubtarget().getRegisterInfo());
6317 
6318   // Check if there is an available register across the sequence that we can
6319   // use.
6320   for (unsigned Reg : AArch64::GPR64RegClass) {
6321     if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
6322         Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
6323       CanSaveLR = true;
6324       break;
6325     }
6326   }
6327 
6328   // Check if we have a register we can save LR to, and if LR was used
6329   // somewhere. If both of those things are true, then we need to evaluate the
6330   // safety of outlining stack instructions later.
6331   if (!CanSaveLR && !LRU.available(AArch64::LR))
6332     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
6333 
6334   return true;
6335 }
6336 
6337 outliner::InstrType
getOutliningType(MachineBasicBlock::iterator & MIT,unsigned Flags) const6338 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
6339                                    unsigned Flags) const {
6340   MachineInstr &MI = *MIT;
6341   MachineBasicBlock *MBB = MI.getParent();
6342   MachineFunction *MF = MBB->getParent();
6343   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
6344 
6345   // Don't outline anything used for return address signing. The outlined
6346   // function will get signed later if needed
6347   switch (MI.getOpcode()) {
6348   case AArch64::PACIASP:
6349   case AArch64::PACIBSP:
6350   case AArch64::AUTIASP:
6351   case AArch64::AUTIBSP:
6352   case AArch64::RETAA:
6353   case AArch64::RETAB:
6354   case AArch64::EMITBKEY:
6355     return outliner::InstrType::Illegal;
6356   }
6357 
6358   // Don't outline LOHs.
6359   if (FuncInfo->getLOHRelated().count(&MI))
6360     return outliner::InstrType::Illegal;
6361 
6362   // We can only outline these if we will tail call the outlined function, or
6363   // fix up the CFI offsets. Currently, CFI instructions are outlined only if
6364   // in a tail call.
6365   //
6366   // FIXME: If the proper fixups for the offset are implemented, this should be
6367   // possible.
6368   if (MI.isCFIInstruction())
6369     return outliner::InstrType::Legal;
6370 
6371   // Don't allow debug values to impact outlining type.
6372   if (MI.isDebugInstr() || MI.isIndirectDebugValue())
6373     return outliner::InstrType::Invisible;
6374 
6375   // At this point, KILL instructions don't really tell us much so we can go
6376   // ahead and skip over them.
6377   if (MI.isKill())
6378     return outliner::InstrType::Invisible;
6379 
6380   // Is this a terminator for a basic block?
6381   if (MI.isTerminator()) {
6382 
6383     // Is this the end of a function?
6384     if (MI.getParent()->succ_empty())
6385       return outliner::InstrType::Legal;
6386 
6387     // It's not, so don't outline it.
6388     return outliner::InstrType::Illegal;
6389   }
6390 
6391   // Make sure none of the operands are un-outlinable.
6392   for (const MachineOperand &MOP : MI.operands()) {
6393     if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
6394         MOP.isTargetIndex())
6395       return outliner::InstrType::Illegal;
6396 
6397     // If it uses LR or W30 explicitly, then don't touch it.
6398     if (MOP.isReg() && !MOP.isImplicit() &&
6399         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
6400       return outliner::InstrType::Illegal;
6401   }
6402 
6403   // Special cases for instructions that can always be outlined, but will fail
6404   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
6405   // be outlined because they don't require a *specific* value to be in LR.
6406   if (MI.getOpcode() == AArch64::ADRP)
6407     return outliner::InstrType::Legal;
6408 
6409   // If MI is a call we might be able to outline it. We don't want to outline
6410   // any calls that rely on the position of items on the stack. When we outline
6411   // something containing a call, we have to emit a save and restore of LR in
6412   // the outlined function. Currently, this always happens by saving LR to the
6413   // stack. Thus, if we outline, say, half the parameters for a function call
6414   // plus the call, then we'll break the callee's expectations for the layout
6415   // of the stack.
6416   //
6417   // FIXME: Allow calls to functions which construct a stack frame, as long
6418   // as they don't access arguments on the stack.
6419   // FIXME: Figure out some way to analyze functions defined in other modules.
6420   // We should be able to compute the memory usage based on the IR calling
6421   // convention, even if we can't see the definition.
6422   if (MI.isCall()) {
6423     // Get the function associated with the call. Look at each operand and find
6424     // the one that represents the callee and get its name.
6425     const Function *Callee = nullptr;
6426     for (const MachineOperand &MOP : MI.operands()) {
6427       if (MOP.isGlobal()) {
6428         Callee = dyn_cast<Function>(MOP.getGlobal());
6429         break;
6430       }
6431     }
6432 
6433     // Never outline calls to mcount.  There isn't any rule that would require
6434     // this, but the Linux kernel's "ftrace" feature depends on it.
6435     if (Callee && Callee->getName() == "\01_mcount")
6436       return outliner::InstrType::Illegal;
6437 
6438     // If we don't know anything about the callee, assume it depends on the
6439     // stack layout of the caller. In that case, it's only legal to outline
6440     // as a tail-call. Explicitly list the call instructions we know about so we
6441     // don't get unexpected results with call pseudo-instructions.
6442     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
6443     if (MI.getOpcode() == AArch64::BLR ||
6444         MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
6445       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
6446 
6447     if (!Callee)
6448       return UnknownCallOutlineType;
6449 
6450     // We have a function we have information about. Check it if it's something
6451     // can safely outline.
6452     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
6453 
6454     // We don't know what's going on with the callee at all. Don't touch it.
6455     if (!CalleeMF)
6456       return UnknownCallOutlineType;
6457 
6458     // Check if we know anything about the callee saves on the function. If we
6459     // don't, then don't touch it, since that implies that we haven't
6460     // computed anything about its stack frame yet.
6461     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
6462     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
6463         MFI.getNumObjects() > 0)
6464       return UnknownCallOutlineType;
6465 
6466     // At this point, we can say that CalleeMF ought to not pass anything on the
6467     // stack. Therefore, we can outline it.
6468     return outliner::InstrType::Legal;
6469   }
6470 
6471   // Don't outline positions.
6472   if (MI.isPosition())
6473     return outliner::InstrType::Illegal;
6474 
6475   // Don't touch the link register or W30.
6476   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
6477       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
6478     return outliner::InstrType::Illegal;
6479 
6480   // Don't outline BTI instructions, because that will prevent the outlining
6481   // site from being indirectly callable.
6482   if (MI.getOpcode() == AArch64::HINT) {
6483     int64_t Imm = MI.getOperand(0).getImm();
6484     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
6485       return outliner::InstrType::Illegal;
6486   }
6487 
6488   return outliner::InstrType::Legal;
6489 }
6490 
fixupPostOutline(MachineBasicBlock & MBB) const6491 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
6492   for (MachineInstr &MI : MBB) {
6493     const MachineOperand *Base;
6494     unsigned Width;
6495     int64_t Offset;
6496     bool OffsetIsScalable;
6497 
6498     // Is this a load or store with an immediate offset with SP as the base?
6499     if (!MI.mayLoadOrStore() ||
6500         !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
6501                                       &RI) ||
6502         (Base->isReg() && Base->getReg() != AArch64::SP))
6503       continue;
6504 
6505     // It is, so we have to fix it up.
6506     TypeSize Scale(0U, false);
6507     int64_t Dummy1, Dummy2;
6508 
6509     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
6510     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
6511     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
6512     assert(Scale != 0 && "Unexpected opcode!");
6513     assert(!OffsetIsScalable && "Expected offset to be a byte offset");
6514 
6515     // We've pushed the return address to the stack, so add 16 to the offset.
6516     // This is safe, since we already checked if it would overflow when we
6517     // checked if this instruction was legal to outline.
6518     int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
6519     StackOffsetOperand.setImm(NewImm);
6520   }
6521 }
6522 
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,bool ShouldSignReturnAddr,bool ShouldSignReturnAddrWithAKey)6523 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
6524                                  bool ShouldSignReturnAddr,
6525                                  bool ShouldSignReturnAddrWithAKey) {
6526   if (ShouldSignReturnAddr) {
6527     MachineBasicBlock::iterator MBBPAC = MBB.begin();
6528     MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
6529     const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
6530     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
6531     DebugLoc DL;
6532 
6533     if (MBBAUT != MBB.end())
6534       DL = MBBAUT->getDebugLoc();
6535 
6536     // At the very beginning of the basic block we insert the following
6537     // depending on the key type
6538     //
6539     // a_key:                   b_key:
6540     //    PACIASP                   EMITBKEY
6541     //    CFI_INSTRUCTION           PACIBSP
6542     //                              CFI_INSTRUCTION
6543     if (ShouldSignReturnAddrWithAKey) {
6544       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
6545           .setMIFlag(MachineInstr::FrameSetup);
6546     } else {
6547       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
6548           .setMIFlag(MachineInstr::FrameSetup);
6549       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
6550           .setMIFlag(MachineInstr::FrameSetup);
6551     }
6552     unsigned CFIIndex =
6553         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
6554     BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
6555         .addCFIIndex(CFIIndex)
6556         .setMIFlags(MachineInstr::FrameSetup);
6557 
6558     // If v8.3a features are available we can replace a RET instruction by
6559     // RETAA or RETAB and omit the AUT instructions
6560     if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
6561         MBBAUT->getOpcode() == AArch64::RET) {
6562       BuildMI(MBB, MBBAUT, DL,
6563               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
6564                                                     : AArch64::RETAB))
6565           .copyImplicitOps(*MBBAUT);
6566       MBB.erase(MBBAUT);
6567     } else {
6568       BuildMI(MBB, MBBAUT, DL,
6569               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
6570                                                     : AArch64::AUTIBSP))
6571           .setMIFlag(MachineInstr::FrameDestroy);
6572     }
6573   }
6574 }
6575 
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const6576 void AArch64InstrInfo::buildOutlinedFrame(
6577     MachineBasicBlock &MBB, MachineFunction &MF,
6578     const outliner::OutlinedFunction &OF) const {
6579 
6580   AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
6581 
6582   if (OF.FrameConstructionID == MachineOutlinerTailCall)
6583     FI->setOutliningStyle("Tail Call");
6584   else if (OF.FrameConstructionID == MachineOutlinerThunk) {
6585     // For thunk outlining, rewrite the last instruction from a call to a
6586     // tail-call.
6587     MachineInstr *Call = &*--MBB.instr_end();
6588     unsigned TailOpcode;
6589     if (Call->getOpcode() == AArch64::BL) {
6590       TailOpcode = AArch64::TCRETURNdi;
6591     } else {
6592       assert(Call->getOpcode() == AArch64::BLR ||
6593              Call->getOpcode() == AArch64::BLRNoIP);
6594       TailOpcode = AArch64::TCRETURNriALL;
6595     }
6596     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
6597                            .add(Call->getOperand(0))
6598                            .addImm(0);
6599     MBB.insert(MBB.end(), TC);
6600     Call->eraseFromParent();
6601 
6602     FI->setOutliningStyle("Thunk");
6603   }
6604 
6605   bool IsLeafFunction = true;
6606 
6607   // Is there a call in the outlined range?
6608   auto IsNonTailCall = [](const MachineInstr &MI) {
6609     return MI.isCall() && !MI.isReturn();
6610   };
6611 
6612   if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
6613     // Fix up the instructions in the range, since we're going to modify the
6614     // stack.
6615     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
6616            "Can only fix up stack references once");
6617     fixupPostOutline(MBB);
6618 
6619     IsLeafFunction = false;
6620 
6621     // LR has to be a live in so that we can save it.
6622     if (!MBB.isLiveIn(AArch64::LR))
6623       MBB.addLiveIn(AArch64::LR);
6624 
6625     MachineBasicBlock::iterator It = MBB.begin();
6626     MachineBasicBlock::iterator Et = MBB.end();
6627 
6628     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6629         OF.FrameConstructionID == MachineOutlinerThunk)
6630       Et = std::prev(MBB.end());
6631 
6632     // Insert a save before the outlined region
6633     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6634                                 .addReg(AArch64::SP, RegState::Define)
6635                                 .addReg(AArch64::LR)
6636                                 .addReg(AArch64::SP)
6637                                 .addImm(-16);
6638     It = MBB.insert(It, STRXpre);
6639 
6640     const TargetSubtargetInfo &STI = MF.getSubtarget();
6641     const MCRegisterInfo *MRI = STI.getRegisterInfo();
6642     unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
6643 
6644     // Add a CFI saying the stack was moved 16 B down.
6645     int64_t StackPosEntry =
6646         MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
6647     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6648         .addCFIIndex(StackPosEntry)
6649         .setMIFlags(MachineInstr::FrameSetup);
6650 
6651     // Add a CFI saying that the LR that we want to find is now 16 B higher than
6652     // before.
6653     int64_t LRPosEntry =
6654         MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
6655     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6656         .addCFIIndex(LRPosEntry)
6657         .setMIFlags(MachineInstr::FrameSetup);
6658 
6659     // Insert a restore before the terminator for the function.
6660     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6661                                  .addReg(AArch64::SP, RegState::Define)
6662                                  .addReg(AArch64::LR, RegState::Define)
6663                                  .addReg(AArch64::SP)
6664                                  .addImm(16);
6665     Et = MBB.insert(Et, LDRXpost);
6666   }
6667 
6668   // If a bunch of candidates reach this point they must agree on their return
6669   // address signing. It is therefore enough to just consider the signing
6670   // behaviour of one of them
6671   const Function &CF = OF.Candidates.front().getMF()->getFunction();
6672   bool ShouldSignReturnAddr = false;
6673   if (CF.hasFnAttribute("sign-return-address")) {
6674     StringRef Scope =
6675         CF.getFnAttribute("sign-return-address").getValueAsString();
6676     if (Scope.equals("all"))
6677       ShouldSignReturnAddr = true;
6678     else if (Scope.equals("non-leaf") && !IsLeafFunction)
6679       ShouldSignReturnAddr = true;
6680   }
6681 
6682   // a_key is the default
6683   bool ShouldSignReturnAddrWithAKey = true;
6684   if (CF.hasFnAttribute("sign-return-address-key")) {
6685     const StringRef Key =
6686         CF.getFnAttribute("sign-return-address-key").getValueAsString();
6687     // Key can either be a_key or b_key
6688     assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
6689            "Return address signing key must be either a_key or b_key");
6690     ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
6691   }
6692 
6693   // If this is a tail call outlined function, then there's already a return.
6694   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6695       OF.FrameConstructionID == MachineOutlinerThunk) {
6696     signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6697                          ShouldSignReturnAddrWithAKey);
6698     return;
6699   }
6700 
6701   // It's not a tail call, so we have to insert the return ourselves.
6702 
6703   // LR has to be a live in so that we can return to it.
6704   if (!MBB.isLiveIn(AArch64::LR))
6705     MBB.addLiveIn(AArch64::LR);
6706 
6707   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
6708                           .addReg(AArch64::LR);
6709   MBB.insert(MBB.end(), ret);
6710 
6711   signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6712                        ShouldSignReturnAddrWithAKey);
6713 
6714   FI->setOutliningStyle("Function");
6715 
6716   // Did we have to modify the stack by saving the link register?
6717   if (OF.FrameConstructionID != MachineOutlinerDefault)
6718     return;
6719 
6720   // We modified the stack.
6721   // Walk over the basic block and fix up all the stack accesses.
6722   fixupPostOutline(MBB);
6723 }
6724 
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,const outliner::Candidate & C) const6725 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
6726     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
6727     MachineFunction &MF, const outliner::Candidate &C) const {
6728 
6729   // Are we tail calling?
6730   if (C.CallConstructionID == MachineOutlinerTailCall) {
6731     // If yes, then we can just branch to the label.
6732     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
6733                             .addGlobalAddress(M.getNamedValue(MF.getName()))
6734                             .addImm(0));
6735     return It;
6736   }
6737 
6738   // Are we saving the link register?
6739   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
6740       C.CallConstructionID == MachineOutlinerThunk) {
6741     // No, so just insert the call.
6742     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6743                             .addGlobalAddress(M.getNamedValue(MF.getName())));
6744     return It;
6745   }
6746 
6747   // We want to return the spot where we inserted the call.
6748   MachineBasicBlock::iterator CallPt;
6749 
6750   // Instructions for saving and restoring LR around the call instruction we're
6751   // going to insert.
6752   MachineInstr *Save;
6753   MachineInstr *Restore;
6754   // Can we save to a register?
6755   if (C.CallConstructionID == MachineOutlinerRegSave) {
6756     // FIXME: This logic should be sunk into a target-specific interface so that
6757     // we don't have to recompute the register.
6758     unsigned Reg = findRegisterToSaveLRTo(C);
6759     assert(Reg != 0 && "No callee-saved register available?");
6760 
6761     // Save and restore LR from that register.
6762     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
6763                .addReg(AArch64::XZR)
6764                .addReg(AArch64::LR)
6765                .addImm(0);
6766     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
6767                 .addReg(AArch64::XZR)
6768                 .addReg(Reg)
6769                 .addImm(0);
6770   } else {
6771     // We have the default case. Save and restore from SP.
6772     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6773                .addReg(AArch64::SP, RegState::Define)
6774                .addReg(AArch64::LR)
6775                .addReg(AArch64::SP)
6776                .addImm(-16);
6777     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6778                   .addReg(AArch64::SP, RegState::Define)
6779                   .addReg(AArch64::LR, RegState::Define)
6780                   .addReg(AArch64::SP)
6781                   .addImm(16);
6782   }
6783 
6784   It = MBB.insert(It, Save);
6785   It++;
6786 
6787   // Insert the call.
6788   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6789                           .addGlobalAddress(M.getNamedValue(MF.getName())));
6790   CallPt = It;
6791   It++;
6792 
6793   It = MBB.insert(It, Restore);
6794   return CallPt;
6795 }
6796 
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const6797 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
6798   MachineFunction &MF) const {
6799   return MF.getFunction().hasMinSize();
6800 }
6801 
6802 Optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const6803 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
6804 
6805   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
6806   // and zero immediate operands used as an alias for mov instruction.
6807   if (MI.getOpcode() == AArch64::ORRWrs &&
6808       MI.getOperand(1).getReg() == AArch64::WZR &&
6809       MI.getOperand(3).getImm() == 0x0) {
6810     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6811   }
6812 
6813   if (MI.getOpcode() == AArch64::ORRXrs &&
6814       MI.getOperand(1).getReg() == AArch64::XZR &&
6815       MI.getOperand(3).getImm() == 0x0) {
6816     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6817   }
6818 
6819   return None;
6820 }
6821 
isAddImmediate(const MachineInstr & MI,Register Reg) const6822 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
6823                                                       Register Reg) const {
6824   int Sign = 1;
6825   int64_t Offset = 0;
6826 
6827   // TODO: Handle cases where Reg is a super- or sub-register of the
6828   // destination register.
6829   const MachineOperand &Op0 = MI.getOperand(0);
6830   if (!Op0.isReg() || Reg != Op0.getReg())
6831     return None;
6832 
6833   switch (MI.getOpcode()) {
6834   default:
6835     return None;
6836   case AArch64::SUBWri:
6837   case AArch64::SUBXri:
6838   case AArch64::SUBSWri:
6839   case AArch64::SUBSXri:
6840     Sign *= -1;
6841     LLVM_FALLTHROUGH;
6842   case AArch64::ADDSWri:
6843   case AArch64::ADDSXri:
6844   case AArch64::ADDWri:
6845   case AArch64::ADDXri: {
6846     // TODO: Third operand can be global address (usually some string).
6847     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
6848         !MI.getOperand(2).isImm())
6849       return None;
6850     Offset = MI.getOperand(2).getImm() * Sign;
6851     int Shift = MI.getOperand(3).getImm();
6852     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
6853     Offset = Offset << Shift;
6854   }
6855   }
6856   return RegImmPair{MI.getOperand(1).getReg(), Offset};
6857 }
6858 
6859 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
6860 /// the destination register then, if possible, describe the value in terms of
6861 /// the source register.
6862 static Optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)6863 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
6864                        const TargetInstrInfo *TII,
6865                        const TargetRegisterInfo *TRI) {
6866   auto DestSrc = TII->isCopyInstr(MI);
6867   if (!DestSrc)
6868     return None;
6869 
6870   Register DestReg = DestSrc->Destination->getReg();
6871   Register SrcReg = DestSrc->Source->getReg();
6872 
6873   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
6874 
6875   // If the described register is the destination, just return the source.
6876   if (DestReg == DescribedReg)
6877     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6878 
6879   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
6880   if (MI.getOpcode() == AArch64::ORRWrs &&
6881       TRI->isSuperRegister(DestReg, DescribedReg))
6882     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6883 
6884   // We may need to describe the lower part of a ORRXrs move.
6885   if (MI.getOpcode() == AArch64::ORRXrs &&
6886       TRI->isSubRegister(DestReg, DescribedReg)) {
6887     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
6888     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
6889   }
6890 
6891   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
6892          "Unhandled ORR[XW]rs copy case");
6893 
6894   return None;
6895 }
6896 
6897 Optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const6898 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
6899                                       Register Reg) const {
6900   const MachineFunction *MF = MI.getMF();
6901   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
6902   switch (MI.getOpcode()) {
6903   case AArch64::MOVZWi:
6904   case AArch64::MOVZXi: {
6905     // MOVZWi may be used for producing zero-extended 32-bit immediates in
6906     // 64-bit parameters, so we need to consider super-registers.
6907     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
6908       return None;
6909 
6910     if (!MI.getOperand(1).isImm())
6911       return None;
6912     int64_t Immediate = MI.getOperand(1).getImm();
6913     int Shift = MI.getOperand(2).getImm();
6914     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
6915                             nullptr);
6916   }
6917   case AArch64::ORRWrs:
6918   case AArch64::ORRXrs:
6919     return describeORRLoadedValue(MI, Reg, this, TRI);
6920   }
6921 
6922   return TargetInstrInfo::describeLoadedValue(MI, Reg);
6923 }
6924 
getElementSizeForOpcode(unsigned Opc) const6925 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
6926   return get(Opc).TSFlags & AArch64::ElementSizeMask;
6927 }
6928 
getBLRCallOpcode(const MachineFunction & MF)6929 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
6930   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
6931     return AArch64::BLRNoIP;
6932   else
6933     return AArch64::BLR;
6934 }
6935 
6936 #define GET_INSTRINFO_HELPERS
6937 #define GET_INSTRMAP_INFO
6938 #include "AArch64GenInstrInfo.inc"
6939