1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugInfoMetadata.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstrDesc.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CodeGen.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Compiler.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/MathExtras.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetOptions.h"
47 #include <cassert>
48 #include <cstdint>
49 #include <iterator>
50 #include <utility>
51 
52 using namespace llvm;
53 
54 #define GET_INSTRINFO_CTOR_DTOR
55 #include "AArch64GenInstrInfo.inc"
56 
57 static cl::opt<unsigned> TBZDisplacementBits(
58     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
59     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
60 
61 static cl::opt<unsigned> CBZDisplacementBits(
62     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
63     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
64 
65 static cl::opt<unsigned>
66     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
67                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
68 
AArch64InstrInfo(const AArch64Subtarget & STI)69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
70     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
71                           AArch64::CATCHRET),
72       RI(STI.getTargetTriple()), Subtarget(STI) {}
73 
74 /// GetInstSize - Return the number of bytes of code the specified
75 /// instruction may be.  This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
77   const MachineBasicBlock &MBB = *MI.getParent();
78   const MachineFunction *MF = MBB.getParent();
79   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
80 
81   {
82     auto Op = MI.getOpcode();
83     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
84       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
85   }
86 
87   // Meta-instructions emit no code.
88   if (MI.isMetaInstruction())
89     return 0;
90 
91   // FIXME: We currently only handle pseudoinstructions that don't get expanded
92   //        before the assembly printer.
93   unsigned NumBytes = 0;
94   const MCInstrDesc &Desc = MI.getDesc();
95   switch (Desc.getOpcode()) {
96   default:
97     // Anything not explicitly designated otherwise is a normal 4-byte insn.
98     NumBytes = 4;
99     break;
100   case TargetOpcode::STACKMAP:
101     // The upper bound for a stackmap intrinsic is the full length of its shadow
102     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
103     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
104     break;
105   case TargetOpcode::PATCHPOINT:
106     // The size of the patchpoint intrinsic is the number of bytes requested
107     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
108     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
109     break;
110   case TargetOpcode::STATEPOINT:
111     NumBytes = StatepointOpers(&MI).getNumPatchBytes();
112     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
113     // No patch bytes means a normal call inst is emitted
114     if (NumBytes == 0)
115       NumBytes = 4;
116     break;
117   case AArch64::TLSDESC_CALLSEQ:
118     // This gets lowered to an instruction sequence which takes 16 bytes
119     NumBytes = 16;
120     break;
121   case AArch64::SpeculationBarrierISBDSBEndBB:
122     // This gets lowered to 2 4-byte instructions.
123     NumBytes = 8;
124     break;
125   case AArch64::SpeculationBarrierSBEndBB:
126     // This gets lowered to 1 4-byte instructions.
127     NumBytes = 4;
128     break;
129   case AArch64::JumpTableDest32:
130   case AArch64::JumpTableDest16:
131   case AArch64::JumpTableDest8:
132     NumBytes = 12;
133     break;
134   case AArch64::SPACE:
135     NumBytes = MI.getOperand(1).getImm();
136     break;
137   case TargetOpcode::BUNDLE:
138     NumBytes = getInstBundleLength(MI);
139     break;
140   }
141 
142   return NumBytes;
143 }
144 
getInstBundleLength(const MachineInstr & MI) const145 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
146   unsigned Size = 0;
147   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
148   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
149   while (++I != E && I->isInsideBundle()) {
150     assert(!I->isBundle() && "No nested bundle!");
151     Size += getInstSizeInBytes(*I);
152   }
153   return Size;
154 }
155 
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)156 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
157                             SmallVectorImpl<MachineOperand> &Cond) {
158   // Block ends with fall-through condbranch.
159   switch (LastInst->getOpcode()) {
160   default:
161     llvm_unreachable("Unknown branch instruction?");
162   case AArch64::Bcc:
163     Target = LastInst->getOperand(1).getMBB();
164     Cond.push_back(LastInst->getOperand(0));
165     break;
166   case AArch64::CBZW:
167   case AArch64::CBZX:
168   case AArch64::CBNZW:
169   case AArch64::CBNZX:
170     Target = LastInst->getOperand(1).getMBB();
171     Cond.push_back(MachineOperand::CreateImm(-1));
172     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
173     Cond.push_back(LastInst->getOperand(0));
174     break;
175   case AArch64::TBZW:
176   case AArch64::TBZX:
177   case AArch64::TBNZW:
178   case AArch64::TBNZX:
179     Target = LastInst->getOperand(2).getMBB();
180     Cond.push_back(MachineOperand::CreateImm(-1));
181     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
182     Cond.push_back(LastInst->getOperand(0));
183     Cond.push_back(LastInst->getOperand(1));
184   }
185 }
186 
getBranchDisplacementBits(unsigned Opc)187 static unsigned getBranchDisplacementBits(unsigned Opc) {
188   switch (Opc) {
189   default:
190     llvm_unreachable("unexpected opcode!");
191   case AArch64::B:
192     return 64;
193   case AArch64::TBNZW:
194   case AArch64::TBZW:
195   case AArch64::TBNZX:
196   case AArch64::TBZX:
197     return TBZDisplacementBits;
198   case AArch64::CBNZW:
199   case AArch64::CBZW:
200   case AArch64::CBNZX:
201   case AArch64::CBZX:
202     return CBZDisplacementBits;
203   case AArch64::Bcc:
204     return BCCDisplacementBits;
205   }
206 }
207 
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const208 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
209                                              int64_t BrOffset) const {
210   unsigned Bits = getBranchDisplacementBits(BranchOp);
211   assert(Bits >= 3 && "max branch displacement must be enough to jump"
212                       "over conditional branch expansion");
213   return isIntN(Bits, BrOffset / 4);
214 }
215 
216 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const217 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
218   switch (MI.getOpcode()) {
219   default:
220     llvm_unreachable("unexpected opcode!");
221   case AArch64::B:
222     return MI.getOperand(0).getMBB();
223   case AArch64::TBZW:
224   case AArch64::TBNZW:
225   case AArch64::TBZX:
226   case AArch64::TBNZX:
227     return MI.getOperand(2).getMBB();
228   case AArch64::CBZW:
229   case AArch64::CBNZW:
230   case AArch64::CBZX:
231   case AArch64::CBNZX:
232   case AArch64::Bcc:
233     return MI.getOperand(1).getMBB();
234   }
235 }
236 
237 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const238 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
239                                      MachineBasicBlock *&TBB,
240                                      MachineBasicBlock *&FBB,
241                                      SmallVectorImpl<MachineOperand> &Cond,
242                                      bool AllowModify) const {
243   // If the block has no terminators, it just falls into the block after it.
244   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
245   if (I == MBB.end())
246     return false;
247 
248   // Skip over SpeculationBarrierEndBB terminators
249   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
250       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
251     --I;
252   }
253 
254   if (!isUnpredicatedTerminator(*I))
255     return false;
256 
257   // Get the last instruction in the block.
258   MachineInstr *LastInst = &*I;
259 
260   // If there is only one terminator instruction, process it.
261   unsigned LastOpc = LastInst->getOpcode();
262   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
263     if (isUncondBranchOpcode(LastOpc)) {
264       TBB = LastInst->getOperand(0).getMBB();
265       return false;
266     }
267     if (isCondBranchOpcode(LastOpc)) {
268       // Block ends with fall-through condbranch.
269       parseCondBranch(LastInst, TBB, Cond);
270       return false;
271     }
272     return true; // Can't handle indirect branch.
273   }
274 
275   // Get the instruction before it if it is a terminator.
276   MachineInstr *SecondLastInst = &*I;
277   unsigned SecondLastOpc = SecondLastInst->getOpcode();
278 
279   // If AllowModify is true and the block ends with two or more unconditional
280   // branches, delete all but the first unconditional branch.
281   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
282     while (isUncondBranchOpcode(SecondLastOpc)) {
283       LastInst->eraseFromParent();
284       LastInst = SecondLastInst;
285       LastOpc = LastInst->getOpcode();
286       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
287         // Return now the only terminator is an unconditional branch.
288         TBB = LastInst->getOperand(0).getMBB();
289         return false;
290       } else {
291         SecondLastInst = &*I;
292         SecondLastOpc = SecondLastInst->getOpcode();
293       }
294     }
295   }
296 
297   // If we're allowed to modify and the block ends in a unconditional branch
298   // which could simply fallthrough, remove the branch.  (Note: This case only
299   // matters when we can't understand the whole sequence, otherwise it's also
300   // handled by BranchFolding.cpp.)
301   if (AllowModify && isUncondBranchOpcode(LastOpc) &&
302       MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
303     LastInst->eraseFromParent();
304     LastInst = SecondLastInst;
305     LastOpc = LastInst->getOpcode();
306     if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
307       assert(!isUncondBranchOpcode(LastOpc) &&
308              "unreachable unconditional branches removed above");
309 
310       if (isCondBranchOpcode(LastOpc)) {
311         // Block ends with fall-through condbranch.
312         parseCondBranch(LastInst, TBB, Cond);
313         return false;
314       }
315       return true; // Can't handle indirect branch.
316     } else {
317       SecondLastInst = &*I;
318       SecondLastOpc = SecondLastInst->getOpcode();
319     }
320   }
321 
322   // If there are three terminators, we don't know what sort of block this is.
323   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
324     return true;
325 
326   // If the block ends with a B and a Bcc, handle it.
327   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
328     parseCondBranch(SecondLastInst, TBB, Cond);
329     FBB = LastInst->getOperand(0).getMBB();
330     return false;
331   }
332 
333   // If the block ends with two unconditional branches, handle it.  The second
334   // one is not executed, so remove it.
335   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
336     TBB = SecondLastInst->getOperand(0).getMBB();
337     I = LastInst;
338     if (AllowModify)
339       I->eraseFromParent();
340     return false;
341   }
342 
343   // ...likewise if it ends with an indirect branch followed by an unconditional
344   // branch.
345   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
346     I = LastInst;
347     if (AllowModify)
348       I->eraseFromParent();
349     return true;
350   }
351 
352   // Otherwise, can't handle this.
353   return true;
354 }
355 
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const356 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
357                                               MachineBranchPredicate &MBP,
358                                               bool AllowModify) const {
359   // For the moment, handle only a block which ends with a cb(n)zx followed by
360   // a fallthrough.  Why this?  Because it is a common form.
361   // TODO: Should we handle b.cc?
362 
363   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
364   if (I == MBB.end())
365     return true;
366 
367   // Skip over SpeculationBarrierEndBB terminators
368   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
369       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
370     --I;
371   }
372 
373   if (!isUnpredicatedTerminator(*I))
374     return true;
375 
376   // Get the last instruction in the block.
377   MachineInstr *LastInst = &*I;
378   unsigned LastOpc = LastInst->getOpcode();
379   if (!isCondBranchOpcode(LastOpc))
380     return true;
381 
382   switch (LastOpc) {
383   default:
384     return true;
385   case AArch64::CBZW:
386   case AArch64::CBZX:
387   case AArch64::CBNZW:
388   case AArch64::CBNZX:
389     break;
390   };
391 
392   MBP.TrueDest = LastInst->getOperand(1).getMBB();
393   assert(MBP.TrueDest && "expected!");
394   MBP.FalseDest = MBB.getNextNode();
395 
396   MBP.ConditionDef = nullptr;
397   MBP.SingleUseCondition = false;
398 
399   MBP.LHS = LastInst->getOperand(0);
400   MBP.RHS = MachineOperand::CreateImm(0);
401   MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
402                                             : MachineBranchPredicate::PRED_EQ;
403   return false;
404 }
405 
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const406 bool AArch64InstrInfo::reverseBranchCondition(
407     SmallVectorImpl<MachineOperand> &Cond) const {
408   if (Cond[0].getImm() != -1) {
409     // Regular Bcc
410     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
411     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
412   } else {
413     // Folded compare-and-branch
414     switch (Cond[1].getImm()) {
415     default:
416       llvm_unreachable("Unknown conditional branch!");
417     case AArch64::CBZW:
418       Cond[1].setImm(AArch64::CBNZW);
419       break;
420     case AArch64::CBNZW:
421       Cond[1].setImm(AArch64::CBZW);
422       break;
423     case AArch64::CBZX:
424       Cond[1].setImm(AArch64::CBNZX);
425       break;
426     case AArch64::CBNZX:
427       Cond[1].setImm(AArch64::CBZX);
428       break;
429     case AArch64::TBZW:
430       Cond[1].setImm(AArch64::TBNZW);
431       break;
432     case AArch64::TBNZW:
433       Cond[1].setImm(AArch64::TBZW);
434       break;
435     case AArch64::TBZX:
436       Cond[1].setImm(AArch64::TBNZX);
437       break;
438     case AArch64::TBNZX:
439       Cond[1].setImm(AArch64::TBZX);
440       break;
441     }
442   }
443 
444   return false;
445 }
446 
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const447 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
448                                         int *BytesRemoved) const {
449   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
450   if (I == MBB.end())
451     return 0;
452 
453   if (!isUncondBranchOpcode(I->getOpcode()) &&
454       !isCondBranchOpcode(I->getOpcode()))
455     return 0;
456 
457   // Remove the branch.
458   I->eraseFromParent();
459 
460   I = MBB.end();
461 
462   if (I == MBB.begin()) {
463     if (BytesRemoved)
464       *BytesRemoved = 4;
465     return 1;
466   }
467   --I;
468   if (!isCondBranchOpcode(I->getOpcode())) {
469     if (BytesRemoved)
470       *BytesRemoved = 4;
471     return 1;
472   }
473 
474   // Remove the branch.
475   I->eraseFromParent();
476   if (BytesRemoved)
477     *BytesRemoved = 8;
478 
479   return 2;
480 }
481 
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const482 void AArch64InstrInfo::instantiateCondBranch(
483     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
484     ArrayRef<MachineOperand> Cond) const {
485   if (Cond[0].getImm() != -1) {
486     // Regular Bcc
487     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
488   } else {
489     // Folded compare-and-branch
490     // Note that we use addOperand instead of addReg to keep the flags.
491     const MachineInstrBuilder MIB =
492         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
493     if (Cond.size() > 3)
494       MIB.addImm(Cond[3].getImm());
495     MIB.addMBB(TBB);
496   }
497 }
498 
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const499 unsigned AArch64InstrInfo::insertBranch(
500     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
501     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
502   // Shouldn't be a fall through.
503   assert(TBB && "insertBranch must not be told to insert a fallthrough");
504 
505   if (!FBB) {
506     if (Cond.empty()) // Unconditional branch?
507       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
508     else
509       instantiateCondBranch(MBB, DL, TBB, Cond);
510 
511     if (BytesAdded)
512       *BytesAdded = 4;
513 
514     return 1;
515   }
516 
517   // Two-way conditional branch.
518   instantiateCondBranch(MBB, DL, TBB, Cond);
519   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
520 
521   if (BytesAdded)
522     *BytesAdded = 8;
523 
524   return 2;
525 }
526 
527 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)528 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
529   while (Register::isVirtualRegister(VReg)) {
530     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
531     if (!DefMI->isFullCopy())
532       return VReg;
533     VReg = DefMI->getOperand(1).getReg();
534   }
535   return VReg;
536 }
537 
538 // Determine if VReg is defined by an instruction that can be folded into a
539 // csel instruction. If so, return the folded opcode, and the replacement
540 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)541 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
542                                 unsigned *NewVReg = nullptr) {
543   VReg = removeCopies(MRI, VReg);
544   if (!Register::isVirtualRegister(VReg))
545     return 0;
546 
547   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
548   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
549   unsigned Opc = 0;
550   unsigned SrcOpNum = 0;
551   switch (DefMI->getOpcode()) {
552   case AArch64::ADDSXri:
553   case AArch64::ADDSWri:
554     // if NZCV is used, do not fold.
555     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
556       return 0;
557     // fall-through to ADDXri and ADDWri.
558     LLVM_FALLTHROUGH;
559   case AArch64::ADDXri:
560   case AArch64::ADDWri:
561     // add x, 1 -> csinc.
562     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
563         DefMI->getOperand(3).getImm() != 0)
564       return 0;
565     SrcOpNum = 1;
566     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
567     break;
568 
569   case AArch64::ORNXrr:
570   case AArch64::ORNWrr: {
571     // not x -> csinv, represented as orn dst, xzr, src.
572     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
573     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
574       return 0;
575     SrcOpNum = 2;
576     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
577     break;
578   }
579 
580   case AArch64::SUBSXrr:
581   case AArch64::SUBSWrr:
582     // if NZCV is used, do not fold.
583     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
584       return 0;
585     // fall-through to SUBXrr and SUBWrr.
586     LLVM_FALLTHROUGH;
587   case AArch64::SUBXrr:
588   case AArch64::SUBWrr: {
589     // neg x -> csneg, represented as sub dst, xzr, src.
590     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
591     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
592       return 0;
593     SrcOpNum = 2;
594     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
595     break;
596   }
597   default:
598     return 0;
599   }
600   assert(Opc && SrcOpNum && "Missing parameters");
601 
602   if (NewVReg)
603     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
604   return Opc;
605 }
606 
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const607 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
608                                        ArrayRef<MachineOperand> Cond,
609                                        Register DstReg, Register TrueReg,
610                                        Register FalseReg, int &CondCycles,
611                                        int &TrueCycles,
612                                        int &FalseCycles) const {
613   // Check register classes.
614   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
615   const TargetRegisterClass *RC =
616       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
617   if (!RC)
618     return false;
619 
620   // Also need to check the dest regclass, in case we're trying to optimize
621   // something like:
622   // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
623   if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
624     return false;
625 
626   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
627   unsigned ExtraCondLat = Cond.size() != 1;
628 
629   // GPRs are handled by csel.
630   // FIXME: Fold in x+1, -x, and ~x when applicable.
631   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
632       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
633     // Single-cycle csel, csinc, csinv, and csneg.
634     CondCycles = 1 + ExtraCondLat;
635     TrueCycles = FalseCycles = 1;
636     if (canFoldIntoCSel(MRI, TrueReg))
637       TrueCycles = 0;
638     else if (canFoldIntoCSel(MRI, FalseReg))
639       FalseCycles = 0;
640     return true;
641   }
642 
643   // Scalar floating point is handled by fcsel.
644   // FIXME: Form fabs, fmin, and fmax when applicable.
645   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
646       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
647     CondCycles = 5 + ExtraCondLat;
648     TrueCycles = FalseCycles = 2;
649     return true;
650   }
651 
652   // Can't do vectors.
653   return false;
654 }
655 
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const656 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
657                                     MachineBasicBlock::iterator I,
658                                     const DebugLoc &DL, Register DstReg,
659                                     ArrayRef<MachineOperand> Cond,
660                                     Register TrueReg, Register FalseReg) const {
661   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
662 
663   // Parse the condition code, see parseCondBranch() above.
664   AArch64CC::CondCode CC;
665   switch (Cond.size()) {
666   default:
667     llvm_unreachable("Unknown condition opcode in Cond");
668   case 1: // b.cc
669     CC = AArch64CC::CondCode(Cond[0].getImm());
670     break;
671   case 3: { // cbz/cbnz
672     // We must insert a compare against 0.
673     bool Is64Bit;
674     switch (Cond[1].getImm()) {
675     default:
676       llvm_unreachable("Unknown branch opcode in Cond");
677     case AArch64::CBZW:
678       Is64Bit = false;
679       CC = AArch64CC::EQ;
680       break;
681     case AArch64::CBZX:
682       Is64Bit = true;
683       CC = AArch64CC::EQ;
684       break;
685     case AArch64::CBNZW:
686       Is64Bit = false;
687       CC = AArch64CC::NE;
688       break;
689     case AArch64::CBNZX:
690       Is64Bit = true;
691       CC = AArch64CC::NE;
692       break;
693     }
694     Register SrcReg = Cond[2].getReg();
695     if (Is64Bit) {
696       // cmp reg, #0 is actually subs xzr, reg, #0.
697       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
698       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
699           .addReg(SrcReg)
700           .addImm(0)
701           .addImm(0);
702     } else {
703       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
704       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
705           .addReg(SrcReg)
706           .addImm(0)
707           .addImm(0);
708     }
709     break;
710   }
711   case 4: { // tbz/tbnz
712     // We must insert a tst instruction.
713     switch (Cond[1].getImm()) {
714     default:
715       llvm_unreachable("Unknown branch opcode in Cond");
716     case AArch64::TBZW:
717     case AArch64::TBZX:
718       CC = AArch64CC::EQ;
719       break;
720     case AArch64::TBNZW:
721     case AArch64::TBNZX:
722       CC = AArch64CC::NE;
723       break;
724     }
725     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
726     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
727       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
728           .addReg(Cond[2].getReg())
729           .addImm(
730               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
731     else
732       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
733           .addReg(Cond[2].getReg())
734           .addImm(
735               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
736     break;
737   }
738   }
739 
740   unsigned Opc = 0;
741   const TargetRegisterClass *RC = nullptr;
742   bool TryFold = false;
743   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
744     RC = &AArch64::GPR64RegClass;
745     Opc = AArch64::CSELXr;
746     TryFold = true;
747   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
748     RC = &AArch64::GPR32RegClass;
749     Opc = AArch64::CSELWr;
750     TryFold = true;
751   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
752     RC = &AArch64::FPR64RegClass;
753     Opc = AArch64::FCSELDrrr;
754   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
755     RC = &AArch64::FPR32RegClass;
756     Opc = AArch64::FCSELSrrr;
757   }
758   assert(RC && "Unsupported regclass");
759 
760   // Try folding simple instructions into the csel.
761   if (TryFold) {
762     unsigned NewVReg = 0;
763     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
764     if (FoldedOpc) {
765       // The folded opcodes csinc, csinc and csneg apply the operation to
766       // FalseReg, so we need to invert the condition.
767       CC = AArch64CC::getInvertedCondCode(CC);
768       TrueReg = FalseReg;
769     } else
770       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
771 
772     // Fold the operation. Leave any dead instructions for DCE to clean up.
773     if (FoldedOpc) {
774       FalseReg = NewVReg;
775       Opc = FoldedOpc;
776       // The extends the live range of NewVReg.
777       MRI.clearKillFlags(NewVReg);
778     }
779   }
780 
781   // Pull all virtual register into the appropriate class.
782   MRI.constrainRegClass(TrueReg, RC);
783   MRI.constrainRegClass(FalseReg, RC);
784 
785   // Insert the csel.
786   BuildMI(MBB, I, DL, get(Opc), DstReg)
787       .addReg(TrueReg)
788       .addReg(FalseReg)
789       .addImm(CC);
790 }
791 
792 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
canBeExpandedToORR(const MachineInstr & MI,unsigned BitSize)793 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
794   uint64_t Imm = MI.getOperand(1).getImm();
795   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
796   uint64_t Encoding;
797   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
798 }
799 
800 // FIXME: this implementation should be micro-architecture dependent, so a
801 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const802 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
803   if (!Subtarget.hasCustomCheapAsMoveHandling())
804     return MI.isAsCheapAsAMove();
805 
806   const unsigned Opcode = MI.getOpcode();
807 
808   // Firstly, check cases gated by features.
809 
810   if (Subtarget.hasZeroCycleZeroingFP()) {
811     if (Opcode == AArch64::FMOVH0 ||
812         Opcode == AArch64::FMOVS0 ||
813         Opcode == AArch64::FMOVD0)
814       return true;
815   }
816 
817   if (Subtarget.hasZeroCycleZeroingGP()) {
818     if (Opcode == TargetOpcode::COPY &&
819         (MI.getOperand(1).getReg() == AArch64::WZR ||
820          MI.getOperand(1).getReg() == AArch64::XZR))
821       return true;
822   }
823 
824   // Secondly, check cases specific to sub-targets.
825 
826   if (Subtarget.hasExynosCheapAsMoveHandling()) {
827     if (isExynosCheapAsMove(MI))
828       return true;
829 
830     return MI.isAsCheapAsAMove();
831   }
832 
833   // Finally, check generic cases.
834 
835   switch (Opcode) {
836   default:
837     return false;
838 
839   // add/sub on register without shift
840   case AArch64::ADDWri:
841   case AArch64::ADDXri:
842   case AArch64::SUBWri:
843   case AArch64::SUBXri:
844     return (MI.getOperand(3).getImm() == 0);
845 
846   // logical ops on immediate
847   case AArch64::ANDWri:
848   case AArch64::ANDXri:
849   case AArch64::EORWri:
850   case AArch64::EORXri:
851   case AArch64::ORRWri:
852   case AArch64::ORRXri:
853     return true;
854 
855   // logical ops on register without shift
856   case AArch64::ANDWrr:
857   case AArch64::ANDXrr:
858   case AArch64::BICWrr:
859   case AArch64::BICXrr:
860   case AArch64::EONWrr:
861   case AArch64::EONXrr:
862   case AArch64::EORWrr:
863   case AArch64::EORXrr:
864   case AArch64::ORNWrr:
865   case AArch64::ORNXrr:
866   case AArch64::ORRWrr:
867   case AArch64::ORRXrr:
868     return true;
869 
870   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
871   // ORRXri, it is as cheap as MOV
872   case AArch64::MOVi32imm:
873     return canBeExpandedToORR(MI, 32);
874   case AArch64::MOVi64imm:
875     return canBeExpandedToORR(MI, 64);
876   }
877 
878   llvm_unreachable("Unknown opcode to check as cheap as a move!");
879 }
880 
isFalkorShiftExtFast(const MachineInstr & MI)881 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
882   switch (MI.getOpcode()) {
883   default:
884     return false;
885 
886   case AArch64::ADDWrs:
887   case AArch64::ADDXrs:
888   case AArch64::ADDSWrs:
889   case AArch64::ADDSXrs: {
890     unsigned Imm = MI.getOperand(3).getImm();
891     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
892     if (ShiftVal == 0)
893       return true;
894     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
895   }
896 
897   case AArch64::ADDWrx:
898   case AArch64::ADDXrx:
899   case AArch64::ADDXrx64:
900   case AArch64::ADDSWrx:
901   case AArch64::ADDSXrx:
902   case AArch64::ADDSXrx64: {
903     unsigned Imm = MI.getOperand(3).getImm();
904     switch (AArch64_AM::getArithExtendType(Imm)) {
905     default:
906       return false;
907     case AArch64_AM::UXTB:
908     case AArch64_AM::UXTH:
909     case AArch64_AM::UXTW:
910     case AArch64_AM::UXTX:
911       return AArch64_AM::getArithShiftValue(Imm) <= 4;
912     }
913   }
914 
915   case AArch64::SUBWrs:
916   case AArch64::SUBSWrs: {
917     unsigned Imm = MI.getOperand(3).getImm();
918     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
919     return ShiftVal == 0 ||
920            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
921   }
922 
923   case AArch64::SUBXrs:
924   case AArch64::SUBSXrs: {
925     unsigned Imm = MI.getOperand(3).getImm();
926     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
927     return ShiftVal == 0 ||
928            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
929   }
930 
931   case AArch64::SUBWrx:
932   case AArch64::SUBXrx:
933   case AArch64::SUBXrx64:
934   case AArch64::SUBSWrx:
935   case AArch64::SUBSXrx:
936   case AArch64::SUBSXrx64: {
937     unsigned Imm = MI.getOperand(3).getImm();
938     switch (AArch64_AM::getArithExtendType(Imm)) {
939     default:
940       return false;
941     case AArch64_AM::UXTB:
942     case AArch64_AM::UXTH:
943     case AArch64_AM::UXTW:
944     case AArch64_AM::UXTX:
945       return AArch64_AM::getArithShiftValue(Imm) == 0;
946     }
947   }
948 
949   case AArch64::LDRBBroW:
950   case AArch64::LDRBBroX:
951   case AArch64::LDRBroW:
952   case AArch64::LDRBroX:
953   case AArch64::LDRDroW:
954   case AArch64::LDRDroX:
955   case AArch64::LDRHHroW:
956   case AArch64::LDRHHroX:
957   case AArch64::LDRHroW:
958   case AArch64::LDRHroX:
959   case AArch64::LDRQroW:
960   case AArch64::LDRQroX:
961   case AArch64::LDRSBWroW:
962   case AArch64::LDRSBWroX:
963   case AArch64::LDRSBXroW:
964   case AArch64::LDRSBXroX:
965   case AArch64::LDRSHWroW:
966   case AArch64::LDRSHWroX:
967   case AArch64::LDRSHXroW:
968   case AArch64::LDRSHXroX:
969   case AArch64::LDRSWroW:
970   case AArch64::LDRSWroX:
971   case AArch64::LDRSroW:
972   case AArch64::LDRSroX:
973   case AArch64::LDRWroW:
974   case AArch64::LDRWroX:
975   case AArch64::LDRXroW:
976   case AArch64::LDRXroX:
977   case AArch64::PRFMroW:
978   case AArch64::PRFMroX:
979   case AArch64::STRBBroW:
980   case AArch64::STRBBroX:
981   case AArch64::STRBroW:
982   case AArch64::STRBroX:
983   case AArch64::STRDroW:
984   case AArch64::STRDroX:
985   case AArch64::STRHHroW:
986   case AArch64::STRHHroX:
987   case AArch64::STRHroW:
988   case AArch64::STRHroX:
989   case AArch64::STRQroW:
990   case AArch64::STRQroX:
991   case AArch64::STRSroW:
992   case AArch64::STRSroX:
993   case AArch64::STRWroW:
994   case AArch64::STRWroX:
995   case AArch64::STRXroW:
996   case AArch64::STRXroX: {
997     unsigned IsSigned = MI.getOperand(3).getImm();
998     return !IsSigned;
999   }
1000   }
1001 }
1002 
isSEHInstruction(const MachineInstr & MI)1003 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1004   unsigned Opc = MI.getOpcode();
1005   switch (Opc) {
1006     default:
1007       return false;
1008     case AArch64::SEH_StackAlloc:
1009     case AArch64::SEH_SaveFPLR:
1010     case AArch64::SEH_SaveFPLR_X:
1011     case AArch64::SEH_SaveReg:
1012     case AArch64::SEH_SaveReg_X:
1013     case AArch64::SEH_SaveRegP:
1014     case AArch64::SEH_SaveRegP_X:
1015     case AArch64::SEH_SaveFReg:
1016     case AArch64::SEH_SaveFReg_X:
1017     case AArch64::SEH_SaveFRegP:
1018     case AArch64::SEH_SaveFRegP_X:
1019     case AArch64::SEH_SetFP:
1020     case AArch64::SEH_AddFP:
1021     case AArch64::SEH_Nop:
1022     case AArch64::SEH_PrologEnd:
1023     case AArch64::SEH_EpilogStart:
1024     case AArch64::SEH_EpilogEnd:
1025       return true;
1026   }
1027 }
1028 
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1029 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1030                                              Register &SrcReg, Register &DstReg,
1031                                              unsigned &SubIdx) const {
1032   switch (MI.getOpcode()) {
1033   default:
1034     return false;
1035   case AArch64::SBFMXri: // aka sxtw
1036   case AArch64::UBFMXri: // aka uxtw
1037     // Check for the 32 -> 64 bit extension case, these instructions can do
1038     // much more.
1039     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1040       return false;
1041     // This is a signed or unsigned 32 -> 64 bit extension.
1042     SrcReg = MI.getOperand(1).getReg();
1043     DstReg = MI.getOperand(0).getReg();
1044     SubIdx = AArch64::sub_32;
1045     return true;
1046   }
1047 }
1048 
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1049 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1050     const MachineInstr &MIa, const MachineInstr &MIb) const {
1051   const TargetRegisterInfo *TRI = &getRegisterInfo();
1052   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1053   int64_t OffsetA = 0, OffsetB = 0;
1054   unsigned WidthA = 0, WidthB = 0;
1055   bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1056 
1057   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1058   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1059 
1060   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1061       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1062     return false;
1063 
1064   // Retrieve the base, offset from the base and width. Width
1065   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
1066   // base are identical, and the offset of a lower memory access +
1067   // the width doesn't overlap the offset of a higher memory access,
1068   // then the memory accesses are different.
1069   // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1070   // are assumed to have the same scale (vscale).
1071   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1072                                    WidthA, TRI) &&
1073       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1074                                    WidthB, TRI)) {
1075     if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1076         OffsetAIsScalable == OffsetBIsScalable) {
1077       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1078       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1079       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1080       if (LowOffset + LowWidth <= HighOffset)
1081         return true;
1082     }
1083   }
1084   return false;
1085 }
1086 
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1087 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1088                                             const MachineBasicBlock *MBB,
1089                                             const MachineFunction &MF) const {
1090   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1091     return true;
1092   switch (MI.getOpcode()) {
1093   case AArch64::HINT:
1094     // CSDB hints are scheduling barriers.
1095     if (MI.getOperand(0).getImm() == 0x14)
1096       return true;
1097     break;
1098   case AArch64::DSB:
1099   case AArch64::ISB:
1100     // DSB and ISB also are scheduling barriers.
1101     return true;
1102   default:;
1103   }
1104   return isSEHInstruction(MI);
1105 }
1106 
1107 /// analyzeCompare - For a comparison instruction, return the source registers
1108 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1109 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int & CmpMask,int & CmpValue) const1110 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1111                                       Register &SrcReg2, int &CmpMask,
1112                                       int &CmpValue) const {
1113   // The first operand can be a frame index where we'd normally expect a
1114   // register.
1115   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1116   if (!MI.getOperand(1).isReg())
1117     return false;
1118 
1119   switch (MI.getOpcode()) {
1120   default:
1121     break;
1122   case AArch64::SUBSWrr:
1123   case AArch64::SUBSWrs:
1124   case AArch64::SUBSWrx:
1125   case AArch64::SUBSXrr:
1126   case AArch64::SUBSXrs:
1127   case AArch64::SUBSXrx:
1128   case AArch64::ADDSWrr:
1129   case AArch64::ADDSWrs:
1130   case AArch64::ADDSWrx:
1131   case AArch64::ADDSXrr:
1132   case AArch64::ADDSXrs:
1133   case AArch64::ADDSXrx:
1134     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1135     SrcReg = MI.getOperand(1).getReg();
1136     SrcReg2 = MI.getOperand(2).getReg();
1137     CmpMask = ~0;
1138     CmpValue = 0;
1139     return true;
1140   case AArch64::SUBSWri:
1141   case AArch64::ADDSWri:
1142   case AArch64::SUBSXri:
1143   case AArch64::ADDSXri:
1144     SrcReg = MI.getOperand(1).getReg();
1145     SrcReg2 = 0;
1146     CmpMask = ~0;
1147     // FIXME: In order to convert CmpValue to 0 or 1
1148     CmpValue = MI.getOperand(2).getImm() != 0;
1149     return true;
1150   case AArch64::ANDSWri:
1151   case AArch64::ANDSXri:
1152     // ANDS does not use the same encoding scheme as the others xxxS
1153     // instructions.
1154     SrcReg = MI.getOperand(1).getReg();
1155     SrcReg2 = 0;
1156     CmpMask = ~0;
1157     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1158     // while the type of CmpValue is int. When converting uint64_t to int,
1159     // the high 32 bits of uint64_t will be lost.
1160     // In fact it causes a bug in spec2006-483.xalancbmk
1161     // CmpValue is only used to compare with zero in OptimizeCompareInstr
1162     CmpValue = AArch64_AM::decodeLogicalImmediate(
1163                    MI.getOperand(2).getImm(),
1164                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1165     return true;
1166   }
1167 
1168   return false;
1169 }
1170 
UpdateOperandRegClass(MachineInstr & Instr)1171 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1172   MachineBasicBlock *MBB = Instr.getParent();
1173   assert(MBB && "Can't get MachineBasicBlock here");
1174   MachineFunction *MF = MBB->getParent();
1175   assert(MF && "Can't get MachineFunction here");
1176   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1177   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1178   MachineRegisterInfo *MRI = &MF->getRegInfo();
1179 
1180   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1181        ++OpIdx) {
1182     MachineOperand &MO = Instr.getOperand(OpIdx);
1183     const TargetRegisterClass *OpRegCstraints =
1184         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1185 
1186     // If there's no constraint, there's nothing to do.
1187     if (!OpRegCstraints)
1188       continue;
1189     // If the operand is a frame index, there's nothing to do here.
1190     // A frame index operand will resolve correctly during PEI.
1191     if (MO.isFI())
1192       continue;
1193 
1194     assert(MO.isReg() &&
1195            "Operand has register constraints without being a register!");
1196 
1197     Register Reg = MO.getReg();
1198     if (Register::isPhysicalRegister(Reg)) {
1199       if (!OpRegCstraints->contains(Reg))
1200         return false;
1201     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1202                !MRI->constrainRegClass(Reg, OpRegCstraints))
1203       return false;
1204   }
1205 
1206   return true;
1207 }
1208 
1209 /// Return the opcode that does not set flags when possible - otherwise
1210 /// return the original opcode. The caller is responsible to do the actual
1211 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1212 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1213   // Don't convert all compare instructions, because for some the zero register
1214   // encoding becomes the sp register.
1215   bool MIDefinesZeroReg = false;
1216   if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1217     MIDefinesZeroReg = true;
1218 
1219   switch (MI.getOpcode()) {
1220   default:
1221     return MI.getOpcode();
1222   case AArch64::ADDSWrr:
1223     return AArch64::ADDWrr;
1224   case AArch64::ADDSWri:
1225     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1226   case AArch64::ADDSWrs:
1227     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1228   case AArch64::ADDSWrx:
1229     return AArch64::ADDWrx;
1230   case AArch64::ADDSXrr:
1231     return AArch64::ADDXrr;
1232   case AArch64::ADDSXri:
1233     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1234   case AArch64::ADDSXrs:
1235     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1236   case AArch64::ADDSXrx:
1237     return AArch64::ADDXrx;
1238   case AArch64::SUBSWrr:
1239     return AArch64::SUBWrr;
1240   case AArch64::SUBSWri:
1241     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1242   case AArch64::SUBSWrs:
1243     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1244   case AArch64::SUBSWrx:
1245     return AArch64::SUBWrx;
1246   case AArch64::SUBSXrr:
1247     return AArch64::SUBXrr;
1248   case AArch64::SUBSXri:
1249     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1250   case AArch64::SUBSXrs:
1251     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1252   case AArch64::SUBSXrx:
1253     return AArch64::SUBXrx;
1254   }
1255 }
1256 
1257 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1258 
1259 /// True when condition flags are accessed (either by writing or reading)
1260 /// on the instruction trace starting at From and ending at To.
1261 ///
1262 /// Note: If From and To are from different blocks it's assumed CC are accessed
1263 ///       on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1264 static bool areCFlagsAccessedBetweenInstrs(
1265     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1266     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1267   // Early exit if To is at the beginning of the BB.
1268   if (To == To->getParent()->begin())
1269     return true;
1270 
1271   // Check whether the instructions are in the same basic block
1272   // If not, assume the condition flags might get modified somewhere.
1273   if (To->getParent() != From->getParent())
1274     return true;
1275 
1276   // From must be above To.
1277   assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1278                       [From](MachineInstr &MI) {
1279                         return MI.getIterator() == From;
1280                       }) != To->getParent()->rend());
1281 
1282   // We iterate backward starting at \p To until we hit \p From.
1283   for (const MachineInstr &Instr :
1284        instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1285     if (((AccessToCheck & AK_Write) &&
1286          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1287         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1288       return true;
1289   }
1290   return false;
1291 }
1292 
1293 /// Try to optimize a compare instruction. A compare instruction is an
1294 /// instruction which produces AArch64::NZCV. It can be truly compare
1295 /// instruction
1296 /// when there are no uses of its destination register.
1297 ///
1298 /// The following steps are tried in order:
1299 /// 1. Convert CmpInstr into an unconditional version.
1300 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1301 ///    condition code or an instruction which can be converted into such an
1302 ///    instruction.
1303 ///    Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int CmpMask,int CmpValue,const MachineRegisterInfo * MRI) const1304 bool AArch64InstrInfo::optimizeCompareInstr(
1305     MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
1306     int CmpValue, const MachineRegisterInfo *MRI) const {
1307   assert(CmpInstr.getParent());
1308   assert(MRI);
1309 
1310   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1311   int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1312   if (DeadNZCVIdx != -1) {
1313     if (CmpInstr.definesRegister(AArch64::WZR) ||
1314         CmpInstr.definesRegister(AArch64::XZR)) {
1315       CmpInstr.eraseFromParent();
1316       return true;
1317     }
1318     unsigned Opc = CmpInstr.getOpcode();
1319     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1320     if (NewOpc == Opc)
1321       return false;
1322     const MCInstrDesc &MCID = get(NewOpc);
1323     CmpInstr.setDesc(MCID);
1324     CmpInstr.RemoveOperand(DeadNZCVIdx);
1325     bool succeeded = UpdateOperandRegClass(CmpInstr);
1326     (void)succeeded;
1327     assert(succeeded && "Some operands reg class are incompatible!");
1328     return true;
1329   }
1330 
1331   // Continue only if we have a "ri" where immediate is zero.
1332   // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1333   // function.
1334   assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1335   if (CmpValue != 0 || SrcReg2 != 0)
1336     return false;
1337 
1338   // CmpInstr is a Compare instruction if destination register is not used.
1339   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1340     return false;
1341 
1342   return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1343 }
1344 
1345 /// Get opcode of S version of Instr.
1346 /// If Instr is S version its opcode is returned.
1347 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1348 /// or we are not interested in it.
sForm(MachineInstr & Instr)1349 static unsigned sForm(MachineInstr &Instr) {
1350   switch (Instr.getOpcode()) {
1351   default:
1352     return AArch64::INSTRUCTION_LIST_END;
1353 
1354   case AArch64::ADDSWrr:
1355   case AArch64::ADDSWri:
1356   case AArch64::ADDSXrr:
1357   case AArch64::ADDSXri:
1358   case AArch64::SUBSWrr:
1359   case AArch64::SUBSWri:
1360   case AArch64::SUBSXrr:
1361   case AArch64::SUBSXri:
1362     return Instr.getOpcode();
1363 
1364   case AArch64::ADDWrr:
1365     return AArch64::ADDSWrr;
1366   case AArch64::ADDWri:
1367     return AArch64::ADDSWri;
1368   case AArch64::ADDXrr:
1369     return AArch64::ADDSXrr;
1370   case AArch64::ADDXri:
1371     return AArch64::ADDSXri;
1372   case AArch64::ADCWr:
1373     return AArch64::ADCSWr;
1374   case AArch64::ADCXr:
1375     return AArch64::ADCSXr;
1376   case AArch64::SUBWrr:
1377     return AArch64::SUBSWrr;
1378   case AArch64::SUBWri:
1379     return AArch64::SUBSWri;
1380   case AArch64::SUBXrr:
1381     return AArch64::SUBSXrr;
1382   case AArch64::SUBXri:
1383     return AArch64::SUBSXri;
1384   case AArch64::SBCWr:
1385     return AArch64::SBCSWr;
1386   case AArch64::SBCXr:
1387     return AArch64::SBCSXr;
1388   case AArch64::ANDWri:
1389     return AArch64::ANDSWri;
1390   case AArch64::ANDXri:
1391     return AArch64::ANDSXri;
1392   }
1393 }
1394 
1395 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(MachineBasicBlock * MBB)1396 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1397   for (auto *BB : MBB->successors())
1398     if (BB->isLiveIn(AArch64::NZCV))
1399       return true;
1400   return false;
1401 }
1402 
1403 namespace {
1404 
1405 struct UsedNZCV {
1406   bool N = false;
1407   bool Z = false;
1408   bool C = false;
1409   bool V = false;
1410 
1411   UsedNZCV() = default;
1412 
operator |=__anonac78361c0211::UsedNZCV1413   UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1414     this->N |= UsedFlags.N;
1415     this->Z |= UsedFlags.Z;
1416     this->C |= UsedFlags.C;
1417     this->V |= UsedFlags.V;
1418     return *this;
1419   }
1420 };
1421 
1422 } // end anonymous namespace
1423 
1424 /// Find a condition code used by the instruction.
1425 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1426 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1427 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1428   switch (Instr.getOpcode()) {
1429   default:
1430     return AArch64CC::Invalid;
1431 
1432   case AArch64::Bcc: {
1433     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1434     assert(Idx >= 2);
1435     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1436   }
1437 
1438   case AArch64::CSINVWr:
1439   case AArch64::CSINVXr:
1440   case AArch64::CSINCWr:
1441   case AArch64::CSINCXr:
1442   case AArch64::CSELWr:
1443   case AArch64::CSELXr:
1444   case AArch64::CSNEGWr:
1445   case AArch64::CSNEGXr:
1446   case AArch64::FCSELSrrr:
1447   case AArch64::FCSELDrrr: {
1448     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1449     assert(Idx >= 1);
1450     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1451   }
1452   }
1453 }
1454 
getUsedNZCV(AArch64CC::CondCode CC)1455 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1456   assert(CC != AArch64CC::Invalid);
1457   UsedNZCV UsedFlags;
1458   switch (CC) {
1459   default:
1460     break;
1461 
1462   case AArch64CC::EQ: // Z set
1463   case AArch64CC::NE: // Z clear
1464     UsedFlags.Z = true;
1465     break;
1466 
1467   case AArch64CC::HI: // Z clear and C set
1468   case AArch64CC::LS: // Z set   or  C clear
1469     UsedFlags.Z = true;
1470     LLVM_FALLTHROUGH;
1471   case AArch64CC::HS: // C set
1472   case AArch64CC::LO: // C clear
1473     UsedFlags.C = true;
1474     break;
1475 
1476   case AArch64CC::MI: // N set
1477   case AArch64CC::PL: // N clear
1478     UsedFlags.N = true;
1479     break;
1480 
1481   case AArch64CC::VS: // V set
1482   case AArch64CC::VC: // V clear
1483     UsedFlags.V = true;
1484     break;
1485 
1486   case AArch64CC::GT: // Z clear, N and V the same
1487   case AArch64CC::LE: // Z set,   N and V differ
1488     UsedFlags.Z = true;
1489     LLVM_FALLTHROUGH;
1490   case AArch64CC::GE: // N and V the same
1491   case AArch64CC::LT: // N and V differ
1492     UsedFlags.N = true;
1493     UsedFlags.V = true;
1494     break;
1495   }
1496   return UsedFlags;
1497 }
1498 
isADDSRegImm(unsigned Opcode)1499 static bool isADDSRegImm(unsigned Opcode) {
1500   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1501 }
1502 
isSUBSRegImm(unsigned Opcode)1503 static bool isSUBSRegImm(unsigned Opcode) {
1504   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1505 }
1506 
1507 /// Check if CmpInstr can be substituted by MI.
1508 ///
1509 /// CmpInstr can be substituted:
1510 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1511 /// - and, MI and CmpInstr are from the same MachineBB
1512 /// - and, condition flags are not alive in successors of the CmpInstr parent
1513 /// - and, if MI opcode is the S form there must be no defs of flags between
1514 ///        MI and CmpInstr
1515 ///        or if MI opcode is not the S form there must be neither defs of flags
1516 ///        nor uses of flags between MI and CmpInstr.
1517 /// - and  C/V flags are not used after CmpInstr
canInstrSubstituteCmpInstr(MachineInstr * MI,MachineInstr * CmpInstr,const TargetRegisterInfo * TRI)1518 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1519                                        const TargetRegisterInfo *TRI) {
1520   assert(MI);
1521   assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1522   assert(CmpInstr);
1523 
1524   const unsigned CmpOpcode = CmpInstr->getOpcode();
1525   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1526     return false;
1527 
1528   if (MI->getParent() != CmpInstr->getParent())
1529     return false;
1530 
1531   if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1532     return false;
1533 
1534   AccessKind AccessToCheck = AK_Write;
1535   if (sForm(*MI) != MI->getOpcode())
1536     AccessToCheck = AK_All;
1537   if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1538     return false;
1539 
1540   UsedNZCV NZCVUsedAfterCmp;
1541   for (const MachineInstr &Instr :
1542        instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
1543                                 CmpInstr->getParent()->instr_end())) {
1544     if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1545       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1546       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1547         return false;
1548       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1549     }
1550 
1551     if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1552       break;
1553   }
1554 
1555   return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1556 }
1557 
1558 /// Substitute an instruction comparing to zero with another instruction
1559 /// which produces needed condition flags.
1560 ///
1561 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo * MRI) const1562 bool AArch64InstrInfo::substituteCmpToZero(
1563     MachineInstr &CmpInstr, unsigned SrcReg,
1564     const MachineRegisterInfo *MRI) const {
1565   assert(MRI);
1566   // Get the unique definition of SrcReg.
1567   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1568   if (!MI)
1569     return false;
1570 
1571   const TargetRegisterInfo *TRI = &getRegisterInfo();
1572 
1573   unsigned NewOpc = sForm(*MI);
1574   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1575     return false;
1576 
1577   if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1578     return false;
1579 
1580   // Update the instruction to set NZCV.
1581   MI->setDesc(get(NewOpc));
1582   CmpInstr.eraseFromParent();
1583   bool succeeded = UpdateOperandRegClass(*MI);
1584   (void)succeeded;
1585   assert(succeeded && "Some operands reg class are incompatible!");
1586   MI->addRegisterDefined(AArch64::NZCV, TRI);
1587   return true;
1588 }
1589 
expandPostRAPseudo(MachineInstr & MI) const1590 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1591   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1592       MI.getOpcode() != AArch64::CATCHRET)
1593     return false;
1594 
1595   MachineBasicBlock &MBB = *MI.getParent();
1596   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1597   auto TRI = Subtarget.getRegisterInfo();
1598   DebugLoc DL = MI.getDebugLoc();
1599 
1600   if (MI.getOpcode() == AArch64::CATCHRET) {
1601     // Skip to the first instruction before the epilog.
1602     const TargetInstrInfo *TII =
1603       MBB.getParent()->getSubtarget().getInstrInfo();
1604     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1605     auto MBBI = MachineBasicBlock::iterator(MI);
1606     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1607     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1608            FirstEpilogSEH != MBB.begin())
1609       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1610     if (FirstEpilogSEH != MBB.begin())
1611       FirstEpilogSEH = std::next(FirstEpilogSEH);
1612     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1613         .addReg(AArch64::X0, RegState::Define)
1614         .addMBB(TargetMBB);
1615     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1616         .addReg(AArch64::X0, RegState::Define)
1617         .addReg(AArch64::X0)
1618         .addMBB(TargetMBB)
1619         .addImm(0);
1620     return true;
1621   }
1622 
1623   Register Reg = MI.getOperand(0).getReg();
1624   const GlobalValue *GV =
1625       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1626   const TargetMachine &TM = MBB.getParent()->getTarget();
1627   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1628   const unsigned char MO_NC = AArch64II::MO_NC;
1629 
1630   if ((OpFlags & AArch64II::MO_GOT) != 0) {
1631     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1632         .addGlobalAddress(GV, 0, OpFlags);
1633     if (Subtarget.isTargetILP32()) {
1634       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1635       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1636           .addDef(Reg32, RegState::Dead)
1637           .addUse(Reg, RegState::Kill)
1638           .addImm(0)
1639           .addMemOperand(*MI.memoperands_begin())
1640           .addDef(Reg, RegState::Implicit);
1641     } else {
1642       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1643           .addReg(Reg, RegState::Kill)
1644           .addImm(0)
1645           .addMemOperand(*MI.memoperands_begin());
1646     }
1647   } else if (TM.getCodeModel() == CodeModel::Large) {
1648     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1649     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1650         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1651         .addImm(0);
1652     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1653         .addReg(Reg, RegState::Kill)
1654         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1655         .addImm(16);
1656     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1657         .addReg(Reg, RegState::Kill)
1658         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1659         .addImm(32);
1660     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1661         .addReg(Reg, RegState::Kill)
1662         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1663         .addImm(48);
1664     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1665         .addReg(Reg, RegState::Kill)
1666         .addImm(0)
1667         .addMemOperand(*MI.memoperands_begin());
1668   } else if (TM.getCodeModel() == CodeModel::Tiny) {
1669     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1670         .addGlobalAddress(GV, 0, OpFlags);
1671   } else {
1672     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1673         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1674     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1675     if (Subtarget.isTargetILP32()) {
1676       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1677       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1678           .addDef(Reg32, RegState::Dead)
1679           .addUse(Reg, RegState::Kill)
1680           .addGlobalAddress(GV, 0, LoFlags)
1681           .addMemOperand(*MI.memoperands_begin())
1682           .addDef(Reg, RegState::Implicit);
1683     } else {
1684       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1685           .addReg(Reg, RegState::Kill)
1686           .addGlobalAddress(GV, 0, LoFlags)
1687           .addMemOperand(*MI.memoperands_begin());
1688     }
1689   }
1690 
1691   MBB.erase(MI);
1692 
1693   return true;
1694 }
1695 
1696 // Return true if this instruction simply sets its single destination register
1697 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)1698 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1699   switch (MI.getOpcode()) {
1700   default:
1701     break;
1702   case AArch64::MOVZWi:
1703   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1704     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1705       assert(MI.getDesc().getNumOperands() == 3 &&
1706              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1707       return true;
1708     }
1709     break;
1710   case AArch64::ANDWri: // and Rd, Rzr, #imm
1711     return MI.getOperand(1).getReg() == AArch64::WZR;
1712   case AArch64::ANDXri:
1713     return MI.getOperand(1).getReg() == AArch64::XZR;
1714   case TargetOpcode::COPY:
1715     return MI.getOperand(1).getReg() == AArch64::WZR;
1716   }
1717   return false;
1718 }
1719 
1720 // Return true if this instruction simply renames a general register without
1721 // modifying bits.
isGPRCopy(const MachineInstr & MI)1722 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1723   switch (MI.getOpcode()) {
1724   default:
1725     break;
1726   case TargetOpcode::COPY: {
1727     // GPR32 copies will by lowered to ORRXrs
1728     Register DstReg = MI.getOperand(0).getReg();
1729     return (AArch64::GPR32RegClass.contains(DstReg) ||
1730             AArch64::GPR64RegClass.contains(DstReg));
1731   }
1732   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1733     if (MI.getOperand(1).getReg() == AArch64::XZR) {
1734       assert(MI.getDesc().getNumOperands() == 4 &&
1735              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1736       return true;
1737     }
1738     break;
1739   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1740     if (MI.getOperand(2).getImm() == 0) {
1741       assert(MI.getDesc().getNumOperands() == 4 &&
1742              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1743       return true;
1744     }
1745     break;
1746   }
1747   return false;
1748 }
1749 
1750 // Return true if this instruction simply renames a general register without
1751 // modifying bits.
isFPRCopy(const MachineInstr & MI)1752 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1753   switch (MI.getOpcode()) {
1754   default:
1755     break;
1756   case TargetOpcode::COPY: {
1757     // FPR64 copies will by lowered to ORR.16b
1758     Register DstReg = MI.getOperand(0).getReg();
1759     return (AArch64::FPR64RegClass.contains(DstReg) ||
1760             AArch64::FPR128RegClass.contains(DstReg));
1761   }
1762   case AArch64::ORRv16i8:
1763     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1764       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1765              "invalid ORRv16i8 operands");
1766       return true;
1767     }
1768     break;
1769   }
1770   return false;
1771 }
1772 
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const1773 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1774                                                int &FrameIndex) const {
1775   switch (MI.getOpcode()) {
1776   default:
1777     break;
1778   case AArch64::LDRWui:
1779   case AArch64::LDRXui:
1780   case AArch64::LDRBui:
1781   case AArch64::LDRHui:
1782   case AArch64::LDRSui:
1783   case AArch64::LDRDui:
1784   case AArch64::LDRQui:
1785     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1786         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1787       FrameIndex = MI.getOperand(1).getIndex();
1788       return MI.getOperand(0).getReg();
1789     }
1790     break;
1791   }
1792 
1793   return 0;
1794 }
1795 
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const1796 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1797                                               int &FrameIndex) const {
1798   switch (MI.getOpcode()) {
1799   default:
1800     break;
1801   case AArch64::STRWui:
1802   case AArch64::STRXui:
1803   case AArch64::STRBui:
1804   case AArch64::STRHui:
1805   case AArch64::STRSui:
1806   case AArch64::STRDui:
1807   case AArch64::STRQui:
1808   case AArch64::LDR_PXI:
1809   case AArch64::STR_PXI:
1810     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1811         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1812       FrameIndex = MI.getOperand(1).getIndex();
1813       return MI.getOperand(0).getReg();
1814     }
1815     break;
1816   }
1817   return 0;
1818 }
1819 
1820 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)1821 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1822   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1823     return MMO->getFlags() & MOSuppressPair;
1824   });
1825 }
1826 
1827 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)1828 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1829   if (MI.memoperands_empty())
1830     return;
1831   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1832 }
1833 
1834 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)1835 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1836   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1837     return MMO->getFlags() & MOStridedAccess;
1838   });
1839 }
1840 
isUnscaledLdSt(unsigned Opc)1841 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1842   switch (Opc) {
1843   default:
1844     return false;
1845   case AArch64::STURSi:
1846   case AArch64::STURDi:
1847   case AArch64::STURQi:
1848   case AArch64::STURBBi:
1849   case AArch64::STURHHi:
1850   case AArch64::STURWi:
1851   case AArch64::STURXi:
1852   case AArch64::LDURSi:
1853   case AArch64::LDURDi:
1854   case AArch64::LDURQi:
1855   case AArch64::LDURWi:
1856   case AArch64::LDURXi:
1857   case AArch64::LDURSWi:
1858   case AArch64::LDURHHi:
1859   case AArch64::LDURBBi:
1860   case AArch64::LDURSBWi:
1861   case AArch64::LDURSHWi:
1862     return true;
1863   }
1864 }
1865 
getUnscaledLdSt(unsigned Opc)1866 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
1867   switch (Opc) {
1868   default: return {};
1869   case AArch64::PRFMui: return AArch64::PRFUMi;
1870   case AArch64::LDRXui: return AArch64::LDURXi;
1871   case AArch64::LDRWui: return AArch64::LDURWi;
1872   case AArch64::LDRBui: return AArch64::LDURBi;
1873   case AArch64::LDRHui: return AArch64::LDURHi;
1874   case AArch64::LDRSui: return AArch64::LDURSi;
1875   case AArch64::LDRDui: return AArch64::LDURDi;
1876   case AArch64::LDRQui: return AArch64::LDURQi;
1877   case AArch64::LDRBBui: return AArch64::LDURBBi;
1878   case AArch64::LDRHHui: return AArch64::LDURHHi;
1879   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1880   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1881   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1882   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1883   case AArch64::LDRSWui: return AArch64::LDURSWi;
1884   case AArch64::STRXui: return AArch64::STURXi;
1885   case AArch64::STRWui: return AArch64::STURWi;
1886   case AArch64::STRBui: return AArch64::STURBi;
1887   case AArch64::STRHui: return AArch64::STURHi;
1888   case AArch64::STRSui: return AArch64::STURSi;
1889   case AArch64::STRDui: return AArch64::STURDi;
1890   case AArch64::STRQui: return AArch64::STURQi;
1891   case AArch64::STRBBui: return AArch64::STURBBi;
1892   case AArch64::STRHHui: return AArch64::STURHHi;
1893   }
1894 }
1895 
getLoadStoreImmIdx(unsigned Opc)1896 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
1897   switch (Opc) {
1898   default:
1899     return 2;
1900   case AArch64::LDPXi:
1901   case AArch64::LDPDi:
1902   case AArch64::STPXi:
1903   case AArch64::STPDi:
1904   case AArch64::LDNPXi:
1905   case AArch64::LDNPDi:
1906   case AArch64::STNPXi:
1907   case AArch64::STNPDi:
1908   case AArch64::LDPQi:
1909   case AArch64::STPQi:
1910   case AArch64::LDNPQi:
1911   case AArch64::STNPQi:
1912   case AArch64::LDPWi:
1913   case AArch64::LDPSi:
1914   case AArch64::STPWi:
1915   case AArch64::STPSi:
1916   case AArch64::LDNPWi:
1917   case AArch64::LDNPSi:
1918   case AArch64::STNPWi:
1919   case AArch64::STNPSi:
1920   case AArch64::LDG:
1921   case AArch64::STGPi:
1922   case AArch64::LD1B_IMM:
1923   case AArch64::LD1H_IMM:
1924   case AArch64::LD1W_IMM:
1925   case AArch64::LD1D_IMM:
1926   case AArch64::ST1B_IMM:
1927   case AArch64::ST1H_IMM:
1928   case AArch64::ST1W_IMM:
1929   case AArch64::ST1D_IMM:
1930   case AArch64::LD1B_H_IMM:
1931   case AArch64::LD1SB_H_IMM:
1932   case AArch64::LD1H_S_IMM:
1933   case AArch64::LD1SH_S_IMM:
1934   case AArch64::LD1W_D_IMM:
1935   case AArch64::LD1SW_D_IMM:
1936   case AArch64::ST1B_H_IMM:
1937   case AArch64::ST1H_S_IMM:
1938   case AArch64::ST1W_D_IMM:
1939   case AArch64::LD1B_S_IMM:
1940   case AArch64::LD1SB_S_IMM:
1941   case AArch64::LD1H_D_IMM:
1942   case AArch64::LD1SH_D_IMM:
1943   case AArch64::ST1B_S_IMM:
1944   case AArch64::ST1H_D_IMM:
1945   case AArch64::LD1B_D_IMM:
1946   case AArch64::LD1SB_D_IMM:
1947   case AArch64::ST1B_D_IMM:
1948     return 3;
1949   case AArch64::ADDG:
1950   case AArch64::STGOffset:
1951   case AArch64::LDR_PXI:
1952   case AArch64::STR_PXI:
1953     return 2;
1954   }
1955 }
1956 
isPairableLdStInst(const MachineInstr & MI)1957 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1958   switch (MI.getOpcode()) {
1959   default:
1960     return false;
1961   // Scaled instructions.
1962   case AArch64::STRSui:
1963   case AArch64::STRDui:
1964   case AArch64::STRQui:
1965   case AArch64::STRXui:
1966   case AArch64::STRWui:
1967   case AArch64::LDRSui:
1968   case AArch64::LDRDui:
1969   case AArch64::LDRQui:
1970   case AArch64::LDRXui:
1971   case AArch64::LDRWui:
1972   case AArch64::LDRSWui:
1973   // Unscaled instructions.
1974   case AArch64::STURSi:
1975   case AArch64::STURDi:
1976   case AArch64::STURQi:
1977   case AArch64::STURWi:
1978   case AArch64::STURXi:
1979   case AArch64::LDURSi:
1980   case AArch64::LDURDi:
1981   case AArch64::LDURQi:
1982   case AArch64::LDURWi:
1983   case AArch64::LDURXi:
1984   case AArch64::LDURSWi:
1985     return true;
1986   }
1987 }
1988 
convertToFlagSettingOpc(unsigned Opc,bool & Is64Bit)1989 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1990                                                    bool &Is64Bit) {
1991   switch (Opc) {
1992   default:
1993     llvm_unreachable("Opcode has no flag setting equivalent!");
1994   // 32-bit cases:
1995   case AArch64::ADDWri:
1996     Is64Bit = false;
1997     return AArch64::ADDSWri;
1998   case AArch64::ADDWrr:
1999     Is64Bit = false;
2000     return AArch64::ADDSWrr;
2001   case AArch64::ADDWrs:
2002     Is64Bit = false;
2003     return AArch64::ADDSWrs;
2004   case AArch64::ADDWrx:
2005     Is64Bit = false;
2006     return AArch64::ADDSWrx;
2007   case AArch64::ANDWri:
2008     Is64Bit = false;
2009     return AArch64::ANDSWri;
2010   case AArch64::ANDWrr:
2011     Is64Bit = false;
2012     return AArch64::ANDSWrr;
2013   case AArch64::ANDWrs:
2014     Is64Bit = false;
2015     return AArch64::ANDSWrs;
2016   case AArch64::BICWrr:
2017     Is64Bit = false;
2018     return AArch64::BICSWrr;
2019   case AArch64::BICWrs:
2020     Is64Bit = false;
2021     return AArch64::BICSWrs;
2022   case AArch64::SUBWri:
2023     Is64Bit = false;
2024     return AArch64::SUBSWri;
2025   case AArch64::SUBWrr:
2026     Is64Bit = false;
2027     return AArch64::SUBSWrr;
2028   case AArch64::SUBWrs:
2029     Is64Bit = false;
2030     return AArch64::SUBSWrs;
2031   case AArch64::SUBWrx:
2032     Is64Bit = false;
2033     return AArch64::SUBSWrx;
2034   // 64-bit cases:
2035   case AArch64::ADDXri:
2036     Is64Bit = true;
2037     return AArch64::ADDSXri;
2038   case AArch64::ADDXrr:
2039     Is64Bit = true;
2040     return AArch64::ADDSXrr;
2041   case AArch64::ADDXrs:
2042     Is64Bit = true;
2043     return AArch64::ADDSXrs;
2044   case AArch64::ADDXrx:
2045     Is64Bit = true;
2046     return AArch64::ADDSXrx;
2047   case AArch64::ANDXri:
2048     Is64Bit = true;
2049     return AArch64::ANDSXri;
2050   case AArch64::ANDXrr:
2051     Is64Bit = true;
2052     return AArch64::ANDSXrr;
2053   case AArch64::ANDXrs:
2054     Is64Bit = true;
2055     return AArch64::ANDSXrs;
2056   case AArch64::BICXrr:
2057     Is64Bit = true;
2058     return AArch64::BICSXrr;
2059   case AArch64::BICXrs:
2060     Is64Bit = true;
2061     return AArch64::BICSXrs;
2062   case AArch64::SUBXri:
2063     Is64Bit = true;
2064     return AArch64::SUBSXri;
2065   case AArch64::SUBXrr:
2066     Is64Bit = true;
2067     return AArch64::SUBSXrr;
2068   case AArch64::SUBXrs:
2069     Is64Bit = true;
2070     return AArch64::SUBSXrs;
2071   case AArch64::SUBXrx:
2072     Is64Bit = true;
2073     return AArch64::SUBSXrx;
2074   }
2075 }
2076 
2077 // Is this a candidate for ld/st merging or pairing?  For example, we don't
2078 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2079 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2080   // If this is a volatile load/store, don't mess with it.
2081   if (MI.hasOrderedMemoryRef())
2082     return false;
2083 
2084   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2085   assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
2086          "Expected a reg or frame index operand.");
2087   if (!MI.getOperand(2).isImm())
2088     return false;
2089 
2090   // Can't merge/pair if the instruction modifies the base register.
2091   // e.g., ldr x0, [x0]
2092   // This case will never occur with an FI base.
2093   if (MI.getOperand(1).isReg()) {
2094     Register BaseReg = MI.getOperand(1).getReg();
2095     const TargetRegisterInfo *TRI = &getRegisterInfo();
2096     if (MI.modifiesRegister(BaseReg, TRI))
2097       return false;
2098   }
2099 
2100   // Check if this load/store has a hint to avoid pair formation.
2101   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2102   if (isLdStPairSuppressed(MI))
2103     return false;
2104 
2105   // Do not pair any callee-save store/reload instructions in the
2106   // prologue/epilogue if the CFI information encoded the operations as separate
2107   // instructions, as that will cause the size of the actual prologue to mismatch
2108   // with the prologue size recorded in the Windows CFI.
2109   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2110   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2111                      MI.getMF()->getFunction().needsUnwindTableEntry();
2112   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2113                       MI.getFlag(MachineInstr::FrameDestroy)))
2114     return false;
2115 
2116   // On some CPUs quad load/store pairs are slower than two single load/stores.
2117   if (Subtarget.isPaired128Slow()) {
2118     switch (MI.getOpcode()) {
2119     default:
2120       break;
2121     case AArch64::LDURQi:
2122     case AArch64::STURQi:
2123     case AArch64::LDRQui:
2124     case AArch64::STRQui:
2125       return false;
2126     }
2127   }
2128 
2129   return true;
2130 }
2131 
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2132 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2133     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2134     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2135     const TargetRegisterInfo *TRI) const {
2136   if (!LdSt.mayLoadOrStore())
2137     return false;
2138 
2139   const MachineOperand *BaseOp;
2140   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2141                                     Width, TRI))
2142     return false;
2143   BaseOps.push_back(BaseOp);
2144   return true;
2145 }
2146 
2147 Optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2148 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2149                                           const TargetRegisterInfo *TRI) const {
2150   const MachineOperand *Base; // Filled with the base operand of MI.
2151   int64_t Offset;             // Filled with the offset of MI.
2152   bool OffsetIsScalable;
2153   if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2154     return None;
2155 
2156   if (!Base->isReg())
2157     return None;
2158   ExtAddrMode AM;
2159   AM.BaseReg = Base->getReg();
2160   AM.Displacement = Offset;
2161   AM.ScaledReg = 0;
2162   return AM;
2163 }
2164 
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2165 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
2166     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2167     bool &OffsetIsScalable, unsigned &Width,
2168     const TargetRegisterInfo *TRI) const {
2169   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2170   // Handle only loads/stores with base register followed by immediate offset.
2171   if (LdSt.getNumExplicitOperands() == 3) {
2172     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2173     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2174         !LdSt.getOperand(2).isImm())
2175       return false;
2176   } else if (LdSt.getNumExplicitOperands() == 4) {
2177     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2178     if (!LdSt.getOperand(1).isReg() ||
2179         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2180         !LdSt.getOperand(3).isImm())
2181       return false;
2182   } else
2183     return false;
2184 
2185   // Get the scaling factor for the instruction and set the width for the
2186   // instruction.
2187   TypeSize Scale(0U, false);
2188   int64_t Dummy1, Dummy2;
2189 
2190   // If this returns false, then it's an instruction we don't want to handle.
2191   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2192     return false;
2193 
2194   // Compute the offset. Offset is calculated as the immediate operand
2195   // multiplied by the scaling factor. Unscaled instructions have scaling factor
2196   // set to 1.
2197   if (LdSt.getNumExplicitOperands() == 3) {
2198     BaseOp = &LdSt.getOperand(1);
2199     Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2200   } else {
2201     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2202     BaseOp = &LdSt.getOperand(2);
2203     Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2204   }
2205   OffsetIsScalable = Scale.isScalable();
2206 
2207   if (!BaseOp->isReg() && !BaseOp->isFI())
2208     return false;
2209 
2210   return true;
2211 }
2212 
2213 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const2214 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
2215   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2216   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2217   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2218   return OfsOp;
2219 }
2220 
getMemOpInfo(unsigned Opcode,TypeSize & Scale,unsigned & Width,int64_t & MinOffset,int64_t & MaxOffset)2221 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2222                                     unsigned &Width, int64_t &MinOffset,
2223                                     int64_t &MaxOffset) {
2224   const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2225   switch (Opcode) {
2226   // Not a memory operation or something we want to handle.
2227   default:
2228     Scale = TypeSize::Fixed(0);
2229     Width = 0;
2230     MinOffset = MaxOffset = 0;
2231     return false;
2232   case AArch64::STRWpost:
2233   case AArch64::LDRWpost:
2234     Width = 32;
2235     Scale = TypeSize::Fixed(4);
2236     MinOffset = -256;
2237     MaxOffset = 255;
2238     break;
2239   case AArch64::LDURQi:
2240   case AArch64::STURQi:
2241     Width = 16;
2242     Scale = TypeSize::Fixed(1);
2243     MinOffset = -256;
2244     MaxOffset = 255;
2245     break;
2246   case AArch64::PRFUMi:
2247   case AArch64::LDURXi:
2248   case AArch64::LDURDi:
2249   case AArch64::STURXi:
2250   case AArch64::STURDi:
2251     Width = 8;
2252     Scale = TypeSize::Fixed(1);
2253     MinOffset = -256;
2254     MaxOffset = 255;
2255     break;
2256   case AArch64::LDURWi:
2257   case AArch64::LDURSi:
2258   case AArch64::LDURSWi:
2259   case AArch64::STURWi:
2260   case AArch64::STURSi:
2261     Width = 4;
2262     Scale = TypeSize::Fixed(1);
2263     MinOffset = -256;
2264     MaxOffset = 255;
2265     break;
2266   case AArch64::LDURHi:
2267   case AArch64::LDURHHi:
2268   case AArch64::LDURSHXi:
2269   case AArch64::LDURSHWi:
2270   case AArch64::STURHi:
2271   case AArch64::STURHHi:
2272     Width = 2;
2273     Scale = TypeSize::Fixed(1);
2274     MinOffset = -256;
2275     MaxOffset = 255;
2276     break;
2277   case AArch64::LDURBi:
2278   case AArch64::LDURBBi:
2279   case AArch64::LDURSBXi:
2280   case AArch64::LDURSBWi:
2281   case AArch64::STURBi:
2282   case AArch64::STURBBi:
2283     Width = 1;
2284     Scale = TypeSize::Fixed(1);
2285     MinOffset = -256;
2286     MaxOffset = 255;
2287     break;
2288   case AArch64::LDPQi:
2289   case AArch64::LDNPQi:
2290   case AArch64::STPQi:
2291   case AArch64::STNPQi:
2292     Scale = TypeSize::Fixed(16);
2293     Width = 32;
2294     MinOffset = -64;
2295     MaxOffset = 63;
2296     break;
2297   case AArch64::LDRQui:
2298   case AArch64::STRQui:
2299     Scale = TypeSize::Fixed(16);
2300     Width = 16;
2301     MinOffset = 0;
2302     MaxOffset = 4095;
2303     break;
2304   case AArch64::LDPXi:
2305   case AArch64::LDPDi:
2306   case AArch64::LDNPXi:
2307   case AArch64::LDNPDi:
2308   case AArch64::STPXi:
2309   case AArch64::STPDi:
2310   case AArch64::STNPXi:
2311   case AArch64::STNPDi:
2312     Scale = TypeSize::Fixed(8);
2313     Width = 16;
2314     MinOffset = -64;
2315     MaxOffset = 63;
2316     break;
2317   case AArch64::PRFMui:
2318   case AArch64::LDRXui:
2319   case AArch64::LDRDui:
2320   case AArch64::STRXui:
2321   case AArch64::STRDui:
2322     Scale = TypeSize::Fixed(8);
2323     Width = 8;
2324     MinOffset = 0;
2325     MaxOffset = 4095;
2326     break;
2327   case AArch64::LDPWi:
2328   case AArch64::LDPSi:
2329   case AArch64::LDNPWi:
2330   case AArch64::LDNPSi:
2331   case AArch64::STPWi:
2332   case AArch64::STPSi:
2333   case AArch64::STNPWi:
2334   case AArch64::STNPSi:
2335     Scale = TypeSize::Fixed(4);
2336     Width = 8;
2337     MinOffset = -64;
2338     MaxOffset = 63;
2339     break;
2340   case AArch64::LDRWui:
2341   case AArch64::LDRSui:
2342   case AArch64::LDRSWui:
2343   case AArch64::STRWui:
2344   case AArch64::STRSui:
2345     Scale = TypeSize::Fixed(4);
2346     Width = 4;
2347     MinOffset = 0;
2348     MaxOffset = 4095;
2349     break;
2350   case AArch64::LDRHui:
2351   case AArch64::LDRHHui:
2352   case AArch64::LDRSHWui:
2353   case AArch64::LDRSHXui:
2354   case AArch64::STRHui:
2355   case AArch64::STRHHui:
2356     Scale = TypeSize::Fixed(2);
2357     Width = 2;
2358     MinOffset = 0;
2359     MaxOffset = 4095;
2360     break;
2361   case AArch64::LDRBui:
2362   case AArch64::LDRBBui:
2363   case AArch64::LDRSBWui:
2364   case AArch64::LDRSBXui:
2365   case AArch64::STRBui:
2366   case AArch64::STRBBui:
2367     Scale = TypeSize::Fixed(1);
2368     Width = 1;
2369     MinOffset = 0;
2370     MaxOffset = 4095;
2371     break;
2372   case AArch64::ADDG:
2373     Scale = TypeSize::Fixed(16);
2374     Width = 0;
2375     MinOffset = 0;
2376     MaxOffset = 63;
2377     break;
2378   case AArch64::TAGPstack:
2379     Scale = TypeSize::Fixed(16);
2380     Width = 0;
2381     // TAGP with a negative offset turns into SUBP, which has a maximum offset
2382     // of 63 (not 64!).
2383     MinOffset = -63;
2384     MaxOffset = 63;
2385     break;
2386   case AArch64::LDG:
2387   case AArch64::STGOffset:
2388   case AArch64::STZGOffset:
2389     Scale = TypeSize::Fixed(16);
2390     Width = 16;
2391     MinOffset = -256;
2392     MaxOffset = 255;
2393     break;
2394   case AArch64::STR_ZZZZXI:
2395   case AArch64::LDR_ZZZZXI:
2396     Scale = TypeSize::Scalable(16);
2397     Width = SVEMaxBytesPerVector * 4;
2398     MinOffset = -256;
2399     MaxOffset = 252;
2400     break;
2401   case AArch64::STR_ZZZXI:
2402   case AArch64::LDR_ZZZXI:
2403     Scale = TypeSize::Scalable(16);
2404     Width = SVEMaxBytesPerVector * 3;
2405     MinOffset = -256;
2406     MaxOffset = 253;
2407     break;
2408   case AArch64::STR_ZZXI:
2409   case AArch64::LDR_ZZXI:
2410     Scale = TypeSize::Scalable(16);
2411     Width = SVEMaxBytesPerVector * 2;
2412     MinOffset = -256;
2413     MaxOffset = 254;
2414     break;
2415   case AArch64::LDR_PXI:
2416   case AArch64::STR_PXI:
2417     Scale = TypeSize::Scalable(2);
2418     Width = SVEMaxBytesPerVector / 8;
2419     MinOffset = -256;
2420     MaxOffset = 255;
2421     break;
2422   case AArch64::LDR_ZXI:
2423   case AArch64::STR_ZXI:
2424     Scale = TypeSize::Scalable(16);
2425     Width = SVEMaxBytesPerVector;
2426     MinOffset = -256;
2427     MaxOffset = 255;
2428     break;
2429   case AArch64::LD1B_IMM:
2430   case AArch64::LD1H_IMM:
2431   case AArch64::LD1W_IMM:
2432   case AArch64::LD1D_IMM:
2433   case AArch64::ST1B_IMM:
2434   case AArch64::ST1H_IMM:
2435   case AArch64::ST1W_IMM:
2436   case AArch64::ST1D_IMM:
2437     // A full vectors worth of data
2438     // Width = mbytes * elements
2439     Scale = TypeSize::Scalable(16);
2440     Width = SVEMaxBytesPerVector;
2441     MinOffset = -8;
2442     MaxOffset = 7;
2443     break;
2444   case AArch64::LD1B_H_IMM:
2445   case AArch64::LD1SB_H_IMM:
2446   case AArch64::LD1H_S_IMM:
2447   case AArch64::LD1SH_S_IMM:
2448   case AArch64::LD1W_D_IMM:
2449   case AArch64::LD1SW_D_IMM:
2450   case AArch64::ST1B_H_IMM:
2451   case AArch64::ST1H_S_IMM:
2452   case AArch64::ST1W_D_IMM:
2453     // A half vector worth of data
2454     // Width = mbytes * elements
2455     Scale = TypeSize::Scalable(8);
2456     Width = SVEMaxBytesPerVector / 2;
2457     MinOffset = -8;
2458     MaxOffset = 7;
2459     break;
2460   case AArch64::LD1B_S_IMM:
2461   case AArch64::LD1SB_S_IMM:
2462   case AArch64::LD1H_D_IMM:
2463   case AArch64::LD1SH_D_IMM:
2464   case AArch64::ST1B_S_IMM:
2465   case AArch64::ST1H_D_IMM:
2466     // A quarter vector worth of data
2467     // Width = mbytes * elements
2468     Scale = TypeSize::Scalable(4);
2469     Width = SVEMaxBytesPerVector / 4;
2470     MinOffset = -8;
2471     MaxOffset = 7;
2472     break;
2473   case AArch64::LD1B_D_IMM:
2474   case AArch64::LD1SB_D_IMM:
2475   case AArch64::ST1B_D_IMM:
2476     // A eighth vector worth of data
2477     // Width = mbytes * elements
2478     Scale = TypeSize::Scalable(2);
2479     Width = SVEMaxBytesPerVector / 8;
2480     MinOffset = -8;
2481     MaxOffset = 7;
2482     break;
2483   case AArch64::ST2GOffset:
2484   case AArch64::STZ2GOffset:
2485     Scale = TypeSize::Fixed(16);
2486     Width = 32;
2487     MinOffset = -256;
2488     MaxOffset = 255;
2489     break;
2490   case AArch64::STGPi:
2491     Scale = TypeSize::Fixed(16);
2492     Width = 16;
2493     MinOffset = -64;
2494     MaxOffset = 63;
2495     break;
2496   }
2497 
2498   return true;
2499 }
2500 
2501 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)2502 int AArch64InstrInfo::getMemScale(unsigned Opc) {
2503   switch (Opc) {
2504   default:
2505     llvm_unreachable("Opcode has unknown scale!");
2506   case AArch64::LDRBBui:
2507   case AArch64::LDURBBi:
2508   case AArch64::LDRSBWui:
2509   case AArch64::LDURSBWi:
2510   case AArch64::STRBBui:
2511   case AArch64::STURBBi:
2512     return 1;
2513   case AArch64::LDRHHui:
2514   case AArch64::LDURHHi:
2515   case AArch64::LDRSHWui:
2516   case AArch64::LDURSHWi:
2517   case AArch64::STRHHui:
2518   case AArch64::STURHHi:
2519     return 2;
2520   case AArch64::LDRSui:
2521   case AArch64::LDURSi:
2522   case AArch64::LDRSWui:
2523   case AArch64::LDURSWi:
2524   case AArch64::LDRWui:
2525   case AArch64::LDURWi:
2526   case AArch64::STRSui:
2527   case AArch64::STURSi:
2528   case AArch64::STRWui:
2529   case AArch64::STURWi:
2530   case AArch64::LDPSi:
2531   case AArch64::LDPSWi:
2532   case AArch64::LDPWi:
2533   case AArch64::STPSi:
2534   case AArch64::STPWi:
2535     return 4;
2536   case AArch64::LDRDui:
2537   case AArch64::LDURDi:
2538   case AArch64::LDRXui:
2539   case AArch64::LDURXi:
2540   case AArch64::STRDui:
2541   case AArch64::STURDi:
2542   case AArch64::STRXui:
2543   case AArch64::STURXi:
2544   case AArch64::LDPDi:
2545   case AArch64::LDPXi:
2546   case AArch64::STPDi:
2547   case AArch64::STPXi:
2548     return 8;
2549   case AArch64::LDRQui:
2550   case AArch64::LDURQi:
2551   case AArch64::STRQui:
2552   case AArch64::STURQi:
2553   case AArch64::LDPQi:
2554   case AArch64::STPQi:
2555   case AArch64::STGOffset:
2556   case AArch64::STZGOffset:
2557   case AArch64::ST2GOffset:
2558   case AArch64::STZ2GOffset:
2559   case AArch64::STGPi:
2560     return 16;
2561   }
2562 }
2563 
2564 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
2565 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)2566 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2567   int Scale = AArch64InstrInfo::getMemScale(Opc);
2568 
2569   // If the byte-offset isn't a multiple of the stride, we can't scale this
2570   // offset.
2571   if (Offset % Scale != 0)
2572     return false;
2573 
2574   // Convert the byte-offset used by unscaled into an "element" offset used
2575   // by the scaled pair load/store instructions.
2576   Offset /= Scale;
2577   return true;
2578 }
2579 
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)2580 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2581   if (FirstOpc == SecondOpc)
2582     return true;
2583   // We can also pair sign-ext and zero-ext instructions.
2584   switch (FirstOpc) {
2585   default:
2586     return false;
2587   case AArch64::LDRWui:
2588   case AArch64::LDURWi:
2589     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2590   case AArch64::LDRSWui:
2591   case AArch64::LDURSWi:
2592     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2593   }
2594   // These instructions can't be paired based on their opcodes.
2595   return false;
2596 }
2597 
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)2598 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2599                             int64_t Offset1, unsigned Opcode1, int FI2,
2600                             int64_t Offset2, unsigned Opcode2) {
2601   // Accesses through fixed stack object frame indices may access a different
2602   // fixed stack slot. Check that the object offsets + offsets match.
2603   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2604     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2605     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2606     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2607     // Convert to scaled object offsets.
2608     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
2609     if (ObjectOffset1 % Scale1 != 0)
2610       return false;
2611     ObjectOffset1 /= Scale1;
2612     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
2613     if (ObjectOffset2 % Scale2 != 0)
2614       return false;
2615     ObjectOffset2 /= Scale2;
2616     ObjectOffset1 += Offset1;
2617     ObjectOffset2 += Offset2;
2618     return ObjectOffset1 + 1 == ObjectOffset2;
2619   }
2620 
2621   return FI1 == FI2;
2622 }
2623 
2624 /// Detect opportunities for ldp/stp formation.
2625 ///
2626 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,ArrayRef<const MachineOperand * > BaseOps2,unsigned NumLoads,unsigned NumBytes) const2627 bool AArch64InstrInfo::shouldClusterMemOps(
2628     ArrayRef<const MachineOperand *> BaseOps1,
2629     ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
2630     unsigned NumBytes) const {
2631   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
2632   const MachineOperand &BaseOp1 = *BaseOps1.front();
2633   const MachineOperand &BaseOp2 = *BaseOps2.front();
2634   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2635   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2636   if (BaseOp1.getType() != BaseOp2.getType())
2637     return false;
2638 
2639   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2640          "Only base registers and frame indices are supported.");
2641 
2642   // Check for both base regs and base FI.
2643   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2644     return false;
2645 
2646   // Only cluster up to a single pair.
2647   if (NumLoads > 2)
2648     return false;
2649 
2650   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2651     return false;
2652 
2653   // Can we pair these instructions based on their opcodes?
2654   unsigned FirstOpc = FirstLdSt.getOpcode();
2655   unsigned SecondOpc = SecondLdSt.getOpcode();
2656   if (!canPairLdStOpc(FirstOpc, SecondOpc))
2657     return false;
2658 
2659   // Can't merge volatiles or load/stores that have a hint to avoid pair
2660   // formation, for example.
2661   if (!isCandidateToMergeOrPair(FirstLdSt) ||
2662       !isCandidateToMergeOrPair(SecondLdSt))
2663     return false;
2664 
2665   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2666   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2667   if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2668     return false;
2669 
2670   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2671   if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2672     return false;
2673 
2674   // Pairwise instructions have a 7-bit signed offset field.
2675   if (Offset1 > 63 || Offset1 < -64)
2676     return false;
2677 
2678   // The caller should already have ordered First/SecondLdSt by offset.
2679   // Note: except for non-equal frame index bases
2680   if (BaseOp1.isFI()) {
2681     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2682            "Caller should have ordered offsets.");
2683 
2684     const MachineFrameInfo &MFI =
2685         FirstLdSt.getParent()->getParent()->getFrameInfo();
2686     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2687                            BaseOp2.getIndex(), Offset2, SecondOpc);
2688   }
2689 
2690   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
2691 
2692   return Offset1 + 1 == Offset2;
2693 }
2694 
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)2695 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2696                                             unsigned Reg, unsigned SubIdx,
2697                                             unsigned State,
2698                                             const TargetRegisterInfo *TRI) {
2699   if (!SubIdx)
2700     return MIB.addReg(Reg, State);
2701 
2702   if (Register::isPhysicalRegister(Reg))
2703     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2704   return MIB.addReg(Reg, State, SubIdx);
2705 }
2706 
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)2707 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2708                                         unsigned NumRegs) {
2709   // We really want the positive remainder mod 32 here, that happens to be
2710   // easily obtainable with a mask.
2711   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2712 }
2713 
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const2714 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2715                                         MachineBasicBlock::iterator I,
2716                                         const DebugLoc &DL, MCRegister DestReg,
2717                                         MCRegister SrcReg, bool KillSrc,
2718                                         unsigned Opcode,
2719                                         ArrayRef<unsigned> Indices) const {
2720   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2721   const TargetRegisterInfo *TRI = &getRegisterInfo();
2722   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2723   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2724   unsigned NumRegs = Indices.size();
2725 
2726   int SubReg = 0, End = NumRegs, Incr = 1;
2727   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2728     SubReg = NumRegs - 1;
2729     End = -1;
2730     Incr = -1;
2731   }
2732 
2733   for (; SubReg != End; SubReg += Incr) {
2734     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2735     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2736     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2737     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2738   }
2739 }
2740 
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const2741 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2742                                        MachineBasicBlock::iterator I,
2743                                        DebugLoc DL, unsigned DestReg,
2744                                        unsigned SrcReg, bool KillSrc,
2745                                        unsigned Opcode, unsigned ZeroReg,
2746                                        llvm::ArrayRef<unsigned> Indices) const {
2747   const TargetRegisterInfo *TRI = &getRegisterInfo();
2748   unsigned NumRegs = Indices.size();
2749 
2750 #ifndef NDEBUG
2751   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2752   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2753   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2754          "GPR reg sequences should not be able to overlap");
2755 #endif
2756 
2757   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2758     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2759     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2760     MIB.addReg(ZeroReg);
2761     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2762     MIB.addImm(0);
2763   }
2764 }
2765 
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const2766 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2767                                    MachineBasicBlock::iterator I,
2768                                    const DebugLoc &DL, MCRegister DestReg,
2769                                    MCRegister SrcReg, bool KillSrc) const {
2770   if (AArch64::GPR32spRegClass.contains(DestReg) &&
2771       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2772     const TargetRegisterInfo *TRI = &getRegisterInfo();
2773 
2774     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2775       // If either operand is WSP, expand to ADD #0.
2776       if (Subtarget.hasZeroCycleRegMove()) {
2777         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2778         MCRegister DestRegX = TRI->getMatchingSuperReg(
2779             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2780         MCRegister SrcRegX = TRI->getMatchingSuperReg(
2781             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2782         // This instruction is reading and writing X registers.  This may upset
2783         // the register scavenger and machine verifier, so we need to indicate
2784         // that we are reading an undefined value from SrcRegX, but a proper
2785         // value from SrcReg.
2786         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2787             .addReg(SrcRegX, RegState::Undef)
2788             .addImm(0)
2789             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2790             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2791       } else {
2792         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2793             .addReg(SrcReg, getKillRegState(KillSrc))
2794             .addImm(0)
2795             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2796       }
2797     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2798       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2799           .addImm(0)
2800           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2801     } else {
2802       if (Subtarget.hasZeroCycleRegMove()) {
2803         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2804         MCRegister DestRegX = TRI->getMatchingSuperReg(
2805             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2806         MCRegister SrcRegX = TRI->getMatchingSuperReg(
2807             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2808         // This instruction is reading and writing X registers.  This may upset
2809         // the register scavenger and machine verifier, so we need to indicate
2810         // that we are reading an undefined value from SrcRegX, but a proper
2811         // value from SrcReg.
2812         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2813             .addReg(AArch64::XZR)
2814             .addReg(SrcRegX, RegState::Undef)
2815             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2816       } else {
2817         // Otherwise, expand to ORR WZR.
2818         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2819             .addReg(AArch64::WZR)
2820             .addReg(SrcReg, getKillRegState(KillSrc));
2821       }
2822     }
2823     return;
2824   }
2825 
2826   // Copy a Predicate register by ORRing with itself.
2827   if (AArch64::PPRRegClass.contains(DestReg) &&
2828       AArch64::PPRRegClass.contains(SrcReg)) {
2829     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2830     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
2831       .addReg(SrcReg) // Pg
2832       .addReg(SrcReg)
2833       .addReg(SrcReg, getKillRegState(KillSrc));
2834     return;
2835   }
2836 
2837   // Copy a Z register by ORRing with itself.
2838   if (AArch64::ZPRRegClass.contains(DestReg) &&
2839       AArch64::ZPRRegClass.contains(SrcReg)) {
2840     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2841     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
2842       .addReg(SrcReg)
2843       .addReg(SrcReg, getKillRegState(KillSrc));
2844     return;
2845   }
2846 
2847   // Copy a Z register pair by copying the individual sub-registers.
2848   if (AArch64::ZPR2RegClass.contains(DestReg) &&
2849       AArch64::ZPR2RegClass.contains(SrcReg)) {
2850     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
2851     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2852                      Indices);
2853     return;
2854   }
2855 
2856   // Copy a Z register triple by copying the individual sub-registers.
2857   if (AArch64::ZPR3RegClass.contains(DestReg) &&
2858       AArch64::ZPR3RegClass.contains(SrcReg)) {
2859     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
2860                                        AArch64::zsub2};
2861     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2862                      Indices);
2863     return;
2864   }
2865 
2866   // Copy a Z register quad by copying the individual sub-registers.
2867   if (AArch64::ZPR4RegClass.contains(DestReg) &&
2868       AArch64::ZPR4RegClass.contains(SrcReg)) {
2869     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
2870                                        AArch64::zsub2, AArch64::zsub3};
2871     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
2872                      Indices);
2873     return;
2874   }
2875 
2876   if (AArch64::GPR64spRegClass.contains(DestReg) &&
2877       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2878     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2879       // If either operand is SP, expand to ADD #0.
2880       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2881           .addReg(SrcReg, getKillRegState(KillSrc))
2882           .addImm(0)
2883           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2884     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2885       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2886           .addImm(0)
2887           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2888     } else {
2889       // Otherwise, expand to ORR XZR.
2890       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2891           .addReg(AArch64::XZR)
2892           .addReg(SrcReg, getKillRegState(KillSrc));
2893     }
2894     return;
2895   }
2896 
2897   // Copy a DDDD register quad by copying the individual sub-registers.
2898   if (AArch64::DDDDRegClass.contains(DestReg) &&
2899       AArch64::DDDDRegClass.contains(SrcReg)) {
2900     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2901                                        AArch64::dsub2, AArch64::dsub3};
2902     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2903                      Indices);
2904     return;
2905   }
2906 
2907   // Copy a DDD register triple by copying the individual sub-registers.
2908   if (AArch64::DDDRegClass.contains(DestReg) &&
2909       AArch64::DDDRegClass.contains(SrcReg)) {
2910     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2911                                        AArch64::dsub2};
2912     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2913                      Indices);
2914     return;
2915   }
2916 
2917   // Copy a DD register pair by copying the individual sub-registers.
2918   if (AArch64::DDRegClass.contains(DestReg) &&
2919       AArch64::DDRegClass.contains(SrcReg)) {
2920     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2921     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2922                      Indices);
2923     return;
2924   }
2925 
2926   // Copy a QQQQ register quad by copying the individual sub-registers.
2927   if (AArch64::QQQQRegClass.contains(DestReg) &&
2928       AArch64::QQQQRegClass.contains(SrcReg)) {
2929     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2930                                        AArch64::qsub2, AArch64::qsub3};
2931     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2932                      Indices);
2933     return;
2934   }
2935 
2936   // Copy a QQQ register triple by copying the individual sub-registers.
2937   if (AArch64::QQQRegClass.contains(DestReg) &&
2938       AArch64::QQQRegClass.contains(SrcReg)) {
2939     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2940                                        AArch64::qsub2};
2941     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2942                      Indices);
2943     return;
2944   }
2945 
2946   // Copy a QQ register pair by copying the individual sub-registers.
2947   if (AArch64::QQRegClass.contains(DestReg) &&
2948       AArch64::QQRegClass.contains(SrcReg)) {
2949     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2950     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2951                      Indices);
2952     return;
2953   }
2954 
2955   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2956       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2957     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2958     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2959                     AArch64::XZR, Indices);
2960     return;
2961   }
2962 
2963   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2964       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2965     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2966     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2967                     AArch64::WZR, Indices);
2968     return;
2969   }
2970 
2971   if (AArch64::FPR128RegClass.contains(DestReg) &&
2972       AArch64::FPR128RegClass.contains(SrcReg)) {
2973     if (Subtarget.hasNEON()) {
2974       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2975           .addReg(SrcReg)
2976           .addReg(SrcReg, getKillRegState(KillSrc));
2977     } else {
2978       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2979           .addReg(AArch64::SP, RegState::Define)
2980           .addReg(SrcReg, getKillRegState(KillSrc))
2981           .addReg(AArch64::SP)
2982           .addImm(-16);
2983       BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2984           .addReg(AArch64::SP, RegState::Define)
2985           .addReg(DestReg, RegState::Define)
2986           .addReg(AArch64::SP)
2987           .addImm(16);
2988     }
2989     return;
2990   }
2991 
2992   if (AArch64::FPR64RegClass.contains(DestReg) &&
2993       AArch64::FPR64RegClass.contains(SrcReg)) {
2994     if (Subtarget.hasNEON()) {
2995       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2996                                        &AArch64::FPR128RegClass);
2997       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2998                                       &AArch64::FPR128RegClass);
2999       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3000           .addReg(SrcReg)
3001           .addReg(SrcReg, getKillRegState(KillSrc));
3002     } else {
3003       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
3004           .addReg(SrcReg, getKillRegState(KillSrc));
3005     }
3006     return;
3007   }
3008 
3009   if (AArch64::FPR32RegClass.contains(DestReg) &&
3010       AArch64::FPR32RegClass.contains(SrcReg)) {
3011     if (Subtarget.hasNEON()) {
3012       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
3013                                        &AArch64::FPR128RegClass);
3014       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
3015                                       &AArch64::FPR128RegClass);
3016       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3017           .addReg(SrcReg)
3018           .addReg(SrcReg, getKillRegState(KillSrc));
3019     } else {
3020       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3021           .addReg(SrcReg, getKillRegState(KillSrc));
3022     }
3023     return;
3024   }
3025 
3026   if (AArch64::FPR16RegClass.contains(DestReg) &&
3027       AArch64::FPR16RegClass.contains(SrcReg)) {
3028     if (Subtarget.hasNEON()) {
3029       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
3030                                        &AArch64::FPR128RegClass);
3031       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
3032                                       &AArch64::FPR128RegClass);
3033       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3034           .addReg(SrcReg)
3035           .addReg(SrcReg, getKillRegState(KillSrc));
3036     } else {
3037       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
3038                                        &AArch64::FPR32RegClass);
3039       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
3040                                       &AArch64::FPR32RegClass);
3041       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3042           .addReg(SrcReg, getKillRegState(KillSrc));
3043     }
3044     return;
3045   }
3046 
3047   if (AArch64::FPR8RegClass.contains(DestReg) &&
3048       AArch64::FPR8RegClass.contains(SrcReg)) {
3049     if (Subtarget.hasNEON()) {
3050       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
3051                                        &AArch64::FPR128RegClass);
3052       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
3053                                       &AArch64::FPR128RegClass);
3054       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3055           .addReg(SrcReg)
3056           .addReg(SrcReg, getKillRegState(KillSrc));
3057     } else {
3058       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
3059                                        &AArch64::FPR32RegClass);
3060       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
3061                                       &AArch64::FPR32RegClass);
3062       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3063           .addReg(SrcReg, getKillRegState(KillSrc));
3064     }
3065     return;
3066   }
3067 
3068   // Copies between GPR64 and FPR64.
3069   if (AArch64::FPR64RegClass.contains(DestReg) &&
3070       AArch64::GPR64RegClass.contains(SrcReg)) {
3071     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
3072         .addReg(SrcReg, getKillRegState(KillSrc));
3073     return;
3074   }
3075   if (AArch64::GPR64RegClass.contains(DestReg) &&
3076       AArch64::FPR64RegClass.contains(SrcReg)) {
3077     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
3078         .addReg(SrcReg, getKillRegState(KillSrc));
3079     return;
3080   }
3081   // Copies between GPR32 and FPR32.
3082   if (AArch64::FPR32RegClass.contains(DestReg) &&
3083       AArch64::GPR32RegClass.contains(SrcReg)) {
3084     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
3085         .addReg(SrcReg, getKillRegState(KillSrc));
3086     return;
3087   }
3088   if (AArch64::GPR32RegClass.contains(DestReg) &&
3089       AArch64::FPR32RegClass.contains(SrcReg)) {
3090     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
3091         .addReg(SrcReg, getKillRegState(KillSrc));
3092     return;
3093   }
3094 
3095   if (DestReg == AArch64::NZCV) {
3096     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
3097     BuildMI(MBB, I, DL, get(AArch64::MSR))
3098         .addImm(AArch64SysReg::NZCV)
3099         .addReg(SrcReg, getKillRegState(KillSrc))
3100         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
3101     return;
3102   }
3103 
3104   if (SrcReg == AArch64::NZCV) {
3105     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
3106     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
3107         .addImm(AArch64SysReg::NZCV)
3108         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
3109     return;
3110   }
3111 
3112   llvm_unreachable("unimplemented reg-to-reg copy");
3113 }
3114 
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)3115 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
3116                                     MachineBasicBlock &MBB,
3117                                     MachineBasicBlock::iterator InsertBefore,
3118                                     const MCInstrDesc &MCID,
3119                                     Register SrcReg, bool IsKill,
3120                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
3121                                     MachineMemOperand *MMO) {
3122   Register SrcReg0 = SrcReg;
3123   Register SrcReg1 = SrcReg;
3124   if (Register::isPhysicalRegister(SrcReg)) {
3125     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
3126     SubIdx0 = 0;
3127     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
3128     SubIdx1 = 0;
3129   }
3130   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3131       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3132       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3133       .addFrameIndex(FI)
3134       .addImm(0)
3135       .addMemOperand(MMO);
3136 }
3137 
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3138 void AArch64InstrInfo::storeRegToStackSlot(
3139     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
3140     bool isKill, int FI, const TargetRegisterClass *RC,
3141     const TargetRegisterInfo *TRI) const {
3142   MachineFunction &MF = *MBB.getParent();
3143   MachineFrameInfo &MFI = MF.getFrameInfo();
3144 
3145   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3146   MachineMemOperand *MMO =
3147       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
3148                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3149   unsigned Opc = 0;
3150   bool Offset = true;
3151   unsigned StackID = TargetStackID::Default;
3152   switch (TRI->getSpillSize(*RC)) {
3153   case 1:
3154     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3155       Opc = AArch64::STRBui;
3156     break;
3157   case 2:
3158     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3159       Opc = AArch64::STRHui;
3160     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3161       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3162       Opc = AArch64::STR_PXI;
3163       StackID = TargetStackID::SVEVector;
3164     }
3165     break;
3166   case 4:
3167     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3168       Opc = AArch64::STRWui;
3169       if (Register::isVirtualRegister(SrcReg))
3170         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3171       else
3172         assert(SrcReg != AArch64::WSP);
3173     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3174       Opc = AArch64::STRSui;
3175     break;
3176   case 8:
3177     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3178       Opc = AArch64::STRXui;
3179       if (Register::isVirtualRegister(SrcReg))
3180         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3181       else
3182         assert(SrcReg != AArch64::SP);
3183     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3184       Opc = AArch64::STRDui;
3185     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3186       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3187                               get(AArch64::STPWi), SrcReg, isKill,
3188                               AArch64::sube32, AArch64::subo32, FI, MMO);
3189       return;
3190     }
3191     break;
3192   case 16:
3193     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3194       Opc = AArch64::STRQui;
3195     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3196       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3197       Opc = AArch64::ST1Twov1d;
3198       Offset = false;
3199     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3200       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3201                               get(AArch64::STPXi), SrcReg, isKill,
3202                               AArch64::sube64, AArch64::subo64, FI, MMO);
3203       return;
3204     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3205       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3206       Opc = AArch64::STR_ZXI;
3207       StackID = TargetStackID::SVEVector;
3208     }
3209     break;
3210   case 24:
3211     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3212       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3213       Opc = AArch64::ST1Threev1d;
3214       Offset = false;
3215     }
3216     break;
3217   case 32:
3218     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3219       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3220       Opc = AArch64::ST1Fourv1d;
3221       Offset = false;
3222     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3223       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3224       Opc = AArch64::ST1Twov2d;
3225       Offset = false;
3226     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3227       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3228       Opc = AArch64::STR_ZZXI;
3229       StackID = TargetStackID::SVEVector;
3230     }
3231     break;
3232   case 48:
3233     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3234       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3235       Opc = AArch64::ST1Threev2d;
3236       Offset = false;
3237     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3238       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3239       Opc = AArch64::STR_ZZZXI;
3240       StackID = TargetStackID::SVEVector;
3241     }
3242     break;
3243   case 64:
3244     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3245       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3246       Opc = AArch64::ST1Fourv2d;
3247       Offset = false;
3248     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3249       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3250       Opc = AArch64::STR_ZZZZXI;
3251       StackID = TargetStackID::SVEVector;
3252     }
3253     break;
3254   }
3255   assert(Opc && "Unknown register class");
3256   MFI.setStackID(FI, StackID);
3257 
3258   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3259                                      .addReg(SrcReg, getKillRegState(isKill))
3260                                      .addFrameIndex(FI);
3261 
3262   if (Offset)
3263     MI.addImm(0);
3264   MI.addMemOperand(MMO);
3265 }
3266 
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)3267 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
3268                                      MachineBasicBlock &MBB,
3269                                      MachineBasicBlock::iterator InsertBefore,
3270                                      const MCInstrDesc &MCID,
3271                                      Register DestReg, unsigned SubIdx0,
3272                                      unsigned SubIdx1, int FI,
3273                                      MachineMemOperand *MMO) {
3274   Register DestReg0 = DestReg;
3275   Register DestReg1 = DestReg;
3276   bool IsUndef = true;
3277   if (Register::isPhysicalRegister(DestReg)) {
3278     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3279     SubIdx0 = 0;
3280     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3281     SubIdx1 = 0;
3282     IsUndef = false;
3283   }
3284   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3285       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3286       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3287       .addFrameIndex(FI)
3288       .addImm(0)
3289       .addMemOperand(MMO);
3290 }
3291 
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3292 void AArch64InstrInfo::loadRegFromStackSlot(
3293     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
3294     int FI, const TargetRegisterClass *RC,
3295     const TargetRegisterInfo *TRI) const {
3296   MachineFunction &MF = *MBB.getParent();
3297   MachineFrameInfo &MFI = MF.getFrameInfo();
3298   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3299   MachineMemOperand *MMO =
3300       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
3301                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3302 
3303   unsigned Opc = 0;
3304   bool Offset = true;
3305   unsigned StackID = TargetStackID::Default;
3306   switch (TRI->getSpillSize(*RC)) {
3307   case 1:
3308     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3309       Opc = AArch64::LDRBui;
3310     break;
3311   case 2:
3312     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3313       Opc = AArch64::LDRHui;
3314     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3315       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3316       Opc = AArch64::LDR_PXI;
3317       StackID = TargetStackID::SVEVector;
3318     }
3319     break;
3320   case 4:
3321     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3322       Opc = AArch64::LDRWui;
3323       if (Register::isVirtualRegister(DestReg))
3324         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3325       else
3326         assert(DestReg != AArch64::WSP);
3327     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3328       Opc = AArch64::LDRSui;
3329     break;
3330   case 8:
3331     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3332       Opc = AArch64::LDRXui;
3333       if (Register::isVirtualRegister(DestReg))
3334         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3335       else
3336         assert(DestReg != AArch64::SP);
3337     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3338       Opc = AArch64::LDRDui;
3339     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3340       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3341                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
3342                                AArch64::subo32, FI, MMO);
3343       return;
3344     }
3345     break;
3346   case 16:
3347     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3348       Opc = AArch64::LDRQui;
3349     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3350       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3351       Opc = AArch64::LD1Twov1d;
3352       Offset = false;
3353     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3354       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3355                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
3356                                AArch64::subo64, FI, MMO);
3357       return;
3358     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3359       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3360       Opc = AArch64::LDR_ZXI;
3361       StackID = TargetStackID::SVEVector;
3362     }
3363     break;
3364   case 24:
3365     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3366       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3367       Opc = AArch64::LD1Threev1d;
3368       Offset = false;
3369     }
3370     break;
3371   case 32:
3372     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3373       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3374       Opc = AArch64::LD1Fourv1d;
3375       Offset = false;
3376     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3377       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3378       Opc = AArch64::LD1Twov2d;
3379       Offset = false;
3380     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3381       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3382       Opc = AArch64::LDR_ZZXI;
3383       StackID = TargetStackID::SVEVector;
3384     }
3385     break;
3386   case 48:
3387     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3388       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3389       Opc = AArch64::LD1Threev2d;
3390       Offset = false;
3391     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3392       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3393       Opc = AArch64::LDR_ZZZXI;
3394       StackID = TargetStackID::SVEVector;
3395     }
3396     break;
3397   case 64:
3398     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3399       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3400       Opc = AArch64::LD1Fourv2d;
3401       Offset = false;
3402     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3403       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3404       Opc = AArch64::LDR_ZZZZXI;
3405       StackID = TargetStackID::SVEVector;
3406     }
3407     break;
3408   }
3409 
3410   assert(Opc && "Unknown register class");
3411   MFI.setStackID(FI, StackID);
3412 
3413   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3414                                      .addReg(DestReg, getDefRegState(true))
3415                                      .addFrameIndex(FI);
3416   if (Offset)
3417     MI.addImm(0);
3418   MI.addMemOperand(MMO);
3419 }
3420 
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)3421 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
3422                                            const MachineInstr &UseMI,
3423                                            const TargetRegisterInfo *TRI) {
3424   return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
3425                                          UseMI.getIterator()),
3426                 [TRI](const MachineInstr &I) {
3427                   return I.modifiesRegister(AArch64::NZCV, TRI) ||
3428                          I.readsRegister(AArch64::NZCV, TRI);
3429                 });
3430 }
3431 
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)3432 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
3433     const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
3434   // The smallest scalable element supported by scaled SVE addressing
3435   // modes are predicates, which are 2 scalable bytes in size. So the scalable
3436   // byte offset must always be a multiple of 2.
3437   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3438 
3439   // VGSized offsets are divided by '2', because the VG register is the
3440   // the number of 64bit granules as opposed to 128bit vector chunks,
3441   // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
3442   // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
3443   // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
3444   ByteSized = Offset.getFixed();
3445   VGSized = Offset.getScalable() / 2;
3446 }
3447 
3448 /// Returns the offset in parts to which this frame offset can be
3449 /// decomposed for the purpose of describing a frame offset.
3450 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)3451 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
3452     const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
3453     int64_t &NumDataVectors) {
3454   // The smallest scalable element supported by scaled SVE addressing
3455   // modes are predicates, which are 2 scalable bytes in size. So the scalable
3456   // byte offset must always be a multiple of 2.
3457   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3458 
3459   NumBytes = Offset.getFixed();
3460   NumDataVectors = 0;
3461   NumPredicateVectors = Offset.getScalable() / 2;
3462   // This method is used to get the offsets to adjust the frame offset.
3463   // If the function requires ADDPL to be used and needs more than two ADDPL
3464   // instructions, part of the offset is folded into NumDataVectors so that it
3465   // uses ADDVL for part of it, reducing the number of ADDPL instructions.
3466   if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
3467       NumPredicateVectors > 62) {
3468     NumDataVectors = NumPredicateVectors / 8;
3469     NumPredicateVectors -= NumDataVectors * 8;
3470   }
3471 }
3472 
3473 // Helper function to emit a frame offset adjustment from a given
3474 // pointer (SrcReg), stored into DestReg. This function is explicit
3475 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI)3476 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
3477                                MachineBasicBlock::iterator MBBI,
3478                                const DebugLoc &DL, unsigned DestReg,
3479                                unsigned SrcReg, int64_t Offset, unsigned Opc,
3480                                const TargetInstrInfo *TII,
3481                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3482                                bool *HasWinCFI) {
3483   int Sign = 1;
3484   unsigned MaxEncoding, ShiftSize;
3485   switch (Opc) {
3486   case AArch64::ADDXri:
3487   case AArch64::ADDSXri:
3488   case AArch64::SUBXri:
3489   case AArch64::SUBSXri:
3490     MaxEncoding = 0xfff;
3491     ShiftSize = 12;
3492     break;
3493   case AArch64::ADDVL_XXI:
3494   case AArch64::ADDPL_XXI:
3495     MaxEncoding = 31;
3496     ShiftSize = 0;
3497     if (Offset < 0) {
3498       MaxEncoding = 32;
3499       Sign = -1;
3500       Offset = -Offset;
3501     }
3502     break;
3503   default:
3504     llvm_unreachable("Unsupported opcode");
3505   }
3506 
3507   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3508   // scratch register.  If DestReg is a virtual register, use it as the
3509   // scratch register; otherwise, create a new virtual register (to be
3510   // replaced by the scavenger at the end of PEI).  That case can be optimized
3511   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3512   // register can be loaded with offset%8 and the add/sub can use an extending
3513   // instruction with LSL#3.
3514   // Currently the function handles any offsets but generates a poor sequence
3515   // of code.
3516   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3517 
3518   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3519   Register TmpReg = DestReg;
3520   if (TmpReg == AArch64::XZR)
3521     TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
3522         &AArch64::GPR64RegClass);
3523   do {
3524     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
3525     unsigned LocalShiftSize = 0;
3526     if (ThisVal > MaxEncoding) {
3527       ThisVal = ThisVal >> ShiftSize;
3528       LocalShiftSize = ShiftSize;
3529     }
3530     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3531            "Encoding cannot handle value that big");
3532 
3533     Offset -= ThisVal << LocalShiftSize;
3534     if (Offset == 0)
3535       TmpReg = DestReg;
3536     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
3537                    .addReg(SrcReg)
3538                    .addImm(Sign * (int)ThisVal);
3539     if (ShiftSize)
3540       MBI = MBI.addImm(
3541           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
3542     MBI = MBI.setMIFlag(Flag);
3543 
3544     if (NeedsWinCFI) {
3545       assert(Sign == 1 && "SEH directives should always have a positive sign");
3546       int Imm = (int)(ThisVal << LocalShiftSize);
3547       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3548           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3549         if (HasWinCFI)
3550           *HasWinCFI = true;
3551         if (Imm == 0)
3552           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
3553         else
3554           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
3555               .addImm(Imm)
3556               .setMIFlag(Flag);
3557         assert(Offset == 0 && "Expected remaining offset to be zero to "
3558                               "emit a single SEH directive");
3559       } else if (DestReg == AArch64::SP) {
3560         if (HasWinCFI)
3561           *HasWinCFI = true;
3562         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
3563         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3564             .addImm(Imm)
3565             .setMIFlag(Flag);
3566       }
3567       if (HasWinCFI)
3568         *HasWinCFI = true;
3569     }
3570 
3571     SrcReg = TmpReg;
3572   } while (Offset);
3573 }
3574 
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI)3575 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
3576                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
3577                            unsigned DestReg, unsigned SrcReg,
3578                            StackOffset Offset, const TargetInstrInfo *TII,
3579                            MachineInstr::MIFlag Flag, bool SetNZCV,
3580                            bool NeedsWinCFI, bool *HasWinCFI) {
3581   int64_t Bytes, NumPredicateVectors, NumDataVectors;
3582   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
3583       Offset, Bytes, NumPredicateVectors, NumDataVectors);
3584 
3585   // First emit non-scalable frame offsets, or a simple 'mov'.
3586   if (Bytes || (!Offset && SrcReg != DestReg)) {
3587     assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
3588            "SP increment/decrement not 8-byte aligned");
3589     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
3590     if (Bytes < 0) {
3591       Bytes = -Bytes;
3592       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
3593     }
3594     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
3595                        NeedsWinCFI, HasWinCFI);
3596     SrcReg = DestReg;
3597   }
3598 
3599   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
3600          "SetNZCV not supported with SVE vectors");
3601   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
3602          "WinCFI not supported with SVE vectors");
3603 
3604   if (NumDataVectors) {
3605     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
3606                        AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3607     SrcReg = DestReg;
3608   }
3609 
3610   if (NumPredicateVectors) {
3611     assert(DestReg != AArch64::SP && "Unaligned access to SP");
3612     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
3613                        AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3614   }
3615 }
3616 
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const3617 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
3618     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
3619     MachineBasicBlock::iterator InsertPt, int FrameIndex,
3620     LiveIntervals *LIS, VirtRegMap *VRM) const {
3621   // This is a bit of a hack. Consider this instruction:
3622   //
3623   //   %0 = COPY %sp; GPR64all:%0
3624   //
3625   // We explicitly chose GPR64all for the virtual register so such a copy might
3626   // be eliminated by RegisterCoalescer. However, that may not be possible, and
3627   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3628   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3629   //
3630   // To prevent that, we are going to constrain the %0 register class here.
3631   //
3632   // <rdar://problem/11522048>
3633   //
3634   if (MI.isFullCopy()) {
3635     Register DstReg = MI.getOperand(0).getReg();
3636     Register SrcReg = MI.getOperand(1).getReg();
3637     if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
3638       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3639       return nullptr;
3640     }
3641     if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
3642       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3643       return nullptr;
3644     }
3645   }
3646 
3647   // Handle the case where a copy is being spilled or filled but the source
3648   // and destination register class don't match.  For example:
3649   //
3650   //   %0 = COPY %xzr; GPR64common:%0
3651   //
3652   // In this case we can still safely fold away the COPY and generate the
3653   // following spill code:
3654   //
3655   //   STRXui %xzr, %stack.0
3656   //
3657   // This also eliminates spilled cross register class COPYs (e.g. between x and
3658   // d regs) of the same size.  For example:
3659   //
3660   //   %0 = COPY %1; GPR64:%0, FPR64:%1
3661   //
3662   // will be filled as
3663   //
3664   //   LDRDui %0, fi<#0>
3665   //
3666   // instead of
3667   //
3668   //   LDRXui %Temp, fi<#0>
3669   //   %0 = FMOV %Temp
3670   //
3671   if (MI.isCopy() && Ops.size() == 1 &&
3672       // Make sure we're only folding the explicit COPY defs/uses.
3673       (Ops[0] == 0 || Ops[0] == 1)) {
3674     bool IsSpill = Ops[0] == 0;
3675     bool IsFill = !IsSpill;
3676     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3677     const MachineRegisterInfo &MRI = MF.getRegInfo();
3678     MachineBasicBlock &MBB = *MI.getParent();
3679     const MachineOperand &DstMO = MI.getOperand(0);
3680     const MachineOperand &SrcMO = MI.getOperand(1);
3681     Register DstReg = DstMO.getReg();
3682     Register SrcReg = SrcMO.getReg();
3683     // This is slightly expensive to compute for physical regs since
3684     // getMinimalPhysRegClass is slow.
3685     auto getRegClass = [&](unsigned Reg) {
3686       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
3687                                               : TRI.getMinimalPhysRegClass(Reg);
3688     };
3689 
3690     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3691       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3692                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3693              "Mismatched register size in non subreg COPY");
3694       if (IsSpill)
3695         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3696                             getRegClass(SrcReg), &TRI);
3697       else
3698         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3699                              getRegClass(DstReg), &TRI);
3700       return &*--InsertPt;
3701     }
3702 
3703     // Handle cases like spilling def of:
3704     //
3705     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3706     //
3707     // where the physical register source can be widened and stored to the full
3708     // virtual reg destination stack slot, in this case producing:
3709     //
3710     //   STRXui %xzr, %stack.0
3711     //
3712     if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
3713       assert(SrcMO.getSubReg() == 0 &&
3714              "Unexpected subreg on physical register");
3715       const TargetRegisterClass *SpillRC;
3716       unsigned SpillSubreg;
3717       switch (DstMO.getSubReg()) {
3718       default:
3719         SpillRC = nullptr;
3720         break;
3721       case AArch64::sub_32:
3722       case AArch64::ssub:
3723         if (AArch64::GPR32RegClass.contains(SrcReg)) {
3724           SpillRC = &AArch64::GPR64RegClass;
3725           SpillSubreg = AArch64::sub_32;
3726         } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3727           SpillRC = &AArch64::FPR64RegClass;
3728           SpillSubreg = AArch64::ssub;
3729         } else
3730           SpillRC = nullptr;
3731         break;
3732       case AArch64::dsub:
3733         if (AArch64::FPR64RegClass.contains(SrcReg)) {
3734           SpillRC = &AArch64::FPR128RegClass;
3735           SpillSubreg = AArch64::dsub;
3736         } else
3737           SpillRC = nullptr;
3738         break;
3739       }
3740 
3741       if (SpillRC)
3742         if (unsigned WidenedSrcReg =
3743                 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3744           storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3745                               FrameIndex, SpillRC, &TRI);
3746           return &*--InsertPt;
3747         }
3748     }
3749 
3750     // Handle cases like filling use of:
3751     //
3752     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3753     //
3754     // where we can load the full virtual reg source stack slot, into the subreg
3755     // destination, in this case producing:
3756     //
3757     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
3758     //
3759     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3760       const TargetRegisterClass *FillRC;
3761       switch (DstMO.getSubReg()) {
3762       default:
3763         FillRC = nullptr;
3764         break;
3765       case AArch64::sub_32:
3766         FillRC = &AArch64::GPR32RegClass;
3767         break;
3768       case AArch64::ssub:
3769         FillRC = &AArch64::FPR32RegClass;
3770         break;
3771       case AArch64::dsub:
3772         FillRC = &AArch64::FPR64RegClass;
3773         break;
3774       }
3775 
3776       if (FillRC) {
3777         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3778                    TRI.getRegSizeInBits(*FillRC) &&
3779                "Mismatched regclass size on folded subreg COPY");
3780         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3781         MachineInstr &LoadMI = *--InsertPt;
3782         MachineOperand &LoadDst = LoadMI.getOperand(0);
3783         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3784         LoadDst.setSubReg(DstMO.getSubReg());
3785         LoadDst.setIsUndef();
3786         return &LoadMI;
3787       }
3788     }
3789   }
3790 
3791   // Cannot fold.
3792   return nullptr;
3793 }
3794 
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)3795 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
3796                                     StackOffset &SOffset,
3797                                     bool *OutUseUnscaledOp,
3798                                     unsigned *OutUnscaledOp,
3799                                     int64_t *EmittableOffset) {
3800   // Set output values in case of early exit.
3801   if (EmittableOffset)
3802     *EmittableOffset = 0;
3803   if (OutUseUnscaledOp)
3804     *OutUseUnscaledOp = false;
3805   if (OutUnscaledOp)
3806     *OutUnscaledOp = 0;
3807 
3808   // Exit early for structured vector spills/fills as they can't take an
3809   // immediate offset.
3810   switch (MI.getOpcode()) {
3811   default:
3812     break;
3813   case AArch64::LD1Twov2d:
3814   case AArch64::LD1Threev2d:
3815   case AArch64::LD1Fourv2d:
3816   case AArch64::LD1Twov1d:
3817   case AArch64::LD1Threev1d:
3818   case AArch64::LD1Fourv1d:
3819   case AArch64::ST1Twov2d:
3820   case AArch64::ST1Threev2d:
3821   case AArch64::ST1Fourv2d:
3822   case AArch64::ST1Twov1d:
3823   case AArch64::ST1Threev1d:
3824   case AArch64::ST1Fourv1d:
3825   case AArch64::IRG:
3826   case AArch64::IRGstack:
3827   case AArch64::STGloop:
3828   case AArch64::STZGloop:
3829     return AArch64FrameOffsetCannotUpdate;
3830   }
3831 
3832   // Get the min/max offset and the scale.
3833   TypeSize ScaleValue(0U, false);
3834   unsigned Width;
3835   int64_t MinOff, MaxOff;
3836   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
3837                                       MaxOff))
3838     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3839 
3840   // Construct the complete offset.
3841   bool IsMulVL = ScaleValue.isScalable();
3842   unsigned Scale = ScaleValue.getKnownMinSize();
3843   int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
3844 
3845   const MachineOperand &ImmOpnd =
3846       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
3847   Offset += ImmOpnd.getImm() * Scale;
3848 
3849   // If the offset doesn't match the scale, we rewrite the instruction to
3850   // use the unscaled instruction instead. Likewise, if we have a negative
3851   // offset and there is an unscaled op to use.
3852   Optional<unsigned> UnscaledOp =
3853       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
3854   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3855   if (useUnscaledOp &&
3856       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
3857                                       MaxOff))
3858     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3859 
3860   Scale = ScaleValue.getKnownMinSize();
3861   assert(IsMulVL == ScaleValue.isScalable() &&
3862          "Unscaled opcode has different value for scalable");
3863 
3864   int64_t Remainder = Offset % Scale;
3865   assert(!(Remainder && useUnscaledOp) &&
3866          "Cannot have remainder when using unscaled op");
3867 
3868   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3869   int64_t NewOffset = Offset / Scale;
3870   if (MinOff <= NewOffset && NewOffset <= MaxOff)
3871     Offset = Remainder;
3872   else {
3873     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3874     Offset = Offset - NewOffset * Scale + Remainder;
3875   }
3876 
3877   if (EmittableOffset)
3878     *EmittableOffset = NewOffset;
3879   if (OutUseUnscaledOp)
3880     *OutUseUnscaledOp = useUnscaledOp;
3881   if (OutUnscaledOp && UnscaledOp)
3882     *OutUnscaledOp = *UnscaledOp;
3883 
3884   if (IsMulVL)
3885     SOffset = StackOffset::get(SOffset.getFixed(), Offset);
3886   else
3887     SOffset = StackOffset::get(Offset, SOffset.getScalable());
3888   return AArch64FrameOffsetCanUpdate |
3889          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
3890 }
3891 
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)3892 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3893                                     unsigned FrameReg, StackOffset &Offset,
3894                                     const AArch64InstrInfo *TII) {
3895   unsigned Opcode = MI.getOpcode();
3896   unsigned ImmIdx = FrameRegIdx + 1;
3897 
3898   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3899     Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
3900     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3901                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3902                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3903     MI.eraseFromParent();
3904     Offset = StackOffset();
3905     return true;
3906   }
3907 
3908   int64_t NewOffset;
3909   unsigned UnscaledOp;
3910   bool UseUnscaledOp;
3911   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3912                                          &UnscaledOp, &NewOffset);
3913   if (Status & AArch64FrameOffsetCanUpdate) {
3914     if (Status & AArch64FrameOffsetIsLegal)
3915       // Replace the FrameIndex with FrameReg.
3916       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3917     if (UseUnscaledOp)
3918       MI.setDesc(TII->get(UnscaledOp));
3919 
3920     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3921     return !Offset;
3922   }
3923 
3924   return false;
3925 }
3926 
getNoop(MCInst & NopInst) const3927 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3928   NopInst.setOpcode(AArch64::HINT);
3929   NopInst.addOperand(MCOperand::createImm(0));
3930 }
3931 
3932 // AArch64 supports MachineCombiner.
useMachineCombiner() const3933 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3934 
3935 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)3936 static bool isCombineInstrSettingFlag(unsigned Opc) {
3937   switch (Opc) {
3938   case AArch64::ADDSWrr:
3939   case AArch64::ADDSWri:
3940   case AArch64::ADDSXrr:
3941   case AArch64::ADDSXri:
3942   case AArch64::SUBSWrr:
3943   case AArch64::SUBSXrr:
3944   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3945   case AArch64::SUBSWri:
3946   case AArch64::SUBSXri:
3947     return true;
3948   default:
3949     break;
3950   }
3951   return false;
3952 }
3953 
3954 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)3955 static bool isCombineInstrCandidate32(unsigned Opc) {
3956   switch (Opc) {
3957   case AArch64::ADDWrr:
3958   case AArch64::ADDWri:
3959   case AArch64::SUBWrr:
3960   case AArch64::ADDSWrr:
3961   case AArch64::ADDSWri:
3962   case AArch64::SUBSWrr:
3963   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3964   case AArch64::SUBWri:
3965   case AArch64::SUBSWri:
3966     return true;
3967   default:
3968     break;
3969   }
3970   return false;
3971 }
3972 
3973 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)3974 static bool isCombineInstrCandidate64(unsigned Opc) {
3975   switch (Opc) {
3976   case AArch64::ADDXrr:
3977   case AArch64::ADDXri:
3978   case AArch64::SUBXrr:
3979   case AArch64::ADDSXrr:
3980   case AArch64::ADDSXri:
3981   case AArch64::SUBSXrr:
3982   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3983   case AArch64::SUBXri:
3984   case AArch64::SUBSXri:
3985   case AArch64::ADDv8i8:
3986   case AArch64::ADDv16i8:
3987   case AArch64::ADDv4i16:
3988   case AArch64::ADDv8i16:
3989   case AArch64::ADDv2i32:
3990   case AArch64::ADDv4i32:
3991   case AArch64::SUBv8i8:
3992   case AArch64::SUBv16i8:
3993   case AArch64::SUBv4i16:
3994   case AArch64::SUBv8i16:
3995   case AArch64::SUBv2i32:
3996   case AArch64::SUBv4i32:
3997     return true;
3998   default:
3999     break;
4000   }
4001   return false;
4002 }
4003 
4004 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)4005 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
4006   switch (Inst.getOpcode()) {
4007   default:
4008     break;
4009   case AArch64::FADDHrr:
4010   case AArch64::FADDSrr:
4011   case AArch64::FADDDrr:
4012   case AArch64::FADDv4f16:
4013   case AArch64::FADDv8f16:
4014   case AArch64::FADDv2f32:
4015   case AArch64::FADDv2f64:
4016   case AArch64::FADDv4f32:
4017   case AArch64::FSUBHrr:
4018   case AArch64::FSUBSrr:
4019   case AArch64::FSUBDrr:
4020   case AArch64::FSUBv4f16:
4021   case AArch64::FSUBv8f16:
4022   case AArch64::FSUBv2f32:
4023   case AArch64::FSUBv2f64:
4024   case AArch64::FSUBv4f32:
4025     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
4026     // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
4027     // the target options or if FADD/FSUB has the contract fast-math flag.
4028     return Options.UnsafeFPMath ||
4029            Options.AllowFPOpFusion == FPOpFusion::Fast ||
4030            Inst.getFlag(MachineInstr::FmContract);
4031     return true;
4032   }
4033   return false;
4034 }
4035 
4036 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)4037 static bool isCombineInstrCandidate(unsigned Opc) {
4038   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
4039 }
4040 
4041 //
4042 // Utility routine that checks if \param MO is defined by an
4043 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)4044 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
4045                        unsigned CombineOpc, unsigned ZeroReg = 0,
4046                        bool CheckZeroReg = false) {
4047   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4048   MachineInstr *MI = nullptr;
4049 
4050   if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
4051     MI = MRI.getUniqueVRegDef(MO.getReg());
4052   // And it needs to be in the trace (otherwise, it won't have a depth).
4053   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
4054     return false;
4055   // Must only used by the user we combine with.
4056   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
4057     return false;
4058 
4059   if (CheckZeroReg) {
4060     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
4061            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
4062            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
4063     // The third input reg must be zero.
4064     if (MI->getOperand(3).getReg() != ZeroReg)
4065       return false;
4066   }
4067 
4068   return true;
4069 }
4070 
4071 //
4072 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)4073 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
4074                               unsigned MulOpc, unsigned ZeroReg) {
4075   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
4076 }
4077 
4078 //
4079 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)4080 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
4081                                unsigned MulOpc) {
4082   return canCombine(MBB, MO, MulOpc);
4083 }
4084 
4085 // TODO: There are many more machine instruction opcodes to match:
4086 //       1. Other data types (integer, vectors)
4087 //       2. Other math / logic operations (xor, or)
4088 //       3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst) const4089 bool AArch64InstrInfo::isAssociativeAndCommutative(
4090     const MachineInstr &Inst) const {
4091   switch (Inst.getOpcode()) {
4092   case AArch64::FADDDrr:
4093   case AArch64::FADDSrr:
4094   case AArch64::FADDv2f32:
4095   case AArch64::FADDv2f64:
4096   case AArch64::FADDv4f32:
4097   case AArch64::FMULDrr:
4098   case AArch64::FMULSrr:
4099   case AArch64::FMULX32:
4100   case AArch64::FMULX64:
4101   case AArch64::FMULXv2f32:
4102   case AArch64::FMULXv2f64:
4103   case AArch64::FMULXv4f32:
4104   case AArch64::FMULv2f32:
4105   case AArch64::FMULv2f64:
4106   case AArch64::FMULv4f32:
4107     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
4108   default:
4109     return false;
4110   }
4111 }
4112 
4113 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)4114 static bool getMaddPatterns(MachineInstr &Root,
4115                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4116   unsigned Opc = Root.getOpcode();
4117   MachineBasicBlock &MBB = *Root.getParent();
4118   bool Found = false;
4119 
4120   if (!isCombineInstrCandidate(Opc))
4121     return false;
4122   if (isCombineInstrSettingFlag(Opc)) {
4123     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
4124     // When NZCV is live bail out.
4125     if (Cmp_NZCV == -1)
4126       return false;
4127     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
4128     // When opcode can't change bail out.
4129     // CHECKME: do we miss any cases for opcode conversion?
4130     if (NewOpc == Opc)
4131       return false;
4132     Opc = NewOpc;
4133   }
4134 
4135   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
4136                       MachineCombinerPattern Pattern) {
4137     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
4138       Patterns.push_back(Pattern);
4139       Found = true;
4140     }
4141   };
4142 
4143   auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
4144     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
4145       Patterns.push_back(Pattern);
4146       Found = true;
4147     }
4148   };
4149 
4150   typedef MachineCombinerPattern MCP;
4151 
4152   switch (Opc) {
4153   default:
4154     break;
4155   case AArch64::ADDWrr:
4156     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4157            "ADDWrr does not have register operands");
4158     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
4159     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
4160     break;
4161   case AArch64::ADDXrr:
4162     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
4163     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
4164     break;
4165   case AArch64::SUBWrr:
4166     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
4167     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
4168     break;
4169   case AArch64::SUBXrr:
4170     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
4171     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
4172     break;
4173   case AArch64::ADDWri:
4174     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
4175     break;
4176   case AArch64::ADDXri:
4177     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
4178     break;
4179   case AArch64::SUBWri:
4180     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
4181     break;
4182   case AArch64::SUBXri:
4183     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
4184     break;
4185   case AArch64::ADDv8i8:
4186     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
4187     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
4188     break;
4189   case AArch64::ADDv16i8:
4190     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
4191     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
4192     break;
4193   case AArch64::ADDv4i16:
4194     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
4195     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
4196     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
4197     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
4198     break;
4199   case AArch64::ADDv8i16:
4200     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
4201     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
4202     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
4203     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
4204     break;
4205   case AArch64::ADDv2i32:
4206     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
4207     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
4208     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
4209     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
4210     break;
4211   case AArch64::ADDv4i32:
4212     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
4213     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
4214     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
4215     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
4216     break;
4217   case AArch64::SUBv8i8:
4218     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
4219     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
4220     break;
4221   case AArch64::SUBv16i8:
4222     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
4223     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
4224     break;
4225   case AArch64::SUBv4i16:
4226     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
4227     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
4228     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
4229     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
4230     break;
4231   case AArch64::SUBv8i16:
4232     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
4233     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
4234     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
4235     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
4236     break;
4237   case AArch64::SUBv2i32:
4238     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
4239     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
4240     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
4241     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
4242     break;
4243   case AArch64::SUBv4i32:
4244     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
4245     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
4246     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
4247     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
4248     break;
4249   }
4250   return Found;
4251 }
4252 /// Floating-Point Support
4253 
4254 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)4255 static bool getFMAPatterns(MachineInstr &Root,
4256                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4257 
4258   if (!isCombineInstrCandidateFP(Root))
4259     return false;
4260 
4261   MachineBasicBlock &MBB = *Root.getParent();
4262   bool Found = false;
4263 
4264   auto Match = [&](int Opcode, int Operand,
4265                    MachineCombinerPattern Pattern) -> bool {
4266     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
4267       Patterns.push_back(Pattern);
4268       return true;
4269     }
4270     return false;
4271   };
4272 
4273   typedef MachineCombinerPattern MCP;
4274 
4275   switch (Root.getOpcode()) {
4276   default:
4277     assert(false && "Unsupported FP instruction in combiner\n");
4278     break;
4279   case AArch64::FADDHrr:
4280     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4281            "FADDHrr does not have register operands");
4282 
4283     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
4284     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
4285     break;
4286   case AArch64::FADDSrr:
4287     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4288            "FADDSrr does not have register operands");
4289 
4290     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
4291              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
4292 
4293     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
4294              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
4295     break;
4296   case AArch64::FADDDrr:
4297     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
4298              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
4299 
4300     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
4301              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
4302     break;
4303   case AArch64::FADDv4f16:
4304     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
4305              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
4306 
4307     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
4308              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
4309     break;
4310   case AArch64::FADDv8f16:
4311     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
4312              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
4313 
4314     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
4315              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
4316     break;
4317   case AArch64::FADDv2f32:
4318     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
4319              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
4320 
4321     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
4322              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
4323     break;
4324   case AArch64::FADDv2f64:
4325     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
4326              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
4327 
4328     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
4329              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
4330     break;
4331   case AArch64::FADDv4f32:
4332     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
4333              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
4334 
4335     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
4336              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
4337     break;
4338   case AArch64::FSUBHrr:
4339     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
4340     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
4341     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
4342     break;
4343   case AArch64::FSUBSrr:
4344     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
4345 
4346     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
4347              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
4348 
4349     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
4350     break;
4351   case AArch64::FSUBDrr:
4352     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
4353 
4354     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
4355              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
4356 
4357     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
4358     break;
4359   case AArch64::FSUBv4f16:
4360     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
4361              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
4362 
4363     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
4364              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
4365     break;
4366   case AArch64::FSUBv8f16:
4367     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
4368              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
4369 
4370     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
4371              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
4372     break;
4373   case AArch64::FSUBv2f32:
4374     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
4375              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4376 
4377     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4378              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4379     break;
4380   case AArch64::FSUBv2f64:
4381     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4382              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4383 
4384     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4385              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4386     break;
4387   case AArch64::FSUBv4f32:
4388     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4389              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4390 
4391     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4392              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4393     break;
4394   }
4395   return Found;
4396 }
4397 
4398 /// Return true when a code sequence can improve throughput. It
4399 /// should be called only for instructions in loops.
4400 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const4401 bool AArch64InstrInfo::isThroughputPattern(
4402     MachineCombinerPattern Pattern) const {
4403   switch (Pattern) {
4404   default:
4405     break;
4406   case MachineCombinerPattern::FMULADDH_OP1:
4407   case MachineCombinerPattern::FMULADDH_OP2:
4408   case MachineCombinerPattern::FMULSUBH_OP1:
4409   case MachineCombinerPattern::FMULSUBH_OP2:
4410   case MachineCombinerPattern::FMULADDS_OP1:
4411   case MachineCombinerPattern::FMULADDS_OP2:
4412   case MachineCombinerPattern::FMULSUBS_OP1:
4413   case MachineCombinerPattern::FMULSUBS_OP2:
4414   case MachineCombinerPattern::FMULADDD_OP1:
4415   case MachineCombinerPattern::FMULADDD_OP2:
4416   case MachineCombinerPattern::FMULSUBD_OP1:
4417   case MachineCombinerPattern::FMULSUBD_OP2:
4418   case MachineCombinerPattern::FNMULSUBH_OP1:
4419   case MachineCombinerPattern::FNMULSUBS_OP1:
4420   case MachineCombinerPattern::FNMULSUBD_OP1:
4421   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
4422   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
4423   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
4424   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
4425   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4426   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4427   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4428   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4429   case MachineCombinerPattern::FMLAv4f16_OP2:
4430   case MachineCombinerPattern::FMLAv4f16_OP1:
4431   case MachineCombinerPattern::FMLAv8f16_OP1:
4432   case MachineCombinerPattern::FMLAv8f16_OP2:
4433   case MachineCombinerPattern::FMLAv2f32_OP2:
4434   case MachineCombinerPattern::FMLAv2f32_OP1:
4435   case MachineCombinerPattern::FMLAv2f64_OP1:
4436   case MachineCombinerPattern::FMLAv2f64_OP2:
4437   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4438   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4439   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4440   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4441   case MachineCombinerPattern::FMLAv4f32_OP1:
4442   case MachineCombinerPattern::FMLAv4f32_OP2:
4443   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4444   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4445   case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
4446   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
4447   case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
4448   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
4449   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4450   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4451   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4452   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4453   case MachineCombinerPattern::FMLSv4f16_OP1:
4454   case MachineCombinerPattern::FMLSv4f16_OP2:
4455   case MachineCombinerPattern::FMLSv8f16_OP1:
4456   case MachineCombinerPattern::FMLSv8f16_OP2:
4457   case MachineCombinerPattern::FMLSv2f32_OP2:
4458   case MachineCombinerPattern::FMLSv2f64_OP2:
4459   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4460   case MachineCombinerPattern::FMLSv4f32_OP2:
4461   case MachineCombinerPattern::MULADDv8i8_OP1:
4462   case MachineCombinerPattern::MULADDv8i8_OP2:
4463   case MachineCombinerPattern::MULADDv16i8_OP1:
4464   case MachineCombinerPattern::MULADDv16i8_OP2:
4465   case MachineCombinerPattern::MULADDv4i16_OP1:
4466   case MachineCombinerPattern::MULADDv4i16_OP2:
4467   case MachineCombinerPattern::MULADDv8i16_OP1:
4468   case MachineCombinerPattern::MULADDv8i16_OP2:
4469   case MachineCombinerPattern::MULADDv2i32_OP1:
4470   case MachineCombinerPattern::MULADDv2i32_OP2:
4471   case MachineCombinerPattern::MULADDv4i32_OP1:
4472   case MachineCombinerPattern::MULADDv4i32_OP2:
4473   case MachineCombinerPattern::MULSUBv8i8_OP1:
4474   case MachineCombinerPattern::MULSUBv8i8_OP2:
4475   case MachineCombinerPattern::MULSUBv16i8_OP1:
4476   case MachineCombinerPattern::MULSUBv16i8_OP2:
4477   case MachineCombinerPattern::MULSUBv4i16_OP1:
4478   case MachineCombinerPattern::MULSUBv4i16_OP2:
4479   case MachineCombinerPattern::MULSUBv8i16_OP1:
4480   case MachineCombinerPattern::MULSUBv8i16_OP2:
4481   case MachineCombinerPattern::MULSUBv2i32_OP1:
4482   case MachineCombinerPattern::MULSUBv2i32_OP2:
4483   case MachineCombinerPattern::MULSUBv4i32_OP1:
4484   case MachineCombinerPattern::MULSUBv4i32_OP2:
4485   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4486   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4487   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4488   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4489   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4490   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4491   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4492   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4493   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4494   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4495   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4496   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4497   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4498   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4499   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4500   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4501     return true;
4502   } // end switch (Pattern)
4503   return false;
4504 }
4505 /// Return true when there is potentially a faster code sequence for an
4506 /// instruction chain ending in \p Root. All potential patterns are listed in
4507 /// the \p Pattern vector. Pattern should be sorted in priority order since the
4508 /// pattern evaluator stops checking as soon as it finds a faster sequence.
4509 
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns) const4510 bool AArch64InstrInfo::getMachineCombinerPatterns(
4511     MachineInstr &Root,
4512     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
4513   // Integer patterns
4514   if (getMaddPatterns(Root, Patterns))
4515     return true;
4516   // Floating point patterns
4517   if (getFMAPatterns(Root, Patterns))
4518     return true;
4519 
4520   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
4521 }
4522 
4523 enum class FMAInstKind { Default, Indexed, Accumulator };
4524 /// genFusedMultiply - Generate fused multiply instructions.
4525 /// This function supports both integer and floating point instructions.
4526 /// A typical example:
4527 ///  F|MUL I=A,B,0
4528 ///  F|ADD R,I,C
4529 ///  ==> F|MADD R,A,B,C
4530 /// \param MF Containing MachineFunction
4531 /// \param MRI Register information
4532 /// \param TII Target information
4533 /// \param Root is the F|ADD instruction
4534 /// \param [out] InsInstrs is a vector of machine instructions and will
4535 /// contain the generated madd instruction
4536 /// \param IdxMulOpd is index of operand in Root that is the result of
4537 /// the F|MUL. In the example above IdxMulOpd is 1.
4538 /// \param MaddOpc the opcode fo the f|madd instruction
4539 /// \param RC Register class of operands
4540 /// \param kind of fma instruction (addressing mode) to be generated
4541 /// \param ReplacedAddend is the result register from the instruction
4542 /// replacing the non-combined operand, if any.
4543 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)4544 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
4545                  const TargetInstrInfo *TII, MachineInstr &Root,
4546                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
4547                  unsigned MaddOpc, const TargetRegisterClass *RC,
4548                  FMAInstKind kind = FMAInstKind::Default,
4549                  const Register *ReplacedAddend = nullptr) {
4550   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4551 
4552   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
4553   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4554   Register ResultReg = Root.getOperand(0).getReg();
4555   Register SrcReg0 = MUL->getOperand(1).getReg();
4556   bool Src0IsKill = MUL->getOperand(1).isKill();
4557   Register SrcReg1 = MUL->getOperand(2).getReg();
4558   bool Src1IsKill = MUL->getOperand(2).isKill();
4559 
4560   unsigned SrcReg2;
4561   bool Src2IsKill;
4562   if (ReplacedAddend) {
4563     // If we just generated a new addend, we must be it's only use.
4564     SrcReg2 = *ReplacedAddend;
4565     Src2IsKill = true;
4566   } else {
4567     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4568     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4569   }
4570 
4571   if (Register::isVirtualRegister(ResultReg))
4572     MRI.constrainRegClass(ResultReg, RC);
4573   if (Register::isVirtualRegister(SrcReg0))
4574     MRI.constrainRegClass(SrcReg0, RC);
4575   if (Register::isVirtualRegister(SrcReg1))
4576     MRI.constrainRegClass(SrcReg1, RC);
4577   if (Register::isVirtualRegister(SrcReg2))
4578     MRI.constrainRegClass(SrcReg2, RC);
4579 
4580   MachineInstrBuilder MIB;
4581   if (kind == FMAInstKind::Default)
4582     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4583               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4584               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4585               .addReg(SrcReg2, getKillRegState(Src2IsKill));
4586   else if (kind == FMAInstKind::Indexed)
4587     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4588               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4589               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4590               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4591               .addImm(MUL->getOperand(3).getImm());
4592   else if (kind == FMAInstKind::Accumulator)
4593     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4594               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4595               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4596               .addReg(SrcReg1, getKillRegState(Src1IsKill));
4597   else
4598     assert(false && "Invalid FMA instruction kind \n");
4599   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4600   InsInstrs.push_back(MIB);
4601   return MUL;
4602 }
4603 
4604 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
4605 /// instructions.
4606 ///
4607 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4608 static MachineInstr *genFusedMultiplyAcc(
4609     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4610     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4611     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4612   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4613                           FMAInstKind::Accumulator);
4614 }
4615 
4616 /// genNeg - Helper to generate an intermediate negation of the second operand
4617 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)4618 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
4619                        const TargetInstrInfo *TII, MachineInstr &Root,
4620                        SmallVectorImpl<MachineInstr *> &InsInstrs,
4621                        DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
4622                        unsigned MnegOpc, const TargetRegisterClass *RC) {
4623   Register NewVR = MRI.createVirtualRegister(RC);
4624   MachineInstrBuilder MIB =
4625       BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
4626           .add(Root.getOperand(2));
4627   InsInstrs.push_back(MIB);
4628 
4629   assert(InstrIdxForVirtReg.empty());
4630   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4631 
4632   return NewVR;
4633 }
4634 
4635 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4636 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4637 static MachineInstr *genFusedMultiplyAccNeg(
4638     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4639     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4640     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4641     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4642   assert(IdxMulOpd == 1);
4643 
4644   Register NewVR =
4645       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4646   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4647                           FMAInstKind::Accumulator, &NewVR);
4648 }
4649 
4650 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
4651 /// instructions.
4652 ///
4653 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4654 static MachineInstr *genFusedMultiplyIdx(
4655     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4656     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4657     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4658   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4659                           FMAInstKind::Indexed);
4660 }
4661 
4662 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4663 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4664 static MachineInstr *genFusedMultiplyIdxNeg(
4665     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4666     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4667     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4668     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4669   assert(IdxMulOpd == 1);
4670 
4671   Register NewVR =
4672       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4673 
4674   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4675                           FMAInstKind::Indexed, &NewVR);
4676 }
4677 
4678 /// genMaddR - Generate madd instruction and combine mul and add using
4679 /// an extra virtual register
4680 /// Example - an ADD intermediate needs to be stored in a register:
4681 ///   MUL I=A,B,0
4682 ///   ADD R,I,Imm
4683 ///   ==> ORR  V, ZR, Imm
4684 ///   ==> MADD R,A,B,V
4685 /// \param MF Containing MachineFunction
4686 /// \param MRI Register information
4687 /// \param TII Target information
4688 /// \param Root is the ADD instruction
4689 /// \param [out] InsInstrs is a vector of machine instructions and will
4690 /// contain the generated madd instruction
4691 /// \param IdxMulOpd is index of operand in Root that is the result of
4692 /// the MUL. In the example above IdxMulOpd is 1.
4693 /// \param MaddOpc the opcode fo the madd instruction
4694 /// \param VR is a virtual register that holds the value of an ADD operand
4695 /// (V in the example above).
4696 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)4697 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4698                               const TargetInstrInfo *TII, MachineInstr &Root,
4699                               SmallVectorImpl<MachineInstr *> &InsInstrs,
4700                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4701                               const TargetRegisterClass *RC) {
4702   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4703 
4704   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4705   Register ResultReg = Root.getOperand(0).getReg();
4706   Register SrcReg0 = MUL->getOperand(1).getReg();
4707   bool Src0IsKill = MUL->getOperand(1).isKill();
4708   Register SrcReg1 = MUL->getOperand(2).getReg();
4709   bool Src1IsKill = MUL->getOperand(2).isKill();
4710 
4711   if (Register::isVirtualRegister(ResultReg))
4712     MRI.constrainRegClass(ResultReg, RC);
4713   if (Register::isVirtualRegister(SrcReg0))
4714     MRI.constrainRegClass(SrcReg0, RC);
4715   if (Register::isVirtualRegister(SrcReg1))
4716     MRI.constrainRegClass(SrcReg1, RC);
4717   if (Register::isVirtualRegister(VR))
4718     MRI.constrainRegClass(VR, RC);
4719 
4720   MachineInstrBuilder MIB =
4721       BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4722           .addReg(SrcReg0, getKillRegState(Src0IsKill))
4723           .addReg(SrcReg1, getKillRegState(Src1IsKill))
4724           .addReg(VR);
4725   // Insert the MADD
4726   InsInstrs.push_back(MIB);
4727   return MUL;
4728 }
4729 
4730 /// When getMachineCombinerPatterns() finds potential patterns,
4731 /// this function generates the instructions that could replace the
4732 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const4733 void AArch64InstrInfo::genAlternativeCodeSequence(
4734     MachineInstr &Root, MachineCombinerPattern Pattern,
4735     SmallVectorImpl<MachineInstr *> &InsInstrs,
4736     SmallVectorImpl<MachineInstr *> &DelInstrs,
4737     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4738   MachineBasicBlock &MBB = *Root.getParent();
4739   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4740   MachineFunction &MF = *MBB.getParent();
4741   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4742 
4743   MachineInstr *MUL;
4744   const TargetRegisterClass *RC;
4745   unsigned Opc;
4746   switch (Pattern) {
4747   default:
4748     // Reassociate instructions.
4749     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4750                                                 DelInstrs, InstrIdxForVirtReg);
4751     return;
4752   case MachineCombinerPattern::MULADDW_OP1:
4753   case MachineCombinerPattern::MULADDX_OP1:
4754     // MUL I=A,B,0
4755     // ADD R,I,C
4756     // ==> MADD R,A,B,C
4757     // --- Create(MADD);
4758     if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4759       Opc = AArch64::MADDWrrr;
4760       RC = &AArch64::GPR32RegClass;
4761     } else {
4762       Opc = AArch64::MADDXrrr;
4763       RC = &AArch64::GPR64RegClass;
4764     }
4765     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4766     break;
4767   case MachineCombinerPattern::MULADDW_OP2:
4768   case MachineCombinerPattern::MULADDX_OP2:
4769     // MUL I=A,B,0
4770     // ADD R,C,I
4771     // ==> MADD R,A,B,C
4772     // --- Create(MADD);
4773     if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4774       Opc = AArch64::MADDWrrr;
4775       RC = &AArch64::GPR32RegClass;
4776     } else {
4777       Opc = AArch64::MADDXrrr;
4778       RC = &AArch64::GPR64RegClass;
4779     }
4780     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4781     break;
4782   case MachineCombinerPattern::MULADDWI_OP1:
4783   case MachineCombinerPattern::MULADDXI_OP1: {
4784     // MUL I=A,B,0
4785     // ADD R,I,Imm
4786     // ==> ORR  V, ZR, Imm
4787     // ==> MADD R,A,B,V
4788     // --- Create(MADD);
4789     const TargetRegisterClass *OrrRC;
4790     unsigned BitSize, OrrOpc, ZeroReg;
4791     if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4792       OrrOpc = AArch64::ORRWri;
4793       OrrRC = &AArch64::GPR32spRegClass;
4794       BitSize = 32;
4795       ZeroReg = AArch64::WZR;
4796       Opc = AArch64::MADDWrrr;
4797       RC = &AArch64::GPR32RegClass;
4798     } else {
4799       OrrOpc = AArch64::ORRXri;
4800       OrrRC = &AArch64::GPR64spRegClass;
4801       BitSize = 64;
4802       ZeroReg = AArch64::XZR;
4803       Opc = AArch64::MADDXrrr;
4804       RC = &AArch64::GPR64RegClass;
4805     }
4806     Register NewVR = MRI.createVirtualRegister(OrrRC);
4807     uint64_t Imm = Root.getOperand(2).getImm();
4808 
4809     if (Root.getOperand(3).isImm()) {
4810       unsigned Val = Root.getOperand(3).getImm();
4811       Imm = Imm << Val;
4812     }
4813     uint64_t UImm = SignExtend64(Imm, BitSize);
4814     uint64_t Encoding;
4815     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4816       MachineInstrBuilder MIB1 =
4817           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4818               .addReg(ZeroReg)
4819               .addImm(Encoding);
4820       InsInstrs.push_back(MIB1);
4821       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4822       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4823     }
4824     break;
4825   }
4826   case MachineCombinerPattern::MULSUBW_OP1:
4827   case MachineCombinerPattern::MULSUBX_OP1: {
4828     // MUL I=A,B,0
4829     // SUB R,I, C
4830     // ==> SUB  V, 0, C
4831     // ==> MADD R,A,B,V // = -C + A*B
4832     // --- Create(MADD);
4833     const TargetRegisterClass *SubRC;
4834     unsigned SubOpc, ZeroReg;
4835     if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4836       SubOpc = AArch64::SUBWrr;
4837       SubRC = &AArch64::GPR32spRegClass;
4838       ZeroReg = AArch64::WZR;
4839       Opc = AArch64::MADDWrrr;
4840       RC = &AArch64::GPR32RegClass;
4841     } else {
4842       SubOpc = AArch64::SUBXrr;
4843       SubRC = &AArch64::GPR64spRegClass;
4844       ZeroReg = AArch64::XZR;
4845       Opc = AArch64::MADDXrrr;
4846       RC = &AArch64::GPR64RegClass;
4847     }
4848     Register NewVR = MRI.createVirtualRegister(SubRC);
4849     // SUB NewVR, 0, C
4850     MachineInstrBuilder MIB1 =
4851         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4852             .addReg(ZeroReg)
4853             .add(Root.getOperand(2));
4854     InsInstrs.push_back(MIB1);
4855     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4856     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4857     break;
4858   }
4859   case MachineCombinerPattern::MULSUBW_OP2:
4860   case MachineCombinerPattern::MULSUBX_OP2:
4861     // MUL I=A,B,0
4862     // SUB R,C,I
4863     // ==> MSUB R,A,B,C (computes C - A*B)
4864     // --- Create(MSUB);
4865     if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4866       Opc = AArch64::MSUBWrrr;
4867       RC = &AArch64::GPR32RegClass;
4868     } else {
4869       Opc = AArch64::MSUBXrrr;
4870       RC = &AArch64::GPR64RegClass;
4871     }
4872     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4873     break;
4874   case MachineCombinerPattern::MULSUBWI_OP1:
4875   case MachineCombinerPattern::MULSUBXI_OP1: {
4876     // MUL I=A,B,0
4877     // SUB R,I, Imm
4878     // ==> ORR  V, ZR, -Imm
4879     // ==> MADD R,A,B,V // = -Imm + A*B
4880     // --- Create(MADD);
4881     const TargetRegisterClass *OrrRC;
4882     unsigned BitSize, OrrOpc, ZeroReg;
4883     if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4884       OrrOpc = AArch64::ORRWri;
4885       OrrRC = &AArch64::GPR32spRegClass;
4886       BitSize = 32;
4887       ZeroReg = AArch64::WZR;
4888       Opc = AArch64::MADDWrrr;
4889       RC = &AArch64::GPR32RegClass;
4890     } else {
4891       OrrOpc = AArch64::ORRXri;
4892       OrrRC = &AArch64::GPR64spRegClass;
4893       BitSize = 64;
4894       ZeroReg = AArch64::XZR;
4895       Opc = AArch64::MADDXrrr;
4896       RC = &AArch64::GPR64RegClass;
4897     }
4898     Register NewVR = MRI.createVirtualRegister(OrrRC);
4899     uint64_t Imm = Root.getOperand(2).getImm();
4900     if (Root.getOperand(3).isImm()) {
4901       unsigned Val = Root.getOperand(3).getImm();
4902       Imm = Imm << Val;
4903     }
4904     uint64_t UImm = SignExtend64(-Imm, BitSize);
4905     uint64_t Encoding;
4906     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4907       MachineInstrBuilder MIB1 =
4908           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4909               .addReg(ZeroReg)
4910               .addImm(Encoding);
4911       InsInstrs.push_back(MIB1);
4912       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4913       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4914     }
4915     break;
4916   }
4917 
4918   case MachineCombinerPattern::MULADDv8i8_OP1:
4919     Opc = AArch64::MLAv8i8;
4920     RC = &AArch64::FPR64RegClass;
4921     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4922     break;
4923   case MachineCombinerPattern::MULADDv8i8_OP2:
4924     Opc = AArch64::MLAv8i8;
4925     RC = &AArch64::FPR64RegClass;
4926     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4927     break;
4928   case MachineCombinerPattern::MULADDv16i8_OP1:
4929     Opc = AArch64::MLAv16i8;
4930     RC = &AArch64::FPR128RegClass;
4931     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4932     break;
4933   case MachineCombinerPattern::MULADDv16i8_OP2:
4934     Opc = AArch64::MLAv16i8;
4935     RC = &AArch64::FPR128RegClass;
4936     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4937     break;
4938   case MachineCombinerPattern::MULADDv4i16_OP1:
4939     Opc = AArch64::MLAv4i16;
4940     RC = &AArch64::FPR64RegClass;
4941     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4942     break;
4943   case MachineCombinerPattern::MULADDv4i16_OP2:
4944     Opc = AArch64::MLAv4i16;
4945     RC = &AArch64::FPR64RegClass;
4946     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4947     break;
4948   case MachineCombinerPattern::MULADDv8i16_OP1:
4949     Opc = AArch64::MLAv8i16;
4950     RC = &AArch64::FPR128RegClass;
4951     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4952     break;
4953   case MachineCombinerPattern::MULADDv8i16_OP2:
4954     Opc = AArch64::MLAv8i16;
4955     RC = &AArch64::FPR128RegClass;
4956     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4957     break;
4958   case MachineCombinerPattern::MULADDv2i32_OP1:
4959     Opc = AArch64::MLAv2i32;
4960     RC = &AArch64::FPR64RegClass;
4961     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4962     break;
4963   case MachineCombinerPattern::MULADDv2i32_OP2:
4964     Opc = AArch64::MLAv2i32;
4965     RC = &AArch64::FPR64RegClass;
4966     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4967     break;
4968   case MachineCombinerPattern::MULADDv4i32_OP1:
4969     Opc = AArch64::MLAv4i32;
4970     RC = &AArch64::FPR128RegClass;
4971     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4972     break;
4973   case MachineCombinerPattern::MULADDv4i32_OP2:
4974     Opc = AArch64::MLAv4i32;
4975     RC = &AArch64::FPR128RegClass;
4976     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4977     break;
4978 
4979   case MachineCombinerPattern::MULSUBv8i8_OP1:
4980     Opc = AArch64::MLAv8i8;
4981     RC = &AArch64::FPR64RegClass;
4982     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4983                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
4984                                  RC);
4985     break;
4986   case MachineCombinerPattern::MULSUBv8i8_OP2:
4987     Opc = AArch64::MLSv8i8;
4988     RC = &AArch64::FPR64RegClass;
4989     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4990     break;
4991   case MachineCombinerPattern::MULSUBv16i8_OP1:
4992     Opc = AArch64::MLAv16i8;
4993     RC = &AArch64::FPR128RegClass;
4994     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4995                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
4996                                  RC);
4997     break;
4998   case MachineCombinerPattern::MULSUBv16i8_OP2:
4999     Opc = AArch64::MLSv16i8;
5000     RC = &AArch64::FPR128RegClass;
5001     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5002     break;
5003   case MachineCombinerPattern::MULSUBv4i16_OP1:
5004     Opc = AArch64::MLAv4i16;
5005     RC = &AArch64::FPR64RegClass;
5006     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5007                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
5008                                  RC);
5009     break;
5010   case MachineCombinerPattern::MULSUBv4i16_OP2:
5011     Opc = AArch64::MLSv4i16;
5012     RC = &AArch64::FPR64RegClass;
5013     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5014     break;
5015   case MachineCombinerPattern::MULSUBv8i16_OP1:
5016     Opc = AArch64::MLAv8i16;
5017     RC = &AArch64::FPR128RegClass;
5018     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5019                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
5020                                  RC);
5021     break;
5022   case MachineCombinerPattern::MULSUBv8i16_OP2:
5023     Opc = AArch64::MLSv8i16;
5024     RC = &AArch64::FPR128RegClass;
5025     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5026     break;
5027   case MachineCombinerPattern::MULSUBv2i32_OP1:
5028     Opc = AArch64::MLAv2i32;
5029     RC = &AArch64::FPR64RegClass;
5030     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5031                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
5032                                  RC);
5033     break;
5034   case MachineCombinerPattern::MULSUBv2i32_OP2:
5035     Opc = AArch64::MLSv2i32;
5036     RC = &AArch64::FPR64RegClass;
5037     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5038     break;
5039   case MachineCombinerPattern::MULSUBv4i32_OP1:
5040     Opc = AArch64::MLAv4i32;
5041     RC = &AArch64::FPR128RegClass;
5042     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
5043                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
5044                                  RC);
5045     break;
5046   case MachineCombinerPattern::MULSUBv4i32_OP2:
5047     Opc = AArch64::MLSv4i32;
5048     RC = &AArch64::FPR128RegClass;
5049     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5050     break;
5051 
5052   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
5053     Opc = AArch64::MLAv4i16_indexed;
5054     RC = &AArch64::FPR64RegClass;
5055     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5056     break;
5057   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
5058     Opc = AArch64::MLAv4i16_indexed;
5059     RC = &AArch64::FPR64RegClass;
5060     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5061     break;
5062   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
5063     Opc = AArch64::MLAv8i16_indexed;
5064     RC = &AArch64::FPR128RegClass;
5065     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5066     break;
5067   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
5068     Opc = AArch64::MLAv8i16_indexed;
5069     RC = &AArch64::FPR128RegClass;
5070     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5071     break;
5072   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
5073     Opc = AArch64::MLAv2i32_indexed;
5074     RC = &AArch64::FPR64RegClass;
5075     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5076     break;
5077   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
5078     Opc = AArch64::MLAv2i32_indexed;
5079     RC = &AArch64::FPR64RegClass;
5080     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5081     break;
5082   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
5083     Opc = AArch64::MLAv4i32_indexed;
5084     RC = &AArch64::FPR128RegClass;
5085     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5086     break;
5087   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
5088     Opc = AArch64::MLAv4i32_indexed;
5089     RC = &AArch64::FPR128RegClass;
5090     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5091     break;
5092 
5093   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
5094     Opc = AArch64::MLAv4i16_indexed;
5095     RC = &AArch64::FPR64RegClass;
5096     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5097                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
5098                                  RC);
5099     break;
5100   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
5101     Opc = AArch64::MLSv4i16_indexed;
5102     RC = &AArch64::FPR64RegClass;
5103     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5104     break;
5105   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
5106     Opc = AArch64::MLAv8i16_indexed;
5107     RC = &AArch64::FPR128RegClass;
5108     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5109                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
5110                                  RC);
5111     break;
5112   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
5113     Opc = AArch64::MLSv8i16_indexed;
5114     RC = &AArch64::FPR128RegClass;
5115     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5116     break;
5117   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
5118     Opc = AArch64::MLAv2i32_indexed;
5119     RC = &AArch64::FPR64RegClass;
5120     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5121                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
5122                                  RC);
5123     break;
5124   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
5125     Opc = AArch64::MLSv2i32_indexed;
5126     RC = &AArch64::FPR64RegClass;
5127     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5128     break;
5129   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
5130     Opc = AArch64::MLAv4i32_indexed;
5131     RC = &AArch64::FPR128RegClass;
5132     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
5133                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
5134                                  RC);
5135     break;
5136   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
5137     Opc = AArch64::MLSv4i32_indexed;
5138     RC = &AArch64::FPR128RegClass;
5139     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5140     break;
5141 
5142   // Floating Point Support
5143   case MachineCombinerPattern::FMULADDH_OP1:
5144     Opc = AArch64::FMADDHrrr;
5145     RC = &AArch64::FPR16RegClass;
5146     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5147     break;
5148   case MachineCombinerPattern::FMULADDS_OP1:
5149     Opc = AArch64::FMADDSrrr;
5150     RC = &AArch64::FPR32RegClass;
5151     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5152     break;
5153   case MachineCombinerPattern::FMULADDD_OP1:
5154     Opc = AArch64::FMADDDrrr;
5155     RC = &AArch64::FPR64RegClass;
5156     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5157     break;
5158 
5159   case MachineCombinerPattern::FMULADDH_OP2:
5160     Opc = AArch64::FMADDHrrr;
5161     RC = &AArch64::FPR16RegClass;
5162     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5163     break;
5164   case MachineCombinerPattern::FMULADDS_OP2:
5165     Opc = AArch64::FMADDSrrr;
5166     RC = &AArch64::FPR32RegClass;
5167     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5168     break;
5169   case MachineCombinerPattern::FMULADDD_OP2:
5170     Opc = AArch64::FMADDDrrr;
5171     RC = &AArch64::FPR64RegClass;
5172     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5173     break;
5174 
5175   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
5176     Opc = AArch64::FMLAv1i32_indexed;
5177     RC = &AArch64::FPR32RegClass;
5178     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5179                            FMAInstKind::Indexed);
5180     break;
5181   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
5182     Opc = AArch64::FMLAv1i32_indexed;
5183     RC = &AArch64::FPR32RegClass;
5184     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5185                            FMAInstKind::Indexed);
5186     break;
5187 
5188   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
5189     Opc = AArch64::FMLAv1i64_indexed;
5190     RC = &AArch64::FPR64RegClass;
5191     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5192                            FMAInstKind::Indexed);
5193     break;
5194   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
5195     Opc = AArch64::FMLAv1i64_indexed;
5196     RC = &AArch64::FPR64RegClass;
5197     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5198                            FMAInstKind::Indexed);
5199     break;
5200 
5201   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
5202     RC = &AArch64::FPR64RegClass;
5203     Opc = AArch64::FMLAv4i16_indexed;
5204     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5205                            FMAInstKind::Indexed);
5206     break;
5207   case MachineCombinerPattern::FMLAv4f16_OP1:
5208     RC = &AArch64::FPR64RegClass;
5209     Opc = AArch64::FMLAv4f16;
5210     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5211                            FMAInstKind::Accumulator);
5212     break;
5213   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
5214     RC = &AArch64::FPR64RegClass;
5215     Opc = AArch64::FMLAv4i16_indexed;
5216     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5217                            FMAInstKind::Indexed);
5218     break;
5219   case MachineCombinerPattern::FMLAv4f16_OP2:
5220     RC = &AArch64::FPR64RegClass;
5221     Opc = AArch64::FMLAv4f16;
5222     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5223                            FMAInstKind::Accumulator);
5224     break;
5225 
5226   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
5227   case MachineCombinerPattern::FMLAv2f32_OP1:
5228     RC = &AArch64::FPR64RegClass;
5229     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
5230       Opc = AArch64::FMLAv2i32_indexed;
5231       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5232                              FMAInstKind::Indexed);
5233     } else {
5234       Opc = AArch64::FMLAv2f32;
5235       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5236                              FMAInstKind::Accumulator);
5237     }
5238     break;
5239   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
5240   case MachineCombinerPattern::FMLAv2f32_OP2:
5241     RC = &AArch64::FPR64RegClass;
5242     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
5243       Opc = AArch64::FMLAv2i32_indexed;
5244       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5245                              FMAInstKind::Indexed);
5246     } else {
5247       Opc = AArch64::FMLAv2f32;
5248       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5249                              FMAInstKind::Accumulator);
5250     }
5251     break;
5252 
5253   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
5254     RC = &AArch64::FPR128RegClass;
5255     Opc = AArch64::FMLAv8i16_indexed;
5256     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5257                            FMAInstKind::Indexed);
5258     break;
5259   case MachineCombinerPattern::FMLAv8f16_OP1:
5260     RC = &AArch64::FPR128RegClass;
5261     Opc = AArch64::FMLAv8f16;
5262     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5263                            FMAInstKind::Accumulator);
5264     break;
5265   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
5266     RC = &AArch64::FPR128RegClass;
5267     Opc = AArch64::FMLAv8i16_indexed;
5268     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5269                            FMAInstKind::Indexed);
5270     break;
5271   case MachineCombinerPattern::FMLAv8f16_OP2:
5272     RC = &AArch64::FPR128RegClass;
5273     Opc = AArch64::FMLAv8f16;
5274     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5275                            FMAInstKind::Accumulator);
5276     break;
5277 
5278   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
5279   case MachineCombinerPattern::FMLAv2f64_OP1:
5280     RC = &AArch64::FPR128RegClass;
5281     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
5282       Opc = AArch64::FMLAv2i64_indexed;
5283       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5284                              FMAInstKind::Indexed);
5285     } else {
5286       Opc = AArch64::FMLAv2f64;
5287       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5288                              FMAInstKind::Accumulator);
5289     }
5290     break;
5291   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
5292   case MachineCombinerPattern::FMLAv2f64_OP2:
5293     RC = &AArch64::FPR128RegClass;
5294     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
5295       Opc = AArch64::FMLAv2i64_indexed;
5296       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5297                              FMAInstKind::Indexed);
5298     } else {
5299       Opc = AArch64::FMLAv2f64;
5300       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5301                              FMAInstKind::Accumulator);
5302     }
5303     break;
5304 
5305   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
5306   case MachineCombinerPattern::FMLAv4f32_OP1:
5307     RC = &AArch64::FPR128RegClass;
5308     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
5309       Opc = AArch64::FMLAv4i32_indexed;
5310       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5311                              FMAInstKind::Indexed);
5312     } else {
5313       Opc = AArch64::FMLAv4f32;
5314       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5315                              FMAInstKind::Accumulator);
5316     }
5317     break;
5318 
5319   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
5320   case MachineCombinerPattern::FMLAv4f32_OP2:
5321     RC = &AArch64::FPR128RegClass;
5322     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
5323       Opc = AArch64::FMLAv4i32_indexed;
5324       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5325                              FMAInstKind::Indexed);
5326     } else {
5327       Opc = AArch64::FMLAv4f32;
5328       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5329                              FMAInstKind::Accumulator);
5330     }
5331     break;
5332 
5333   case MachineCombinerPattern::FMULSUBH_OP1:
5334     Opc = AArch64::FNMSUBHrrr;
5335     RC = &AArch64::FPR16RegClass;
5336     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5337     break;
5338   case MachineCombinerPattern::FMULSUBS_OP1:
5339     Opc = AArch64::FNMSUBSrrr;
5340     RC = &AArch64::FPR32RegClass;
5341     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5342     break;
5343   case MachineCombinerPattern::FMULSUBD_OP1:
5344     Opc = AArch64::FNMSUBDrrr;
5345     RC = &AArch64::FPR64RegClass;
5346     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5347     break;
5348 
5349   case MachineCombinerPattern::FNMULSUBH_OP1:
5350     Opc = AArch64::FNMADDHrrr;
5351     RC = &AArch64::FPR16RegClass;
5352     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5353     break;
5354   case MachineCombinerPattern::FNMULSUBS_OP1:
5355     Opc = AArch64::FNMADDSrrr;
5356     RC = &AArch64::FPR32RegClass;
5357     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5358     break;
5359   case MachineCombinerPattern::FNMULSUBD_OP1:
5360     Opc = AArch64::FNMADDDrrr;
5361     RC = &AArch64::FPR64RegClass;
5362     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5363     break;
5364 
5365   case MachineCombinerPattern::FMULSUBH_OP2:
5366     Opc = AArch64::FMSUBHrrr;
5367     RC = &AArch64::FPR16RegClass;
5368     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5369     break;
5370   case MachineCombinerPattern::FMULSUBS_OP2:
5371     Opc = AArch64::FMSUBSrrr;
5372     RC = &AArch64::FPR32RegClass;
5373     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5374     break;
5375   case MachineCombinerPattern::FMULSUBD_OP2:
5376     Opc = AArch64::FMSUBDrrr;
5377     RC = &AArch64::FPR64RegClass;
5378     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5379     break;
5380 
5381   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
5382     Opc = AArch64::FMLSv1i32_indexed;
5383     RC = &AArch64::FPR32RegClass;
5384     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5385                            FMAInstKind::Indexed);
5386     break;
5387 
5388   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
5389     Opc = AArch64::FMLSv1i64_indexed;
5390     RC = &AArch64::FPR64RegClass;
5391     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5392                            FMAInstKind::Indexed);
5393     break;
5394 
5395   case MachineCombinerPattern::FMLSv4f16_OP1:
5396   case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
5397     RC = &AArch64::FPR64RegClass;
5398     Register NewVR = MRI.createVirtualRegister(RC);
5399     MachineInstrBuilder MIB1 =
5400         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
5401             .add(Root.getOperand(2));
5402     InsInstrs.push_back(MIB1);
5403     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5404     if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
5405       Opc = AArch64::FMLAv4f16;
5406       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5407                              FMAInstKind::Accumulator, &NewVR);
5408     } else {
5409       Opc = AArch64::FMLAv4i16_indexed;
5410       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5411                              FMAInstKind::Indexed, &NewVR);
5412     }
5413     break;
5414   }
5415   case MachineCombinerPattern::FMLSv4f16_OP2:
5416     RC = &AArch64::FPR64RegClass;
5417     Opc = AArch64::FMLSv4f16;
5418     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5419                            FMAInstKind::Accumulator);
5420     break;
5421   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
5422     RC = &AArch64::FPR64RegClass;
5423     Opc = AArch64::FMLSv4i16_indexed;
5424     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5425                            FMAInstKind::Indexed);
5426     break;
5427 
5428   case MachineCombinerPattern::FMLSv2f32_OP2:
5429   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
5430     RC = &AArch64::FPR64RegClass;
5431     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
5432       Opc = AArch64::FMLSv2i32_indexed;
5433       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5434                              FMAInstKind::Indexed);
5435     } else {
5436       Opc = AArch64::FMLSv2f32;
5437       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5438                              FMAInstKind::Accumulator);
5439     }
5440     break;
5441 
5442   case MachineCombinerPattern::FMLSv8f16_OP1:
5443   case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
5444     RC = &AArch64::FPR128RegClass;
5445     Register NewVR = MRI.createVirtualRegister(RC);
5446     MachineInstrBuilder MIB1 =
5447         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
5448             .add(Root.getOperand(2));
5449     InsInstrs.push_back(MIB1);
5450     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5451     if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
5452       Opc = AArch64::FMLAv8f16;
5453       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5454                              FMAInstKind::Accumulator, &NewVR);
5455     } else {
5456       Opc = AArch64::FMLAv8i16_indexed;
5457       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5458                              FMAInstKind::Indexed, &NewVR);
5459     }
5460     break;
5461   }
5462   case MachineCombinerPattern::FMLSv8f16_OP2:
5463     RC = &AArch64::FPR128RegClass;
5464     Opc = AArch64::FMLSv8f16;
5465     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5466                            FMAInstKind::Accumulator);
5467     break;
5468   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
5469     RC = &AArch64::FPR128RegClass;
5470     Opc = AArch64::FMLSv8i16_indexed;
5471     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5472                            FMAInstKind::Indexed);
5473     break;
5474 
5475   case MachineCombinerPattern::FMLSv2f64_OP2:
5476   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
5477     RC = &AArch64::FPR128RegClass;
5478     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
5479       Opc = AArch64::FMLSv2i64_indexed;
5480       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5481                              FMAInstKind::Indexed);
5482     } else {
5483       Opc = AArch64::FMLSv2f64;
5484       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5485                              FMAInstKind::Accumulator);
5486     }
5487     break;
5488 
5489   case MachineCombinerPattern::FMLSv4f32_OP2:
5490   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
5491     RC = &AArch64::FPR128RegClass;
5492     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
5493       Opc = AArch64::FMLSv4i32_indexed;
5494       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5495                              FMAInstKind::Indexed);
5496     } else {
5497       Opc = AArch64::FMLSv4f32;
5498       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5499                              FMAInstKind::Accumulator);
5500     }
5501     break;
5502   case MachineCombinerPattern::FMLSv2f32_OP1:
5503   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
5504     RC = &AArch64::FPR64RegClass;
5505     Register NewVR = MRI.createVirtualRegister(RC);
5506     MachineInstrBuilder MIB1 =
5507         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
5508             .add(Root.getOperand(2));
5509     InsInstrs.push_back(MIB1);
5510     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5511     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
5512       Opc = AArch64::FMLAv2i32_indexed;
5513       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5514                              FMAInstKind::Indexed, &NewVR);
5515     } else {
5516       Opc = AArch64::FMLAv2f32;
5517       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5518                              FMAInstKind::Accumulator, &NewVR);
5519     }
5520     break;
5521   }
5522   case MachineCombinerPattern::FMLSv4f32_OP1:
5523   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
5524     RC = &AArch64::FPR128RegClass;
5525     Register NewVR = MRI.createVirtualRegister(RC);
5526     MachineInstrBuilder MIB1 =
5527         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
5528             .add(Root.getOperand(2));
5529     InsInstrs.push_back(MIB1);
5530     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5531     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
5532       Opc = AArch64::FMLAv4i32_indexed;
5533       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5534                              FMAInstKind::Indexed, &NewVR);
5535     } else {
5536       Opc = AArch64::FMLAv4f32;
5537       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5538                              FMAInstKind::Accumulator, &NewVR);
5539     }
5540     break;
5541   }
5542   case MachineCombinerPattern::FMLSv2f64_OP1:
5543   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
5544     RC = &AArch64::FPR128RegClass;
5545     Register NewVR = MRI.createVirtualRegister(RC);
5546     MachineInstrBuilder MIB1 =
5547         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
5548             .add(Root.getOperand(2));
5549     InsInstrs.push_back(MIB1);
5550     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5551     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
5552       Opc = AArch64::FMLAv2i64_indexed;
5553       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5554                              FMAInstKind::Indexed, &NewVR);
5555     } else {
5556       Opc = AArch64::FMLAv2f64;
5557       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5558                              FMAInstKind::Accumulator, &NewVR);
5559     }
5560     break;
5561   }
5562   } // end switch (Pattern)
5563   // Record MUL and ADD/SUB for deletion
5564   DelInstrs.push_back(MUL);
5565   DelInstrs.push_back(&Root);
5566 }
5567 
5568 /// Replace csincr-branch sequence by simple conditional branch
5569 ///
5570 /// Examples:
5571 /// 1. \code
5572 ///   csinc  w9, wzr, wzr, <condition code>
5573 ///   tbnz   w9, #0, 0x44
5574 ///    \endcode
5575 /// to
5576 ///    \code
5577 ///   b.<inverted condition code>
5578 ///    \endcode
5579 ///
5580 /// 2. \code
5581 ///   csinc w9, wzr, wzr, <condition code>
5582 ///   tbz   w9, #0, 0x44
5583 ///    \endcode
5584 /// to
5585 ///    \code
5586 ///   b.<condition code>
5587 ///    \endcode
5588 ///
5589 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
5590 /// compare's constant operand is power of 2.
5591 ///
5592 /// Examples:
5593 ///    \code
5594 ///   and  w8, w8, #0x400
5595 ///   cbnz w8, L1
5596 ///    \endcode
5597 /// to
5598 ///    \code
5599 ///   tbnz w8, #10, L1
5600 ///    \endcode
5601 ///
5602 /// \param  MI Conditional Branch
5603 /// \return True when the simple conditional branch is generated
5604 ///
optimizeCondBranch(MachineInstr & MI) const5605 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
5606   bool IsNegativeBranch = false;
5607   bool IsTestAndBranch = false;
5608   unsigned TargetBBInMI = 0;
5609   switch (MI.getOpcode()) {
5610   default:
5611     llvm_unreachable("Unknown branch instruction?");
5612   case AArch64::Bcc:
5613     return false;
5614   case AArch64::CBZW:
5615   case AArch64::CBZX:
5616     TargetBBInMI = 1;
5617     break;
5618   case AArch64::CBNZW:
5619   case AArch64::CBNZX:
5620     TargetBBInMI = 1;
5621     IsNegativeBranch = true;
5622     break;
5623   case AArch64::TBZW:
5624   case AArch64::TBZX:
5625     TargetBBInMI = 2;
5626     IsTestAndBranch = true;
5627     break;
5628   case AArch64::TBNZW:
5629   case AArch64::TBNZX:
5630     TargetBBInMI = 2;
5631     IsNegativeBranch = true;
5632     IsTestAndBranch = true;
5633     break;
5634   }
5635   // So we increment a zero register and test for bits other
5636   // than bit 0? Conservatively bail out in case the verifier
5637   // missed this case.
5638   if (IsTestAndBranch && MI.getOperand(1).getImm())
5639     return false;
5640 
5641   // Find Definition.
5642   assert(MI.getParent() && "Incomplete machine instruciton\n");
5643   MachineBasicBlock *MBB = MI.getParent();
5644   MachineFunction *MF = MBB->getParent();
5645   MachineRegisterInfo *MRI = &MF->getRegInfo();
5646   Register VReg = MI.getOperand(0).getReg();
5647   if (!Register::isVirtualRegister(VReg))
5648     return false;
5649 
5650   MachineInstr *DefMI = MRI->getVRegDef(VReg);
5651 
5652   // Look through COPY instructions to find definition.
5653   while (DefMI->isCopy()) {
5654     Register CopyVReg = DefMI->getOperand(1).getReg();
5655     if (!MRI->hasOneNonDBGUse(CopyVReg))
5656       return false;
5657     if (!MRI->hasOneDef(CopyVReg))
5658       return false;
5659     DefMI = MRI->getVRegDef(CopyVReg);
5660   }
5661 
5662   switch (DefMI->getOpcode()) {
5663   default:
5664     return false;
5665   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
5666   case AArch64::ANDWri:
5667   case AArch64::ANDXri: {
5668     if (IsTestAndBranch)
5669       return false;
5670     if (DefMI->getParent() != MBB)
5671       return false;
5672     if (!MRI->hasOneNonDBGUse(VReg))
5673       return false;
5674 
5675     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
5676     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
5677         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
5678     if (!isPowerOf2_64(Mask))
5679       return false;
5680 
5681     MachineOperand &MO = DefMI->getOperand(1);
5682     Register NewReg = MO.getReg();
5683     if (!Register::isVirtualRegister(NewReg))
5684       return false;
5685 
5686     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
5687 
5688     MachineBasicBlock &RefToMBB = *MBB;
5689     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
5690     DebugLoc DL = MI.getDebugLoc();
5691     unsigned Imm = Log2_64(Mask);
5692     unsigned Opc = (Imm < 32)
5693                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
5694                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
5695     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
5696                               .addReg(NewReg)
5697                               .addImm(Imm)
5698                               .addMBB(TBB);
5699     // Register lives on to the CBZ now.
5700     MO.setIsKill(false);
5701 
5702     // For immediate smaller than 32, we need to use the 32-bit
5703     // variant (W) in all cases. Indeed the 64-bit variant does not
5704     // allow to encode them.
5705     // Therefore, if the input register is 64-bit, we need to take the
5706     // 32-bit sub-part.
5707     if (!Is32Bit && Imm < 32)
5708       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
5709     MI.eraseFromParent();
5710     return true;
5711   }
5712   // Look for CSINC
5713   case AArch64::CSINCWr:
5714   case AArch64::CSINCXr: {
5715     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
5716           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
5717         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
5718           DefMI->getOperand(2).getReg() == AArch64::XZR))
5719       return false;
5720 
5721     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
5722       return false;
5723 
5724     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
5725     // Convert only when the condition code is not modified between
5726     // the CSINC and the branch. The CC may be used by other
5727     // instructions in between.
5728     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
5729       return false;
5730     MachineBasicBlock &RefToMBB = *MBB;
5731     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
5732     DebugLoc DL = MI.getDebugLoc();
5733     if (IsNegativeBranch)
5734       CC = AArch64CC::getInvertedCondCode(CC);
5735     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
5736     MI.eraseFromParent();
5737     return true;
5738   }
5739   }
5740 }
5741 
5742 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const5743 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
5744   const unsigned Mask = AArch64II::MO_FRAGMENT;
5745   return std::make_pair(TF & Mask, TF & ~Mask);
5746 }
5747 
5748 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const5749 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
5750   using namespace AArch64II;
5751 
5752   static const std::pair<unsigned, const char *> TargetFlags[] = {
5753       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
5754       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
5755       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
5756       {MO_HI12, "aarch64-hi12"}};
5757   return makeArrayRef(TargetFlags);
5758 }
5759 
5760 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const5761 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
5762   using namespace AArch64II;
5763 
5764   static const std::pair<unsigned, const char *> TargetFlags[] = {
5765       {MO_COFFSTUB, "aarch64-coffstub"},
5766       {MO_GOT, "aarch64-got"},
5767       {MO_NC, "aarch64-nc"},
5768       {MO_S, "aarch64-s"},
5769       {MO_TLS, "aarch64-tls"},
5770       {MO_DLLIMPORT, "aarch64-dllimport"},
5771       {MO_PREL, "aarch64-prel"},
5772       {MO_TAGGED, "aarch64-tagged"}};
5773   return makeArrayRef(TargetFlags);
5774 }
5775 
5776 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const5777 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
5778   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
5779       {{MOSuppressPair, "aarch64-suppress-pair"},
5780        {MOStridedAccess, "aarch64-strided-access"}};
5781   return makeArrayRef(TargetFlags);
5782 }
5783 
5784 /// Constants defining how certain sequences should be outlined.
5785 /// This encompasses how an outlined function should be called, and what kind of
5786 /// frame should be emitted for that outlined function.
5787 ///
5788 /// \p MachineOutlinerDefault implies that the function should be called with
5789 /// a save and restore of LR to the stack.
5790 ///
5791 /// That is,
5792 ///
5793 /// I1     Save LR                    OUTLINED_FUNCTION:
5794 /// I2 --> BL OUTLINED_FUNCTION       I1
5795 /// I3     Restore LR                 I2
5796 ///                                   I3
5797 ///                                   RET
5798 ///
5799 /// * Call construction overhead: 3 (save + BL + restore)
5800 /// * Frame construction overhead: 1 (ret)
5801 /// * Requires stack fixups? Yes
5802 ///
5803 /// \p MachineOutlinerTailCall implies that the function is being created from
5804 /// a sequence of instructions ending in a return.
5805 ///
5806 /// That is,
5807 ///
5808 /// I1                             OUTLINED_FUNCTION:
5809 /// I2 --> B OUTLINED_FUNCTION     I1
5810 /// RET                            I2
5811 ///                                RET
5812 ///
5813 /// * Call construction overhead: 1 (B)
5814 /// * Frame construction overhead: 0 (Return included in sequence)
5815 /// * Requires stack fixups? No
5816 ///
5817 /// \p MachineOutlinerNoLRSave implies that the function should be called using
5818 /// a BL instruction, but doesn't require LR to be saved and restored. This
5819 /// happens when LR is known to be dead.
5820 ///
5821 /// That is,
5822 ///
5823 /// I1                                OUTLINED_FUNCTION:
5824 /// I2 --> BL OUTLINED_FUNCTION       I1
5825 /// I3                                I2
5826 ///                                   I3
5827 ///                                   RET
5828 ///
5829 /// * Call construction overhead: 1 (BL)
5830 /// * Frame construction overhead: 1 (RET)
5831 /// * Requires stack fixups? No
5832 ///
5833 /// \p MachineOutlinerThunk implies that the function is being created from
5834 /// a sequence of instructions ending in a call. The outlined function is
5835 /// called with a BL instruction, and the outlined function tail-calls the
5836 /// original call destination.
5837 ///
5838 /// That is,
5839 ///
5840 /// I1                                OUTLINED_FUNCTION:
5841 /// I2 --> BL OUTLINED_FUNCTION       I1
5842 /// BL f                              I2
5843 ///                                   B f
5844 /// * Call construction overhead: 1 (BL)
5845 /// * Frame construction overhead: 0
5846 /// * Requires stack fixups? No
5847 ///
5848 /// \p MachineOutlinerRegSave implies that the function should be called with a
5849 /// save and restore of LR to an available register. This allows us to avoid
5850 /// stack fixups. Note that this outlining variant is compatible with the
5851 /// NoLRSave case.
5852 ///
5853 /// That is,
5854 ///
5855 /// I1     Save LR                    OUTLINED_FUNCTION:
5856 /// I2 --> BL OUTLINED_FUNCTION       I1
5857 /// I3     Restore LR                 I2
5858 ///                                   I3
5859 ///                                   RET
5860 ///
5861 /// * Call construction overhead: 3 (save + BL + restore)
5862 /// * Frame construction overhead: 1 (ret)
5863 /// * Requires stack fixups? No
5864 enum MachineOutlinerClass {
5865   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
5866   MachineOutlinerTailCall, /// Only emit a branch.
5867   MachineOutlinerNoLRSave, /// Emit a call and return.
5868   MachineOutlinerThunk,    /// Emit a call and tail-call.
5869   MachineOutlinerRegSave   /// Same as default, but save to a register.
5870 };
5871 
5872 enum MachineOutlinerMBBFlags {
5873   LRUnavailableSomewhere = 0x2,
5874   HasCalls = 0x4,
5875   UnsafeRegsDead = 0x8
5876 };
5877 
5878 unsigned
findRegisterToSaveLRTo(const outliner::Candidate & C) const5879 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
5880   assert(C.LRUWasSet && "LRU wasn't set?");
5881   MachineFunction *MF = C.getMF();
5882   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5883       MF->getSubtarget().getRegisterInfo());
5884 
5885   // Check if there is an available register across the sequence that we can
5886   // use.
5887   for (unsigned Reg : AArch64::GPR64RegClass) {
5888     if (!ARI->isReservedReg(*MF, Reg) &&
5889         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
5890         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
5891         Reg != AArch64::X17 && // Ditto for X17.
5892         C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
5893       return Reg;
5894   }
5895 
5896   // No suitable register. Return 0.
5897   return 0u;
5898 }
5899 
5900 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5901 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
5902                                          const outliner::Candidate &b) {
5903   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
5904   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
5905 
5906   return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
5907          MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
5908 }
5909 
5910 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5911 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
5912                                        const outliner::Candidate &b) {
5913   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
5914   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
5915 
5916   return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
5917 }
5918 
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5919 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
5920                                                 const outliner::Candidate &b) {
5921   const AArch64Subtarget &SubtargetA =
5922       a.getMF()->getSubtarget<AArch64Subtarget>();
5923   const AArch64Subtarget &SubtargetB =
5924       b.getMF()->getSubtarget<AArch64Subtarget>();
5925   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
5926 }
5927 
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const5928 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
5929     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
5930   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
5931   unsigned SequenceSize =
5932       std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
5933                       [this](unsigned Sum, const MachineInstr &MI) {
5934                         return Sum + getInstSizeInBytes(MI);
5935                       });
5936   unsigned NumBytesToCreateFrame = 0;
5937 
5938   // We only allow outlining for functions having exactly matching return
5939   // address signing attributes, i.e., all share the same value for the
5940   // attribute "sign-return-address" and all share the same type of key they
5941   // are signed with.
5942   // Additionally we require all functions to simultaniously either support
5943   // v8.3a features or not. Otherwise an outlined function could get signed
5944   // using dedicated v8.3 instructions and a call from a function that doesn't
5945   // support v8.3 instructions would therefore be invalid.
5946   if (std::adjacent_find(
5947           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5948           [](const outliner::Candidate &a, const outliner::Candidate &b) {
5949             // Return true if a and b are non-equal w.r.t. return address
5950             // signing or support of v8.3a features
5951             if (outliningCandidatesSigningScopeConsensus(a, b) &&
5952                 outliningCandidatesSigningKeyConsensus(a, b) &&
5953                 outliningCandidatesV8_3OpsConsensus(a, b)) {
5954               return false;
5955             }
5956             return true;
5957           }) != RepeatedSequenceLocs.end()) {
5958     return outliner::OutlinedFunction();
5959   }
5960 
5961   // Since at this point all candidates agree on their return address signing
5962   // picking just one is fine. If the candidate functions potentially sign their
5963   // return addresses, the outlined function should do the same. Note that in
5964   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
5965   // not certainly true that the outlined function will have to sign its return
5966   // address but this decision is made later, when the decision to outline
5967   // has already been made.
5968   // The same holds for the number of additional instructions we need: On
5969   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
5970   // necessary. However, at this point we don't know if the outlined function
5971   // will have a RET instruction so we assume the worst.
5972   const TargetRegisterInfo &TRI = getRegisterInfo();
5973   if (FirstCand.getMF()
5974           ->getInfo<AArch64FunctionInfo>()
5975           ->shouldSignReturnAddress(true)) {
5976     // One PAC and one AUT instructions
5977     NumBytesToCreateFrame += 8;
5978 
5979     // We have to check if sp modifying instructions would get outlined.
5980     // If so we only allow outlining if sp is unchanged overall, so matching
5981     // sub and add instructions are okay to outline, all other sp modifications
5982     // are not
5983     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
5984       int SPValue = 0;
5985       MachineBasicBlock::iterator MBBI = C.front();
5986       for (;;) {
5987         if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
5988           switch (MBBI->getOpcode()) {
5989           case AArch64::ADDXri:
5990           case AArch64::ADDWri:
5991             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5992             assert(MBBI->getOperand(2).isImm() &&
5993                    "Expected operand to be immediate");
5994             assert(MBBI->getOperand(1).isReg() &&
5995                    "Expected operand to be a register");
5996             // Check if the add just increments sp. If so, we search for
5997             // matching sub instructions that decrement sp. If not, the
5998             // modification is illegal
5999             if (MBBI->getOperand(1).getReg() == AArch64::SP)
6000               SPValue += MBBI->getOperand(2).getImm();
6001             else
6002               return true;
6003             break;
6004           case AArch64::SUBXri:
6005           case AArch64::SUBWri:
6006             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
6007             assert(MBBI->getOperand(2).isImm() &&
6008                    "Expected operand to be immediate");
6009             assert(MBBI->getOperand(1).isReg() &&
6010                    "Expected operand to be a register");
6011             // Check if the sub just decrements sp. If so, we search for
6012             // matching add instructions that increment sp. If not, the
6013             // modification is illegal
6014             if (MBBI->getOperand(1).getReg() == AArch64::SP)
6015               SPValue -= MBBI->getOperand(2).getImm();
6016             else
6017               return true;
6018             break;
6019           default:
6020             return true;
6021           }
6022         }
6023         if (MBBI == C.back())
6024           break;
6025         ++MBBI;
6026       }
6027       if (SPValue)
6028         return true;
6029       return false;
6030     };
6031     // Remove candidates with illegal stack modifying instructions
6032     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
6033                                               RepeatedSequenceLocs.end(),
6034                                               hasIllegalSPModification),
6035                                RepeatedSequenceLocs.end());
6036 
6037     // If the sequence doesn't have enough candidates left, then we're done.
6038     if (RepeatedSequenceLocs.size() < 2)
6039       return outliner::OutlinedFunction();
6040   }
6041 
6042   // Properties about candidate MBBs that hold for all of them.
6043   unsigned FlagsSetInAll = 0xF;
6044 
6045   // Compute liveness information for each candidate, and set FlagsSetInAll.
6046   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
6047                 [&FlagsSetInAll](outliner::Candidate &C) {
6048                   FlagsSetInAll &= C.Flags;
6049                 });
6050 
6051   // According to the AArch64 Procedure Call Standard, the following are
6052   // undefined on entry/exit from a function call:
6053   //
6054   // * Registers x16, x17, (and thus w16, w17)
6055   // * Condition codes (and thus the NZCV register)
6056   //
6057   // Because if this, we can't outline any sequence of instructions where
6058   // one
6059   // of these registers is live into/across it. Thus, we need to delete
6060   // those
6061   // candidates.
6062   auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
6063     // If the unsafe registers in this block are all dead, then we don't need
6064     // to compute liveness here.
6065     if (C.Flags & UnsafeRegsDead)
6066       return false;
6067     C.initLRU(TRI);
6068     LiveRegUnits LRU = C.LRU;
6069     return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
6070             !LRU.available(AArch64::NZCV));
6071   };
6072 
6073   // Are there any candidates where those registers are live?
6074   if (!(FlagsSetInAll & UnsafeRegsDead)) {
6075     // Erase every candidate that violates the restrictions above. (It could be
6076     // true that we have viable candidates, so it's not worth bailing out in
6077     // the case that, say, 1 out of 20 candidates violate the restructions.)
6078     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
6079                                               RepeatedSequenceLocs.end(),
6080                                               CantGuaranteeValueAcrossCall),
6081                                RepeatedSequenceLocs.end());
6082 
6083     // If the sequence doesn't have enough candidates left, then we're done.
6084     if (RepeatedSequenceLocs.size() < 2)
6085       return outliner::OutlinedFunction();
6086   }
6087 
6088   // At this point, we have only "safe" candidates to outline. Figure out
6089   // frame + call instruction information.
6090 
6091   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
6092 
6093   // Helper lambda which sets call information for every candidate.
6094   auto SetCandidateCallInfo =
6095       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
6096         for (outliner::Candidate &C : RepeatedSequenceLocs)
6097           C.setCallInfo(CallID, NumBytesForCall);
6098       };
6099 
6100   unsigned FrameID = MachineOutlinerDefault;
6101   NumBytesToCreateFrame += 4;
6102 
6103   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
6104     return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
6105   });
6106 
6107   // We check to see if CFI Instructions are present, and if they are
6108   // we find the number of CFI Instructions in the candidates.
6109   unsigned CFICount = 0;
6110   MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
6111   for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
6112        Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
6113     const std::vector<MCCFIInstruction> &CFIInstructions =
6114         RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
6115     if (MBBI->isCFIInstruction()) {
6116       unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
6117       MCCFIInstruction CFI = CFIInstructions[CFIIndex];
6118       CFICount++;
6119     }
6120     MBBI++;
6121   }
6122 
6123   // We compare the number of found CFI Instructions to  the number of CFI
6124   // instructions in the parent function for each candidate.  We must check this
6125   // since if we outline one of the CFI instructions in a function, we have to
6126   // outline them all for correctness. If we do not, the address offsets will be
6127   // incorrect between the two sections of the program.
6128   for (outliner::Candidate &C : RepeatedSequenceLocs) {
6129     std::vector<MCCFIInstruction> CFIInstructions =
6130         C.getMF()->getFrameInstructions();
6131 
6132     if (CFICount > 0 && CFICount != CFIInstructions.size())
6133       return outliner::OutlinedFunction();
6134   }
6135 
6136   // Returns true if an instructions is safe to fix up, false otherwise.
6137   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
6138     if (MI.isCall())
6139       return true;
6140 
6141     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
6142         !MI.readsRegister(AArch64::SP, &TRI))
6143       return true;
6144 
6145     // Any modification of SP will break our code to save/restore LR.
6146     // FIXME: We could handle some instructions which add a constant
6147     // offset to SP, with a bit more work.
6148     if (MI.modifiesRegister(AArch64::SP, &TRI))
6149       return false;
6150 
6151     // At this point, we have a stack instruction that we might need to
6152     // fix up. We'll handle it if it's a load or store.
6153     if (MI.mayLoadOrStore()) {
6154       const MachineOperand *Base; // Filled with the base operand of MI.
6155       int64_t Offset;             // Filled with the offset of MI.
6156       bool OffsetIsScalable;
6157 
6158       // Does it allow us to offset the base operand and is the base the
6159       // register SP?
6160       if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
6161           !Base->isReg() || Base->getReg() != AArch64::SP)
6162         return false;
6163 
6164       // Fixe-up code below assumes bytes.
6165       if (OffsetIsScalable)
6166         return false;
6167 
6168       // Find the minimum/maximum offset for this instruction and check
6169       // if fixing it up would be in range.
6170       int64_t MinOffset,
6171           MaxOffset;  // Unscaled offsets for the instruction.
6172       TypeSize Scale(0U, false); // The scale to multiply the offsets by.
6173       unsigned DummyWidth;
6174       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
6175 
6176       Offset += 16; // Update the offset to what it would be if we outlined.
6177       if (Offset < MinOffset * (int64_t)Scale.getFixedSize() ||
6178           Offset > MaxOffset * (int64_t)Scale.getFixedSize())
6179         return false;
6180 
6181       // It's in range, so we can outline it.
6182       return true;
6183     }
6184 
6185     // FIXME: Add handling for instructions like "add x0, sp, #8".
6186 
6187     // We can't fix it up, so don't outline it.
6188     return false;
6189   };
6190 
6191   // True if it's possible to fix up each stack instruction in this sequence.
6192   // Important for frames/call variants that modify the stack.
6193   bool AllStackInstrsSafe = std::all_of(
6194       FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
6195 
6196   // If the last instruction in any candidate is a terminator, then we should
6197   // tail call all of the candidates.
6198   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
6199     FrameID = MachineOutlinerTailCall;
6200     NumBytesToCreateFrame = 0;
6201     SetCandidateCallInfo(MachineOutlinerTailCall, 4);
6202   }
6203 
6204   else if (LastInstrOpcode == AArch64::BL ||
6205            ((LastInstrOpcode == AArch64::BLR ||
6206              LastInstrOpcode == AArch64::BLRNoIP) &&
6207             !HasBTI)) {
6208     // FIXME: Do we need to check if the code after this uses the value of LR?
6209     FrameID = MachineOutlinerThunk;
6210     NumBytesToCreateFrame = 0;
6211     SetCandidateCallInfo(MachineOutlinerThunk, 4);
6212   }
6213 
6214   else {
6215     // We need to decide how to emit calls + frames. We can always emit the same
6216     // frame if we don't need to save to the stack. If we have to save to the
6217     // stack, then we need a different frame.
6218     unsigned NumBytesNoStackCalls = 0;
6219     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
6220 
6221     // Check if we have to save LR.
6222     for (outliner::Candidate &C : RepeatedSequenceLocs) {
6223       C.initLRU(TRI);
6224 
6225       // If we have a noreturn caller, then we're going to be conservative and
6226       // say that we have to save LR. If we don't have a ret at the end of the
6227       // block, then we can't reason about liveness accurately.
6228       //
6229       // FIXME: We can probably do better than always disabling this in
6230       // noreturn functions by fixing up the liveness info.
6231       bool IsNoReturn =
6232           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
6233 
6234       // Is LR available? If so, we don't need a save.
6235       if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
6236         NumBytesNoStackCalls += 4;
6237         C.setCallInfo(MachineOutlinerNoLRSave, 4);
6238         CandidatesWithoutStackFixups.push_back(C);
6239       }
6240 
6241       // Is an unused register available? If so, we won't modify the stack, so
6242       // we can outline with the same frame type as those that don't save LR.
6243       else if (findRegisterToSaveLRTo(C)) {
6244         NumBytesNoStackCalls += 12;
6245         C.setCallInfo(MachineOutlinerRegSave, 12);
6246         CandidatesWithoutStackFixups.push_back(C);
6247       }
6248 
6249       // Is SP used in the sequence at all? If not, we don't have to modify
6250       // the stack, so we are guaranteed to get the same frame.
6251       else if (C.UsedInSequence.available(AArch64::SP)) {
6252         NumBytesNoStackCalls += 12;
6253         C.setCallInfo(MachineOutlinerDefault, 12);
6254         CandidatesWithoutStackFixups.push_back(C);
6255       }
6256 
6257       // If we outline this, we need to modify the stack. Pretend we don't
6258       // outline this by saving all of its bytes.
6259       else {
6260         NumBytesNoStackCalls += SequenceSize;
6261       }
6262     }
6263 
6264     // If there are no places where we have to save LR, then note that we
6265     // don't have to update the stack. Otherwise, give every candidate the
6266     // default call type, as long as it's safe to do so.
6267     if (!AllStackInstrsSafe ||
6268         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
6269       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
6270       FrameID = MachineOutlinerNoLRSave;
6271     } else {
6272       SetCandidateCallInfo(MachineOutlinerDefault, 12);
6273 
6274       // Bugzilla ID: 46767
6275       // TODO: Check if fixing up the stack more than once is safe so we can
6276       // outline these.
6277       //
6278       // An outline resulting in a caller that requires stack fixups at the
6279       // callsite to a callee that also requires stack fixups can happen when
6280       // there are no available registers at the candidate callsite for a
6281       // candidate that itself also has calls.
6282       //
6283       // In other words if function_containing_sequence in the following pseudo
6284       // assembly requires that we save LR at the point of the call, but there
6285       // are no available registers: in this case we save using SP and as a
6286       // result the SP offsets requires stack fixups by multiples of 16.
6287       //
6288       // function_containing_sequence:
6289       //   ...
6290       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
6291       //   call OUTLINED_FUNCTION_N
6292       //   restore LR from SP
6293       //   ...
6294       //
6295       // OUTLINED_FUNCTION_N:
6296       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
6297       //   ...
6298       //   bl foo
6299       //   restore LR from SP
6300       //   ret
6301       //
6302       // Because the code to handle more than one stack fixup does not
6303       // currently have the proper checks for legality, these cases will assert
6304       // in the AArch64 MachineOutliner. This is because the code to do this
6305       // needs more hardening, testing, better checks that generated code is
6306       // legal, etc and because it is only verified to handle a single pass of
6307       // stack fixup.
6308       //
6309       // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
6310       // these cases until they are known to be handled. Bugzilla 46767 is
6311       // referenced in comments at the assert site.
6312       //
6313       // To avoid asserting (or generating non-legal code on noassert builds)
6314       // we remove all candidates which would need more than one stack fixup by
6315       // pruning the cases where the candidate has calls while also having no
6316       // available LR and having no available general purpose registers to copy
6317       // LR to (ie one extra stack save/restore).
6318       //
6319       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
6320         erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
6321           return (std::any_of(
6322                      C.front(), std::next(C.back()),
6323                      [](const MachineInstr &MI) { return MI.isCall(); })) &&
6324                  (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C));
6325         });
6326       }
6327     }
6328 
6329     // If we dropped all of the candidates, bail out here.
6330     if (RepeatedSequenceLocs.size() < 2) {
6331       RepeatedSequenceLocs.clear();
6332       return outliner::OutlinedFunction();
6333     }
6334   }
6335 
6336   // Does every candidate's MBB contain a call? If so, then we might have a call
6337   // in the range.
6338   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
6339     // Check if the range contains a call. These require a save + restore of the
6340     // link register.
6341     bool ModStackToSaveLR = false;
6342     if (std::any_of(FirstCand.front(), FirstCand.back(),
6343                     [](const MachineInstr &MI) { return MI.isCall(); }))
6344       ModStackToSaveLR = true;
6345 
6346     // Handle the last instruction separately. If this is a tail call, then the
6347     // last instruction is a call. We don't want to save + restore in this case.
6348     // However, it could be possible that the last instruction is a call without
6349     // it being valid to tail call this sequence. We should consider this as
6350     // well.
6351     else if (FrameID != MachineOutlinerThunk &&
6352              FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
6353       ModStackToSaveLR = true;
6354 
6355     if (ModStackToSaveLR) {
6356       // We can't fix up the stack. Bail out.
6357       if (!AllStackInstrsSafe) {
6358         RepeatedSequenceLocs.clear();
6359         return outliner::OutlinedFunction();
6360       }
6361 
6362       // Save + restore LR.
6363       NumBytesToCreateFrame += 8;
6364     }
6365   }
6366 
6367   // If we have CFI instructions, we can only outline if the outlined section
6368   // can be a tail call
6369   if (FrameID != MachineOutlinerTailCall && CFICount > 0)
6370     return outliner::OutlinedFunction();
6371 
6372   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
6373                                     NumBytesToCreateFrame, FrameID);
6374 }
6375 
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const6376 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
6377     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
6378   const Function &F = MF.getFunction();
6379 
6380   // Can F be deduplicated by the linker? If it can, don't outline from it.
6381   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
6382     return false;
6383 
6384   // Don't outline from functions with section markings; the program could
6385   // expect that all the code is in the named section.
6386   // FIXME: Allow outlining from multiple functions with the same section
6387   // marking.
6388   if (F.hasSection())
6389     return false;
6390 
6391   // Outlining from functions with redzones is unsafe since the outliner may
6392   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
6393   // outline from it.
6394   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
6395   if (!AFI || AFI->hasRedZone().getValueOr(true))
6396     return false;
6397 
6398   // FIXME: Teach the outliner to generate/handle Windows unwind info.
6399   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
6400     return false;
6401 
6402   // It's safe to outline from MF.
6403   return true;
6404 }
6405 
isMBBSafeToOutlineFrom(MachineBasicBlock & MBB,unsigned & Flags) const6406 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
6407                                               unsigned &Flags) const {
6408   // Check if LR is available through all of the MBB. If it's not, then set
6409   // a flag.
6410   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
6411          "Suitable Machine Function for outlining must track liveness");
6412   LiveRegUnits LRU(getRegisterInfo());
6413 
6414   std::for_each(MBB.rbegin(), MBB.rend(),
6415                 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
6416 
6417   // Check if each of the unsafe registers are available...
6418   bool W16AvailableInBlock = LRU.available(AArch64::W16);
6419   bool W17AvailableInBlock = LRU.available(AArch64::W17);
6420   bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
6421 
6422   // If all of these are dead (and not live out), we know we don't have to check
6423   // them later.
6424   if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
6425     Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
6426 
6427   // Now, add the live outs to the set.
6428   LRU.addLiveOuts(MBB);
6429 
6430   // If any of these registers is available in the MBB, but also a live out of
6431   // the block, then we know outlining is unsafe.
6432   if (W16AvailableInBlock && !LRU.available(AArch64::W16))
6433     return false;
6434   if (W17AvailableInBlock && !LRU.available(AArch64::W17))
6435     return false;
6436   if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
6437     return false;
6438 
6439   // Check if there's a call inside this MachineBasicBlock. If there is, then
6440   // set a flag.
6441   if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
6442     Flags |= MachineOutlinerMBBFlags::HasCalls;
6443 
6444   MachineFunction *MF = MBB.getParent();
6445 
6446   // In the event that we outline, we may have to save LR. If there is an
6447   // available register in the MBB, then we'll always save LR there. Check if
6448   // this is true.
6449   bool CanSaveLR = false;
6450   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
6451       MF->getSubtarget().getRegisterInfo());
6452 
6453   // Check if there is an available register across the sequence that we can
6454   // use.
6455   for (unsigned Reg : AArch64::GPR64RegClass) {
6456     if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
6457         Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
6458       CanSaveLR = true;
6459       break;
6460     }
6461   }
6462 
6463   // Check if we have a register we can save LR to, and if LR was used
6464   // somewhere. If both of those things are true, then we need to evaluate the
6465   // safety of outlining stack instructions later.
6466   if (!CanSaveLR && !LRU.available(AArch64::LR))
6467     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
6468 
6469   return true;
6470 }
6471 
6472 outliner::InstrType
getOutliningType(MachineBasicBlock::iterator & MIT,unsigned Flags) const6473 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
6474                                    unsigned Flags) const {
6475   MachineInstr &MI = *MIT;
6476   MachineBasicBlock *MBB = MI.getParent();
6477   MachineFunction *MF = MBB->getParent();
6478   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
6479 
6480   // Don't outline anything used for return address signing. The outlined
6481   // function will get signed later if needed
6482   switch (MI.getOpcode()) {
6483   case AArch64::PACIASP:
6484   case AArch64::PACIBSP:
6485   case AArch64::AUTIASP:
6486   case AArch64::AUTIBSP:
6487   case AArch64::RETAA:
6488   case AArch64::RETAB:
6489   case AArch64::EMITBKEY:
6490     return outliner::InstrType::Illegal;
6491   }
6492 
6493   // Don't outline LOHs.
6494   if (FuncInfo->getLOHRelated().count(&MI))
6495     return outliner::InstrType::Illegal;
6496 
6497   // We can only outline these if we will tail call the outlined function, or
6498   // fix up the CFI offsets. Currently, CFI instructions are outlined only if
6499   // in a tail call.
6500   //
6501   // FIXME: If the proper fixups for the offset are implemented, this should be
6502   // possible.
6503   if (MI.isCFIInstruction())
6504     return outliner::InstrType::Legal;
6505 
6506   // Don't allow debug values to impact outlining type.
6507   if (MI.isDebugInstr() || MI.isIndirectDebugValue())
6508     return outliner::InstrType::Invisible;
6509 
6510   // At this point, KILL instructions don't really tell us much so we can go
6511   // ahead and skip over them.
6512   if (MI.isKill())
6513     return outliner::InstrType::Invisible;
6514 
6515   // Is this a terminator for a basic block?
6516   if (MI.isTerminator()) {
6517 
6518     // Is this the end of a function?
6519     if (MI.getParent()->succ_empty())
6520       return outliner::InstrType::Legal;
6521 
6522     // It's not, so don't outline it.
6523     return outliner::InstrType::Illegal;
6524   }
6525 
6526   // Make sure none of the operands are un-outlinable.
6527   for (const MachineOperand &MOP : MI.operands()) {
6528     if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
6529         MOP.isTargetIndex())
6530       return outliner::InstrType::Illegal;
6531 
6532     // If it uses LR or W30 explicitly, then don't touch it.
6533     if (MOP.isReg() && !MOP.isImplicit() &&
6534         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
6535       return outliner::InstrType::Illegal;
6536   }
6537 
6538   // Special cases for instructions that can always be outlined, but will fail
6539   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
6540   // be outlined because they don't require a *specific* value to be in LR.
6541   if (MI.getOpcode() == AArch64::ADRP)
6542     return outliner::InstrType::Legal;
6543 
6544   // If MI is a call we might be able to outline it. We don't want to outline
6545   // any calls that rely on the position of items on the stack. When we outline
6546   // something containing a call, we have to emit a save and restore of LR in
6547   // the outlined function. Currently, this always happens by saving LR to the
6548   // stack. Thus, if we outline, say, half the parameters for a function call
6549   // plus the call, then we'll break the callee's expectations for the layout
6550   // of the stack.
6551   //
6552   // FIXME: Allow calls to functions which construct a stack frame, as long
6553   // as they don't access arguments on the stack.
6554   // FIXME: Figure out some way to analyze functions defined in other modules.
6555   // We should be able to compute the memory usage based on the IR calling
6556   // convention, even if we can't see the definition.
6557   if (MI.isCall()) {
6558     // Get the function associated with the call. Look at each operand and find
6559     // the one that represents the callee and get its name.
6560     const Function *Callee = nullptr;
6561     for (const MachineOperand &MOP : MI.operands()) {
6562       if (MOP.isGlobal()) {
6563         Callee = dyn_cast<Function>(MOP.getGlobal());
6564         break;
6565       }
6566     }
6567 
6568     // Never outline calls to mcount.  There isn't any rule that would require
6569     // this, but the Linux kernel's "ftrace" feature depends on it.
6570     if (Callee && Callee->getName() == "\01_mcount")
6571       return outliner::InstrType::Illegal;
6572 
6573     // If we don't know anything about the callee, assume it depends on the
6574     // stack layout of the caller. In that case, it's only legal to outline
6575     // as a tail-call. Explicitly list the call instructions we know about so we
6576     // don't get unexpected results with call pseudo-instructions.
6577     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
6578     if (MI.getOpcode() == AArch64::BLR ||
6579         MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
6580       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
6581 
6582     if (!Callee)
6583       return UnknownCallOutlineType;
6584 
6585     // We have a function we have information about. Check it if it's something
6586     // can safely outline.
6587     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
6588 
6589     // We don't know what's going on with the callee at all. Don't touch it.
6590     if (!CalleeMF)
6591       return UnknownCallOutlineType;
6592 
6593     // Check if we know anything about the callee saves on the function. If we
6594     // don't, then don't touch it, since that implies that we haven't
6595     // computed anything about its stack frame yet.
6596     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
6597     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
6598         MFI.getNumObjects() > 0)
6599       return UnknownCallOutlineType;
6600 
6601     // At this point, we can say that CalleeMF ought to not pass anything on the
6602     // stack. Therefore, we can outline it.
6603     return outliner::InstrType::Legal;
6604   }
6605 
6606   // Don't outline positions.
6607   if (MI.isPosition())
6608     return outliner::InstrType::Illegal;
6609 
6610   // Don't touch the link register or W30.
6611   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
6612       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
6613     return outliner::InstrType::Illegal;
6614 
6615   // Don't outline BTI instructions, because that will prevent the outlining
6616   // site from being indirectly callable.
6617   if (MI.getOpcode() == AArch64::HINT) {
6618     int64_t Imm = MI.getOperand(0).getImm();
6619     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
6620       return outliner::InstrType::Illegal;
6621   }
6622 
6623   return outliner::InstrType::Legal;
6624 }
6625 
fixupPostOutline(MachineBasicBlock & MBB) const6626 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
6627   for (MachineInstr &MI : MBB) {
6628     const MachineOperand *Base;
6629     unsigned Width;
6630     int64_t Offset;
6631     bool OffsetIsScalable;
6632 
6633     // Is this a load or store with an immediate offset with SP as the base?
6634     if (!MI.mayLoadOrStore() ||
6635         !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
6636                                       &RI) ||
6637         (Base->isReg() && Base->getReg() != AArch64::SP))
6638       continue;
6639 
6640     // It is, so we have to fix it up.
6641     TypeSize Scale(0U, false);
6642     int64_t Dummy1, Dummy2;
6643 
6644     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
6645     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
6646     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
6647     assert(Scale != 0 && "Unexpected opcode!");
6648     assert(!OffsetIsScalable && "Expected offset to be a byte offset");
6649 
6650     // We've pushed the return address to the stack, so add 16 to the offset.
6651     // This is safe, since we already checked if it would overflow when we
6652     // checked if this instruction was legal to outline.
6653     int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
6654     StackOffsetOperand.setImm(NewImm);
6655   }
6656 }
6657 
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,bool ShouldSignReturnAddr,bool ShouldSignReturnAddrWithAKey)6658 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
6659                                  bool ShouldSignReturnAddr,
6660                                  bool ShouldSignReturnAddrWithAKey) {
6661   if (ShouldSignReturnAddr) {
6662     MachineBasicBlock::iterator MBBPAC = MBB.begin();
6663     MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
6664     const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
6665     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
6666     DebugLoc DL;
6667 
6668     if (MBBAUT != MBB.end())
6669       DL = MBBAUT->getDebugLoc();
6670 
6671     // At the very beginning of the basic block we insert the following
6672     // depending on the key type
6673     //
6674     // a_key:                   b_key:
6675     //    PACIASP                   EMITBKEY
6676     //    CFI_INSTRUCTION           PACIBSP
6677     //                              CFI_INSTRUCTION
6678     if (ShouldSignReturnAddrWithAKey) {
6679       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
6680           .setMIFlag(MachineInstr::FrameSetup);
6681     } else {
6682       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
6683           .setMIFlag(MachineInstr::FrameSetup);
6684       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
6685           .setMIFlag(MachineInstr::FrameSetup);
6686     }
6687     unsigned CFIIndex =
6688         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
6689     BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
6690         .addCFIIndex(CFIIndex)
6691         .setMIFlags(MachineInstr::FrameSetup);
6692 
6693     // If v8.3a features are available we can replace a RET instruction by
6694     // RETAA or RETAB and omit the AUT instructions
6695     if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
6696         MBBAUT->getOpcode() == AArch64::RET) {
6697       BuildMI(MBB, MBBAUT, DL,
6698               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
6699                                                     : AArch64::RETAB))
6700           .copyImplicitOps(*MBBAUT);
6701       MBB.erase(MBBAUT);
6702     } else {
6703       BuildMI(MBB, MBBAUT, DL,
6704               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
6705                                                     : AArch64::AUTIBSP))
6706           .setMIFlag(MachineInstr::FrameDestroy);
6707     }
6708   }
6709 }
6710 
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const6711 void AArch64InstrInfo::buildOutlinedFrame(
6712     MachineBasicBlock &MBB, MachineFunction &MF,
6713     const outliner::OutlinedFunction &OF) const {
6714 
6715   AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
6716 
6717   if (OF.FrameConstructionID == MachineOutlinerTailCall)
6718     FI->setOutliningStyle("Tail Call");
6719   else if (OF.FrameConstructionID == MachineOutlinerThunk) {
6720     // For thunk outlining, rewrite the last instruction from a call to a
6721     // tail-call.
6722     MachineInstr *Call = &*--MBB.instr_end();
6723     unsigned TailOpcode;
6724     if (Call->getOpcode() == AArch64::BL) {
6725       TailOpcode = AArch64::TCRETURNdi;
6726     } else {
6727       assert(Call->getOpcode() == AArch64::BLR ||
6728              Call->getOpcode() == AArch64::BLRNoIP);
6729       TailOpcode = AArch64::TCRETURNriALL;
6730     }
6731     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
6732                            .add(Call->getOperand(0))
6733                            .addImm(0);
6734     MBB.insert(MBB.end(), TC);
6735     Call->eraseFromParent();
6736 
6737     FI->setOutliningStyle("Thunk");
6738   }
6739 
6740   bool IsLeafFunction = true;
6741 
6742   // Is there a call in the outlined range?
6743   auto IsNonTailCall = [](const MachineInstr &MI) {
6744     return MI.isCall() && !MI.isReturn();
6745   };
6746 
6747   if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
6748     // Fix up the instructions in the range, since we're going to modify the
6749     // stack.
6750 
6751     // Bugzilla ID: 46767
6752     // TODO: Check if fixing up twice is safe so we can outline these.
6753     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
6754            "Can only fix up stack references once");
6755     fixupPostOutline(MBB);
6756 
6757     IsLeafFunction = false;
6758 
6759     // LR has to be a live in so that we can save it.
6760     if (!MBB.isLiveIn(AArch64::LR))
6761       MBB.addLiveIn(AArch64::LR);
6762 
6763     MachineBasicBlock::iterator It = MBB.begin();
6764     MachineBasicBlock::iterator Et = MBB.end();
6765 
6766     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6767         OF.FrameConstructionID == MachineOutlinerThunk)
6768       Et = std::prev(MBB.end());
6769 
6770     // Insert a save before the outlined region
6771     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6772                                 .addReg(AArch64::SP, RegState::Define)
6773                                 .addReg(AArch64::LR)
6774                                 .addReg(AArch64::SP)
6775                                 .addImm(-16);
6776     It = MBB.insert(It, STRXpre);
6777 
6778     const TargetSubtargetInfo &STI = MF.getSubtarget();
6779     const MCRegisterInfo *MRI = STI.getRegisterInfo();
6780     unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
6781 
6782     // Add a CFI saying the stack was moved 16 B down.
6783     int64_t StackPosEntry =
6784         MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
6785     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6786         .addCFIIndex(StackPosEntry)
6787         .setMIFlags(MachineInstr::FrameSetup);
6788 
6789     // Add a CFI saying that the LR that we want to find is now 16 B higher than
6790     // before.
6791     int64_t LRPosEntry =
6792         MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
6793     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6794         .addCFIIndex(LRPosEntry)
6795         .setMIFlags(MachineInstr::FrameSetup);
6796 
6797     // Insert a restore before the terminator for the function.
6798     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6799                                  .addReg(AArch64::SP, RegState::Define)
6800                                  .addReg(AArch64::LR, RegState::Define)
6801                                  .addReg(AArch64::SP)
6802                                  .addImm(16);
6803     Et = MBB.insert(Et, LDRXpost);
6804   }
6805 
6806   // If a bunch of candidates reach this point they must agree on their return
6807   // address signing. It is therefore enough to just consider the signing
6808   // behaviour of one of them
6809   const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>();
6810   bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction);
6811 
6812   // a_key is the default
6813   bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey();
6814 
6815   // If this is a tail call outlined function, then there's already a return.
6816   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6817       OF.FrameConstructionID == MachineOutlinerThunk) {
6818     signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6819                          ShouldSignReturnAddrWithAKey);
6820     return;
6821   }
6822 
6823   // It's not a tail call, so we have to insert the return ourselves.
6824 
6825   // LR has to be a live in so that we can return to it.
6826   if (!MBB.isLiveIn(AArch64::LR))
6827     MBB.addLiveIn(AArch64::LR);
6828 
6829   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
6830                           .addReg(AArch64::LR);
6831   MBB.insert(MBB.end(), ret);
6832 
6833   signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6834                        ShouldSignReturnAddrWithAKey);
6835 
6836   FI->setOutliningStyle("Function");
6837 
6838   // Did we have to modify the stack by saving the link register?
6839   if (OF.FrameConstructionID != MachineOutlinerDefault)
6840     return;
6841 
6842   // We modified the stack.
6843   // Walk over the basic block and fix up all the stack accesses.
6844   fixupPostOutline(MBB);
6845 }
6846 
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,const outliner::Candidate & C) const6847 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
6848     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
6849     MachineFunction &MF, const outliner::Candidate &C) const {
6850 
6851   // Are we tail calling?
6852   if (C.CallConstructionID == MachineOutlinerTailCall) {
6853     // If yes, then we can just branch to the label.
6854     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
6855                             .addGlobalAddress(M.getNamedValue(MF.getName()))
6856                             .addImm(0));
6857     return It;
6858   }
6859 
6860   // Are we saving the link register?
6861   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
6862       C.CallConstructionID == MachineOutlinerThunk) {
6863     // No, so just insert the call.
6864     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6865                             .addGlobalAddress(M.getNamedValue(MF.getName())));
6866     return It;
6867   }
6868 
6869   // We want to return the spot where we inserted the call.
6870   MachineBasicBlock::iterator CallPt;
6871 
6872   // Instructions for saving and restoring LR around the call instruction we're
6873   // going to insert.
6874   MachineInstr *Save;
6875   MachineInstr *Restore;
6876   // Can we save to a register?
6877   if (C.CallConstructionID == MachineOutlinerRegSave) {
6878     // FIXME: This logic should be sunk into a target-specific interface so that
6879     // we don't have to recompute the register.
6880     unsigned Reg = findRegisterToSaveLRTo(C);
6881     assert(Reg != 0 && "No callee-saved register available?");
6882 
6883     // Save and restore LR from that register.
6884     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
6885                .addReg(AArch64::XZR)
6886                .addReg(AArch64::LR)
6887                .addImm(0);
6888     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
6889                 .addReg(AArch64::XZR)
6890                 .addReg(Reg)
6891                 .addImm(0);
6892   } else {
6893     // We have the default case. Save and restore from SP.
6894     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6895                .addReg(AArch64::SP, RegState::Define)
6896                .addReg(AArch64::LR)
6897                .addReg(AArch64::SP)
6898                .addImm(-16);
6899     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6900                   .addReg(AArch64::SP, RegState::Define)
6901                   .addReg(AArch64::LR, RegState::Define)
6902                   .addReg(AArch64::SP)
6903                   .addImm(16);
6904   }
6905 
6906   It = MBB.insert(It, Save);
6907   It++;
6908 
6909   // Insert the call.
6910   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6911                           .addGlobalAddress(M.getNamedValue(MF.getName())));
6912   CallPt = It;
6913   It++;
6914 
6915   It = MBB.insert(It, Restore);
6916   return CallPt;
6917 }
6918 
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const6919 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
6920   MachineFunction &MF) const {
6921   return MF.getFunction().hasMinSize();
6922 }
6923 
6924 Optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const6925 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
6926 
6927   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
6928   // and zero immediate operands used as an alias for mov instruction.
6929   if (MI.getOpcode() == AArch64::ORRWrs &&
6930       MI.getOperand(1).getReg() == AArch64::WZR &&
6931       MI.getOperand(3).getImm() == 0x0) {
6932     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6933   }
6934 
6935   if (MI.getOpcode() == AArch64::ORRXrs &&
6936       MI.getOperand(1).getReg() == AArch64::XZR &&
6937       MI.getOperand(3).getImm() == 0x0) {
6938     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6939   }
6940 
6941   return None;
6942 }
6943 
isAddImmediate(const MachineInstr & MI,Register Reg) const6944 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
6945                                                       Register Reg) const {
6946   int Sign = 1;
6947   int64_t Offset = 0;
6948 
6949   // TODO: Handle cases where Reg is a super- or sub-register of the
6950   // destination register.
6951   const MachineOperand &Op0 = MI.getOperand(0);
6952   if (!Op0.isReg() || Reg != Op0.getReg())
6953     return None;
6954 
6955   switch (MI.getOpcode()) {
6956   default:
6957     return None;
6958   case AArch64::SUBWri:
6959   case AArch64::SUBXri:
6960   case AArch64::SUBSWri:
6961   case AArch64::SUBSXri:
6962     Sign *= -1;
6963     LLVM_FALLTHROUGH;
6964   case AArch64::ADDSWri:
6965   case AArch64::ADDSXri:
6966   case AArch64::ADDWri:
6967   case AArch64::ADDXri: {
6968     // TODO: Third operand can be global address (usually some string).
6969     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
6970         !MI.getOperand(2).isImm())
6971       return None;
6972     int Shift = MI.getOperand(3).getImm();
6973     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
6974     Offset = Sign * (MI.getOperand(2).getImm() << Shift);
6975   }
6976   }
6977   return RegImmPair{MI.getOperand(1).getReg(), Offset};
6978 }
6979 
6980 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
6981 /// the destination register then, if possible, describe the value in terms of
6982 /// the source register.
6983 static Optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)6984 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
6985                        const TargetInstrInfo *TII,
6986                        const TargetRegisterInfo *TRI) {
6987   auto DestSrc = TII->isCopyInstr(MI);
6988   if (!DestSrc)
6989     return None;
6990 
6991   Register DestReg = DestSrc->Destination->getReg();
6992   Register SrcReg = DestSrc->Source->getReg();
6993 
6994   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
6995 
6996   // If the described register is the destination, just return the source.
6997   if (DestReg == DescribedReg)
6998     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6999 
7000   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
7001   if (MI.getOpcode() == AArch64::ORRWrs &&
7002       TRI->isSuperRegister(DestReg, DescribedReg))
7003     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
7004 
7005   // We may need to describe the lower part of a ORRXrs move.
7006   if (MI.getOpcode() == AArch64::ORRXrs &&
7007       TRI->isSubRegister(DestReg, DescribedReg)) {
7008     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
7009     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
7010   }
7011 
7012   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
7013          "Unhandled ORR[XW]rs copy case");
7014 
7015   return None;
7016 }
7017 
7018 Optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const7019 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
7020                                       Register Reg) const {
7021   const MachineFunction *MF = MI.getMF();
7022   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
7023   switch (MI.getOpcode()) {
7024   case AArch64::MOVZWi:
7025   case AArch64::MOVZXi: {
7026     // MOVZWi may be used for producing zero-extended 32-bit immediates in
7027     // 64-bit parameters, so we need to consider super-registers.
7028     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
7029       return None;
7030 
7031     if (!MI.getOperand(1).isImm())
7032       return None;
7033     int64_t Immediate = MI.getOperand(1).getImm();
7034     int Shift = MI.getOperand(2).getImm();
7035     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
7036                             nullptr);
7037   }
7038   case AArch64::ORRWrs:
7039   case AArch64::ORRXrs:
7040     return describeORRLoadedValue(MI, Reg, this, TRI);
7041   }
7042 
7043   return TargetInstrInfo::describeLoadedValue(MI, Reg);
7044 }
7045 
getElementSizeForOpcode(unsigned Opc) const7046 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
7047   return get(Opc).TSFlags & AArch64::ElementSizeMask;
7048 }
7049 
getBLRCallOpcode(const MachineFunction & MF)7050 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
7051   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
7052     return AArch64::BLRNoIP;
7053   else
7054     return AArch64::BLR;
7055 }
7056 
7057 #define GET_INSTRINFO_HELPERS
7058 #define GET_INSTRMAP_INFO
7059 #include "AArch64GenInstrInfo.inc"
7060