1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains the AArch64 implementation of the TargetInstrInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64InstrInfo.h"
15 #include "AArch64MachineFunctionInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "Utils/AArch64BaseInfo.h"
19 #include "llvm/ADT/ArrayRef.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/CodeGen/MachineBasicBlock.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineInstr.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/CodeGen/MachineMemOperand.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/MachineModuleInfo.h"
31 #include "llvm/CodeGen/StackMaps.h"
32 #include "llvm/CodeGen/TargetRegisterInfo.h"
33 #include "llvm/CodeGen/TargetSubtargetInfo.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCInst.h"
37 #include "llvm/MC/MCInstrDesc.h"
38 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/CodeGen.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Compiler.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/MathExtras.h"
44 #include "llvm/Target/TargetMachine.h"
45 #include "llvm/Target/TargetOptions.h"
46 #include <cassert>
47 #include <cstdint>
48 #include <iterator>
49 #include <utility>
50 
51 using namespace llvm;
52 
53 #define GET_INSTRINFO_CTOR_DTOR
54 #include "AArch64GenInstrInfo.inc"
55 
56 static cl::opt<unsigned> TBZDisplacementBits(
57     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
58     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
59 
60 static cl::opt<unsigned> CBZDisplacementBits(
61     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
62     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
63 
64 static cl::opt<unsigned>
65     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
66                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
67 
AArch64InstrInfo(const AArch64Subtarget & STI)68 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
69     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
70                           AArch64::CATCHRET),
71       RI(STI.getTargetTriple()), Subtarget(STI) {}
72 
73 /// GetInstSize - Return the number of bytes of code the specified
74 /// instruction may be.  This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const75 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
76   const MachineBasicBlock &MBB = *MI.getParent();
77   const MachineFunction *MF = MBB.getParent();
78   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
79 
80   if (MI.getOpcode() == AArch64::INLINEASM)
81     return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
82 
83   // FIXME: We currently only handle pseudoinstructions that don't get expanded
84   //        before the assembly printer.
85   unsigned NumBytes = 0;
86   const MCInstrDesc &Desc = MI.getDesc();
87   switch (Desc.getOpcode()) {
88   default:
89     // Anything not explicitly designated otherwise is a normal 4-byte insn.
90     NumBytes = 4;
91     break;
92   case TargetOpcode::DBG_VALUE:
93   case TargetOpcode::EH_LABEL:
94   case TargetOpcode::IMPLICIT_DEF:
95   case TargetOpcode::KILL:
96     NumBytes = 0;
97     break;
98   case TargetOpcode::STACKMAP:
99     // The upper bound for a stackmap intrinsic is the full length of its shadow
100     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
101     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
102     break;
103   case TargetOpcode::PATCHPOINT:
104     // The size of the patchpoint intrinsic is the number of bytes requested
105     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
106     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
107     break;
108   case AArch64::TLSDESC_CALLSEQ:
109     // This gets lowered to an instruction sequence which takes 16 bytes
110     NumBytes = 16;
111     break;
112   case AArch64::JumpTableDest32:
113   case AArch64::JumpTableDest16:
114   case AArch64::JumpTableDest8:
115     NumBytes = 12;
116     break;
117   case AArch64::SPACE:
118     NumBytes = MI.getOperand(1).getImm();
119     break;
120   }
121 
122   return NumBytes;
123 }
124 
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)125 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
126                             SmallVectorImpl<MachineOperand> &Cond) {
127   // Block ends with fall-through condbranch.
128   switch (LastInst->getOpcode()) {
129   default:
130     llvm_unreachable("Unknown branch instruction?");
131   case AArch64::Bcc:
132     Target = LastInst->getOperand(1).getMBB();
133     Cond.push_back(LastInst->getOperand(0));
134     break;
135   case AArch64::CBZW:
136   case AArch64::CBZX:
137   case AArch64::CBNZW:
138   case AArch64::CBNZX:
139     Target = LastInst->getOperand(1).getMBB();
140     Cond.push_back(MachineOperand::CreateImm(-1));
141     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
142     Cond.push_back(LastInst->getOperand(0));
143     break;
144   case AArch64::TBZW:
145   case AArch64::TBZX:
146   case AArch64::TBNZW:
147   case AArch64::TBNZX:
148     Target = LastInst->getOperand(2).getMBB();
149     Cond.push_back(MachineOperand::CreateImm(-1));
150     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
151     Cond.push_back(LastInst->getOperand(0));
152     Cond.push_back(LastInst->getOperand(1));
153   }
154 }
155 
getBranchDisplacementBits(unsigned Opc)156 static unsigned getBranchDisplacementBits(unsigned Opc) {
157   switch (Opc) {
158   default:
159     llvm_unreachable("unexpected opcode!");
160   case AArch64::B:
161     return 64;
162   case AArch64::TBNZW:
163   case AArch64::TBZW:
164   case AArch64::TBNZX:
165   case AArch64::TBZX:
166     return TBZDisplacementBits;
167   case AArch64::CBNZW:
168   case AArch64::CBZW:
169   case AArch64::CBNZX:
170   case AArch64::CBZX:
171     return CBZDisplacementBits;
172   case AArch64::Bcc:
173     return BCCDisplacementBits;
174   }
175 }
176 
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const177 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
178                                              int64_t BrOffset) const {
179   unsigned Bits = getBranchDisplacementBits(BranchOp);
180   assert(Bits >= 3 && "max branch displacement must be enough to jump"
181                       "over conditional branch expansion");
182   return isIntN(Bits, BrOffset / 4);
183 }
184 
185 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const186 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
187   switch (MI.getOpcode()) {
188   default:
189     llvm_unreachable("unexpected opcode!");
190   case AArch64::B:
191     return MI.getOperand(0).getMBB();
192   case AArch64::TBZW:
193   case AArch64::TBNZW:
194   case AArch64::TBZX:
195   case AArch64::TBNZX:
196     return MI.getOperand(2).getMBB();
197   case AArch64::CBZW:
198   case AArch64::CBNZW:
199   case AArch64::CBZX:
200   case AArch64::CBNZX:
201   case AArch64::Bcc:
202     return MI.getOperand(1).getMBB();
203   }
204 }
205 
206 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const207 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
208                                      MachineBasicBlock *&TBB,
209                                      MachineBasicBlock *&FBB,
210                                      SmallVectorImpl<MachineOperand> &Cond,
211                                      bool AllowModify) const {
212   // If the block has no terminators, it just falls into the block after it.
213   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
214   if (I == MBB.end())
215     return false;
216 
217   if (!isUnpredicatedTerminator(*I))
218     return false;
219 
220   // Get the last instruction in the block.
221   MachineInstr *LastInst = &*I;
222 
223   // If there is only one terminator instruction, process it.
224   unsigned LastOpc = LastInst->getOpcode();
225   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
226     if (isUncondBranchOpcode(LastOpc)) {
227       TBB = LastInst->getOperand(0).getMBB();
228       return false;
229     }
230     if (isCondBranchOpcode(LastOpc)) {
231       // Block ends with fall-through condbranch.
232       parseCondBranch(LastInst, TBB, Cond);
233       return false;
234     }
235     return true; // Can't handle indirect branch.
236   }
237 
238   // Get the instruction before it if it is a terminator.
239   MachineInstr *SecondLastInst = &*I;
240   unsigned SecondLastOpc = SecondLastInst->getOpcode();
241 
242   // If AllowModify is true and the block ends with two or more unconditional
243   // branches, delete all but the first unconditional branch.
244   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
245     while (isUncondBranchOpcode(SecondLastOpc)) {
246       LastInst->eraseFromParent();
247       LastInst = SecondLastInst;
248       LastOpc = LastInst->getOpcode();
249       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
250         // Return now the only terminator is an unconditional branch.
251         TBB = LastInst->getOperand(0).getMBB();
252         return false;
253       } else {
254         SecondLastInst = &*I;
255         SecondLastOpc = SecondLastInst->getOpcode();
256       }
257     }
258   }
259 
260   // If there are three terminators, we don't know what sort of block this is.
261   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
262     return true;
263 
264   // If the block ends with a B and a Bcc, handle it.
265   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
266     parseCondBranch(SecondLastInst, TBB, Cond);
267     FBB = LastInst->getOperand(0).getMBB();
268     return false;
269   }
270 
271   // If the block ends with two unconditional branches, handle it.  The second
272   // one is not executed, so remove it.
273   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
274     TBB = SecondLastInst->getOperand(0).getMBB();
275     I = LastInst;
276     if (AllowModify)
277       I->eraseFromParent();
278     return false;
279   }
280 
281   // ...likewise if it ends with an indirect branch followed by an unconditional
282   // branch.
283   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
284     I = LastInst;
285     if (AllowModify)
286       I->eraseFromParent();
287     return true;
288   }
289 
290   // Otherwise, can't handle this.
291   return true;
292 }
293 
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const294 bool AArch64InstrInfo::reverseBranchCondition(
295     SmallVectorImpl<MachineOperand> &Cond) const {
296   if (Cond[0].getImm() != -1) {
297     // Regular Bcc
298     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
299     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
300   } else {
301     // Folded compare-and-branch
302     switch (Cond[1].getImm()) {
303     default:
304       llvm_unreachable("Unknown conditional branch!");
305     case AArch64::CBZW:
306       Cond[1].setImm(AArch64::CBNZW);
307       break;
308     case AArch64::CBNZW:
309       Cond[1].setImm(AArch64::CBZW);
310       break;
311     case AArch64::CBZX:
312       Cond[1].setImm(AArch64::CBNZX);
313       break;
314     case AArch64::CBNZX:
315       Cond[1].setImm(AArch64::CBZX);
316       break;
317     case AArch64::TBZW:
318       Cond[1].setImm(AArch64::TBNZW);
319       break;
320     case AArch64::TBNZW:
321       Cond[1].setImm(AArch64::TBZW);
322       break;
323     case AArch64::TBZX:
324       Cond[1].setImm(AArch64::TBNZX);
325       break;
326     case AArch64::TBNZX:
327       Cond[1].setImm(AArch64::TBZX);
328       break;
329     }
330   }
331 
332   return false;
333 }
334 
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const335 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
336                                         int *BytesRemoved) const {
337   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
338   if (I == MBB.end())
339     return 0;
340 
341   if (!isUncondBranchOpcode(I->getOpcode()) &&
342       !isCondBranchOpcode(I->getOpcode()))
343     return 0;
344 
345   // Remove the branch.
346   I->eraseFromParent();
347 
348   I = MBB.end();
349 
350   if (I == MBB.begin()) {
351     if (BytesRemoved)
352       *BytesRemoved = 4;
353     return 1;
354   }
355   --I;
356   if (!isCondBranchOpcode(I->getOpcode())) {
357     if (BytesRemoved)
358       *BytesRemoved = 4;
359     return 1;
360   }
361 
362   // Remove the branch.
363   I->eraseFromParent();
364   if (BytesRemoved)
365     *BytesRemoved = 8;
366 
367   return 2;
368 }
369 
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const370 void AArch64InstrInfo::instantiateCondBranch(
371     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
372     ArrayRef<MachineOperand> Cond) const {
373   if (Cond[0].getImm() != -1) {
374     // Regular Bcc
375     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
376   } else {
377     // Folded compare-and-branch
378     // Note that we use addOperand instead of addReg to keep the flags.
379     const MachineInstrBuilder MIB =
380         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
381     if (Cond.size() > 3)
382       MIB.addImm(Cond[3].getImm());
383     MIB.addMBB(TBB);
384   }
385 }
386 
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const387 unsigned AArch64InstrInfo::insertBranch(
388     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
389     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
390   // Shouldn't be a fall through.
391   assert(TBB && "insertBranch must not be told to insert a fallthrough");
392 
393   if (!FBB) {
394     if (Cond.empty()) // Unconditional branch?
395       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
396     else
397       instantiateCondBranch(MBB, DL, TBB, Cond);
398 
399     if (BytesAdded)
400       *BytesAdded = 4;
401 
402     return 1;
403   }
404 
405   // Two-way conditional branch.
406   instantiateCondBranch(MBB, DL, TBB, Cond);
407   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
408 
409   if (BytesAdded)
410     *BytesAdded = 8;
411 
412   return 2;
413 }
414 
415 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)416 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
417   while (TargetRegisterInfo::isVirtualRegister(VReg)) {
418     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
419     if (!DefMI->isFullCopy())
420       return VReg;
421     VReg = DefMI->getOperand(1).getReg();
422   }
423   return VReg;
424 }
425 
426 // Determine if VReg is defined by an instruction that can be folded into a
427 // csel instruction. If so, return the folded opcode, and the replacement
428 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)429 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
430                                 unsigned *NewVReg = nullptr) {
431   VReg = removeCopies(MRI, VReg);
432   if (!TargetRegisterInfo::isVirtualRegister(VReg))
433     return 0;
434 
435   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
436   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
437   unsigned Opc = 0;
438   unsigned SrcOpNum = 0;
439   switch (DefMI->getOpcode()) {
440   case AArch64::ADDSXri:
441   case AArch64::ADDSWri:
442     // if NZCV is used, do not fold.
443     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
444       return 0;
445     // fall-through to ADDXri and ADDWri.
446     LLVM_FALLTHROUGH;
447   case AArch64::ADDXri:
448   case AArch64::ADDWri:
449     // add x, 1 -> csinc.
450     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
451         DefMI->getOperand(3).getImm() != 0)
452       return 0;
453     SrcOpNum = 1;
454     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
455     break;
456 
457   case AArch64::ORNXrr:
458   case AArch64::ORNWrr: {
459     // not x -> csinv, represented as orn dst, xzr, src.
460     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
461     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
462       return 0;
463     SrcOpNum = 2;
464     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
465     break;
466   }
467 
468   case AArch64::SUBSXrr:
469   case AArch64::SUBSWrr:
470     // if NZCV is used, do not fold.
471     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
472       return 0;
473     // fall-through to SUBXrr and SUBWrr.
474     LLVM_FALLTHROUGH;
475   case AArch64::SUBXrr:
476   case AArch64::SUBWrr: {
477     // neg x -> csneg, represented as sub dst, xzr, src.
478     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
479     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
480       return 0;
481     SrcOpNum = 2;
482     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
483     break;
484   }
485   default:
486     return 0;
487   }
488   assert(Opc && SrcOpNum && "Missing parameters");
489 
490   if (NewVReg)
491     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
492   return Opc;
493 }
494 
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,unsigned TrueReg,unsigned FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const495 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
496                                        ArrayRef<MachineOperand> Cond,
497                                        unsigned TrueReg, unsigned FalseReg,
498                                        int &CondCycles, int &TrueCycles,
499                                        int &FalseCycles) const {
500   // Check register classes.
501   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
502   const TargetRegisterClass *RC =
503       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
504   if (!RC)
505     return false;
506 
507   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
508   unsigned ExtraCondLat = Cond.size() != 1;
509 
510   // GPRs are handled by csel.
511   // FIXME: Fold in x+1, -x, and ~x when applicable.
512   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
513       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
514     // Single-cycle csel, csinc, csinv, and csneg.
515     CondCycles = 1 + ExtraCondLat;
516     TrueCycles = FalseCycles = 1;
517     if (canFoldIntoCSel(MRI, TrueReg))
518       TrueCycles = 0;
519     else if (canFoldIntoCSel(MRI, FalseReg))
520       FalseCycles = 0;
521     return true;
522   }
523 
524   // Scalar floating point is handled by fcsel.
525   // FIXME: Form fabs, fmin, and fmax when applicable.
526   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
527       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
528     CondCycles = 5 + ExtraCondLat;
529     TrueCycles = FalseCycles = 2;
530     return true;
531   }
532 
533   // Can't do vectors.
534   return false;
535 }
536 
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DstReg,ArrayRef<MachineOperand> Cond,unsigned TrueReg,unsigned FalseReg) const537 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
538                                     MachineBasicBlock::iterator I,
539                                     const DebugLoc &DL, unsigned DstReg,
540                                     ArrayRef<MachineOperand> Cond,
541                                     unsigned TrueReg, unsigned FalseReg) const {
542   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
543 
544   // Parse the condition code, see parseCondBranch() above.
545   AArch64CC::CondCode CC;
546   switch (Cond.size()) {
547   default:
548     llvm_unreachable("Unknown condition opcode in Cond");
549   case 1: // b.cc
550     CC = AArch64CC::CondCode(Cond[0].getImm());
551     break;
552   case 3: { // cbz/cbnz
553     // We must insert a compare against 0.
554     bool Is64Bit;
555     switch (Cond[1].getImm()) {
556     default:
557       llvm_unreachable("Unknown branch opcode in Cond");
558     case AArch64::CBZW:
559       Is64Bit = false;
560       CC = AArch64CC::EQ;
561       break;
562     case AArch64::CBZX:
563       Is64Bit = true;
564       CC = AArch64CC::EQ;
565       break;
566     case AArch64::CBNZW:
567       Is64Bit = false;
568       CC = AArch64CC::NE;
569       break;
570     case AArch64::CBNZX:
571       Is64Bit = true;
572       CC = AArch64CC::NE;
573       break;
574     }
575     unsigned SrcReg = Cond[2].getReg();
576     if (Is64Bit) {
577       // cmp reg, #0 is actually subs xzr, reg, #0.
578       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
579       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
580           .addReg(SrcReg)
581           .addImm(0)
582           .addImm(0);
583     } else {
584       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
585       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
586           .addReg(SrcReg)
587           .addImm(0)
588           .addImm(0);
589     }
590     break;
591   }
592   case 4: { // tbz/tbnz
593     // We must insert a tst instruction.
594     switch (Cond[1].getImm()) {
595     default:
596       llvm_unreachable("Unknown branch opcode in Cond");
597     case AArch64::TBZW:
598     case AArch64::TBZX:
599       CC = AArch64CC::EQ;
600       break;
601     case AArch64::TBNZW:
602     case AArch64::TBNZX:
603       CC = AArch64CC::NE;
604       break;
605     }
606     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
607     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
608       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
609           .addReg(Cond[2].getReg())
610           .addImm(
611               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
612     else
613       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
614           .addReg(Cond[2].getReg())
615           .addImm(
616               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
617     break;
618   }
619   }
620 
621   unsigned Opc = 0;
622   const TargetRegisterClass *RC = nullptr;
623   bool TryFold = false;
624   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
625     RC = &AArch64::GPR64RegClass;
626     Opc = AArch64::CSELXr;
627     TryFold = true;
628   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
629     RC = &AArch64::GPR32RegClass;
630     Opc = AArch64::CSELWr;
631     TryFold = true;
632   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
633     RC = &AArch64::FPR64RegClass;
634     Opc = AArch64::FCSELDrrr;
635   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
636     RC = &AArch64::FPR32RegClass;
637     Opc = AArch64::FCSELSrrr;
638   }
639   assert(RC && "Unsupported regclass");
640 
641   // Try folding simple instructions into the csel.
642   if (TryFold) {
643     unsigned NewVReg = 0;
644     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
645     if (FoldedOpc) {
646       // The folded opcodes csinc, csinc and csneg apply the operation to
647       // FalseReg, so we need to invert the condition.
648       CC = AArch64CC::getInvertedCondCode(CC);
649       TrueReg = FalseReg;
650     } else
651       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
652 
653     // Fold the operation. Leave any dead instructions for DCE to clean up.
654     if (FoldedOpc) {
655       FalseReg = NewVReg;
656       Opc = FoldedOpc;
657       // The extends the live range of NewVReg.
658       MRI.clearKillFlags(NewVReg);
659     }
660   }
661 
662   // Pull all virtual register into the appropriate class.
663   MRI.constrainRegClass(TrueReg, RC);
664   MRI.constrainRegClass(FalseReg, RC);
665 
666   // Insert the csel.
667   BuildMI(MBB, I, DL, get(Opc), DstReg)
668       .addReg(TrueReg)
669       .addReg(FalseReg)
670       .addImm(CC);
671 }
672 
673 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
canBeExpandedToORR(const MachineInstr & MI,unsigned BitSize)674 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
675   uint64_t Imm = MI.getOperand(1).getImm();
676   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
677   uint64_t Encoding;
678   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
679 }
680 
681 // FIXME: this implementation should be micro-architecture dependent, so a
682 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const683 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
684   if (!Subtarget.hasCustomCheapAsMoveHandling())
685     return MI.isAsCheapAsAMove();
686 
687   const unsigned Opcode = MI.getOpcode();
688 
689   // Firstly, check cases gated by features.
690 
691   if (Subtarget.hasZeroCycleZeroingFP()) {
692     if (Opcode == AArch64::FMOVH0 ||
693         Opcode == AArch64::FMOVS0 ||
694         Opcode == AArch64::FMOVD0)
695       return true;
696   }
697 
698   if (Subtarget.hasZeroCycleZeroingGP()) {
699     if (Opcode == TargetOpcode::COPY &&
700         (MI.getOperand(1).getReg() == AArch64::WZR ||
701          MI.getOperand(1).getReg() == AArch64::XZR))
702       return true;
703   }
704 
705   // Secondly, check cases specific to sub-targets.
706 
707   if (Subtarget.hasExynosCheapAsMoveHandling()) {
708     if (isExynosCheapAsMove(MI))
709       return true;
710 
711     return MI.isAsCheapAsAMove();
712   }
713 
714   // Finally, check generic cases.
715 
716   switch (Opcode) {
717   default:
718     return false;
719 
720   // add/sub on register without shift
721   case AArch64::ADDWri:
722   case AArch64::ADDXri:
723   case AArch64::SUBWri:
724   case AArch64::SUBXri:
725     return (MI.getOperand(3).getImm() == 0);
726 
727   // logical ops on immediate
728   case AArch64::ANDWri:
729   case AArch64::ANDXri:
730   case AArch64::EORWri:
731   case AArch64::EORXri:
732   case AArch64::ORRWri:
733   case AArch64::ORRXri:
734     return true;
735 
736   // logical ops on register without shift
737   case AArch64::ANDWrr:
738   case AArch64::ANDXrr:
739   case AArch64::BICWrr:
740   case AArch64::BICXrr:
741   case AArch64::EONWrr:
742   case AArch64::EONXrr:
743   case AArch64::EORWrr:
744   case AArch64::EORXrr:
745   case AArch64::ORNWrr:
746   case AArch64::ORNXrr:
747   case AArch64::ORRWrr:
748   case AArch64::ORRXrr:
749     return true;
750 
751   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
752   // ORRXri, it is as cheap as MOV
753   case AArch64::MOVi32imm:
754     return canBeExpandedToORR(MI, 32);
755   case AArch64::MOVi64imm:
756     return canBeExpandedToORR(MI, 64);
757   }
758 
759   llvm_unreachable("Unknown opcode to check as cheap as a move!");
760 }
761 
isFalkorShiftExtFast(const MachineInstr & MI)762 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
763   switch (MI.getOpcode()) {
764   default:
765     return false;
766 
767   case AArch64::ADDWrs:
768   case AArch64::ADDXrs:
769   case AArch64::ADDSWrs:
770   case AArch64::ADDSXrs: {
771     unsigned Imm = MI.getOperand(3).getImm();
772     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
773     if (ShiftVal == 0)
774       return true;
775     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
776   }
777 
778   case AArch64::ADDWrx:
779   case AArch64::ADDXrx:
780   case AArch64::ADDXrx64:
781   case AArch64::ADDSWrx:
782   case AArch64::ADDSXrx:
783   case AArch64::ADDSXrx64: {
784     unsigned Imm = MI.getOperand(3).getImm();
785     switch (AArch64_AM::getArithExtendType(Imm)) {
786     default:
787       return false;
788     case AArch64_AM::UXTB:
789     case AArch64_AM::UXTH:
790     case AArch64_AM::UXTW:
791     case AArch64_AM::UXTX:
792       return AArch64_AM::getArithShiftValue(Imm) <= 4;
793     }
794   }
795 
796   case AArch64::SUBWrs:
797   case AArch64::SUBSWrs: {
798     unsigned Imm = MI.getOperand(3).getImm();
799     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
800     return ShiftVal == 0 ||
801            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
802   }
803 
804   case AArch64::SUBXrs:
805   case AArch64::SUBSXrs: {
806     unsigned Imm = MI.getOperand(3).getImm();
807     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
808     return ShiftVal == 0 ||
809            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
810   }
811 
812   case AArch64::SUBWrx:
813   case AArch64::SUBXrx:
814   case AArch64::SUBXrx64:
815   case AArch64::SUBSWrx:
816   case AArch64::SUBSXrx:
817   case AArch64::SUBSXrx64: {
818     unsigned Imm = MI.getOperand(3).getImm();
819     switch (AArch64_AM::getArithExtendType(Imm)) {
820     default:
821       return false;
822     case AArch64_AM::UXTB:
823     case AArch64_AM::UXTH:
824     case AArch64_AM::UXTW:
825     case AArch64_AM::UXTX:
826       return AArch64_AM::getArithShiftValue(Imm) == 0;
827     }
828   }
829 
830   case AArch64::LDRBBroW:
831   case AArch64::LDRBBroX:
832   case AArch64::LDRBroW:
833   case AArch64::LDRBroX:
834   case AArch64::LDRDroW:
835   case AArch64::LDRDroX:
836   case AArch64::LDRHHroW:
837   case AArch64::LDRHHroX:
838   case AArch64::LDRHroW:
839   case AArch64::LDRHroX:
840   case AArch64::LDRQroW:
841   case AArch64::LDRQroX:
842   case AArch64::LDRSBWroW:
843   case AArch64::LDRSBWroX:
844   case AArch64::LDRSBXroW:
845   case AArch64::LDRSBXroX:
846   case AArch64::LDRSHWroW:
847   case AArch64::LDRSHWroX:
848   case AArch64::LDRSHXroW:
849   case AArch64::LDRSHXroX:
850   case AArch64::LDRSWroW:
851   case AArch64::LDRSWroX:
852   case AArch64::LDRSroW:
853   case AArch64::LDRSroX:
854   case AArch64::LDRWroW:
855   case AArch64::LDRWroX:
856   case AArch64::LDRXroW:
857   case AArch64::LDRXroX:
858   case AArch64::PRFMroW:
859   case AArch64::PRFMroX:
860   case AArch64::STRBBroW:
861   case AArch64::STRBBroX:
862   case AArch64::STRBroW:
863   case AArch64::STRBroX:
864   case AArch64::STRDroW:
865   case AArch64::STRDroX:
866   case AArch64::STRHHroW:
867   case AArch64::STRHHroX:
868   case AArch64::STRHroW:
869   case AArch64::STRHroX:
870   case AArch64::STRQroW:
871   case AArch64::STRQroX:
872   case AArch64::STRSroW:
873   case AArch64::STRSroX:
874   case AArch64::STRWroW:
875   case AArch64::STRWroX:
876   case AArch64::STRXroW:
877   case AArch64::STRXroX: {
878     unsigned IsSigned = MI.getOperand(3).getImm();
879     return !IsSigned;
880   }
881   }
882 }
883 
isSEHInstruction(const MachineInstr & MI)884 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
885   unsigned Opc = MI.getOpcode();
886   switch (Opc) {
887     default:
888       return false;
889     case AArch64::SEH_StackAlloc:
890     case AArch64::SEH_SaveFPLR:
891     case AArch64::SEH_SaveFPLR_X:
892     case AArch64::SEH_SaveReg:
893     case AArch64::SEH_SaveReg_X:
894     case AArch64::SEH_SaveRegP:
895     case AArch64::SEH_SaveRegP_X:
896     case AArch64::SEH_SaveFReg:
897     case AArch64::SEH_SaveFReg_X:
898     case AArch64::SEH_SaveFRegP:
899     case AArch64::SEH_SaveFRegP_X:
900     case AArch64::SEH_SetFP:
901     case AArch64::SEH_AddFP:
902     case AArch64::SEH_Nop:
903     case AArch64::SEH_PrologEnd:
904     case AArch64::SEH_EpilogStart:
905     case AArch64::SEH_EpilogEnd:
906       return true;
907   }
908 }
909 
isCoalescableExtInstr(const MachineInstr & MI,unsigned & SrcReg,unsigned & DstReg,unsigned & SubIdx) const910 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
911                                              unsigned &SrcReg, unsigned &DstReg,
912                                              unsigned &SubIdx) const {
913   switch (MI.getOpcode()) {
914   default:
915     return false;
916   case AArch64::SBFMXri: // aka sxtw
917   case AArch64::UBFMXri: // aka uxtw
918     // Check for the 32 -> 64 bit extension case, these instructions can do
919     // much more.
920     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
921       return false;
922     // This is a signed or unsigned 32 -> 64 bit extension.
923     SrcReg = MI.getOperand(1).getReg();
924     DstReg = MI.getOperand(0).getReg();
925     SubIdx = AArch64::sub_32;
926     return true;
927   }
928 }
929 
areMemAccessesTriviallyDisjoint(MachineInstr & MIa,MachineInstr & MIb,AliasAnalysis * AA) const930 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
931     MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
932   const TargetRegisterInfo *TRI = &getRegisterInfo();
933   MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
934   int64_t OffsetA = 0, OffsetB = 0;
935   unsigned WidthA = 0, WidthB = 0;
936 
937   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
938   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
939 
940   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
941       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
942     return false;
943 
944   // Retrieve the base, offset from the base and width. Width
945   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
946   // base are identical, and the offset of a lower memory access +
947   // the width doesn't overlap the offset of a higher memory access,
948   // then the memory accesses are different.
949   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
950       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
951     if (BaseOpA->isIdenticalTo(*BaseOpB)) {
952       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
953       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
954       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
955       if (LowOffset + LowWidth <= HighOffset)
956         return true;
957     }
958   }
959   return false;
960 }
961 
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const962 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
963                                             const MachineBasicBlock *MBB,
964                                             const MachineFunction &MF) const {
965   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
966     return true;
967   switch (MI.getOpcode()) {
968   case AArch64::HINT:
969     // CSDB hints are scheduling barriers.
970     if (MI.getOperand(0).getImm() == 0x14)
971       return true;
972     break;
973   case AArch64::DSB:
974   case AArch64::ISB:
975     // DSB and ISB also are scheduling barriers.
976     return true;
977   default:;
978   }
979   return isSEHInstruction(MI);
980 }
981 
982 /// analyzeCompare - For a comparison instruction, return the source registers
983 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
984 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,unsigned & SrcReg,unsigned & SrcReg2,int & CmpMask,int & CmpValue) const985 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
986                                       unsigned &SrcReg2, int &CmpMask,
987                                       int &CmpValue) const {
988   // The first operand can be a frame index where we'd normally expect a
989   // register.
990   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
991   if (!MI.getOperand(1).isReg())
992     return false;
993 
994   switch (MI.getOpcode()) {
995   default:
996     break;
997   case AArch64::SUBSWrr:
998   case AArch64::SUBSWrs:
999   case AArch64::SUBSWrx:
1000   case AArch64::SUBSXrr:
1001   case AArch64::SUBSXrs:
1002   case AArch64::SUBSXrx:
1003   case AArch64::ADDSWrr:
1004   case AArch64::ADDSWrs:
1005   case AArch64::ADDSWrx:
1006   case AArch64::ADDSXrr:
1007   case AArch64::ADDSXrs:
1008   case AArch64::ADDSXrx:
1009     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1010     SrcReg = MI.getOperand(1).getReg();
1011     SrcReg2 = MI.getOperand(2).getReg();
1012     CmpMask = ~0;
1013     CmpValue = 0;
1014     return true;
1015   case AArch64::SUBSWri:
1016   case AArch64::ADDSWri:
1017   case AArch64::SUBSXri:
1018   case AArch64::ADDSXri:
1019     SrcReg = MI.getOperand(1).getReg();
1020     SrcReg2 = 0;
1021     CmpMask = ~0;
1022     // FIXME: In order to convert CmpValue to 0 or 1
1023     CmpValue = MI.getOperand(2).getImm() != 0;
1024     return true;
1025   case AArch64::ANDSWri:
1026   case AArch64::ANDSXri:
1027     // ANDS does not use the same encoding scheme as the others xxxS
1028     // instructions.
1029     SrcReg = MI.getOperand(1).getReg();
1030     SrcReg2 = 0;
1031     CmpMask = ~0;
1032     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1033     // while the type of CmpValue is int. When converting uint64_t to int,
1034     // the high 32 bits of uint64_t will be lost.
1035     // In fact it causes a bug in spec2006-483.xalancbmk
1036     // CmpValue is only used to compare with zero in OptimizeCompareInstr
1037     CmpValue = AArch64_AM::decodeLogicalImmediate(
1038                    MI.getOperand(2).getImm(),
1039                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1040     return true;
1041   }
1042 
1043   return false;
1044 }
1045 
UpdateOperandRegClass(MachineInstr & Instr)1046 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1047   MachineBasicBlock *MBB = Instr.getParent();
1048   assert(MBB && "Can't get MachineBasicBlock here");
1049   MachineFunction *MF = MBB->getParent();
1050   assert(MF && "Can't get MachineFunction here");
1051   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1052   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1053   MachineRegisterInfo *MRI = &MF->getRegInfo();
1054 
1055   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1056        ++OpIdx) {
1057     MachineOperand &MO = Instr.getOperand(OpIdx);
1058     const TargetRegisterClass *OpRegCstraints =
1059         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1060 
1061     // If there's no constraint, there's nothing to do.
1062     if (!OpRegCstraints)
1063       continue;
1064     // If the operand is a frame index, there's nothing to do here.
1065     // A frame index operand will resolve correctly during PEI.
1066     if (MO.isFI())
1067       continue;
1068 
1069     assert(MO.isReg() &&
1070            "Operand has register constraints without being a register!");
1071 
1072     unsigned Reg = MO.getReg();
1073     if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
1074       if (!OpRegCstraints->contains(Reg))
1075         return false;
1076     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1077                !MRI->constrainRegClass(Reg, OpRegCstraints))
1078       return false;
1079   }
1080 
1081   return true;
1082 }
1083 
1084 /// Return the opcode that does not set flags when possible - otherwise
1085 /// return the original opcode. The caller is responsible to do the actual
1086 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1087 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1088   // Don't convert all compare instructions, because for some the zero register
1089   // encoding becomes the sp register.
1090   bool MIDefinesZeroReg = false;
1091   if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1092     MIDefinesZeroReg = true;
1093 
1094   switch (MI.getOpcode()) {
1095   default:
1096     return MI.getOpcode();
1097   case AArch64::ADDSWrr:
1098     return AArch64::ADDWrr;
1099   case AArch64::ADDSWri:
1100     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1101   case AArch64::ADDSWrs:
1102     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1103   case AArch64::ADDSWrx:
1104     return AArch64::ADDWrx;
1105   case AArch64::ADDSXrr:
1106     return AArch64::ADDXrr;
1107   case AArch64::ADDSXri:
1108     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1109   case AArch64::ADDSXrs:
1110     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1111   case AArch64::ADDSXrx:
1112     return AArch64::ADDXrx;
1113   case AArch64::SUBSWrr:
1114     return AArch64::SUBWrr;
1115   case AArch64::SUBSWri:
1116     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1117   case AArch64::SUBSWrs:
1118     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1119   case AArch64::SUBSWrx:
1120     return AArch64::SUBWrx;
1121   case AArch64::SUBSXrr:
1122     return AArch64::SUBXrr;
1123   case AArch64::SUBSXri:
1124     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1125   case AArch64::SUBSXrs:
1126     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1127   case AArch64::SUBSXrx:
1128     return AArch64::SUBXrx;
1129   }
1130 }
1131 
1132 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1133 
1134 /// True when condition flags are accessed (either by writing or reading)
1135 /// on the instruction trace starting at From and ending at To.
1136 ///
1137 /// Note: If From and To are from different blocks it's assumed CC are accessed
1138 ///       on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1139 static bool areCFlagsAccessedBetweenInstrs(
1140     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1141     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1142   // Early exit if To is at the beginning of the BB.
1143   if (To == To->getParent()->begin())
1144     return true;
1145 
1146   // Check whether the instructions are in the same basic block
1147   // If not, assume the condition flags might get modified somewhere.
1148   if (To->getParent() != From->getParent())
1149     return true;
1150 
1151   // From must be above To.
1152   assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1153                       [From](MachineInstr &MI) {
1154                         return MI.getIterator() == From;
1155                       }) != To->getParent()->rend());
1156 
1157   // We iterate backward starting \p To until we hit \p From.
1158   for (--To; To != From; --To) {
1159     const MachineInstr &Instr = *To;
1160 
1161     if (((AccessToCheck & AK_Write) &&
1162          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1163         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1164       return true;
1165   }
1166   return false;
1167 }
1168 
1169 /// Try to optimize a compare instruction. A compare instruction is an
1170 /// instruction which produces AArch64::NZCV. It can be truly compare
1171 /// instruction
1172 /// when there are no uses of its destination register.
1173 ///
1174 /// The following steps are tried in order:
1175 /// 1. Convert CmpInstr into an unconditional version.
1176 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1177 ///    condition code or an instruction which can be converted into such an
1178 ///    instruction.
1179 ///    Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,unsigned SrcReg,unsigned SrcReg2,int CmpMask,int CmpValue,const MachineRegisterInfo * MRI) const1180 bool AArch64InstrInfo::optimizeCompareInstr(
1181     MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
1182     int CmpValue, const MachineRegisterInfo *MRI) const {
1183   assert(CmpInstr.getParent());
1184   assert(MRI);
1185 
1186   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1187   int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1188   if (DeadNZCVIdx != -1) {
1189     if (CmpInstr.definesRegister(AArch64::WZR) ||
1190         CmpInstr.definesRegister(AArch64::XZR)) {
1191       CmpInstr.eraseFromParent();
1192       return true;
1193     }
1194     unsigned Opc = CmpInstr.getOpcode();
1195     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1196     if (NewOpc == Opc)
1197       return false;
1198     const MCInstrDesc &MCID = get(NewOpc);
1199     CmpInstr.setDesc(MCID);
1200     CmpInstr.RemoveOperand(DeadNZCVIdx);
1201     bool succeeded = UpdateOperandRegClass(CmpInstr);
1202     (void)succeeded;
1203     assert(succeeded && "Some operands reg class are incompatible!");
1204     return true;
1205   }
1206 
1207   // Continue only if we have a "ri" where immediate is zero.
1208   // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1209   // function.
1210   assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1211   if (CmpValue != 0 || SrcReg2 != 0)
1212     return false;
1213 
1214   // CmpInstr is a Compare instruction if destination register is not used.
1215   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1216     return false;
1217 
1218   return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1219 }
1220 
1221 /// Get opcode of S version of Instr.
1222 /// If Instr is S version its opcode is returned.
1223 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1224 /// or we are not interested in it.
sForm(MachineInstr & Instr)1225 static unsigned sForm(MachineInstr &Instr) {
1226   switch (Instr.getOpcode()) {
1227   default:
1228     return AArch64::INSTRUCTION_LIST_END;
1229 
1230   case AArch64::ADDSWrr:
1231   case AArch64::ADDSWri:
1232   case AArch64::ADDSXrr:
1233   case AArch64::ADDSXri:
1234   case AArch64::SUBSWrr:
1235   case AArch64::SUBSWri:
1236   case AArch64::SUBSXrr:
1237   case AArch64::SUBSXri:
1238     return Instr.getOpcode();
1239 
1240   case AArch64::ADDWrr:
1241     return AArch64::ADDSWrr;
1242   case AArch64::ADDWri:
1243     return AArch64::ADDSWri;
1244   case AArch64::ADDXrr:
1245     return AArch64::ADDSXrr;
1246   case AArch64::ADDXri:
1247     return AArch64::ADDSXri;
1248   case AArch64::ADCWr:
1249     return AArch64::ADCSWr;
1250   case AArch64::ADCXr:
1251     return AArch64::ADCSXr;
1252   case AArch64::SUBWrr:
1253     return AArch64::SUBSWrr;
1254   case AArch64::SUBWri:
1255     return AArch64::SUBSWri;
1256   case AArch64::SUBXrr:
1257     return AArch64::SUBSXrr;
1258   case AArch64::SUBXri:
1259     return AArch64::SUBSXri;
1260   case AArch64::SBCWr:
1261     return AArch64::SBCSWr;
1262   case AArch64::SBCXr:
1263     return AArch64::SBCSXr;
1264   case AArch64::ANDWri:
1265     return AArch64::ANDSWri;
1266   case AArch64::ANDXri:
1267     return AArch64::ANDSXri;
1268   }
1269 }
1270 
1271 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(MachineBasicBlock * MBB)1272 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1273   for (auto *BB : MBB->successors())
1274     if (BB->isLiveIn(AArch64::NZCV))
1275       return true;
1276   return false;
1277 }
1278 
1279 namespace {
1280 
1281 struct UsedNZCV {
1282   bool N = false;
1283   bool Z = false;
1284   bool C = false;
1285   bool V = false;
1286 
1287   UsedNZCV() = default;
1288 
operator |=__anonffc4c8070211::UsedNZCV1289   UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1290     this->N |= UsedFlags.N;
1291     this->Z |= UsedFlags.Z;
1292     this->C |= UsedFlags.C;
1293     this->V |= UsedFlags.V;
1294     return *this;
1295   }
1296 };
1297 
1298 } // end anonymous namespace
1299 
1300 /// Find a condition code used by the instruction.
1301 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1302 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1303 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1304   switch (Instr.getOpcode()) {
1305   default:
1306     return AArch64CC::Invalid;
1307 
1308   case AArch64::Bcc: {
1309     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1310     assert(Idx >= 2);
1311     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1312   }
1313 
1314   case AArch64::CSINVWr:
1315   case AArch64::CSINVXr:
1316   case AArch64::CSINCWr:
1317   case AArch64::CSINCXr:
1318   case AArch64::CSELWr:
1319   case AArch64::CSELXr:
1320   case AArch64::CSNEGWr:
1321   case AArch64::CSNEGXr:
1322   case AArch64::FCSELSrrr:
1323   case AArch64::FCSELDrrr: {
1324     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1325     assert(Idx >= 1);
1326     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1327   }
1328   }
1329 }
1330 
getUsedNZCV(AArch64CC::CondCode CC)1331 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1332   assert(CC != AArch64CC::Invalid);
1333   UsedNZCV UsedFlags;
1334   switch (CC) {
1335   default:
1336     break;
1337 
1338   case AArch64CC::EQ: // Z set
1339   case AArch64CC::NE: // Z clear
1340     UsedFlags.Z = true;
1341     break;
1342 
1343   case AArch64CC::HI: // Z clear and C set
1344   case AArch64CC::LS: // Z set   or  C clear
1345     UsedFlags.Z = true;
1346     LLVM_FALLTHROUGH;
1347   case AArch64CC::HS: // C set
1348   case AArch64CC::LO: // C clear
1349     UsedFlags.C = true;
1350     break;
1351 
1352   case AArch64CC::MI: // N set
1353   case AArch64CC::PL: // N clear
1354     UsedFlags.N = true;
1355     break;
1356 
1357   case AArch64CC::VS: // V set
1358   case AArch64CC::VC: // V clear
1359     UsedFlags.V = true;
1360     break;
1361 
1362   case AArch64CC::GT: // Z clear, N and V the same
1363   case AArch64CC::LE: // Z set,   N and V differ
1364     UsedFlags.Z = true;
1365     LLVM_FALLTHROUGH;
1366   case AArch64CC::GE: // N and V the same
1367   case AArch64CC::LT: // N and V differ
1368     UsedFlags.N = true;
1369     UsedFlags.V = true;
1370     break;
1371   }
1372   return UsedFlags;
1373 }
1374 
isADDSRegImm(unsigned Opcode)1375 static bool isADDSRegImm(unsigned Opcode) {
1376   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1377 }
1378 
isSUBSRegImm(unsigned Opcode)1379 static bool isSUBSRegImm(unsigned Opcode) {
1380   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1381 }
1382 
1383 /// Check if CmpInstr can be substituted by MI.
1384 ///
1385 /// CmpInstr can be substituted:
1386 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1387 /// - and, MI and CmpInstr are from the same MachineBB
1388 /// - and, condition flags are not alive in successors of the CmpInstr parent
1389 /// - and, if MI opcode is the S form there must be no defs of flags between
1390 ///        MI and CmpInstr
1391 ///        or if MI opcode is not the S form there must be neither defs of flags
1392 ///        nor uses of flags between MI and CmpInstr.
1393 /// - and  C/V flags are not used after CmpInstr
canInstrSubstituteCmpInstr(MachineInstr * MI,MachineInstr * CmpInstr,const TargetRegisterInfo * TRI)1394 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1395                                        const TargetRegisterInfo *TRI) {
1396   assert(MI);
1397   assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1398   assert(CmpInstr);
1399 
1400   const unsigned CmpOpcode = CmpInstr->getOpcode();
1401   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1402     return false;
1403 
1404   if (MI->getParent() != CmpInstr->getParent())
1405     return false;
1406 
1407   if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1408     return false;
1409 
1410   AccessKind AccessToCheck = AK_Write;
1411   if (sForm(*MI) != MI->getOpcode())
1412     AccessToCheck = AK_All;
1413   if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1414     return false;
1415 
1416   UsedNZCV NZCVUsedAfterCmp;
1417   for (auto I = std::next(CmpInstr->getIterator()),
1418             E = CmpInstr->getParent()->instr_end();
1419        I != E; ++I) {
1420     const MachineInstr &Instr = *I;
1421     if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1422       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1423       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1424         return false;
1425       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1426     }
1427 
1428     if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1429       break;
1430   }
1431 
1432   return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1433 }
1434 
1435 /// Substitute an instruction comparing to zero with another instruction
1436 /// which produces needed condition flags.
1437 ///
1438 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo * MRI) const1439 bool AArch64InstrInfo::substituteCmpToZero(
1440     MachineInstr &CmpInstr, unsigned SrcReg,
1441     const MachineRegisterInfo *MRI) const {
1442   assert(MRI);
1443   // Get the unique definition of SrcReg.
1444   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1445   if (!MI)
1446     return false;
1447 
1448   const TargetRegisterInfo *TRI = &getRegisterInfo();
1449 
1450   unsigned NewOpc = sForm(*MI);
1451   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1452     return false;
1453 
1454   if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1455     return false;
1456 
1457   // Update the instruction to set NZCV.
1458   MI->setDesc(get(NewOpc));
1459   CmpInstr.eraseFromParent();
1460   bool succeeded = UpdateOperandRegClass(*MI);
1461   (void)succeeded;
1462   assert(succeeded && "Some operands reg class are incompatible!");
1463   MI->addRegisterDefined(AArch64::NZCV, TRI);
1464   return true;
1465 }
1466 
expandPostRAPseudo(MachineInstr & MI) const1467 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1468   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1469       MI.getOpcode() != AArch64::CATCHRET)
1470     return false;
1471 
1472   MachineBasicBlock &MBB = *MI.getParent();
1473   DebugLoc DL = MI.getDebugLoc();
1474 
1475   if (MI.getOpcode() == AArch64::CATCHRET) {
1476     // Skip to the first instruction before the epilog.
1477     const TargetInstrInfo *TII =
1478       MBB.getParent()->getSubtarget().getInstrInfo();
1479     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1480     auto MBBI = MachineBasicBlock::iterator(MI);
1481     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1482     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1483            FirstEpilogSEH != MBB.begin())
1484       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1485     if (FirstEpilogSEH != MBB.begin())
1486       FirstEpilogSEH = std::next(FirstEpilogSEH);
1487     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1488         .addReg(AArch64::X0, RegState::Define)
1489         .addMBB(TargetMBB);
1490     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1491         .addReg(AArch64::X0, RegState::Define)
1492         .addReg(AArch64::X0)
1493         .addMBB(TargetMBB)
1494         .addImm(0);
1495     return true;
1496   }
1497 
1498   unsigned Reg = MI.getOperand(0).getReg();
1499   const GlobalValue *GV =
1500       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1501   const TargetMachine &TM = MBB.getParent()->getTarget();
1502   unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1503   const unsigned char MO_NC = AArch64II::MO_NC;
1504 
1505   if ((OpFlags & AArch64II::MO_GOT) != 0) {
1506     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1507         .addGlobalAddress(GV, 0, OpFlags);
1508     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1509         .addReg(Reg, RegState::Kill)
1510         .addImm(0)
1511         .addMemOperand(*MI.memoperands_begin());
1512   } else if (TM.getCodeModel() == CodeModel::Large) {
1513     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1514         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1515         .addImm(0);
1516     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1517         .addReg(Reg, RegState::Kill)
1518         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1519         .addImm(16);
1520     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1521         .addReg(Reg, RegState::Kill)
1522         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1523         .addImm(32);
1524     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1525         .addReg(Reg, RegState::Kill)
1526         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1527         .addImm(48);
1528     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1529         .addReg(Reg, RegState::Kill)
1530         .addImm(0)
1531         .addMemOperand(*MI.memoperands_begin());
1532   } else if (TM.getCodeModel() == CodeModel::Tiny) {
1533     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1534         .addGlobalAddress(GV, 0, OpFlags);
1535   } else {
1536     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1537         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1538     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1539     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1540         .addReg(Reg, RegState::Kill)
1541         .addGlobalAddress(GV, 0, LoFlags)
1542         .addMemOperand(*MI.memoperands_begin());
1543   }
1544 
1545   MBB.erase(MI);
1546 
1547   return true;
1548 }
1549 
1550 // Return true if this instruction simply sets its single destination register
1551 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)1552 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1553   switch (MI.getOpcode()) {
1554   default:
1555     break;
1556   case AArch64::MOVZWi:
1557   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1558     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1559       assert(MI.getDesc().getNumOperands() == 3 &&
1560              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1561       return true;
1562     }
1563     break;
1564   case AArch64::ANDWri: // and Rd, Rzr, #imm
1565     return MI.getOperand(1).getReg() == AArch64::WZR;
1566   case AArch64::ANDXri:
1567     return MI.getOperand(1).getReg() == AArch64::XZR;
1568   case TargetOpcode::COPY:
1569     return MI.getOperand(1).getReg() == AArch64::WZR;
1570   }
1571   return false;
1572 }
1573 
1574 // Return true if this instruction simply renames a general register without
1575 // modifying bits.
isGPRCopy(const MachineInstr & MI)1576 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1577   switch (MI.getOpcode()) {
1578   default:
1579     break;
1580   case TargetOpcode::COPY: {
1581     // GPR32 copies will by lowered to ORRXrs
1582     unsigned DstReg = MI.getOperand(0).getReg();
1583     return (AArch64::GPR32RegClass.contains(DstReg) ||
1584             AArch64::GPR64RegClass.contains(DstReg));
1585   }
1586   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1587     if (MI.getOperand(1).getReg() == AArch64::XZR) {
1588       assert(MI.getDesc().getNumOperands() == 4 &&
1589              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1590       return true;
1591     }
1592     break;
1593   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1594     if (MI.getOperand(2).getImm() == 0) {
1595       assert(MI.getDesc().getNumOperands() == 4 &&
1596              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1597       return true;
1598     }
1599     break;
1600   }
1601   return false;
1602 }
1603 
1604 // Return true if this instruction simply renames a general register without
1605 // modifying bits.
isFPRCopy(const MachineInstr & MI)1606 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1607   switch (MI.getOpcode()) {
1608   default:
1609     break;
1610   case TargetOpcode::COPY: {
1611     // FPR64 copies will by lowered to ORR.16b
1612     unsigned DstReg = MI.getOperand(0).getReg();
1613     return (AArch64::FPR64RegClass.contains(DstReg) ||
1614             AArch64::FPR128RegClass.contains(DstReg));
1615   }
1616   case AArch64::ORRv16i8:
1617     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1618       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1619              "invalid ORRv16i8 operands");
1620       return true;
1621     }
1622     break;
1623   }
1624   return false;
1625 }
1626 
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const1627 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1628                                                int &FrameIndex) const {
1629   switch (MI.getOpcode()) {
1630   default:
1631     break;
1632   case AArch64::LDRWui:
1633   case AArch64::LDRXui:
1634   case AArch64::LDRBui:
1635   case AArch64::LDRHui:
1636   case AArch64::LDRSui:
1637   case AArch64::LDRDui:
1638   case AArch64::LDRQui:
1639     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1640         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1641       FrameIndex = MI.getOperand(1).getIndex();
1642       return MI.getOperand(0).getReg();
1643     }
1644     break;
1645   }
1646 
1647   return 0;
1648 }
1649 
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const1650 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1651                                               int &FrameIndex) const {
1652   switch (MI.getOpcode()) {
1653   default:
1654     break;
1655   case AArch64::STRWui:
1656   case AArch64::STRXui:
1657   case AArch64::STRBui:
1658   case AArch64::STRHui:
1659   case AArch64::STRSui:
1660   case AArch64::STRDui:
1661   case AArch64::STRQui:
1662     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1663         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1664       FrameIndex = MI.getOperand(1).getIndex();
1665       return MI.getOperand(0).getReg();
1666     }
1667     break;
1668   }
1669   return 0;
1670 }
1671 
1672 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)1673 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1674   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1675     return MMO->getFlags() & MOSuppressPair;
1676   });
1677 }
1678 
1679 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)1680 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1681   if (MI.memoperands_empty())
1682     return;
1683   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1684 }
1685 
1686 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)1687 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1688   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1689     return MMO->getFlags() & MOStridedAccess;
1690   });
1691 }
1692 
isUnscaledLdSt(unsigned Opc)1693 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1694   switch (Opc) {
1695   default:
1696     return false;
1697   case AArch64::STURSi:
1698   case AArch64::STURDi:
1699   case AArch64::STURQi:
1700   case AArch64::STURBBi:
1701   case AArch64::STURHHi:
1702   case AArch64::STURWi:
1703   case AArch64::STURXi:
1704   case AArch64::LDURSi:
1705   case AArch64::LDURDi:
1706   case AArch64::LDURQi:
1707   case AArch64::LDURWi:
1708   case AArch64::LDURXi:
1709   case AArch64::LDURSWi:
1710   case AArch64::LDURHHi:
1711   case AArch64::LDURBBi:
1712   case AArch64::LDURSBWi:
1713   case AArch64::LDURSHWi:
1714     return true;
1715   }
1716 }
1717 
isPairableLdStInst(const MachineInstr & MI)1718 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1719   switch (MI.getOpcode()) {
1720   default:
1721     return false;
1722   // Scaled instructions.
1723   case AArch64::STRSui:
1724   case AArch64::STRDui:
1725   case AArch64::STRQui:
1726   case AArch64::STRXui:
1727   case AArch64::STRWui:
1728   case AArch64::LDRSui:
1729   case AArch64::LDRDui:
1730   case AArch64::LDRQui:
1731   case AArch64::LDRXui:
1732   case AArch64::LDRWui:
1733   case AArch64::LDRSWui:
1734   // Unscaled instructions.
1735   case AArch64::STURSi:
1736   case AArch64::STURDi:
1737   case AArch64::STURQi:
1738   case AArch64::STURWi:
1739   case AArch64::STURXi:
1740   case AArch64::LDURSi:
1741   case AArch64::LDURDi:
1742   case AArch64::LDURQi:
1743   case AArch64::LDURWi:
1744   case AArch64::LDURXi:
1745   case AArch64::LDURSWi:
1746     return true;
1747   }
1748 }
1749 
convertToFlagSettingOpc(unsigned Opc,bool & Is64Bit)1750 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1751                                                    bool &Is64Bit) {
1752   switch (Opc) {
1753   default:
1754     llvm_unreachable("Opcode has no flag setting equivalent!");
1755   // 32-bit cases:
1756   case AArch64::ADDWri:
1757     Is64Bit = false;
1758     return AArch64::ADDSWri;
1759   case AArch64::ADDWrr:
1760     Is64Bit = false;
1761     return AArch64::ADDSWrr;
1762   case AArch64::ADDWrs:
1763     Is64Bit = false;
1764     return AArch64::ADDSWrs;
1765   case AArch64::ADDWrx:
1766     Is64Bit = false;
1767     return AArch64::ADDSWrx;
1768   case AArch64::ANDWri:
1769     Is64Bit = false;
1770     return AArch64::ANDSWri;
1771   case AArch64::ANDWrr:
1772     Is64Bit = false;
1773     return AArch64::ANDSWrr;
1774   case AArch64::ANDWrs:
1775     Is64Bit = false;
1776     return AArch64::ANDSWrs;
1777   case AArch64::BICWrr:
1778     Is64Bit = false;
1779     return AArch64::BICSWrr;
1780   case AArch64::BICWrs:
1781     Is64Bit = false;
1782     return AArch64::BICSWrs;
1783   case AArch64::SUBWri:
1784     Is64Bit = false;
1785     return AArch64::SUBSWri;
1786   case AArch64::SUBWrr:
1787     Is64Bit = false;
1788     return AArch64::SUBSWrr;
1789   case AArch64::SUBWrs:
1790     Is64Bit = false;
1791     return AArch64::SUBSWrs;
1792   case AArch64::SUBWrx:
1793     Is64Bit = false;
1794     return AArch64::SUBSWrx;
1795   // 64-bit cases:
1796   case AArch64::ADDXri:
1797     Is64Bit = true;
1798     return AArch64::ADDSXri;
1799   case AArch64::ADDXrr:
1800     Is64Bit = true;
1801     return AArch64::ADDSXrr;
1802   case AArch64::ADDXrs:
1803     Is64Bit = true;
1804     return AArch64::ADDSXrs;
1805   case AArch64::ADDXrx:
1806     Is64Bit = true;
1807     return AArch64::ADDSXrx;
1808   case AArch64::ANDXri:
1809     Is64Bit = true;
1810     return AArch64::ANDSXri;
1811   case AArch64::ANDXrr:
1812     Is64Bit = true;
1813     return AArch64::ANDSXrr;
1814   case AArch64::ANDXrs:
1815     Is64Bit = true;
1816     return AArch64::ANDSXrs;
1817   case AArch64::BICXrr:
1818     Is64Bit = true;
1819     return AArch64::BICSXrr;
1820   case AArch64::BICXrs:
1821     Is64Bit = true;
1822     return AArch64::BICSXrs;
1823   case AArch64::SUBXri:
1824     Is64Bit = true;
1825     return AArch64::SUBSXri;
1826   case AArch64::SUBXrr:
1827     Is64Bit = true;
1828     return AArch64::SUBSXrr;
1829   case AArch64::SUBXrs:
1830     Is64Bit = true;
1831     return AArch64::SUBSXrs;
1832   case AArch64::SUBXrx:
1833     Is64Bit = true;
1834     return AArch64::SUBSXrx;
1835   }
1836 }
1837 
1838 // Is this a candidate for ld/st merging or pairing?  For example, we don't
1839 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(MachineInstr & MI) const1840 bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
1841   // If this is a volatile load/store, don't mess with it.
1842   if (MI.hasOrderedMemoryRef())
1843     return false;
1844 
1845   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
1846   assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
1847          "Expected a reg or frame index operand.");
1848   if (!MI.getOperand(2).isImm())
1849     return false;
1850 
1851   // Can't merge/pair if the instruction modifies the base register.
1852   // e.g., ldr x0, [x0]
1853   // This case will never occur with an FI base.
1854   if (MI.getOperand(1).isReg()) {
1855     unsigned BaseReg = MI.getOperand(1).getReg();
1856     const TargetRegisterInfo *TRI = &getRegisterInfo();
1857     if (MI.modifiesRegister(BaseReg, TRI))
1858       return false;
1859   }
1860 
1861   // Check if this load/store has a hint to avoid pair formation.
1862   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1863   if (isLdStPairSuppressed(MI))
1864     return false;
1865 
1866   // On some CPUs quad load/store pairs are slower than two single load/stores.
1867   if (Subtarget.isPaired128Slow()) {
1868     switch (MI.getOpcode()) {
1869     default:
1870       break;
1871     case AArch64::LDURQi:
1872     case AArch64::STURQi:
1873     case AArch64::LDRQui:
1874     case AArch64::STRQui:
1875       return false;
1876     }
1877   }
1878 
1879   return true;
1880 }
1881 
getMemOperandWithOffset(MachineInstr & LdSt,MachineOperand * & BaseOp,int64_t & Offset,const TargetRegisterInfo * TRI) const1882 bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
1883                                           MachineOperand *&BaseOp,
1884                                           int64_t &Offset,
1885                                           const TargetRegisterInfo *TRI) const {
1886   unsigned Width;
1887   return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
1888 }
1889 
getMemOperandWithOffsetWidth(MachineInstr & LdSt,MachineOperand * & BaseOp,int64_t & Offset,unsigned & Width,const TargetRegisterInfo * TRI) const1890 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
1891     MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
1892     unsigned &Width, const TargetRegisterInfo *TRI) const {
1893   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1894   // Handle only loads/stores with base register followed by immediate offset.
1895   if (LdSt.getNumExplicitOperands() == 3) {
1896     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
1897     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
1898         !LdSt.getOperand(2).isImm())
1899       return false;
1900   } else if (LdSt.getNumExplicitOperands() == 4) {
1901     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
1902     if (!LdSt.getOperand(1).isReg() ||
1903         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
1904         !LdSt.getOperand(3).isImm())
1905       return false;
1906   } else
1907     return false;
1908 
1909   // Get the scaling factor for the instruction and set the width for the
1910   // instruction.
1911   unsigned Scale = 0;
1912   int64_t Dummy1, Dummy2;
1913 
1914   // If this returns false, then it's an instruction we don't want to handle.
1915   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
1916     return false;
1917 
1918   // Compute the offset. Offset is calculated as the immediate operand
1919   // multiplied by the scaling factor. Unscaled instructions have scaling factor
1920   // set to 1.
1921   if (LdSt.getNumExplicitOperands() == 3) {
1922     BaseOp = &LdSt.getOperand(1);
1923     Offset = LdSt.getOperand(2).getImm() * Scale;
1924   } else {
1925     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
1926     BaseOp = &LdSt.getOperand(2);
1927     Offset = LdSt.getOperand(3).getImm() * Scale;
1928   }
1929 
1930   assert((BaseOp->isReg() || BaseOp->isFI()) &&
1931          "getMemOperandWithOffset only supports base "
1932          "operands of type register or frame index.");
1933 
1934   return true;
1935 }
1936 
1937 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const1938 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
1939   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1940   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
1941   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
1942   return OfsOp;
1943 }
1944 
getMemOpInfo(unsigned Opcode,unsigned & Scale,unsigned & Width,int64_t & MinOffset,int64_t & MaxOffset) const1945 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
1946                                     unsigned &Width, int64_t &MinOffset,
1947                                     int64_t &MaxOffset) const {
1948   switch (Opcode) {
1949   // Not a memory operation or something we want to handle.
1950   default:
1951     Scale = Width = 0;
1952     MinOffset = MaxOffset = 0;
1953     return false;
1954   case AArch64::STRWpost:
1955   case AArch64::LDRWpost:
1956     Width = 32;
1957     Scale = 4;
1958     MinOffset = -256;
1959     MaxOffset = 255;
1960     break;
1961   case AArch64::LDURQi:
1962   case AArch64::STURQi:
1963     Width = 16;
1964     Scale = 1;
1965     MinOffset = -256;
1966     MaxOffset = 255;
1967     break;
1968   case AArch64::LDURXi:
1969   case AArch64::LDURDi:
1970   case AArch64::STURXi:
1971   case AArch64::STURDi:
1972     Width = 8;
1973     Scale = 1;
1974     MinOffset = -256;
1975     MaxOffset = 255;
1976     break;
1977   case AArch64::LDURWi:
1978   case AArch64::LDURSi:
1979   case AArch64::LDURSWi:
1980   case AArch64::STURWi:
1981   case AArch64::STURSi:
1982     Width = 4;
1983     Scale = 1;
1984     MinOffset = -256;
1985     MaxOffset = 255;
1986     break;
1987   case AArch64::LDURHi:
1988   case AArch64::LDURHHi:
1989   case AArch64::LDURSHXi:
1990   case AArch64::LDURSHWi:
1991   case AArch64::STURHi:
1992   case AArch64::STURHHi:
1993     Width = 2;
1994     Scale = 1;
1995     MinOffset = -256;
1996     MaxOffset = 255;
1997     break;
1998   case AArch64::LDURBi:
1999   case AArch64::LDURBBi:
2000   case AArch64::LDURSBXi:
2001   case AArch64::LDURSBWi:
2002   case AArch64::STURBi:
2003   case AArch64::STURBBi:
2004     Width = 1;
2005     Scale = 1;
2006     MinOffset = -256;
2007     MaxOffset = 255;
2008     break;
2009   case AArch64::LDPQi:
2010   case AArch64::LDNPQi:
2011   case AArch64::STPQi:
2012   case AArch64::STNPQi:
2013     Scale = 16;
2014     Width = 32;
2015     MinOffset = -64;
2016     MaxOffset = 63;
2017     break;
2018   case AArch64::LDRQui:
2019   case AArch64::STRQui:
2020     Scale = Width = 16;
2021     MinOffset = 0;
2022     MaxOffset = 4095;
2023     break;
2024   case AArch64::LDPXi:
2025   case AArch64::LDPDi:
2026   case AArch64::LDNPXi:
2027   case AArch64::LDNPDi:
2028   case AArch64::STPXi:
2029   case AArch64::STPDi:
2030   case AArch64::STNPXi:
2031   case AArch64::STNPDi:
2032     Scale = 8;
2033     Width = 16;
2034     MinOffset = -64;
2035     MaxOffset = 63;
2036     break;
2037   case AArch64::LDRXui:
2038   case AArch64::LDRDui:
2039   case AArch64::STRXui:
2040   case AArch64::STRDui:
2041     Scale = Width = 8;
2042     MinOffset = 0;
2043     MaxOffset = 4095;
2044     break;
2045   case AArch64::LDPWi:
2046   case AArch64::LDPSi:
2047   case AArch64::LDNPWi:
2048   case AArch64::LDNPSi:
2049   case AArch64::STPWi:
2050   case AArch64::STPSi:
2051   case AArch64::STNPWi:
2052   case AArch64::STNPSi:
2053     Scale = 4;
2054     Width = 8;
2055     MinOffset = -64;
2056     MaxOffset = 63;
2057     break;
2058   case AArch64::LDRWui:
2059   case AArch64::LDRSui:
2060   case AArch64::LDRSWui:
2061   case AArch64::STRWui:
2062   case AArch64::STRSui:
2063     Scale = Width = 4;
2064     MinOffset = 0;
2065     MaxOffset = 4095;
2066     break;
2067   case AArch64::LDRHui:
2068   case AArch64::LDRHHui:
2069   case AArch64::STRHui:
2070   case AArch64::STRHHui:
2071     Scale = Width = 2;
2072     MinOffset = 0;
2073     MaxOffset = 4095;
2074     break;
2075   case AArch64::LDRBui:
2076   case AArch64::LDRBBui:
2077   case AArch64::STRBui:
2078   case AArch64::STRBBui:
2079     Scale = Width = 1;
2080     MinOffset = 0;
2081     MaxOffset = 4095;
2082     break;
2083   }
2084 
2085   return true;
2086 }
2087 
getOffsetStride(unsigned Opc)2088 static unsigned getOffsetStride(unsigned Opc) {
2089   switch (Opc) {
2090   default:
2091     return 0;
2092   case AArch64::LDURQi:
2093   case AArch64::STURQi:
2094     return 16;
2095   case AArch64::LDURXi:
2096   case AArch64::LDURDi:
2097   case AArch64::STURXi:
2098   case AArch64::STURDi:
2099     return 8;
2100   case AArch64::LDURWi:
2101   case AArch64::LDURSi:
2102   case AArch64::LDURSWi:
2103   case AArch64::STURWi:
2104   case AArch64::STURSi:
2105     return 4;
2106   }
2107 }
2108 
2109 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
2110 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)2111 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2112   unsigned OffsetStride = getOffsetStride(Opc);
2113   if (OffsetStride == 0)
2114     return false;
2115   // If the byte-offset isn't a multiple of the stride, we can't scale this
2116   // offset.
2117   if (Offset % OffsetStride != 0)
2118     return false;
2119 
2120   // Convert the byte-offset used by unscaled into an "element" offset used
2121   // by the scaled pair load/store instructions.
2122   Offset /= OffsetStride;
2123   return true;
2124 }
2125 
2126 // Unscale the scaled offsets. Returns false if the scaled offset can't be
2127 // unscaled.
unscaleOffset(unsigned Opc,int64_t & Offset)2128 static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
2129   unsigned OffsetStride = getOffsetStride(Opc);
2130   if (OffsetStride == 0)
2131     return false;
2132 
2133   // Convert the "element" offset used by scaled pair load/store instructions
2134   // into the byte-offset used by unscaled.
2135   Offset *= OffsetStride;
2136   return true;
2137 }
2138 
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)2139 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2140   if (FirstOpc == SecondOpc)
2141     return true;
2142   // We can also pair sign-ext and zero-ext instructions.
2143   switch (FirstOpc) {
2144   default:
2145     return false;
2146   case AArch64::LDRWui:
2147   case AArch64::LDURWi:
2148     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2149   case AArch64::LDRSWui:
2150   case AArch64::LDURSWi:
2151     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2152   }
2153   // These instructions can't be paired based on their opcodes.
2154   return false;
2155 }
2156 
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)2157 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2158                             int64_t Offset1, unsigned Opcode1, int FI2,
2159                             int64_t Offset2, unsigned Opcode2) {
2160   // Accesses through fixed stack object frame indices may access a different
2161   // fixed stack slot. Check that the object offsets + offsets match.
2162   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2163     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2164     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2165     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2166     // Get the byte-offset from the object offset.
2167     if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
2168       return false;
2169     ObjectOffset1 += Offset1;
2170     ObjectOffset2 += Offset2;
2171     // Get the "element" index in the object.
2172     if (!scaleOffset(Opcode1, ObjectOffset1) ||
2173         !scaleOffset(Opcode2, ObjectOffset2))
2174       return false;
2175     return ObjectOffset1 + 1 == ObjectOffset2;
2176   }
2177 
2178   return FI1 == FI2;
2179 }
2180 
2181 /// Detect opportunities for ldp/stp formation.
2182 ///
2183 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(MachineOperand & BaseOp1,MachineOperand & BaseOp2,unsigned NumLoads) const2184 bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
2185                                            MachineOperand &BaseOp2,
2186                                            unsigned NumLoads) const {
2187   MachineInstr &FirstLdSt = *BaseOp1.getParent();
2188   MachineInstr &SecondLdSt = *BaseOp2.getParent();
2189   if (BaseOp1.getType() != BaseOp2.getType())
2190     return false;
2191 
2192   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2193          "Only base registers and frame indices are supported.");
2194 
2195   // Check for both base regs and base FI.
2196   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2197     return false;
2198 
2199   // Only cluster up to a single pair.
2200   if (NumLoads > 1)
2201     return false;
2202 
2203   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2204     return false;
2205 
2206   // Can we pair these instructions based on their opcodes?
2207   unsigned FirstOpc = FirstLdSt.getOpcode();
2208   unsigned SecondOpc = SecondLdSt.getOpcode();
2209   if (!canPairLdStOpc(FirstOpc, SecondOpc))
2210     return false;
2211 
2212   // Can't merge volatiles or load/stores that have a hint to avoid pair
2213   // formation, for example.
2214   if (!isCandidateToMergeOrPair(FirstLdSt) ||
2215       !isCandidateToMergeOrPair(SecondLdSt))
2216     return false;
2217 
2218   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2219   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2220   if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2221     return false;
2222 
2223   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2224   if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2225     return false;
2226 
2227   // Pairwise instructions have a 7-bit signed offset field.
2228   if (Offset1 > 63 || Offset1 < -64)
2229     return false;
2230 
2231   // The caller should already have ordered First/SecondLdSt by offset.
2232   // Note: except for non-equal frame index bases
2233   if (BaseOp1.isFI()) {
2234     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
2235            "Caller should have ordered offsets.");
2236 
2237     const MachineFrameInfo &MFI =
2238         FirstLdSt.getParent()->getParent()->getFrameInfo();
2239     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2240                            BaseOp2.getIndex(), Offset2, SecondOpc);
2241   }
2242 
2243   assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2244          "Caller should have ordered offsets.");
2245 
2246   return Offset1 + 1 == Offset2;
2247 }
2248 
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)2249 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2250                                             unsigned Reg, unsigned SubIdx,
2251                                             unsigned State,
2252                                             const TargetRegisterInfo *TRI) {
2253   if (!SubIdx)
2254     return MIB.addReg(Reg, State);
2255 
2256   if (TargetRegisterInfo::isPhysicalRegister(Reg))
2257     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2258   return MIB.addReg(Reg, State, SubIdx);
2259 }
2260 
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)2261 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2262                                         unsigned NumRegs) {
2263   // We really want the positive remainder mod 32 here, that happens to be
2264   // easily obtainable with a mask.
2265   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2266 }
2267 
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const2268 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2269                                         MachineBasicBlock::iterator I,
2270                                         const DebugLoc &DL, unsigned DestReg,
2271                                         unsigned SrcReg, bool KillSrc,
2272                                         unsigned Opcode,
2273                                         ArrayRef<unsigned> Indices) const {
2274   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2275   const TargetRegisterInfo *TRI = &getRegisterInfo();
2276   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2277   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2278   unsigned NumRegs = Indices.size();
2279 
2280   int SubReg = 0, End = NumRegs, Incr = 1;
2281   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2282     SubReg = NumRegs - 1;
2283     End = -1;
2284     Incr = -1;
2285   }
2286 
2287   for (; SubReg != End; SubReg += Incr) {
2288     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2289     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2290     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2291     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2292   }
2293 }
2294 
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const2295 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2296                                        MachineBasicBlock::iterator I,
2297                                        DebugLoc DL, unsigned DestReg,
2298                                        unsigned SrcReg, bool KillSrc,
2299                                        unsigned Opcode, unsigned ZeroReg,
2300                                        llvm::ArrayRef<unsigned> Indices) const {
2301   const TargetRegisterInfo *TRI = &getRegisterInfo();
2302   unsigned NumRegs = Indices.size();
2303 
2304 #ifndef NDEBUG
2305   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2306   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2307   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2308          "GPR reg sequences should not be able to overlap");
2309 #endif
2310 
2311   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2312     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2313     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2314     MIB.addReg(ZeroReg);
2315     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2316     MIB.addImm(0);
2317   }
2318 }
2319 
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,bool KillSrc) const2320 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2321                                    MachineBasicBlock::iterator I,
2322                                    const DebugLoc &DL, unsigned DestReg,
2323                                    unsigned SrcReg, bool KillSrc) const {
2324   if (AArch64::GPR32spRegClass.contains(DestReg) &&
2325       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2326     const TargetRegisterInfo *TRI = &getRegisterInfo();
2327 
2328     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2329       // If either operand is WSP, expand to ADD #0.
2330       if (Subtarget.hasZeroCycleRegMove()) {
2331         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2332         unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2333                                                      &AArch64::GPR64spRegClass);
2334         unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2335                                                     &AArch64::GPR64spRegClass);
2336         // This instruction is reading and writing X registers.  This may upset
2337         // the register scavenger and machine verifier, so we need to indicate
2338         // that we are reading an undefined value from SrcRegX, but a proper
2339         // value from SrcReg.
2340         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2341             .addReg(SrcRegX, RegState::Undef)
2342             .addImm(0)
2343             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2344             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2345       } else {
2346         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2347             .addReg(SrcReg, getKillRegState(KillSrc))
2348             .addImm(0)
2349             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2350       }
2351     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2352       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2353           .addImm(0)
2354           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2355     } else {
2356       if (Subtarget.hasZeroCycleRegMove()) {
2357         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2358         unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2359                                                      &AArch64::GPR64spRegClass);
2360         unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2361                                                     &AArch64::GPR64spRegClass);
2362         // This instruction is reading and writing X registers.  This may upset
2363         // the register scavenger and machine verifier, so we need to indicate
2364         // that we are reading an undefined value from SrcRegX, but a proper
2365         // value from SrcReg.
2366         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2367             .addReg(AArch64::XZR)
2368             .addReg(SrcRegX, RegState::Undef)
2369             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2370       } else {
2371         // Otherwise, expand to ORR WZR.
2372         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2373             .addReg(AArch64::WZR)
2374             .addReg(SrcReg, getKillRegState(KillSrc));
2375       }
2376     }
2377     return;
2378   }
2379 
2380   if (AArch64::GPR64spRegClass.contains(DestReg) &&
2381       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2382     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2383       // If either operand is SP, expand to ADD #0.
2384       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2385           .addReg(SrcReg, getKillRegState(KillSrc))
2386           .addImm(0)
2387           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2388     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2389       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2390           .addImm(0)
2391           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2392     } else {
2393       // Otherwise, expand to ORR XZR.
2394       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2395           .addReg(AArch64::XZR)
2396           .addReg(SrcReg, getKillRegState(KillSrc));
2397     }
2398     return;
2399   }
2400 
2401   // Copy a DDDD register quad by copying the individual sub-registers.
2402   if (AArch64::DDDDRegClass.contains(DestReg) &&
2403       AArch64::DDDDRegClass.contains(SrcReg)) {
2404     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2405                                        AArch64::dsub2, AArch64::dsub3};
2406     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2407                      Indices);
2408     return;
2409   }
2410 
2411   // Copy a DDD register triple by copying the individual sub-registers.
2412   if (AArch64::DDDRegClass.contains(DestReg) &&
2413       AArch64::DDDRegClass.contains(SrcReg)) {
2414     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2415                                        AArch64::dsub2};
2416     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2417                      Indices);
2418     return;
2419   }
2420 
2421   // Copy a DD register pair by copying the individual sub-registers.
2422   if (AArch64::DDRegClass.contains(DestReg) &&
2423       AArch64::DDRegClass.contains(SrcReg)) {
2424     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2425     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2426                      Indices);
2427     return;
2428   }
2429 
2430   // Copy a QQQQ register quad by copying the individual sub-registers.
2431   if (AArch64::QQQQRegClass.contains(DestReg) &&
2432       AArch64::QQQQRegClass.contains(SrcReg)) {
2433     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2434                                        AArch64::qsub2, AArch64::qsub3};
2435     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2436                      Indices);
2437     return;
2438   }
2439 
2440   // Copy a QQQ register triple by copying the individual sub-registers.
2441   if (AArch64::QQQRegClass.contains(DestReg) &&
2442       AArch64::QQQRegClass.contains(SrcReg)) {
2443     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2444                                        AArch64::qsub2};
2445     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2446                      Indices);
2447     return;
2448   }
2449 
2450   // Copy a QQ register pair by copying the individual sub-registers.
2451   if (AArch64::QQRegClass.contains(DestReg) &&
2452       AArch64::QQRegClass.contains(SrcReg)) {
2453     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2454     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2455                      Indices);
2456     return;
2457   }
2458 
2459   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2460       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2461     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2462     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2463                     AArch64::XZR, Indices);
2464     return;
2465   }
2466 
2467   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2468       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2469     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2470     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2471                     AArch64::WZR, Indices);
2472     return;
2473   }
2474 
2475   if (AArch64::FPR128RegClass.contains(DestReg) &&
2476       AArch64::FPR128RegClass.contains(SrcReg)) {
2477     if (Subtarget.hasNEON()) {
2478       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2479           .addReg(SrcReg)
2480           .addReg(SrcReg, getKillRegState(KillSrc));
2481     } else {
2482       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2483           .addReg(AArch64::SP, RegState::Define)
2484           .addReg(SrcReg, getKillRegState(KillSrc))
2485           .addReg(AArch64::SP)
2486           .addImm(-16);
2487       BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2488           .addReg(AArch64::SP, RegState::Define)
2489           .addReg(DestReg, RegState::Define)
2490           .addReg(AArch64::SP)
2491           .addImm(16);
2492     }
2493     return;
2494   }
2495 
2496   if (AArch64::FPR64RegClass.contains(DestReg) &&
2497       AArch64::FPR64RegClass.contains(SrcReg)) {
2498     if (Subtarget.hasNEON()) {
2499       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2500                                        &AArch64::FPR128RegClass);
2501       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2502                                       &AArch64::FPR128RegClass);
2503       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2504           .addReg(SrcReg)
2505           .addReg(SrcReg, getKillRegState(KillSrc));
2506     } else {
2507       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2508           .addReg(SrcReg, getKillRegState(KillSrc));
2509     }
2510     return;
2511   }
2512 
2513   if (AArch64::FPR32RegClass.contains(DestReg) &&
2514       AArch64::FPR32RegClass.contains(SrcReg)) {
2515     if (Subtarget.hasNEON()) {
2516       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2517                                        &AArch64::FPR128RegClass);
2518       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2519                                       &AArch64::FPR128RegClass);
2520       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2521           .addReg(SrcReg)
2522           .addReg(SrcReg, getKillRegState(KillSrc));
2523     } else {
2524       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2525           .addReg(SrcReg, getKillRegState(KillSrc));
2526     }
2527     return;
2528   }
2529 
2530   if (AArch64::FPR16RegClass.contains(DestReg) &&
2531       AArch64::FPR16RegClass.contains(SrcReg)) {
2532     if (Subtarget.hasNEON()) {
2533       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2534                                        &AArch64::FPR128RegClass);
2535       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2536                                       &AArch64::FPR128RegClass);
2537       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2538           .addReg(SrcReg)
2539           .addReg(SrcReg, getKillRegState(KillSrc));
2540     } else {
2541       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2542                                        &AArch64::FPR32RegClass);
2543       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2544                                       &AArch64::FPR32RegClass);
2545       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2546           .addReg(SrcReg, getKillRegState(KillSrc));
2547     }
2548     return;
2549   }
2550 
2551   if (AArch64::FPR8RegClass.contains(DestReg) &&
2552       AArch64::FPR8RegClass.contains(SrcReg)) {
2553     if (Subtarget.hasNEON()) {
2554       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2555                                        &AArch64::FPR128RegClass);
2556       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2557                                       &AArch64::FPR128RegClass);
2558       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2559           .addReg(SrcReg)
2560           .addReg(SrcReg, getKillRegState(KillSrc));
2561     } else {
2562       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2563                                        &AArch64::FPR32RegClass);
2564       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2565                                       &AArch64::FPR32RegClass);
2566       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2567           .addReg(SrcReg, getKillRegState(KillSrc));
2568     }
2569     return;
2570   }
2571 
2572   // Copies between GPR64 and FPR64.
2573   if (AArch64::FPR64RegClass.contains(DestReg) &&
2574       AArch64::GPR64RegClass.contains(SrcReg)) {
2575     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2576         .addReg(SrcReg, getKillRegState(KillSrc));
2577     return;
2578   }
2579   if (AArch64::GPR64RegClass.contains(DestReg) &&
2580       AArch64::FPR64RegClass.contains(SrcReg)) {
2581     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2582         .addReg(SrcReg, getKillRegState(KillSrc));
2583     return;
2584   }
2585   // Copies between GPR32 and FPR32.
2586   if (AArch64::FPR32RegClass.contains(DestReg) &&
2587       AArch64::GPR32RegClass.contains(SrcReg)) {
2588     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2589         .addReg(SrcReg, getKillRegState(KillSrc));
2590     return;
2591   }
2592   if (AArch64::GPR32RegClass.contains(DestReg) &&
2593       AArch64::FPR32RegClass.contains(SrcReg)) {
2594     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2595         .addReg(SrcReg, getKillRegState(KillSrc));
2596     return;
2597   }
2598 
2599   if (DestReg == AArch64::NZCV) {
2600     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2601     BuildMI(MBB, I, DL, get(AArch64::MSR))
2602         .addImm(AArch64SysReg::NZCV)
2603         .addReg(SrcReg, getKillRegState(KillSrc))
2604         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2605     return;
2606   }
2607 
2608   if (SrcReg == AArch64::NZCV) {
2609     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2610     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2611         .addImm(AArch64SysReg::NZCV)
2612         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2613     return;
2614   }
2615 
2616   llvm_unreachable("unimplemented reg-to-reg copy");
2617 }
2618 
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,unsigned SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)2619 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
2620                                     MachineBasicBlock &MBB,
2621                                     MachineBasicBlock::iterator InsertBefore,
2622                                     const MCInstrDesc &MCID,
2623                                     unsigned SrcReg, bool IsKill,
2624                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
2625                                     MachineMemOperand *MMO) {
2626   unsigned SrcReg0 = SrcReg;
2627   unsigned SrcReg1 = SrcReg;
2628   if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
2629     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2630     SubIdx0 = 0;
2631     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2632     SubIdx1 = 0;
2633   }
2634   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2635       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
2636       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
2637       .addFrameIndex(FI)
2638       .addImm(0)
2639       .addMemOperand(MMO);
2640 }
2641 
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,unsigned SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const2642 void AArch64InstrInfo::storeRegToStackSlot(
2643     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
2644     bool isKill, int FI, const TargetRegisterClass *RC,
2645     const TargetRegisterInfo *TRI) const {
2646   MachineFunction &MF = *MBB.getParent();
2647   MachineFrameInfo &MFI = MF.getFrameInfo();
2648   unsigned Align = MFI.getObjectAlignment(FI);
2649 
2650   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2651   MachineMemOperand *MMO = MF.getMachineMemOperand(
2652       PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
2653   unsigned Opc = 0;
2654   bool Offset = true;
2655   switch (TRI->getSpillSize(*RC)) {
2656   case 1:
2657     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2658       Opc = AArch64::STRBui;
2659     break;
2660   case 2:
2661     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2662       Opc = AArch64::STRHui;
2663     break;
2664   case 4:
2665     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2666       Opc = AArch64::STRWui;
2667       if (TargetRegisterInfo::isVirtualRegister(SrcReg))
2668         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
2669       else
2670         assert(SrcReg != AArch64::WSP);
2671     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2672       Opc = AArch64::STRSui;
2673     break;
2674   case 8:
2675     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2676       Opc = AArch64::STRXui;
2677       if (TargetRegisterInfo::isVirtualRegister(SrcReg))
2678         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2679       else
2680         assert(SrcReg != AArch64::SP);
2681     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2682       Opc = AArch64::STRDui;
2683     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2684       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2685                               get(AArch64::STPWi), SrcReg, isKill,
2686                               AArch64::sube32, AArch64::subo32, FI, MMO);
2687       return;
2688     }
2689     break;
2690   case 16:
2691     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2692       Opc = AArch64::STRQui;
2693     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2694       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2695       Opc = AArch64::ST1Twov1d;
2696       Offset = false;
2697     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2698       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2699                               get(AArch64::STPXi), SrcReg, isKill,
2700                               AArch64::sube64, AArch64::subo64, FI, MMO);
2701       return;
2702     }
2703     break;
2704   case 24:
2705     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2706       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2707       Opc = AArch64::ST1Threev1d;
2708       Offset = false;
2709     }
2710     break;
2711   case 32:
2712     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2713       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2714       Opc = AArch64::ST1Fourv1d;
2715       Offset = false;
2716     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2717       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2718       Opc = AArch64::ST1Twov2d;
2719       Offset = false;
2720     }
2721     break;
2722   case 48:
2723     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2724       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2725       Opc = AArch64::ST1Threev2d;
2726       Offset = false;
2727     }
2728     break;
2729   case 64:
2730     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2731       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2732       Opc = AArch64::ST1Fourv2d;
2733       Offset = false;
2734     }
2735     break;
2736   }
2737   assert(Opc && "Unknown register class");
2738 
2739   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2740                                      .addReg(SrcReg, getKillRegState(isKill))
2741                                      .addFrameIndex(FI);
2742 
2743   if (Offset)
2744     MI.addImm(0);
2745   MI.addMemOperand(MMO);
2746 }
2747 
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,unsigned DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)2748 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
2749                                      MachineBasicBlock &MBB,
2750                                      MachineBasicBlock::iterator InsertBefore,
2751                                      const MCInstrDesc &MCID,
2752                                      unsigned DestReg, unsigned SubIdx0,
2753                                      unsigned SubIdx1, int FI,
2754                                      MachineMemOperand *MMO) {
2755   unsigned DestReg0 = DestReg;
2756   unsigned DestReg1 = DestReg;
2757   bool IsUndef = true;
2758   if (TargetRegisterInfo::isPhysicalRegister(DestReg)) {
2759     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
2760     SubIdx0 = 0;
2761     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
2762     SubIdx1 = 0;
2763     IsUndef = false;
2764   }
2765   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2766       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
2767       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
2768       .addFrameIndex(FI)
2769       .addImm(0)
2770       .addMemOperand(MMO);
2771 }
2772 
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,unsigned DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const2773 void AArch64InstrInfo::loadRegFromStackSlot(
2774     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
2775     int FI, const TargetRegisterClass *RC,
2776     const TargetRegisterInfo *TRI) const {
2777   MachineFunction &MF = *MBB.getParent();
2778   MachineFrameInfo &MFI = MF.getFrameInfo();
2779   unsigned Align = MFI.getObjectAlignment(FI);
2780   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2781   MachineMemOperand *MMO = MF.getMachineMemOperand(
2782       PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
2783 
2784   unsigned Opc = 0;
2785   bool Offset = true;
2786   switch (TRI->getSpillSize(*RC)) {
2787   case 1:
2788     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2789       Opc = AArch64::LDRBui;
2790     break;
2791   case 2:
2792     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2793       Opc = AArch64::LDRHui;
2794     break;
2795   case 4:
2796     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2797       Opc = AArch64::LDRWui;
2798       if (TargetRegisterInfo::isVirtualRegister(DestReg))
2799         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
2800       else
2801         assert(DestReg != AArch64::WSP);
2802     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2803       Opc = AArch64::LDRSui;
2804     break;
2805   case 8:
2806     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2807       Opc = AArch64::LDRXui;
2808       if (TargetRegisterInfo::isVirtualRegister(DestReg))
2809         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
2810       else
2811         assert(DestReg != AArch64::SP);
2812     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2813       Opc = AArch64::LDRDui;
2814     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2815       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2816                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
2817                                AArch64::subo32, FI, MMO);
2818       return;
2819     }
2820     break;
2821   case 16:
2822     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2823       Opc = AArch64::LDRQui;
2824     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2825       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2826       Opc = AArch64::LD1Twov1d;
2827       Offset = false;
2828     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2829       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2830                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
2831                                AArch64::subo64, FI, MMO);
2832       return;
2833     }
2834     break;
2835   case 24:
2836     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2837       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2838       Opc = AArch64::LD1Threev1d;
2839       Offset = false;
2840     }
2841     break;
2842   case 32:
2843     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2844       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2845       Opc = AArch64::LD1Fourv1d;
2846       Offset = false;
2847     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2848       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2849       Opc = AArch64::LD1Twov2d;
2850       Offset = false;
2851     }
2852     break;
2853   case 48:
2854     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2855       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2856       Opc = AArch64::LD1Threev2d;
2857       Offset = false;
2858     }
2859     break;
2860   case 64:
2861     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2862       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2863       Opc = AArch64::LD1Fourv2d;
2864       Offset = false;
2865     }
2866     break;
2867   }
2868   assert(Opc && "Unknown register class");
2869 
2870   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2871                                      .addReg(DestReg, getDefRegState(true))
2872                                      .addFrameIndex(FI);
2873   if (Offset)
2874     MI.addImm(0);
2875   MI.addMemOperand(MMO);
2876 }
2877 
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI)2878 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
2879                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
2880                            unsigned DestReg, unsigned SrcReg, int Offset,
2881                            const TargetInstrInfo *TII,
2882                            MachineInstr::MIFlag Flag, bool SetNZCV,
2883                            bool NeedsWinCFI) {
2884   if (DestReg == SrcReg && Offset == 0)
2885     return;
2886 
2887   assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
2888          "SP increment/decrement not 16-byte aligned");
2889 
2890   bool isSub = Offset < 0;
2891   if (isSub)
2892     Offset = -Offset;
2893 
2894   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
2895   // scratch register.  If DestReg is a virtual register, use it as the
2896   // scratch register; otherwise, create a new virtual register (to be
2897   // replaced by the scavenger at the end of PEI).  That case can be optimized
2898   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
2899   // register can be loaded with offset%8 and the add/sub can use an extending
2900   // instruction with LSL#3.
2901   // Currently the function handles any offsets but generates a poor sequence
2902   // of code.
2903   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
2904 
2905   unsigned Opc;
2906   if (SetNZCV)
2907     Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
2908   else
2909     Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
2910   const unsigned MaxEncoding = 0xfff;
2911   const unsigned ShiftSize = 12;
2912   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
2913   while (((unsigned)Offset) >= (1 << ShiftSize)) {
2914     unsigned ThisVal;
2915     if (((unsigned)Offset) > MaxEncodableValue) {
2916       ThisVal = MaxEncodableValue;
2917     } else {
2918       ThisVal = Offset & MaxEncodableValue;
2919     }
2920     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
2921            "Encoding cannot handle value that big");
2922     BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
2923         .addReg(SrcReg)
2924         .addImm(ThisVal >> ShiftSize)
2925         .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
2926         .setMIFlag(Flag);
2927 
2928    if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP)
2929      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
2930          .addImm(ThisVal)
2931          .setMIFlag(Flag);
2932 
2933     SrcReg = DestReg;
2934     Offset -= ThisVal;
2935     if (Offset == 0)
2936       return;
2937   }
2938   BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
2939       .addReg(SrcReg)
2940       .addImm(Offset)
2941       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2942       .setMIFlag(Flag);
2943 
2944   if (NeedsWinCFI) {
2945     if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
2946         (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
2947       if (Offset == 0)
2948         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
2949                 setMIFlag(Flag);
2950       else
2951         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
2952                 addImm(Offset).setMIFlag(Flag);
2953     } else if (DestReg == AArch64::SP) {
2954       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
2955               addImm(Offset).setMIFlag(Flag);
2956     }
2957   }
2958 }
2959 
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS) const2960 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
2961     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
2962     MachineBasicBlock::iterator InsertPt, int FrameIndex,
2963     LiveIntervals *LIS) const {
2964   // This is a bit of a hack. Consider this instruction:
2965   //
2966   //   %0 = COPY %sp; GPR64all:%0
2967   //
2968   // We explicitly chose GPR64all for the virtual register so such a copy might
2969   // be eliminated by RegisterCoalescer. However, that may not be possible, and
2970   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
2971   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
2972   //
2973   // To prevent that, we are going to constrain the %0 register class here.
2974   //
2975   // <rdar://problem/11522048>
2976   //
2977   if (MI.isFullCopy()) {
2978     unsigned DstReg = MI.getOperand(0).getReg();
2979     unsigned SrcReg = MI.getOperand(1).getReg();
2980     if (SrcReg == AArch64::SP &&
2981         TargetRegisterInfo::isVirtualRegister(DstReg)) {
2982       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
2983       return nullptr;
2984     }
2985     if (DstReg == AArch64::SP &&
2986         TargetRegisterInfo::isVirtualRegister(SrcReg)) {
2987       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2988       return nullptr;
2989     }
2990   }
2991 
2992   // Handle the case where a copy is being spilled or filled but the source
2993   // and destination register class don't match.  For example:
2994   //
2995   //   %0 = COPY %xzr; GPR64common:%0
2996   //
2997   // In this case we can still safely fold away the COPY and generate the
2998   // following spill code:
2999   //
3000   //   STRXui %xzr, %stack.0
3001   //
3002   // This also eliminates spilled cross register class COPYs (e.g. between x and
3003   // d regs) of the same size.  For example:
3004   //
3005   //   %0 = COPY %1; GPR64:%0, FPR64:%1
3006   //
3007   // will be filled as
3008   //
3009   //   LDRDui %0, fi<#0>
3010   //
3011   // instead of
3012   //
3013   //   LDRXui %Temp, fi<#0>
3014   //   %0 = FMOV %Temp
3015   //
3016   if (MI.isCopy() && Ops.size() == 1 &&
3017       // Make sure we're only folding the explicit COPY defs/uses.
3018       (Ops[0] == 0 || Ops[0] == 1)) {
3019     bool IsSpill = Ops[0] == 0;
3020     bool IsFill = !IsSpill;
3021     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3022     const MachineRegisterInfo &MRI = MF.getRegInfo();
3023     MachineBasicBlock &MBB = *MI.getParent();
3024     const MachineOperand &DstMO = MI.getOperand(0);
3025     const MachineOperand &SrcMO = MI.getOperand(1);
3026     unsigned DstReg = DstMO.getReg();
3027     unsigned SrcReg = SrcMO.getReg();
3028     // This is slightly expensive to compute for physical regs since
3029     // getMinimalPhysRegClass is slow.
3030     auto getRegClass = [&](unsigned Reg) {
3031       return TargetRegisterInfo::isVirtualRegister(Reg)
3032                  ? MRI.getRegClass(Reg)
3033                  : TRI.getMinimalPhysRegClass(Reg);
3034     };
3035 
3036     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3037       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3038                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3039              "Mismatched register size in non subreg COPY");
3040       if (IsSpill)
3041         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3042                             getRegClass(SrcReg), &TRI);
3043       else
3044         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3045                              getRegClass(DstReg), &TRI);
3046       return &*--InsertPt;
3047     }
3048 
3049     // Handle cases like spilling def of:
3050     //
3051     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3052     //
3053     // where the physical register source can be widened and stored to the full
3054     // virtual reg destination stack slot, in this case producing:
3055     //
3056     //   STRXui %xzr, %stack.0
3057     //
3058     if (IsSpill && DstMO.isUndef() &&
3059         TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
3060       assert(SrcMO.getSubReg() == 0 &&
3061              "Unexpected subreg on physical register");
3062       const TargetRegisterClass *SpillRC;
3063       unsigned SpillSubreg;
3064       switch (DstMO.getSubReg()) {
3065       default:
3066         SpillRC = nullptr;
3067         break;
3068       case AArch64::sub_32:
3069       case AArch64::ssub:
3070         if (AArch64::GPR32RegClass.contains(SrcReg)) {
3071           SpillRC = &AArch64::GPR64RegClass;
3072           SpillSubreg = AArch64::sub_32;
3073         } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3074           SpillRC = &AArch64::FPR64RegClass;
3075           SpillSubreg = AArch64::ssub;
3076         } else
3077           SpillRC = nullptr;
3078         break;
3079       case AArch64::dsub:
3080         if (AArch64::FPR64RegClass.contains(SrcReg)) {
3081           SpillRC = &AArch64::FPR128RegClass;
3082           SpillSubreg = AArch64::dsub;
3083         } else
3084           SpillRC = nullptr;
3085         break;
3086       }
3087 
3088       if (SpillRC)
3089         if (unsigned WidenedSrcReg =
3090                 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3091           storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3092                               FrameIndex, SpillRC, &TRI);
3093           return &*--InsertPt;
3094         }
3095     }
3096 
3097     // Handle cases like filling use of:
3098     //
3099     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3100     //
3101     // where we can load the full virtual reg source stack slot, into the subreg
3102     // destination, in this case producing:
3103     //
3104     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
3105     //
3106     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3107       const TargetRegisterClass *FillRC;
3108       switch (DstMO.getSubReg()) {
3109       default:
3110         FillRC = nullptr;
3111         break;
3112       case AArch64::sub_32:
3113         FillRC = &AArch64::GPR32RegClass;
3114         break;
3115       case AArch64::ssub:
3116         FillRC = &AArch64::FPR32RegClass;
3117         break;
3118       case AArch64::dsub:
3119         FillRC = &AArch64::FPR64RegClass;
3120         break;
3121       }
3122 
3123       if (FillRC) {
3124         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3125                    TRI.getRegSizeInBits(*FillRC) &&
3126                "Mismatched regclass size on folded subreg COPY");
3127         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3128         MachineInstr &LoadMI = *--InsertPt;
3129         MachineOperand &LoadDst = LoadMI.getOperand(0);
3130         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3131         LoadDst.setSubReg(DstMO.getSubReg());
3132         LoadDst.setIsUndef();
3133         return &LoadMI;
3134       }
3135     }
3136   }
3137 
3138   // Cannot fold.
3139   return nullptr;
3140 }
3141 
isAArch64FrameOffsetLegal(const MachineInstr & MI,int & Offset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int * EmittableOffset)3142 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
3143                                     bool *OutUseUnscaledOp,
3144                                     unsigned *OutUnscaledOp,
3145                                     int *EmittableOffset) {
3146   int Scale = 1;
3147   bool IsSigned = false;
3148   // The ImmIdx should be changed case by case if it is not 2.
3149   unsigned ImmIdx = 2;
3150   unsigned UnscaledOp = 0;
3151   // Set output values in case of early exit.
3152   if (EmittableOffset)
3153     *EmittableOffset = 0;
3154   if (OutUseUnscaledOp)
3155     *OutUseUnscaledOp = false;
3156   if (OutUnscaledOp)
3157     *OutUnscaledOp = 0;
3158   switch (MI.getOpcode()) {
3159   default:
3160     llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
3161   // Vector spills/fills can't take an immediate offset.
3162   case AArch64::LD1Twov2d:
3163   case AArch64::LD1Threev2d:
3164   case AArch64::LD1Fourv2d:
3165   case AArch64::LD1Twov1d:
3166   case AArch64::LD1Threev1d:
3167   case AArch64::LD1Fourv1d:
3168   case AArch64::ST1Twov2d:
3169   case AArch64::ST1Threev2d:
3170   case AArch64::ST1Fourv2d:
3171   case AArch64::ST1Twov1d:
3172   case AArch64::ST1Threev1d:
3173   case AArch64::ST1Fourv1d:
3174     return AArch64FrameOffsetCannotUpdate;
3175   case AArch64::PRFMui:
3176     Scale = 8;
3177     UnscaledOp = AArch64::PRFUMi;
3178     break;
3179   case AArch64::LDRXui:
3180     Scale = 8;
3181     UnscaledOp = AArch64::LDURXi;
3182     break;
3183   case AArch64::LDRWui:
3184     Scale = 4;
3185     UnscaledOp = AArch64::LDURWi;
3186     break;
3187   case AArch64::LDRBui:
3188     Scale = 1;
3189     UnscaledOp = AArch64::LDURBi;
3190     break;
3191   case AArch64::LDRHui:
3192     Scale = 2;
3193     UnscaledOp = AArch64::LDURHi;
3194     break;
3195   case AArch64::LDRSui:
3196     Scale = 4;
3197     UnscaledOp = AArch64::LDURSi;
3198     break;
3199   case AArch64::LDRDui:
3200     Scale = 8;
3201     UnscaledOp = AArch64::LDURDi;
3202     break;
3203   case AArch64::LDRQui:
3204     Scale = 16;
3205     UnscaledOp = AArch64::LDURQi;
3206     break;
3207   case AArch64::LDRBBui:
3208     Scale = 1;
3209     UnscaledOp = AArch64::LDURBBi;
3210     break;
3211   case AArch64::LDRHHui:
3212     Scale = 2;
3213     UnscaledOp = AArch64::LDURHHi;
3214     break;
3215   case AArch64::LDRSBXui:
3216     Scale = 1;
3217     UnscaledOp = AArch64::LDURSBXi;
3218     break;
3219   case AArch64::LDRSBWui:
3220     Scale = 1;
3221     UnscaledOp = AArch64::LDURSBWi;
3222     break;
3223   case AArch64::LDRSHXui:
3224     Scale = 2;
3225     UnscaledOp = AArch64::LDURSHXi;
3226     break;
3227   case AArch64::LDRSHWui:
3228     Scale = 2;
3229     UnscaledOp = AArch64::LDURSHWi;
3230     break;
3231   case AArch64::LDRSWui:
3232     Scale = 4;
3233     UnscaledOp = AArch64::LDURSWi;
3234     break;
3235 
3236   case AArch64::STRXui:
3237     Scale = 8;
3238     UnscaledOp = AArch64::STURXi;
3239     break;
3240   case AArch64::STRWui:
3241     Scale = 4;
3242     UnscaledOp = AArch64::STURWi;
3243     break;
3244   case AArch64::STRBui:
3245     Scale = 1;
3246     UnscaledOp = AArch64::STURBi;
3247     break;
3248   case AArch64::STRHui:
3249     Scale = 2;
3250     UnscaledOp = AArch64::STURHi;
3251     break;
3252   case AArch64::STRSui:
3253     Scale = 4;
3254     UnscaledOp = AArch64::STURSi;
3255     break;
3256   case AArch64::STRDui:
3257     Scale = 8;
3258     UnscaledOp = AArch64::STURDi;
3259     break;
3260   case AArch64::STRQui:
3261     Scale = 16;
3262     UnscaledOp = AArch64::STURQi;
3263     break;
3264   case AArch64::STRBBui:
3265     Scale = 1;
3266     UnscaledOp = AArch64::STURBBi;
3267     break;
3268   case AArch64::STRHHui:
3269     Scale = 2;
3270     UnscaledOp = AArch64::STURHHi;
3271     break;
3272 
3273   case AArch64::LDPXi:
3274   case AArch64::LDPDi:
3275   case AArch64::STPXi:
3276   case AArch64::STPDi:
3277   case AArch64::LDNPXi:
3278   case AArch64::LDNPDi:
3279   case AArch64::STNPXi:
3280   case AArch64::STNPDi:
3281     ImmIdx = 3;
3282     IsSigned = true;
3283     Scale = 8;
3284     break;
3285   case AArch64::LDPQi:
3286   case AArch64::STPQi:
3287   case AArch64::LDNPQi:
3288   case AArch64::STNPQi:
3289     ImmIdx = 3;
3290     IsSigned = true;
3291     Scale = 16;
3292     break;
3293   case AArch64::LDPWi:
3294   case AArch64::LDPSi:
3295   case AArch64::STPWi:
3296   case AArch64::STPSi:
3297   case AArch64::LDNPWi:
3298   case AArch64::LDNPSi:
3299   case AArch64::STNPWi:
3300   case AArch64::STNPSi:
3301     ImmIdx = 3;
3302     IsSigned = true;
3303     Scale = 4;
3304     break;
3305 
3306   case AArch64::LDURXi:
3307   case AArch64::LDURWi:
3308   case AArch64::LDURBi:
3309   case AArch64::LDURHi:
3310   case AArch64::LDURSi:
3311   case AArch64::LDURDi:
3312   case AArch64::LDURQi:
3313   case AArch64::LDURHHi:
3314   case AArch64::LDURBBi:
3315   case AArch64::LDURSBXi:
3316   case AArch64::LDURSBWi:
3317   case AArch64::LDURSHXi:
3318   case AArch64::LDURSHWi:
3319   case AArch64::LDURSWi:
3320   case AArch64::STURXi:
3321   case AArch64::STURWi:
3322   case AArch64::STURBi:
3323   case AArch64::STURHi:
3324   case AArch64::STURSi:
3325   case AArch64::STURDi:
3326   case AArch64::STURQi:
3327   case AArch64::STURBBi:
3328   case AArch64::STURHHi:
3329     Scale = 1;
3330     break;
3331   }
3332 
3333   Offset += MI.getOperand(ImmIdx).getImm() * Scale;
3334 
3335   bool useUnscaledOp = false;
3336   // If the offset doesn't match the scale, we rewrite the instruction to
3337   // use the unscaled instruction instead. Likewise, if we have a negative
3338   // offset (and have an unscaled op to use).
3339   if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
3340     useUnscaledOp = true;
3341 
3342   // Use an unscaled addressing mode if the instruction has a negative offset
3343   // (or if the instruction is already using an unscaled addressing mode).
3344   unsigned MaskBits;
3345   if (IsSigned) {
3346     // ldp/stp instructions.
3347     MaskBits = 7;
3348     Offset /= Scale;
3349   } else if (UnscaledOp == 0 || useUnscaledOp) {
3350     MaskBits = 9;
3351     IsSigned = true;
3352     Scale = 1;
3353   } else {
3354     MaskBits = 12;
3355     IsSigned = false;
3356     Offset /= Scale;
3357   }
3358 
3359   // Attempt to fold address computation.
3360   int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
3361   int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
3362   if (Offset >= MinOff && Offset <= MaxOff) {
3363     if (EmittableOffset)
3364       *EmittableOffset = Offset;
3365     Offset = 0;
3366   } else {
3367     int NewOff = Offset < 0 ? MinOff : MaxOff;
3368     if (EmittableOffset)
3369       *EmittableOffset = NewOff;
3370     Offset = (Offset - NewOff) * Scale;
3371   }
3372   if (OutUseUnscaledOp)
3373     *OutUseUnscaledOp = useUnscaledOp;
3374   if (OutUnscaledOp)
3375     *OutUnscaledOp = UnscaledOp;
3376   return AArch64FrameOffsetCanUpdate |
3377          (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
3378 }
3379 
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,int & Offset,const AArch64InstrInfo * TII)3380 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3381                                     unsigned FrameReg, int &Offset,
3382                                     const AArch64InstrInfo *TII) {
3383   unsigned Opcode = MI.getOpcode();
3384   unsigned ImmIdx = FrameRegIdx + 1;
3385 
3386   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3387     Offset += MI.getOperand(ImmIdx).getImm();
3388     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3389                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3390                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3391     MI.eraseFromParent();
3392     Offset = 0;
3393     return true;
3394   }
3395 
3396   int NewOffset;
3397   unsigned UnscaledOp;
3398   bool UseUnscaledOp;
3399   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3400                                          &UnscaledOp, &NewOffset);
3401   if (Status & AArch64FrameOffsetCanUpdate) {
3402     if (Status & AArch64FrameOffsetIsLegal)
3403       // Replace the FrameIndex with FrameReg.
3404       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3405     if (UseUnscaledOp)
3406       MI.setDesc(TII->get(UnscaledOp));
3407 
3408     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3409     return Offset == 0;
3410   }
3411 
3412   return false;
3413 }
3414 
getNoop(MCInst & NopInst) const3415 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3416   NopInst.setOpcode(AArch64::HINT);
3417   NopInst.addOperand(MCOperand::createImm(0));
3418 }
3419 
3420 // AArch64 supports MachineCombiner.
useMachineCombiner() const3421 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3422 
3423 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)3424 static bool isCombineInstrSettingFlag(unsigned Opc) {
3425   switch (Opc) {
3426   case AArch64::ADDSWrr:
3427   case AArch64::ADDSWri:
3428   case AArch64::ADDSXrr:
3429   case AArch64::ADDSXri:
3430   case AArch64::SUBSWrr:
3431   case AArch64::SUBSXrr:
3432   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3433   case AArch64::SUBSWri:
3434   case AArch64::SUBSXri:
3435     return true;
3436   default:
3437     break;
3438   }
3439   return false;
3440 }
3441 
3442 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)3443 static bool isCombineInstrCandidate32(unsigned Opc) {
3444   switch (Opc) {
3445   case AArch64::ADDWrr:
3446   case AArch64::ADDWri:
3447   case AArch64::SUBWrr:
3448   case AArch64::ADDSWrr:
3449   case AArch64::ADDSWri:
3450   case AArch64::SUBSWrr:
3451   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3452   case AArch64::SUBWri:
3453   case AArch64::SUBSWri:
3454     return true;
3455   default:
3456     break;
3457   }
3458   return false;
3459 }
3460 
3461 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)3462 static bool isCombineInstrCandidate64(unsigned Opc) {
3463   switch (Opc) {
3464   case AArch64::ADDXrr:
3465   case AArch64::ADDXri:
3466   case AArch64::SUBXrr:
3467   case AArch64::ADDSXrr:
3468   case AArch64::ADDSXri:
3469   case AArch64::SUBSXrr:
3470   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3471   case AArch64::SUBXri:
3472   case AArch64::SUBSXri:
3473     return true;
3474   default:
3475     break;
3476   }
3477   return false;
3478 }
3479 
3480 // FP Opcodes that can be combined with a FMUL
isCombineInstrCandidateFP(const MachineInstr & Inst)3481 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3482   switch (Inst.getOpcode()) {
3483   default:
3484     break;
3485   case AArch64::FADDSrr:
3486   case AArch64::FADDDrr:
3487   case AArch64::FADDv2f32:
3488   case AArch64::FADDv2f64:
3489   case AArch64::FADDv4f32:
3490   case AArch64::FSUBSrr:
3491   case AArch64::FSUBDrr:
3492   case AArch64::FSUBv2f32:
3493   case AArch64::FSUBv2f64:
3494   case AArch64::FSUBv4f32:
3495     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3496     return (Options.UnsafeFPMath ||
3497             Options.AllowFPOpFusion == FPOpFusion::Fast);
3498   }
3499   return false;
3500 }
3501 
3502 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)3503 static bool isCombineInstrCandidate(unsigned Opc) {
3504   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
3505 }
3506 
3507 //
3508 // Utility routine that checks if \param MO is defined by an
3509 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)3510 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
3511                        unsigned CombineOpc, unsigned ZeroReg = 0,
3512                        bool CheckZeroReg = false) {
3513   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3514   MachineInstr *MI = nullptr;
3515 
3516   if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3517     MI = MRI.getUniqueVRegDef(MO.getReg());
3518   // And it needs to be in the trace (otherwise, it won't have a depth).
3519   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3520     return false;
3521   // Must only used by the user we combine with.
3522   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3523     return false;
3524 
3525   if (CheckZeroReg) {
3526     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3527            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3528            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3529     // The third input reg must be zero.
3530     if (MI->getOperand(3).getReg() != ZeroReg)
3531       return false;
3532   }
3533 
3534   return true;
3535 }
3536 
3537 //
3538 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)3539 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3540                               unsigned MulOpc, unsigned ZeroReg) {
3541   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3542 }
3543 
3544 //
3545 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)3546 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3547                                unsigned MulOpc) {
3548   return canCombine(MBB, MO, MulOpc);
3549 }
3550 
3551 // TODO: There are many more machine instruction opcodes to match:
3552 //       1. Other data types (integer, vectors)
3553 //       2. Other math / logic operations (xor, or)
3554 //       3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst) const3555 bool AArch64InstrInfo::isAssociativeAndCommutative(
3556     const MachineInstr &Inst) const {
3557   switch (Inst.getOpcode()) {
3558   case AArch64::FADDDrr:
3559   case AArch64::FADDSrr:
3560   case AArch64::FADDv2f32:
3561   case AArch64::FADDv2f64:
3562   case AArch64::FADDv4f32:
3563   case AArch64::FMULDrr:
3564   case AArch64::FMULSrr:
3565   case AArch64::FMULX32:
3566   case AArch64::FMULX64:
3567   case AArch64::FMULXv2f32:
3568   case AArch64::FMULXv2f64:
3569   case AArch64::FMULXv4f32:
3570   case AArch64::FMULv2f32:
3571   case AArch64::FMULv2f64:
3572   case AArch64::FMULv4f32:
3573     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3574   default:
3575     return false;
3576   }
3577 }
3578 
3579 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)3580 static bool getMaddPatterns(MachineInstr &Root,
3581                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3582   unsigned Opc = Root.getOpcode();
3583   MachineBasicBlock &MBB = *Root.getParent();
3584   bool Found = false;
3585 
3586   if (!isCombineInstrCandidate(Opc))
3587     return false;
3588   if (isCombineInstrSettingFlag(Opc)) {
3589     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3590     // When NZCV is live bail out.
3591     if (Cmp_NZCV == -1)
3592       return false;
3593     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3594     // When opcode can't change bail out.
3595     // CHECKME: do we miss any cases for opcode conversion?
3596     if (NewOpc == Opc)
3597       return false;
3598     Opc = NewOpc;
3599   }
3600 
3601   switch (Opc) {
3602   default:
3603     break;
3604   case AArch64::ADDWrr:
3605     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3606            "ADDWrr does not have register operands");
3607     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3608                           AArch64::WZR)) {
3609       Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
3610       Found = true;
3611     }
3612     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3613                           AArch64::WZR)) {
3614       Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
3615       Found = true;
3616     }
3617     break;
3618   case AArch64::ADDXrr:
3619     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3620                           AArch64::XZR)) {
3621       Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
3622       Found = true;
3623     }
3624     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3625                           AArch64::XZR)) {
3626       Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
3627       Found = true;
3628     }
3629     break;
3630   case AArch64::SUBWrr:
3631     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3632                           AArch64::WZR)) {
3633       Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
3634       Found = true;
3635     }
3636     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3637                           AArch64::WZR)) {
3638       Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
3639       Found = true;
3640     }
3641     break;
3642   case AArch64::SUBXrr:
3643     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3644                           AArch64::XZR)) {
3645       Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
3646       Found = true;
3647     }
3648     if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3649                           AArch64::XZR)) {
3650       Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
3651       Found = true;
3652     }
3653     break;
3654   case AArch64::ADDWri:
3655     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3656                           AArch64::WZR)) {
3657       Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
3658       Found = true;
3659     }
3660     break;
3661   case AArch64::ADDXri:
3662     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3663                           AArch64::XZR)) {
3664       Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
3665       Found = true;
3666     }
3667     break;
3668   case AArch64::SUBWri:
3669     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3670                           AArch64::WZR)) {
3671       Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
3672       Found = true;
3673     }
3674     break;
3675   case AArch64::SUBXri:
3676     if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3677                           AArch64::XZR)) {
3678       Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
3679       Found = true;
3680     }
3681     break;
3682   }
3683   return Found;
3684 }
3685 /// Floating-Point Support
3686 
3687 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)3688 static bool getFMAPatterns(MachineInstr &Root,
3689                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3690 
3691   if (!isCombineInstrCandidateFP(Root))
3692     return false;
3693 
3694   MachineBasicBlock &MBB = *Root.getParent();
3695   bool Found = false;
3696 
3697   switch (Root.getOpcode()) {
3698   default:
3699     assert(false && "Unsupported FP instruction in combiner\n");
3700     break;
3701   case AArch64::FADDSrr:
3702     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3703            "FADDWrr does not have register operands");
3704     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3705       Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
3706       Found = true;
3707     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3708                                   AArch64::FMULv1i32_indexed)) {
3709       Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
3710       Found = true;
3711     }
3712     if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3713       Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
3714       Found = true;
3715     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3716                                   AArch64::FMULv1i32_indexed)) {
3717       Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
3718       Found = true;
3719     }
3720     break;
3721   case AArch64::FADDDrr:
3722     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3723       Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
3724       Found = true;
3725     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3726                                   AArch64::FMULv1i64_indexed)) {
3727       Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
3728       Found = true;
3729     }
3730     if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3731       Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
3732       Found = true;
3733     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3734                                   AArch64::FMULv1i64_indexed)) {
3735       Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
3736       Found = true;
3737     }
3738     break;
3739   case AArch64::FADDv2f32:
3740     if (canCombineWithFMUL(MBB, Root.getOperand(1),
3741                            AArch64::FMULv2i32_indexed)) {
3742       Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
3743       Found = true;
3744     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3745                                   AArch64::FMULv2f32)) {
3746       Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
3747       Found = true;
3748     }
3749     if (canCombineWithFMUL(MBB, Root.getOperand(2),
3750                            AArch64::FMULv2i32_indexed)) {
3751       Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
3752       Found = true;
3753     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3754                                   AArch64::FMULv2f32)) {
3755       Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
3756       Found = true;
3757     }
3758     break;
3759   case AArch64::FADDv2f64:
3760     if (canCombineWithFMUL(MBB, Root.getOperand(1),
3761                            AArch64::FMULv2i64_indexed)) {
3762       Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
3763       Found = true;
3764     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3765                                   AArch64::FMULv2f64)) {
3766       Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
3767       Found = true;
3768     }
3769     if (canCombineWithFMUL(MBB, Root.getOperand(2),
3770                            AArch64::FMULv2i64_indexed)) {
3771       Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
3772       Found = true;
3773     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3774                                   AArch64::FMULv2f64)) {
3775       Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
3776       Found = true;
3777     }
3778     break;
3779   case AArch64::FADDv4f32:
3780     if (canCombineWithFMUL(MBB, Root.getOperand(1),
3781                            AArch64::FMULv4i32_indexed)) {
3782       Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
3783       Found = true;
3784     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3785                                   AArch64::FMULv4f32)) {
3786       Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
3787       Found = true;
3788     }
3789     if (canCombineWithFMUL(MBB, Root.getOperand(2),
3790                            AArch64::FMULv4i32_indexed)) {
3791       Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
3792       Found = true;
3793     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3794                                   AArch64::FMULv4f32)) {
3795       Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
3796       Found = true;
3797     }
3798     break;
3799 
3800   case AArch64::FSUBSrr:
3801     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3802       Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
3803       Found = true;
3804     }
3805     if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3806       Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
3807       Found = true;
3808     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3809                                   AArch64::FMULv1i32_indexed)) {
3810       Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
3811       Found = true;
3812     }
3813     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
3814       Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
3815       Found = true;
3816     }
3817     break;
3818   case AArch64::FSUBDrr:
3819     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3820       Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
3821       Found = true;
3822     }
3823     if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3824       Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
3825       Found = true;
3826     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3827                                   AArch64::FMULv1i64_indexed)) {
3828       Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
3829       Found = true;
3830     }
3831     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
3832       Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
3833       Found = true;
3834     }
3835     break;
3836   case AArch64::FSUBv2f32:
3837     if (canCombineWithFMUL(MBB, Root.getOperand(2),
3838                            AArch64::FMULv2i32_indexed)) {
3839       Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
3840       Found = true;
3841     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3842                                   AArch64::FMULv2f32)) {
3843       Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
3844       Found = true;
3845     }
3846     if (canCombineWithFMUL(MBB, Root.getOperand(1),
3847                            AArch64::FMULv2i32_indexed)) {
3848       Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
3849       Found = true;
3850     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3851                                   AArch64::FMULv2f32)) {
3852       Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
3853       Found = true;
3854     }
3855     break;
3856   case AArch64::FSUBv2f64:
3857     if (canCombineWithFMUL(MBB, Root.getOperand(2),
3858                            AArch64::FMULv2i64_indexed)) {
3859       Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
3860       Found = true;
3861     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3862                                   AArch64::FMULv2f64)) {
3863       Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
3864       Found = true;
3865     }
3866     if (canCombineWithFMUL(MBB, Root.getOperand(1),
3867                            AArch64::FMULv2i64_indexed)) {
3868       Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
3869       Found = true;
3870     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3871                                   AArch64::FMULv2f64)) {
3872       Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
3873       Found = true;
3874     }
3875     break;
3876   case AArch64::FSUBv4f32:
3877     if (canCombineWithFMUL(MBB, Root.getOperand(2),
3878                            AArch64::FMULv4i32_indexed)) {
3879       Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
3880       Found = true;
3881     } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3882                                   AArch64::FMULv4f32)) {
3883       Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
3884       Found = true;
3885     }
3886     if (canCombineWithFMUL(MBB, Root.getOperand(1),
3887                            AArch64::FMULv4i32_indexed)) {
3888       Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
3889       Found = true;
3890     } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3891                                   AArch64::FMULv4f32)) {
3892       Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
3893       Found = true;
3894     }
3895     break;
3896   }
3897   return Found;
3898 }
3899 
3900 /// Return true when a code sequence can improve throughput. It
3901 /// should be called only for instructions in loops.
3902 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const3903 bool AArch64InstrInfo::isThroughputPattern(
3904     MachineCombinerPattern Pattern) const {
3905   switch (Pattern) {
3906   default:
3907     break;
3908   case MachineCombinerPattern::FMULADDS_OP1:
3909   case MachineCombinerPattern::FMULADDS_OP2:
3910   case MachineCombinerPattern::FMULSUBS_OP1:
3911   case MachineCombinerPattern::FMULSUBS_OP2:
3912   case MachineCombinerPattern::FMULADDD_OP1:
3913   case MachineCombinerPattern::FMULADDD_OP2:
3914   case MachineCombinerPattern::FMULSUBD_OP1:
3915   case MachineCombinerPattern::FMULSUBD_OP2:
3916   case MachineCombinerPattern::FNMULSUBS_OP1:
3917   case MachineCombinerPattern::FNMULSUBD_OP1:
3918   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3919   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3920   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3921   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3922   case MachineCombinerPattern::FMLAv2f32_OP2:
3923   case MachineCombinerPattern::FMLAv2f32_OP1:
3924   case MachineCombinerPattern::FMLAv2f64_OP1:
3925   case MachineCombinerPattern::FMLAv2f64_OP2:
3926   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3927   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3928   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3929   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3930   case MachineCombinerPattern::FMLAv4f32_OP1:
3931   case MachineCombinerPattern::FMLAv4f32_OP2:
3932   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3933   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3934   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3935   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3936   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3937   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3938   case MachineCombinerPattern::FMLSv2f32_OP2:
3939   case MachineCombinerPattern::FMLSv2f64_OP2:
3940   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3941   case MachineCombinerPattern::FMLSv4f32_OP2:
3942     return true;
3943   } // end switch (Pattern)
3944   return false;
3945 }
3946 /// Return true when there is potentially a faster code sequence for an
3947 /// instruction chain ending in \p Root. All potential patterns are listed in
3948 /// the \p Pattern vector. Pattern should be sorted in priority order since the
3949 /// pattern evaluator stops checking as soon as it finds a faster sequence.
3950 
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns) const3951 bool AArch64InstrInfo::getMachineCombinerPatterns(
3952     MachineInstr &Root,
3953     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
3954   // Integer patterns
3955   if (getMaddPatterns(Root, Patterns))
3956     return true;
3957   // Floating point patterns
3958   if (getFMAPatterns(Root, Patterns))
3959     return true;
3960 
3961   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
3962 }
3963 
3964 enum class FMAInstKind { Default, Indexed, Accumulator };
3965 /// genFusedMultiply - Generate fused multiply instructions.
3966 /// This function supports both integer and floating point instructions.
3967 /// A typical example:
3968 ///  F|MUL I=A,B,0
3969 ///  F|ADD R,I,C
3970 ///  ==> F|MADD R,A,B,C
3971 /// \param MF Containing MachineFunction
3972 /// \param MRI Register information
3973 /// \param TII Target information
3974 /// \param Root is the F|ADD instruction
3975 /// \param [out] InsInstrs is a vector of machine instructions and will
3976 /// contain the generated madd instruction
3977 /// \param IdxMulOpd is index of operand in Root that is the result of
3978 /// the F|MUL. In the example above IdxMulOpd is 1.
3979 /// \param MaddOpc the opcode fo the f|madd instruction
3980 /// \param RC Register class of operands
3981 /// \param kind of fma instruction (addressing mode) to be generated
3982 /// \param ReplacedAddend is the result register from the instruction
3983 /// replacing the non-combined operand, if any.
3984 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const unsigned * ReplacedAddend=nullptr)3985 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
3986                  const TargetInstrInfo *TII, MachineInstr &Root,
3987                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
3988                  unsigned MaddOpc, const TargetRegisterClass *RC,
3989                  FMAInstKind kind = FMAInstKind::Default,
3990                  const unsigned *ReplacedAddend = nullptr) {
3991   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
3992 
3993   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
3994   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
3995   unsigned ResultReg = Root.getOperand(0).getReg();
3996   unsigned SrcReg0 = MUL->getOperand(1).getReg();
3997   bool Src0IsKill = MUL->getOperand(1).isKill();
3998   unsigned SrcReg1 = MUL->getOperand(2).getReg();
3999   bool Src1IsKill = MUL->getOperand(2).isKill();
4000 
4001   unsigned SrcReg2;
4002   bool Src2IsKill;
4003   if (ReplacedAddend) {
4004     // If we just generated a new addend, we must be it's only use.
4005     SrcReg2 = *ReplacedAddend;
4006     Src2IsKill = true;
4007   } else {
4008     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4009     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4010   }
4011 
4012   if (TargetRegisterInfo::isVirtualRegister(ResultReg))
4013     MRI.constrainRegClass(ResultReg, RC);
4014   if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
4015     MRI.constrainRegClass(SrcReg0, RC);
4016   if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
4017     MRI.constrainRegClass(SrcReg1, RC);
4018   if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
4019     MRI.constrainRegClass(SrcReg2, RC);
4020 
4021   MachineInstrBuilder MIB;
4022   if (kind == FMAInstKind::Default)
4023     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4024               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4025               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4026               .addReg(SrcReg2, getKillRegState(Src2IsKill));
4027   else if (kind == FMAInstKind::Indexed)
4028     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4029               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4030               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4031               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4032               .addImm(MUL->getOperand(3).getImm());
4033   else if (kind == FMAInstKind::Accumulator)
4034     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4035               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4036               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4037               .addReg(SrcReg1, getKillRegState(Src1IsKill));
4038   else
4039     assert(false && "Invalid FMA instruction kind \n");
4040   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4041   InsInstrs.push_back(MIB);
4042   return MUL;
4043 }
4044 
4045 /// genMaddR - Generate madd instruction and combine mul and add using
4046 /// an extra virtual register
4047 /// Example - an ADD intermediate needs to be stored in a register:
4048 ///   MUL I=A,B,0
4049 ///   ADD R,I,Imm
4050 ///   ==> ORR  V, ZR, Imm
4051 ///   ==> MADD R,A,B,V
4052 /// \param MF Containing MachineFunction
4053 /// \param MRI Register information
4054 /// \param TII Target information
4055 /// \param Root is the ADD instruction
4056 /// \param [out] InsInstrs is a vector of machine instructions and will
4057 /// contain the generated madd instruction
4058 /// \param IdxMulOpd is index of operand in Root that is the result of
4059 /// the MUL. In the example above IdxMulOpd is 1.
4060 /// \param MaddOpc the opcode fo the madd instruction
4061 /// \param VR is a virtual register that holds the value of an ADD operand
4062 /// (V in the example above).
4063 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)4064 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4065                               const TargetInstrInfo *TII, MachineInstr &Root,
4066                               SmallVectorImpl<MachineInstr *> &InsInstrs,
4067                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4068                               const TargetRegisterClass *RC) {
4069   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4070 
4071   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4072   unsigned ResultReg = Root.getOperand(0).getReg();
4073   unsigned SrcReg0 = MUL->getOperand(1).getReg();
4074   bool Src0IsKill = MUL->getOperand(1).isKill();
4075   unsigned SrcReg1 = MUL->getOperand(2).getReg();
4076   bool Src1IsKill = MUL->getOperand(2).isKill();
4077 
4078   if (TargetRegisterInfo::isVirtualRegister(ResultReg))
4079     MRI.constrainRegClass(ResultReg, RC);
4080   if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
4081     MRI.constrainRegClass(SrcReg0, RC);
4082   if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
4083     MRI.constrainRegClass(SrcReg1, RC);
4084   if (TargetRegisterInfo::isVirtualRegister(VR))
4085     MRI.constrainRegClass(VR, RC);
4086 
4087   MachineInstrBuilder MIB =
4088       BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4089           .addReg(SrcReg0, getKillRegState(Src0IsKill))
4090           .addReg(SrcReg1, getKillRegState(Src1IsKill))
4091           .addReg(VR);
4092   // Insert the MADD
4093   InsInstrs.push_back(MIB);
4094   return MUL;
4095 }
4096 
4097 /// When getMachineCombinerPatterns() finds potential patterns,
4098 /// this function generates the instructions that could replace the
4099 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const4100 void AArch64InstrInfo::genAlternativeCodeSequence(
4101     MachineInstr &Root, MachineCombinerPattern Pattern,
4102     SmallVectorImpl<MachineInstr *> &InsInstrs,
4103     SmallVectorImpl<MachineInstr *> &DelInstrs,
4104     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4105   MachineBasicBlock &MBB = *Root.getParent();
4106   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4107   MachineFunction &MF = *MBB.getParent();
4108   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4109 
4110   MachineInstr *MUL;
4111   const TargetRegisterClass *RC;
4112   unsigned Opc;
4113   switch (Pattern) {
4114   default:
4115     // Reassociate instructions.
4116     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4117                                                 DelInstrs, InstrIdxForVirtReg);
4118     return;
4119   case MachineCombinerPattern::MULADDW_OP1:
4120   case MachineCombinerPattern::MULADDX_OP1:
4121     // MUL I=A,B,0
4122     // ADD R,I,C
4123     // ==> MADD R,A,B,C
4124     // --- Create(MADD);
4125     if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4126       Opc = AArch64::MADDWrrr;
4127       RC = &AArch64::GPR32RegClass;
4128     } else {
4129       Opc = AArch64::MADDXrrr;
4130       RC = &AArch64::GPR64RegClass;
4131     }
4132     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4133     break;
4134   case MachineCombinerPattern::MULADDW_OP2:
4135   case MachineCombinerPattern::MULADDX_OP2:
4136     // MUL I=A,B,0
4137     // ADD R,C,I
4138     // ==> MADD R,A,B,C
4139     // --- Create(MADD);
4140     if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4141       Opc = AArch64::MADDWrrr;
4142       RC = &AArch64::GPR32RegClass;
4143     } else {
4144       Opc = AArch64::MADDXrrr;
4145       RC = &AArch64::GPR64RegClass;
4146     }
4147     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4148     break;
4149   case MachineCombinerPattern::MULADDWI_OP1:
4150   case MachineCombinerPattern::MULADDXI_OP1: {
4151     // MUL I=A,B,0
4152     // ADD R,I,Imm
4153     // ==> ORR  V, ZR, Imm
4154     // ==> MADD R,A,B,V
4155     // --- Create(MADD);
4156     const TargetRegisterClass *OrrRC;
4157     unsigned BitSize, OrrOpc, ZeroReg;
4158     if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4159       OrrOpc = AArch64::ORRWri;
4160       OrrRC = &AArch64::GPR32spRegClass;
4161       BitSize = 32;
4162       ZeroReg = AArch64::WZR;
4163       Opc = AArch64::MADDWrrr;
4164       RC = &AArch64::GPR32RegClass;
4165     } else {
4166       OrrOpc = AArch64::ORRXri;
4167       OrrRC = &AArch64::GPR64spRegClass;
4168       BitSize = 64;
4169       ZeroReg = AArch64::XZR;
4170       Opc = AArch64::MADDXrrr;
4171       RC = &AArch64::GPR64RegClass;
4172     }
4173     unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4174     uint64_t Imm = Root.getOperand(2).getImm();
4175 
4176     if (Root.getOperand(3).isImm()) {
4177       unsigned Val = Root.getOperand(3).getImm();
4178       Imm = Imm << Val;
4179     }
4180     uint64_t UImm = SignExtend64(Imm, BitSize);
4181     uint64_t Encoding;
4182     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4183       MachineInstrBuilder MIB1 =
4184           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4185               .addReg(ZeroReg)
4186               .addImm(Encoding);
4187       InsInstrs.push_back(MIB1);
4188       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4189       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4190     }
4191     break;
4192   }
4193   case MachineCombinerPattern::MULSUBW_OP1:
4194   case MachineCombinerPattern::MULSUBX_OP1: {
4195     // MUL I=A,B,0
4196     // SUB R,I, C
4197     // ==> SUB  V, 0, C
4198     // ==> MADD R,A,B,V // = -C + A*B
4199     // --- Create(MADD);
4200     const TargetRegisterClass *SubRC;
4201     unsigned SubOpc, ZeroReg;
4202     if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4203       SubOpc = AArch64::SUBWrr;
4204       SubRC = &AArch64::GPR32spRegClass;
4205       ZeroReg = AArch64::WZR;
4206       Opc = AArch64::MADDWrrr;
4207       RC = &AArch64::GPR32RegClass;
4208     } else {
4209       SubOpc = AArch64::SUBXrr;
4210       SubRC = &AArch64::GPR64spRegClass;
4211       ZeroReg = AArch64::XZR;
4212       Opc = AArch64::MADDXrrr;
4213       RC = &AArch64::GPR64RegClass;
4214     }
4215     unsigned NewVR = MRI.createVirtualRegister(SubRC);
4216     // SUB NewVR, 0, C
4217     MachineInstrBuilder MIB1 =
4218         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4219             .addReg(ZeroReg)
4220             .add(Root.getOperand(2));
4221     InsInstrs.push_back(MIB1);
4222     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4223     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4224     break;
4225   }
4226   case MachineCombinerPattern::MULSUBW_OP2:
4227   case MachineCombinerPattern::MULSUBX_OP2:
4228     // MUL I=A,B,0
4229     // SUB R,C,I
4230     // ==> MSUB R,A,B,C (computes C - A*B)
4231     // --- Create(MSUB);
4232     if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4233       Opc = AArch64::MSUBWrrr;
4234       RC = &AArch64::GPR32RegClass;
4235     } else {
4236       Opc = AArch64::MSUBXrrr;
4237       RC = &AArch64::GPR64RegClass;
4238     }
4239     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4240     break;
4241   case MachineCombinerPattern::MULSUBWI_OP1:
4242   case MachineCombinerPattern::MULSUBXI_OP1: {
4243     // MUL I=A,B,0
4244     // SUB R,I, Imm
4245     // ==> ORR  V, ZR, -Imm
4246     // ==> MADD R,A,B,V // = -Imm + A*B
4247     // --- Create(MADD);
4248     const TargetRegisterClass *OrrRC;
4249     unsigned BitSize, OrrOpc, ZeroReg;
4250     if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4251       OrrOpc = AArch64::ORRWri;
4252       OrrRC = &AArch64::GPR32spRegClass;
4253       BitSize = 32;
4254       ZeroReg = AArch64::WZR;
4255       Opc = AArch64::MADDWrrr;
4256       RC = &AArch64::GPR32RegClass;
4257     } else {
4258       OrrOpc = AArch64::ORRXri;
4259       OrrRC = &AArch64::GPR64spRegClass;
4260       BitSize = 64;
4261       ZeroReg = AArch64::XZR;
4262       Opc = AArch64::MADDXrrr;
4263       RC = &AArch64::GPR64RegClass;
4264     }
4265     unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4266     uint64_t Imm = Root.getOperand(2).getImm();
4267     if (Root.getOperand(3).isImm()) {
4268       unsigned Val = Root.getOperand(3).getImm();
4269       Imm = Imm << Val;
4270     }
4271     uint64_t UImm = SignExtend64(-Imm, BitSize);
4272     uint64_t Encoding;
4273     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4274       MachineInstrBuilder MIB1 =
4275           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4276               .addReg(ZeroReg)
4277               .addImm(Encoding);
4278       InsInstrs.push_back(MIB1);
4279       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4280       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4281     }
4282     break;
4283   }
4284   // Floating Point Support
4285   case MachineCombinerPattern::FMULADDS_OP1:
4286   case MachineCombinerPattern::FMULADDD_OP1:
4287     // MUL I=A,B,0
4288     // ADD R,I,C
4289     // ==> MADD R,A,B,C
4290     // --- Create(MADD);
4291     if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
4292       Opc = AArch64::FMADDSrrr;
4293       RC = &AArch64::FPR32RegClass;
4294     } else {
4295       Opc = AArch64::FMADDDrrr;
4296       RC = &AArch64::FPR64RegClass;
4297     }
4298     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4299     break;
4300   case MachineCombinerPattern::FMULADDS_OP2:
4301   case MachineCombinerPattern::FMULADDD_OP2:
4302     // FMUL I=A,B,0
4303     // FADD R,C,I
4304     // ==> FMADD R,A,B,C
4305     // --- Create(FMADD);
4306     if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
4307       Opc = AArch64::FMADDSrrr;
4308       RC = &AArch64::FPR32RegClass;
4309     } else {
4310       Opc = AArch64::FMADDDrrr;
4311       RC = &AArch64::FPR64RegClass;
4312     }
4313     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4314     break;
4315 
4316   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4317     Opc = AArch64::FMLAv1i32_indexed;
4318     RC = &AArch64::FPR32RegClass;
4319     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4320                            FMAInstKind::Indexed);
4321     break;
4322   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4323     Opc = AArch64::FMLAv1i32_indexed;
4324     RC = &AArch64::FPR32RegClass;
4325     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4326                            FMAInstKind::Indexed);
4327     break;
4328 
4329   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4330     Opc = AArch64::FMLAv1i64_indexed;
4331     RC = &AArch64::FPR64RegClass;
4332     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4333                            FMAInstKind::Indexed);
4334     break;
4335   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4336     Opc = AArch64::FMLAv1i64_indexed;
4337     RC = &AArch64::FPR64RegClass;
4338     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4339                            FMAInstKind::Indexed);
4340     break;
4341 
4342   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4343   case MachineCombinerPattern::FMLAv2f32_OP1:
4344     RC = &AArch64::FPR64RegClass;
4345     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
4346       Opc = AArch64::FMLAv2i32_indexed;
4347       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4348                              FMAInstKind::Indexed);
4349     } else {
4350       Opc = AArch64::FMLAv2f32;
4351       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4352                              FMAInstKind::Accumulator);
4353     }
4354     break;
4355   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4356   case MachineCombinerPattern::FMLAv2f32_OP2:
4357     RC = &AArch64::FPR64RegClass;
4358     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
4359       Opc = AArch64::FMLAv2i32_indexed;
4360       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4361                              FMAInstKind::Indexed);
4362     } else {
4363       Opc = AArch64::FMLAv2f32;
4364       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4365                              FMAInstKind::Accumulator);
4366     }
4367     break;
4368 
4369   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4370   case MachineCombinerPattern::FMLAv2f64_OP1:
4371     RC = &AArch64::FPR128RegClass;
4372     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
4373       Opc = AArch64::FMLAv2i64_indexed;
4374       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4375                              FMAInstKind::Indexed);
4376     } else {
4377       Opc = AArch64::FMLAv2f64;
4378       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4379                              FMAInstKind::Accumulator);
4380     }
4381     break;
4382   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4383   case MachineCombinerPattern::FMLAv2f64_OP2:
4384     RC = &AArch64::FPR128RegClass;
4385     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
4386       Opc = AArch64::FMLAv2i64_indexed;
4387       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4388                              FMAInstKind::Indexed);
4389     } else {
4390       Opc = AArch64::FMLAv2f64;
4391       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4392                              FMAInstKind::Accumulator);
4393     }
4394     break;
4395 
4396   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4397   case MachineCombinerPattern::FMLAv4f32_OP1:
4398     RC = &AArch64::FPR128RegClass;
4399     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
4400       Opc = AArch64::FMLAv4i32_indexed;
4401       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4402                              FMAInstKind::Indexed);
4403     } else {
4404       Opc = AArch64::FMLAv4f32;
4405       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4406                              FMAInstKind::Accumulator);
4407     }
4408     break;
4409 
4410   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4411   case MachineCombinerPattern::FMLAv4f32_OP2:
4412     RC = &AArch64::FPR128RegClass;
4413     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
4414       Opc = AArch64::FMLAv4i32_indexed;
4415       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4416                              FMAInstKind::Indexed);
4417     } else {
4418       Opc = AArch64::FMLAv4f32;
4419       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4420                              FMAInstKind::Accumulator);
4421     }
4422     break;
4423 
4424   case MachineCombinerPattern::FMULSUBS_OP1:
4425   case MachineCombinerPattern::FMULSUBD_OP1: {
4426     // FMUL I=A,B,0
4427     // FSUB R,I,C
4428     // ==> FNMSUB R,A,B,C // = -C + A*B
4429     // --- Create(FNMSUB);
4430     if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
4431       Opc = AArch64::FNMSUBSrrr;
4432       RC = &AArch64::FPR32RegClass;
4433     } else {
4434       Opc = AArch64::FNMSUBDrrr;
4435       RC = &AArch64::FPR64RegClass;
4436     }
4437     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4438     break;
4439   }
4440 
4441   case MachineCombinerPattern::FNMULSUBS_OP1:
4442   case MachineCombinerPattern::FNMULSUBD_OP1: {
4443     // FNMUL I=A,B,0
4444     // FSUB R,I,C
4445     // ==> FNMADD R,A,B,C // = -A*B - C
4446     // --- Create(FNMADD);
4447     if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
4448       Opc = AArch64::FNMADDSrrr;
4449       RC = &AArch64::FPR32RegClass;
4450     } else {
4451       Opc = AArch64::FNMADDDrrr;
4452       RC = &AArch64::FPR64RegClass;
4453     }
4454     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4455     break;
4456   }
4457 
4458   case MachineCombinerPattern::FMULSUBS_OP2:
4459   case MachineCombinerPattern::FMULSUBD_OP2: {
4460     // FMUL I=A,B,0
4461     // FSUB R,C,I
4462     // ==> FMSUB R,A,B,C (computes C - A*B)
4463     // --- Create(FMSUB);
4464     if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
4465       Opc = AArch64::FMSUBSrrr;
4466       RC = &AArch64::FPR32RegClass;
4467     } else {
4468       Opc = AArch64::FMSUBDrrr;
4469       RC = &AArch64::FPR64RegClass;
4470     }
4471     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4472     break;
4473   }
4474 
4475   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4476     Opc = AArch64::FMLSv1i32_indexed;
4477     RC = &AArch64::FPR32RegClass;
4478     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4479                            FMAInstKind::Indexed);
4480     break;
4481 
4482   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4483     Opc = AArch64::FMLSv1i64_indexed;
4484     RC = &AArch64::FPR64RegClass;
4485     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4486                            FMAInstKind::Indexed);
4487     break;
4488 
4489   case MachineCombinerPattern::FMLSv2f32_OP2:
4490   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4491     RC = &AArch64::FPR64RegClass;
4492     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
4493       Opc = AArch64::FMLSv2i32_indexed;
4494       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4495                              FMAInstKind::Indexed);
4496     } else {
4497       Opc = AArch64::FMLSv2f32;
4498       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4499                              FMAInstKind::Accumulator);
4500     }
4501     break;
4502 
4503   case MachineCombinerPattern::FMLSv2f64_OP2:
4504   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4505     RC = &AArch64::FPR128RegClass;
4506     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
4507       Opc = AArch64::FMLSv2i64_indexed;
4508       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4509                              FMAInstKind::Indexed);
4510     } else {
4511       Opc = AArch64::FMLSv2f64;
4512       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4513                              FMAInstKind::Accumulator);
4514     }
4515     break;
4516 
4517   case MachineCombinerPattern::FMLSv4f32_OP2:
4518   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4519     RC = &AArch64::FPR128RegClass;
4520     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
4521       Opc = AArch64::FMLSv4i32_indexed;
4522       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4523                              FMAInstKind::Indexed);
4524     } else {
4525       Opc = AArch64::FMLSv4f32;
4526       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4527                              FMAInstKind::Accumulator);
4528     }
4529     break;
4530   case MachineCombinerPattern::FMLSv2f32_OP1:
4531   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
4532     RC = &AArch64::FPR64RegClass;
4533     unsigned NewVR = MRI.createVirtualRegister(RC);
4534     MachineInstrBuilder MIB1 =
4535         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
4536             .add(Root.getOperand(2));
4537     InsInstrs.push_back(MIB1);
4538     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4539     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
4540       Opc = AArch64::FMLAv2i32_indexed;
4541       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4542                              FMAInstKind::Indexed, &NewVR);
4543     } else {
4544       Opc = AArch64::FMLAv2f32;
4545       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4546                              FMAInstKind::Accumulator, &NewVR);
4547     }
4548     break;
4549   }
4550   case MachineCombinerPattern::FMLSv4f32_OP1:
4551   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
4552     RC = &AArch64::FPR128RegClass;
4553     unsigned NewVR = MRI.createVirtualRegister(RC);
4554     MachineInstrBuilder MIB1 =
4555         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
4556             .add(Root.getOperand(2));
4557     InsInstrs.push_back(MIB1);
4558     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4559     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
4560       Opc = AArch64::FMLAv4i32_indexed;
4561       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4562                              FMAInstKind::Indexed, &NewVR);
4563     } else {
4564       Opc = AArch64::FMLAv4f32;
4565       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4566                              FMAInstKind::Accumulator, &NewVR);
4567     }
4568     break;
4569   }
4570   case MachineCombinerPattern::FMLSv2f64_OP1:
4571   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
4572     RC = &AArch64::FPR128RegClass;
4573     unsigned NewVR = MRI.createVirtualRegister(RC);
4574     MachineInstrBuilder MIB1 =
4575         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
4576             .add(Root.getOperand(2));
4577     InsInstrs.push_back(MIB1);
4578     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4579     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
4580       Opc = AArch64::FMLAv2i64_indexed;
4581       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4582                              FMAInstKind::Indexed, &NewVR);
4583     } else {
4584       Opc = AArch64::FMLAv2f64;
4585       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4586                              FMAInstKind::Accumulator, &NewVR);
4587     }
4588     break;
4589   }
4590   } // end switch (Pattern)
4591   // Record MUL and ADD/SUB for deletion
4592   DelInstrs.push_back(MUL);
4593   DelInstrs.push_back(&Root);
4594 }
4595 
4596 /// Replace csincr-branch sequence by simple conditional branch
4597 ///
4598 /// Examples:
4599 /// 1. \code
4600 ///   csinc  w9, wzr, wzr, <condition code>
4601 ///   tbnz   w9, #0, 0x44
4602 ///    \endcode
4603 /// to
4604 ///    \code
4605 ///   b.<inverted condition code>
4606 ///    \endcode
4607 ///
4608 /// 2. \code
4609 ///   csinc w9, wzr, wzr, <condition code>
4610 ///   tbz   w9, #0, 0x44
4611 ///    \endcode
4612 /// to
4613 ///    \code
4614 ///   b.<condition code>
4615 ///    \endcode
4616 ///
4617 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
4618 /// compare's constant operand is power of 2.
4619 ///
4620 /// Examples:
4621 ///    \code
4622 ///   and  w8, w8, #0x400
4623 ///   cbnz w8, L1
4624 ///    \endcode
4625 /// to
4626 ///    \code
4627 ///   tbnz w8, #10, L1
4628 ///    \endcode
4629 ///
4630 /// \param  MI Conditional Branch
4631 /// \return True when the simple conditional branch is generated
4632 ///
optimizeCondBranch(MachineInstr & MI) const4633 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
4634   bool IsNegativeBranch = false;
4635   bool IsTestAndBranch = false;
4636   unsigned TargetBBInMI = 0;
4637   switch (MI.getOpcode()) {
4638   default:
4639     llvm_unreachable("Unknown branch instruction?");
4640   case AArch64::Bcc:
4641     return false;
4642   case AArch64::CBZW:
4643   case AArch64::CBZX:
4644     TargetBBInMI = 1;
4645     break;
4646   case AArch64::CBNZW:
4647   case AArch64::CBNZX:
4648     TargetBBInMI = 1;
4649     IsNegativeBranch = true;
4650     break;
4651   case AArch64::TBZW:
4652   case AArch64::TBZX:
4653     TargetBBInMI = 2;
4654     IsTestAndBranch = true;
4655     break;
4656   case AArch64::TBNZW:
4657   case AArch64::TBNZX:
4658     TargetBBInMI = 2;
4659     IsNegativeBranch = true;
4660     IsTestAndBranch = true;
4661     break;
4662   }
4663   // So we increment a zero register and test for bits other
4664   // than bit 0? Conservatively bail out in case the verifier
4665   // missed this case.
4666   if (IsTestAndBranch && MI.getOperand(1).getImm())
4667     return false;
4668 
4669   // Find Definition.
4670   assert(MI.getParent() && "Incomplete machine instruciton\n");
4671   MachineBasicBlock *MBB = MI.getParent();
4672   MachineFunction *MF = MBB->getParent();
4673   MachineRegisterInfo *MRI = &MF->getRegInfo();
4674   unsigned VReg = MI.getOperand(0).getReg();
4675   if (!TargetRegisterInfo::isVirtualRegister(VReg))
4676     return false;
4677 
4678   MachineInstr *DefMI = MRI->getVRegDef(VReg);
4679 
4680   // Look through COPY instructions to find definition.
4681   while (DefMI->isCopy()) {
4682     unsigned CopyVReg = DefMI->getOperand(1).getReg();
4683     if (!MRI->hasOneNonDBGUse(CopyVReg))
4684       return false;
4685     if (!MRI->hasOneDef(CopyVReg))
4686       return false;
4687     DefMI = MRI->getVRegDef(CopyVReg);
4688   }
4689 
4690   switch (DefMI->getOpcode()) {
4691   default:
4692     return false;
4693   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
4694   case AArch64::ANDWri:
4695   case AArch64::ANDXri: {
4696     if (IsTestAndBranch)
4697       return false;
4698     if (DefMI->getParent() != MBB)
4699       return false;
4700     if (!MRI->hasOneNonDBGUse(VReg))
4701       return false;
4702 
4703     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
4704     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
4705         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
4706     if (!isPowerOf2_64(Mask))
4707       return false;
4708 
4709     MachineOperand &MO = DefMI->getOperand(1);
4710     unsigned NewReg = MO.getReg();
4711     if (!TargetRegisterInfo::isVirtualRegister(NewReg))
4712       return false;
4713 
4714     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
4715 
4716     MachineBasicBlock &RefToMBB = *MBB;
4717     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
4718     DebugLoc DL = MI.getDebugLoc();
4719     unsigned Imm = Log2_64(Mask);
4720     unsigned Opc = (Imm < 32)
4721                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
4722                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
4723     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
4724                               .addReg(NewReg)
4725                               .addImm(Imm)
4726                               .addMBB(TBB);
4727     // Register lives on to the CBZ now.
4728     MO.setIsKill(false);
4729 
4730     // For immediate smaller than 32, we need to use the 32-bit
4731     // variant (W) in all cases. Indeed the 64-bit variant does not
4732     // allow to encode them.
4733     // Therefore, if the input register is 64-bit, we need to take the
4734     // 32-bit sub-part.
4735     if (!Is32Bit && Imm < 32)
4736       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
4737     MI.eraseFromParent();
4738     return true;
4739   }
4740   // Look for CSINC
4741   case AArch64::CSINCWr:
4742   case AArch64::CSINCXr: {
4743     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
4744           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
4745         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
4746           DefMI->getOperand(2).getReg() == AArch64::XZR))
4747       return false;
4748 
4749     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
4750       return false;
4751 
4752     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
4753     // Convert only when the condition code is not modified between
4754     // the CSINC and the branch. The CC may be used by other
4755     // instructions in between.
4756     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
4757       return false;
4758     MachineBasicBlock &RefToMBB = *MBB;
4759     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
4760     DebugLoc DL = MI.getDebugLoc();
4761     if (IsNegativeBranch)
4762       CC = AArch64CC::getInvertedCondCode(CC);
4763     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
4764     MI.eraseFromParent();
4765     return true;
4766   }
4767   }
4768 }
4769 
4770 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const4771 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
4772   const unsigned Mask = AArch64II::MO_FRAGMENT;
4773   return std::make_pair(TF & Mask, TF & ~Mask);
4774 }
4775 
4776 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const4777 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
4778   using namespace AArch64II;
4779 
4780   static const std::pair<unsigned, const char *> TargetFlags[] = {
4781       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
4782       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
4783       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
4784       {MO_HI12, "aarch64-hi12"}};
4785   return makeArrayRef(TargetFlags);
4786 }
4787 
4788 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const4789 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
4790   using namespace AArch64II;
4791 
4792   static const std::pair<unsigned, const char *> TargetFlags[] = {
4793       {MO_COFFSTUB, "aarch64-coffstub"},
4794       {MO_GOT, "aarch64-got"},   {MO_NC, "aarch64-nc"},
4795       {MO_S, "aarch64-s"},       {MO_TLS, "aarch64-tls"},
4796       {MO_DLLIMPORT, "aarch64-dllimport"}};
4797   return makeArrayRef(TargetFlags);
4798 }
4799 
4800 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const4801 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
4802   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
4803       {{MOSuppressPair, "aarch64-suppress-pair"},
4804        {MOStridedAccess, "aarch64-strided-access"}};
4805   return makeArrayRef(TargetFlags);
4806 }
4807 
4808 /// Constants defining how certain sequences should be outlined.
4809 /// This encompasses how an outlined function should be called, and what kind of
4810 /// frame should be emitted for that outlined function.
4811 ///
4812 /// \p MachineOutlinerDefault implies that the function should be called with
4813 /// a save and restore of LR to the stack.
4814 ///
4815 /// That is,
4816 ///
4817 /// I1     Save LR                    OUTLINED_FUNCTION:
4818 /// I2 --> BL OUTLINED_FUNCTION       I1
4819 /// I3     Restore LR                 I2
4820 ///                                   I3
4821 ///                                   RET
4822 ///
4823 /// * Call construction overhead: 3 (save + BL + restore)
4824 /// * Frame construction overhead: 1 (ret)
4825 /// * Requires stack fixups? Yes
4826 ///
4827 /// \p MachineOutlinerTailCall implies that the function is being created from
4828 /// a sequence of instructions ending in a return.
4829 ///
4830 /// That is,
4831 ///
4832 /// I1                             OUTLINED_FUNCTION:
4833 /// I2 --> B OUTLINED_FUNCTION     I1
4834 /// RET                            I2
4835 ///                                RET
4836 ///
4837 /// * Call construction overhead: 1 (B)
4838 /// * Frame construction overhead: 0 (Return included in sequence)
4839 /// * Requires stack fixups? No
4840 ///
4841 /// \p MachineOutlinerNoLRSave implies that the function should be called using
4842 /// a BL instruction, but doesn't require LR to be saved and restored. This
4843 /// happens when LR is known to be dead.
4844 ///
4845 /// That is,
4846 ///
4847 /// I1                                OUTLINED_FUNCTION:
4848 /// I2 --> BL OUTLINED_FUNCTION       I1
4849 /// I3                                I2
4850 ///                                   I3
4851 ///                                   RET
4852 ///
4853 /// * Call construction overhead: 1 (BL)
4854 /// * Frame construction overhead: 1 (RET)
4855 /// * Requires stack fixups? No
4856 ///
4857 /// \p MachineOutlinerThunk implies that the function is being created from
4858 /// a sequence of instructions ending in a call. The outlined function is
4859 /// called with a BL instruction, and the outlined function tail-calls the
4860 /// original call destination.
4861 ///
4862 /// That is,
4863 ///
4864 /// I1                                OUTLINED_FUNCTION:
4865 /// I2 --> BL OUTLINED_FUNCTION       I1
4866 /// BL f                              I2
4867 ///                                   B f
4868 /// * Call construction overhead: 1 (BL)
4869 /// * Frame construction overhead: 0
4870 /// * Requires stack fixups? No
4871 ///
4872 /// \p MachineOutlinerRegSave implies that the function should be called with a
4873 /// save and restore of LR to an available register. This allows us to avoid
4874 /// stack fixups. Note that this outlining variant is compatible with the
4875 /// NoLRSave case.
4876 ///
4877 /// That is,
4878 ///
4879 /// I1     Save LR                    OUTLINED_FUNCTION:
4880 /// I2 --> BL OUTLINED_FUNCTION       I1
4881 /// I3     Restore LR                 I2
4882 ///                                   I3
4883 ///                                   RET
4884 ///
4885 /// * Call construction overhead: 3 (save + BL + restore)
4886 /// * Frame construction overhead: 1 (ret)
4887 /// * Requires stack fixups? No
4888 enum MachineOutlinerClass {
4889   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
4890   MachineOutlinerTailCall, /// Only emit a branch.
4891   MachineOutlinerNoLRSave, /// Emit a call and return.
4892   MachineOutlinerThunk,    /// Emit a call and tail-call.
4893   MachineOutlinerRegSave   /// Same as default, but save to a register.
4894 };
4895 
4896 enum MachineOutlinerMBBFlags {
4897   LRUnavailableSomewhere = 0x2,
4898   HasCalls = 0x4,
4899   UnsafeRegsDead = 0x8
4900 };
4901 
4902 unsigned
findRegisterToSaveLRTo(const outliner::Candidate & C) const4903 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
4904   assert(C.LRUWasSet && "LRU wasn't set?");
4905   MachineFunction *MF = C.getMF();
4906   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
4907       MF->getSubtarget().getRegisterInfo());
4908 
4909   // Check if there is an available register across the sequence that we can
4910   // use.
4911   for (unsigned Reg : AArch64::GPR64RegClass) {
4912     if (!ARI->isReservedReg(*MF, Reg) &&
4913         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
4914         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
4915         Reg != AArch64::X17 && // Ditto for X17.
4916         C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
4917       return Reg;
4918   }
4919 
4920   // No suitable register. Return 0.
4921   return 0u;
4922 }
4923 
4924 outliner::OutlinedFunction
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const4925 AArch64InstrInfo::getOutliningCandidateInfo(
4926     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
4927   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
4928   unsigned SequenceSize =
4929       std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
4930                       [this](unsigned Sum, const MachineInstr &MI) {
4931                         return Sum + getInstSizeInBytes(MI);
4932                       });
4933 
4934   // Properties about candidate MBBs that hold for all of them.
4935   unsigned FlagsSetInAll = 0xF;
4936 
4937   // Compute liveness information for each candidate, and set FlagsSetInAll.
4938   const TargetRegisterInfo &TRI = getRegisterInfo();
4939   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
4940                 [&FlagsSetInAll](outliner::Candidate &C) {
4941                   FlagsSetInAll &= C.Flags;
4942                 });
4943 
4944   // According to the AArch64 Procedure Call Standard, the following are
4945   // undefined on entry/exit from a function call:
4946   //
4947   // * Registers x16, x17, (and thus w16, w17)
4948   // * Condition codes (and thus the NZCV register)
4949   //
4950   // Because if this, we can't outline any sequence of instructions where
4951   // one
4952   // of these registers is live into/across it. Thus, we need to delete
4953   // those
4954   // candidates.
4955   auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
4956     // If the unsafe registers in this block are all dead, then we don't need
4957     // to compute liveness here.
4958     if (C.Flags & UnsafeRegsDead)
4959       return false;
4960     C.initLRU(TRI);
4961     LiveRegUnits LRU = C.LRU;
4962     return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
4963             !LRU.available(AArch64::NZCV));
4964   };
4965 
4966   // Are there any candidates where those registers are live?
4967   if (!(FlagsSetInAll & UnsafeRegsDead)) {
4968     // Erase every candidate that violates the restrictions above. (It could be
4969     // true that we have viable candidates, so it's not worth bailing out in
4970     // the case that, say, 1 out of 20 candidates violate the restructions.)
4971     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
4972                                               RepeatedSequenceLocs.end(),
4973                                               CantGuaranteeValueAcrossCall),
4974                                RepeatedSequenceLocs.end());
4975 
4976     // If the sequence doesn't have enough candidates left, then we're done.
4977     if (RepeatedSequenceLocs.size() < 2)
4978       return outliner::OutlinedFunction();
4979   }
4980 
4981   // At this point, we have only "safe" candidates to outline. Figure out
4982   // frame + call instruction information.
4983 
4984   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
4985 
4986   // Helper lambda which sets call information for every candidate.
4987   auto SetCandidateCallInfo =
4988       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
4989         for (outliner::Candidate &C : RepeatedSequenceLocs)
4990           C.setCallInfo(CallID, NumBytesForCall);
4991       };
4992 
4993   unsigned FrameID = MachineOutlinerDefault;
4994   unsigned NumBytesToCreateFrame = 4;
4995 
4996   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
4997     return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
4998   });
4999 
5000   // Returns true if an instructions is safe to fix up, false otherwise.
5001   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
5002     if (MI.isCall())
5003       return true;
5004 
5005     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
5006         !MI.readsRegister(AArch64::SP, &TRI))
5007       return true;
5008 
5009     // Any modification of SP will break our code to save/restore LR.
5010     // FIXME: We could handle some instructions which add a constant
5011     // offset to SP, with a bit more work.
5012     if (MI.modifiesRegister(AArch64::SP, &TRI))
5013       return false;
5014 
5015     // At this point, we have a stack instruction that we might need to
5016     // fix up. We'll handle it if it's a load or store.
5017     if (MI.mayLoadOrStore()) {
5018       MachineOperand *Base; // Filled with the base operand of MI.
5019       int64_t Offset;       // Filled with the offset of MI.
5020 
5021       // Does it allow us to offset the base operand and is the base the
5022       // register SP?
5023       if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
5024           Base->getReg() != AArch64::SP)
5025         return false;
5026 
5027       // Find the minimum/maximum offset for this instruction and check
5028       // if fixing it up would be in range.
5029       int64_t MinOffset,
5030           MaxOffset;  // Unscaled offsets for the instruction.
5031       unsigned Scale; // The scale to multiply the offsets by.
5032       unsigned DummyWidth;
5033       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
5034 
5035       Offset += 16; // Update the offset to what it would be if we outlined.
5036       if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
5037         return false;
5038 
5039       // It's in range, so we can outline it.
5040       return true;
5041     }
5042 
5043     // FIXME: Add handling for instructions like "add x0, sp, #8".
5044 
5045     // We can't fix it up, so don't outline it.
5046     return false;
5047   };
5048 
5049   // True if it's possible to fix up each stack instruction in this sequence.
5050   // Important for frames/call variants that modify the stack.
5051   bool AllStackInstrsSafe = std::all_of(
5052       FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
5053 
5054   // If the last instruction in any candidate is a terminator, then we should
5055   // tail call all of the candidates.
5056   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
5057     FrameID = MachineOutlinerTailCall;
5058     NumBytesToCreateFrame = 0;
5059     SetCandidateCallInfo(MachineOutlinerTailCall, 4);
5060   }
5061 
5062   else if (LastInstrOpcode == AArch64::BL ||
5063            (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
5064     // FIXME: Do we need to check if the code after this uses the value of LR?
5065     FrameID = MachineOutlinerThunk;
5066     NumBytesToCreateFrame = 0;
5067     SetCandidateCallInfo(MachineOutlinerThunk, 4);
5068   }
5069 
5070   else {
5071     // We need to decide how to emit calls + frames. We can always emit the same
5072     // frame if we don't need to save to the stack. If we have to save to the
5073     // stack, then we need a different frame.
5074     unsigned NumBytesNoStackCalls = 0;
5075     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
5076 
5077     for (outliner::Candidate &C : RepeatedSequenceLocs) {
5078       C.initLRU(TRI);
5079 
5080       // Is LR available? If so, we don't need a save.
5081       if (C.LRU.available(AArch64::LR)) {
5082         NumBytesNoStackCalls += 4;
5083         C.setCallInfo(MachineOutlinerNoLRSave, 4);
5084         CandidatesWithoutStackFixups.push_back(C);
5085       }
5086 
5087       // Is an unused register available? If so, we won't modify the stack, so
5088       // we can outline with the same frame type as those that don't save LR.
5089       else if (findRegisterToSaveLRTo(C)) {
5090         NumBytesNoStackCalls += 12;
5091         C.setCallInfo(MachineOutlinerRegSave, 12);
5092         CandidatesWithoutStackFixups.push_back(C);
5093       }
5094 
5095       // Is SP used in the sequence at all? If not, we don't have to modify
5096       // the stack, so we are guaranteed to get the same frame.
5097       else if (C.UsedInSequence.available(AArch64::SP)) {
5098         NumBytesNoStackCalls += 12;
5099         C.setCallInfo(MachineOutlinerDefault, 12);
5100         CandidatesWithoutStackFixups.push_back(C);
5101       }
5102 
5103       // If we outline this, we need to modify the stack. Pretend we don't
5104       // outline this by saving all of its bytes.
5105       else {
5106         NumBytesNoStackCalls += SequenceSize;
5107       }
5108     }
5109 
5110     // If there are no places where we have to save LR, then note that we
5111     // don't have to update the stack. Otherwise, give every candidate the
5112     // default call type, as long as it's safe to do so.
5113     if (!AllStackInstrsSafe ||
5114         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
5115       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
5116       FrameID = MachineOutlinerNoLRSave;
5117     } else {
5118       SetCandidateCallInfo(MachineOutlinerDefault, 12);
5119     }
5120 
5121     // If we dropped all of the candidates, bail out here.
5122     if (RepeatedSequenceLocs.size() < 2) {
5123       RepeatedSequenceLocs.clear();
5124       return outliner::OutlinedFunction();
5125     }
5126   }
5127 
5128   // Does every candidate's MBB contain a call? If so, then we might have a call
5129   // in the range.
5130   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
5131     // Check if the range contains a call. These require a save + restore of the
5132     // link register.
5133     bool ModStackToSaveLR = false;
5134     if (std::any_of(FirstCand.front(), FirstCand.back(),
5135                     [](const MachineInstr &MI) { return MI.isCall(); }))
5136       ModStackToSaveLR = true;
5137 
5138     // Handle the last instruction separately. If this is a tail call, then the
5139     // last instruction is a call. We don't want to save + restore in this case.
5140     // However, it could be possible that the last instruction is a call without
5141     // it being valid to tail call this sequence. We should consider this as
5142     // well.
5143     else if (FrameID != MachineOutlinerThunk &&
5144              FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
5145       ModStackToSaveLR = true;
5146 
5147     if (ModStackToSaveLR) {
5148       // We can't fix up the stack. Bail out.
5149       if (!AllStackInstrsSafe) {
5150         RepeatedSequenceLocs.clear();
5151         return outliner::OutlinedFunction();
5152       }
5153 
5154       // Save + restore LR.
5155       NumBytesToCreateFrame += 8;
5156     }
5157   }
5158 
5159   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
5160                                     NumBytesToCreateFrame, FrameID);
5161 }
5162 
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const5163 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
5164     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
5165   const Function &F = MF.getFunction();
5166 
5167   // Can F be deduplicated by the linker? If it can, don't outline from it.
5168   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
5169     return false;
5170 
5171   // Don't outline from functions with section markings; the program could
5172   // expect that all the code is in the named section.
5173   // FIXME: Allow outlining from multiple functions with the same section
5174   // marking.
5175   if (F.hasSection())
5176     return false;
5177 
5178   // Outlining from functions with redzones is unsafe since the outliner may
5179   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
5180   // outline from it.
5181   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5182   if (!AFI || AFI->hasRedZone().getValueOr(true))
5183     return false;
5184 
5185   // It's safe to outline from MF.
5186   return true;
5187 }
5188 
isMBBSafeToOutlineFrom(MachineBasicBlock & MBB,unsigned & Flags) const5189 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
5190                                               unsigned &Flags) const {
5191   // Check if LR is available through all of the MBB. If it's not, then set
5192   // a flag.
5193   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
5194          "Suitable Machine Function for outlining must track liveness");
5195   LiveRegUnits LRU(getRegisterInfo());
5196 
5197   std::for_each(MBB.rbegin(), MBB.rend(),
5198                 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
5199 
5200   // Check if each of the unsafe registers are available...
5201   bool W16AvailableInBlock = LRU.available(AArch64::W16);
5202   bool W17AvailableInBlock = LRU.available(AArch64::W17);
5203   bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
5204 
5205   // If all of these are dead (and not live out), we know we don't have to check
5206   // them later.
5207   if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
5208     Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
5209 
5210   // Now, add the live outs to the set.
5211   LRU.addLiveOuts(MBB);
5212 
5213   // If any of these registers is available in the MBB, but also a live out of
5214   // the block, then we know outlining is unsafe.
5215   if (W16AvailableInBlock && !LRU.available(AArch64::W16))
5216     return false;
5217   if (W17AvailableInBlock && !LRU.available(AArch64::W17))
5218     return false;
5219   if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
5220     return false;
5221 
5222   // Check if there's a call inside this MachineBasicBlock. If there is, then
5223   // set a flag.
5224   if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
5225     Flags |= MachineOutlinerMBBFlags::HasCalls;
5226 
5227   MachineFunction *MF = MBB.getParent();
5228 
5229   // In the event that we outline, we may have to save LR. If there is an
5230   // available register in the MBB, then we'll always save LR there. Check if
5231   // this is true.
5232   bool CanSaveLR = false;
5233   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5234       MF->getSubtarget().getRegisterInfo());
5235 
5236   // Check if there is an available register across the sequence that we can
5237   // use.
5238   for (unsigned Reg : AArch64::GPR64RegClass) {
5239     if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
5240         Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
5241       CanSaveLR = true;
5242       break;
5243     }
5244   }
5245 
5246   // Check if we have a register we can save LR to, and if LR was used
5247   // somewhere. If both of those things are true, then we need to evaluate the
5248   // safety of outlining stack instructions later.
5249   if (!CanSaveLR && !LRU.available(AArch64::LR))
5250     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
5251 
5252   return true;
5253 }
5254 
5255 outliner::InstrType
getOutliningType(MachineBasicBlock::iterator & MIT,unsigned Flags) const5256 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
5257                                    unsigned Flags) const {
5258   MachineInstr &MI = *MIT;
5259   MachineBasicBlock *MBB = MI.getParent();
5260   MachineFunction *MF = MBB->getParent();
5261   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
5262 
5263   // Don't outline LOHs.
5264   if (FuncInfo->getLOHRelated().count(&MI))
5265     return outliner::InstrType::Illegal;
5266 
5267   // Don't allow debug values to impact outlining type.
5268   if (MI.isDebugInstr() || MI.isIndirectDebugValue())
5269     return outliner::InstrType::Invisible;
5270 
5271   // At this point, KILL instructions don't really tell us much so we can go
5272   // ahead and skip over them.
5273   if (MI.isKill())
5274     return outliner::InstrType::Invisible;
5275 
5276   // Is this a terminator for a basic block?
5277   if (MI.isTerminator()) {
5278 
5279     // Is this the end of a function?
5280     if (MI.getParent()->succ_empty())
5281       return outliner::InstrType::Legal;
5282 
5283     // It's not, so don't outline it.
5284     return outliner::InstrType::Illegal;
5285   }
5286 
5287   // Make sure none of the operands are un-outlinable.
5288   for (const MachineOperand &MOP : MI.operands()) {
5289     if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
5290         MOP.isTargetIndex())
5291       return outliner::InstrType::Illegal;
5292 
5293     // If it uses LR or W30 explicitly, then don't touch it.
5294     if (MOP.isReg() && !MOP.isImplicit() &&
5295         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
5296       return outliner::InstrType::Illegal;
5297   }
5298 
5299   // Special cases for instructions that can always be outlined, but will fail
5300   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
5301   // be outlined because they don't require a *specific* value to be in LR.
5302   if (MI.getOpcode() == AArch64::ADRP)
5303     return outliner::InstrType::Legal;
5304 
5305   // If MI is a call we might be able to outline it. We don't want to outline
5306   // any calls that rely on the position of items on the stack. When we outline
5307   // something containing a call, we have to emit a save and restore of LR in
5308   // the outlined function. Currently, this always happens by saving LR to the
5309   // stack. Thus, if we outline, say, half the parameters for a function call
5310   // plus the call, then we'll break the callee's expectations for the layout
5311   // of the stack.
5312   //
5313   // FIXME: Allow calls to functions which construct a stack frame, as long
5314   // as they don't access arguments on the stack.
5315   // FIXME: Figure out some way to analyze functions defined in other modules.
5316   // We should be able to compute the memory usage based on the IR calling
5317   // convention, even if we can't see the definition.
5318   if (MI.isCall()) {
5319     // Get the function associated with the call. Look at each operand and find
5320     // the one that represents the callee and get its name.
5321     const Function *Callee = nullptr;
5322     for (const MachineOperand &MOP : MI.operands()) {
5323       if (MOP.isGlobal()) {
5324         Callee = dyn_cast<Function>(MOP.getGlobal());
5325         break;
5326       }
5327     }
5328 
5329     // Never outline calls to mcount.  There isn't any rule that would require
5330     // this, but the Linux kernel's "ftrace" feature depends on it.
5331     if (Callee && Callee->getName() == "\01_mcount")
5332       return outliner::InstrType::Illegal;
5333 
5334     // If we don't know anything about the callee, assume it depends on the
5335     // stack layout of the caller. In that case, it's only legal to outline
5336     // as a tail-call.  Whitelist the call instructions we know about so we
5337     // don't get unexpected results with call pseudo-instructions.
5338     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
5339     if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
5340       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
5341 
5342     if (!Callee)
5343       return UnknownCallOutlineType;
5344 
5345     // We have a function we have information about. Check it if it's something
5346     // can safely outline.
5347     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
5348 
5349     // We don't know what's going on with the callee at all. Don't touch it.
5350     if (!CalleeMF)
5351       return UnknownCallOutlineType;
5352 
5353     // Check if we know anything about the callee saves on the function. If we
5354     // don't, then don't touch it, since that implies that we haven't
5355     // computed anything about its stack frame yet.
5356     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
5357     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
5358         MFI.getNumObjects() > 0)
5359       return UnknownCallOutlineType;
5360 
5361     // At this point, we can say that CalleeMF ought to not pass anything on the
5362     // stack. Therefore, we can outline it.
5363     return outliner::InstrType::Legal;
5364   }
5365 
5366   // Don't outline positions.
5367   if (MI.isPosition())
5368     return outliner::InstrType::Illegal;
5369 
5370   // Don't touch the link register or W30.
5371   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
5372       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
5373     return outliner::InstrType::Illegal;
5374 
5375   return outliner::InstrType::Legal;
5376 }
5377 
fixupPostOutline(MachineBasicBlock & MBB) const5378 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
5379   for (MachineInstr &MI : MBB) {
5380     MachineOperand *Base;
5381     unsigned Width;
5382     int64_t Offset;
5383 
5384     // Is this a load or store with an immediate offset with SP as the base?
5385     if (!MI.mayLoadOrStore() ||
5386         !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
5387         (Base->isReg() && Base->getReg() != AArch64::SP))
5388       continue;
5389 
5390     // It is, so we have to fix it up.
5391     unsigned Scale;
5392     int64_t Dummy1, Dummy2;
5393 
5394     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
5395     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
5396     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
5397     assert(Scale != 0 && "Unexpected opcode!");
5398 
5399     // We've pushed the return address to the stack, so add 16 to the offset.
5400     // This is safe, since we already checked if it would overflow when we
5401     // checked if this instruction was legal to outline.
5402     int64_t NewImm = (Offset + 16) / Scale;
5403     StackOffsetOperand.setImm(NewImm);
5404   }
5405 }
5406 
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const5407 void AArch64InstrInfo::buildOutlinedFrame(
5408     MachineBasicBlock &MBB, MachineFunction &MF,
5409     const outliner::OutlinedFunction &OF) const {
5410   // For thunk outlining, rewrite the last instruction from a call to a
5411   // tail-call.
5412   if (OF.FrameConstructionID == MachineOutlinerThunk) {
5413     MachineInstr *Call = &*--MBB.instr_end();
5414     unsigned TailOpcode;
5415     if (Call->getOpcode() == AArch64::BL) {
5416       TailOpcode = AArch64::TCRETURNdi;
5417     } else {
5418       assert(Call->getOpcode() == AArch64::BLR);
5419       TailOpcode = AArch64::TCRETURNriALL;
5420     }
5421     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
5422                             .add(Call->getOperand(0))
5423                             .addImm(0);
5424     MBB.insert(MBB.end(), TC);
5425     Call->eraseFromParent();
5426   }
5427 
5428   // Is there a call in the outlined range?
5429   auto IsNonTailCall = [](MachineInstr &MI) {
5430     return MI.isCall() && !MI.isReturn();
5431   };
5432   if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
5433     // Fix up the instructions in the range, since we're going to modify the
5434     // stack.
5435     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
5436            "Can only fix up stack references once");
5437     fixupPostOutline(MBB);
5438 
5439     // LR has to be a live in so that we can save it.
5440     MBB.addLiveIn(AArch64::LR);
5441 
5442     MachineBasicBlock::iterator It = MBB.begin();
5443     MachineBasicBlock::iterator Et = MBB.end();
5444 
5445     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5446         OF.FrameConstructionID == MachineOutlinerThunk)
5447       Et = std::prev(MBB.end());
5448 
5449     // Insert a save before the outlined region
5450     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5451                                 .addReg(AArch64::SP, RegState::Define)
5452                                 .addReg(AArch64::LR)
5453                                 .addReg(AArch64::SP)
5454                                 .addImm(-16);
5455     It = MBB.insert(It, STRXpre);
5456 
5457     const TargetSubtargetInfo &STI = MF.getSubtarget();
5458     const MCRegisterInfo *MRI = STI.getRegisterInfo();
5459     unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
5460 
5461     // Add a CFI saying the stack was moved 16 B down.
5462     int64_t StackPosEntry =
5463         MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
5464     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5465         .addCFIIndex(StackPosEntry)
5466         .setMIFlags(MachineInstr::FrameSetup);
5467 
5468     // Add a CFI saying that the LR that we want to find is now 16 B higher than
5469     // before.
5470     int64_t LRPosEntry =
5471         MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
5472     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5473         .addCFIIndex(LRPosEntry)
5474         .setMIFlags(MachineInstr::FrameSetup);
5475 
5476     // Insert a restore before the terminator for the function.
5477     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5478                                  .addReg(AArch64::SP, RegState::Define)
5479                                  .addReg(AArch64::LR, RegState::Define)
5480                                  .addReg(AArch64::SP)
5481                                  .addImm(16);
5482     Et = MBB.insert(Et, LDRXpost);
5483   }
5484 
5485   // If this is a tail call outlined function, then there's already a return.
5486   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5487       OF.FrameConstructionID == MachineOutlinerThunk)
5488     return;
5489 
5490   // It's not a tail call, so we have to insert the return ourselves.
5491   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
5492                           .addReg(AArch64::LR, RegState::Undef);
5493   MBB.insert(MBB.end(), ret);
5494 
5495   // Did we have to modify the stack by saving the link register?
5496   if (OF.FrameConstructionID != MachineOutlinerDefault)
5497     return;
5498 
5499   // We modified the stack.
5500   // Walk over the basic block and fix up all the stack accesses.
5501   fixupPostOutline(MBB);
5502 }
5503 
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,const outliner::Candidate & C) const5504 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
5505     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
5506     MachineFunction &MF, const outliner::Candidate &C) const {
5507 
5508   // Are we tail calling?
5509   if (C.CallConstructionID == MachineOutlinerTailCall) {
5510     // If yes, then we can just branch to the label.
5511     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
5512                             .addGlobalAddress(M.getNamedValue(MF.getName()))
5513                             .addImm(0));
5514     return It;
5515   }
5516 
5517   // Are we saving the link register?
5518   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
5519       C.CallConstructionID == MachineOutlinerThunk) {
5520     // No, so just insert the call.
5521     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5522                             .addGlobalAddress(M.getNamedValue(MF.getName())));
5523     return It;
5524   }
5525 
5526   // We want to return the spot where we inserted the call.
5527   MachineBasicBlock::iterator CallPt;
5528 
5529   // Instructions for saving and restoring LR around the call instruction we're
5530   // going to insert.
5531   MachineInstr *Save;
5532   MachineInstr *Restore;
5533   // Can we save to a register?
5534   if (C.CallConstructionID == MachineOutlinerRegSave) {
5535     // FIXME: This logic should be sunk into a target-specific interface so that
5536     // we don't have to recompute the register.
5537     unsigned Reg = findRegisterToSaveLRTo(C);
5538     assert(Reg != 0 && "No callee-saved register available?");
5539 
5540     // Save and restore LR from that register.
5541     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
5542                .addReg(AArch64::XZR)
5543                .addReg(AArch64::LR)
5544                .addImm(0);
5545     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
5546                 .addReg(AArch64::XZR)
5547                 .addReg(Reg)
5548                 .addImm(0);
5549   } else {
5550     // We have the default case. Save and restore from SP.
5551     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5552                .addReg(AArch64::SP, RegState::Define)
5553                .addReg(AArch64::LR)
5554                .addReg(AArch64::SP)
5555                .addImm(-16);
5556     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5557                   .addReg(AArch64::SP, RegState::Define)
5558                   .addReg(AArch64::LR, RegState::Define)
5559                   .addReg(AArch64::SP)
5560                   .addImm(16);
5561   }
5562 
5563   It = MBB.insert(It, Save);
5564   It++;
5565 
5566   // Insert the call.
5567   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5568                           .addGlobalAddress(M.getNamedValue(MF.getName())));
5569   CallPt = It;
5570   It++;
5571 
5572   It = MBB.insert(It, Restore);
5573   return CallPt;
5574 }
5575 
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const5576 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
5577   MachineFunction &MF) const {
5578   return MF.getFunction().optForMinSize();
5579 }
5580 
5581 #define GET_INSTRINFO_HELPERS
5582 #include "AArch64GenInstrInfo.inc"
5583