1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineOperand.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/MachineModuleInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/MC/MCAsmInfo.h"
36 #include "llvm/MC/MCInst.h"
37 #include "llvm/MC/MCInstrDesc.h"
38 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/CodeGen.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Compiler.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/MathExtras.h"
44 #include "llvm/Target/TargetMachine.h"
45 #include "llvm/Target/TargetOptions.h"
46 #include <cassert>
47 #include <cstdint>
48 #include <iterator>
49 #include <utility>
50
51 using namespace llvm;
52
53 #define GET_INSTRINFO_CTOR_DTOR
54 #include "AArch64GenInstrInfo.inc"
55
56 static cl::opt<unsigned> TBZDisplacementBits(
57 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
58 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
59
60 static cl::opt<unsigned> CBZDisplacementBits(
61 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
62 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
63
64 static cl::opt<unsigned>
65 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
66 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
67
AArch64InstrInfo(const AArch64Subtarget & STI)68 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
69 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
70 AArch64::CATCHRET),
71 RI(STI.getTargetTriple()), Subtarget(STI) {}
72
73 /// GetInstSize - Return the number of bytes of code the specified
74 /// instruction may be. This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const75 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
76 const MachineBasicBlock &MBB = *MI.getParent();
77 const MachineFunction *MF = MBB.getParent();
78 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
79
80 {
81 auto Op = MI.getOpcode();
82 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
83 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
84 }
85
86 // FIXME: We currently only handle pseudoinstructions that don't get expanded
87 // before the assembly printer.
88 unsigned NumBytes = 0;
89 const MCInstrDesc &Desc = MI.getDesc();
90 switch (Desc.getOpcode()) {
91 default:
92 // Anything not explicitly designated otherwise is a normal 4-byte insn.
93 NumBytes = 4;
94 break;
95 case TargetOpcode::DBG_VALUE:
96 case TargetOpcode::EH_LABEL:
97 case TargetOpcode::IMPLICIT_DEF:
98 case TargetOpcode::KILL:
99 NumBytes = 0;
100 break;
101 case TargetOpcode::STACKMAP:
102 // The upper bound for a stackmap intrinsic is the full length of its shadow
103 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
104 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
105 break;
106 case TargetOpcode::PATCHPOINT:
107 // The size of the patchpoint intrinsic is the number of bytes requested
108 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
109 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
110 break;
111 case AArch64::TLSDESC_CALLSEQ:
112 // This gets lowered to an instruction sequence which takes 16 bytes
113 NumBytes = 16;
114 break;
115 case AArch64::JumpTableDest32:
116 case AArch64::JumpTableDest16:
117 case AArch64::JumpTableDest8:
118 NumBytes = 12;
119 break;
120 case AArch64::SPACE:
121 NumBytes = MI.getOperand(1).getImm();
122 break;
123 }
124
125 return NumBytes;
126 }
127
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)128 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
129 SmallVectorImpl<MachineOperand> &Cond) {
130 // Block ends with fall-through condbranch.
131 switch (LastInst->getOpcode()) {
132 default:
133 llvm_unreachable("Unknown branch instruction?");
134 case AArch64::Bcc:
135 Target = LastInst->getOperand(1).getMBB();
136 Cond.push_back(LastInst->getOperand(0));
137 break;
138 case AArch64::CBZW:
139 case AArch64::CBZX:
140 case AArch64::CBNZW:
141 case AArch64::CBNZX:
142 Target = LastInst->getOperand(1).getMBB();
143 Cond.push_back(MachineOperand::CreateImm(-1));
144 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
145 Cond.push_back(LastInst->getOperand(0));
146 break;
147 case AArch64::TBZW:
148 case AArch64::TBZX:
149 case AArch64::TBNZW:
150 case AArch64::TBNZX:
151 Target = LastInst->getOperand(2).getMBB();
152 Cond.push_back(MachineOperand::CreateImm(-1));
153 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
154 Cond.push_back(LastInst->getOperand(0));
155 Cond.push_back(LastInst->getOperand(1));
156 }
157 }
158
getBranchDisplacementBits(unsigned Opc)159 static unsigned getBranchDisplacementBits(unsigned Opc) {
160 switch (Opc) {
161 default:
162 llvm_unreachable("unexpected opcode!");
163 case AArch64::B:
164 return 64;
165 case AArch64::TBNZW:
166 case AArch64::TBZW:
167 case AArch64::TBNZX:
168 case AArch64::TBZX:
169 return TBZDisplacementBits;
170 case AArch64::CBNZW:
171 case AArch64::CBZW:
172 case AArch64::CBNZX:
173 case AArch64::CBZX:
174 return CBZDisplacementBits;
175 case AArch64::Bcc:
176 return BCCDisplacementBits;
177 }
178 }
179
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const180 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
181 int64_t BrOffset) const {
182 unsigned Bits = getBranchDisplacementBits(BranchOp);
183 assert(Bits >= 3 && "max branch displacement must be enough to jump"
184 "over conditional branch expansion");
185 return isIntN(Bits, BrOffset / 4);
186 }
187
188 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const189 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
190 switch (MI.getOpcode()) {
191 default:
192 llvm_unreachable("unexpected opcode!");
193 case AArch64::B:
194 return MI.getOperand(0).getMBB();
195 case AArch64::TBZW:
196 case AArch64::TBNZW:
197 case AArch64::TBZX:
198 case AArch64::TBNZX:
199 return MI.getOperand(2).getMBB();
200 case AArch64::CBZW:
201 case AArch64::CBNZW:
202 case AArch64::CBZX:
203 case AArch64::CBNZX:
204 case AArch64::Bcc:
205 return MI.getOperand(1).getMBB();
206 }
207 }
208
209 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const210 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
211 MachineBasicBlock *&TBB,
212 MachineBasicBlock *&FBB,
213 SmallVectorImpl<MachineOperand> &Cond,
214 bool AllowModify) const {
215 // If the block has no terminators, it just falls into the block after it.
216 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
217 if (I == MBB.end())
218 return false;
219
220 if (!isUnpredicatedTerminator(*I))
221 return false;
222
223 // Get the last instruction in the block.
224 MachineInstr *LastInst = &*I;
225
226 // If there is only one terminator instruction, process it.
227 unsigned LastOpc = LastInst->getOpcode();
228 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
229 if (isUncondBranchOpcode(LastOpc)) {
230 TBB = LastInst->getOperand(0).getMBB();
231 return false;
232 }
233 if (isCondBranchOpcode(LastOpc)) {
234 // Block ends with fall-through condbranch.
235 parseCondBranch(LastInst, TBB, Cond);
236 return false;
237 }
238 return true; // Can't handle indirect branch.
239 }
240
241 // Get the instruction before it if it is a terminator.
242 MachineInstr *SecondLastInst = &*I;
243 unsigned SecondLastOpc = SecondLastInst->getOpcode();
244
245 // If AllowModify is true and the block ends with two or more unconditional
246 // branches, delete all but the first unconditional branch.
247 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
248 while (isUncondBranchOpcode(SecondLastOpc)) {
249 LastInst->eraseFromParent();
250 LastInst = SecondLastInst;
251 LastOpc = LastInst->getOpcode();
252 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
253 // Return now the only terminator is an unconditional branch.
254 TBB = LastInst->getOperand(0).getMBB();
255 return false;
256 } else {
257 SecondLastInst = &*I;
258 SecondLastOpc = SecondLastInst->getOpcode();
259 }
260 }
261 }
262
263 // If there are three terminators, we don't know what sort of block this is.
264 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
265 return true;
266
267 // If the block ends with a B and a Bcc, handle it.
268 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
269 parseCondBranch(SecondLastInst, TBB, Cond);
270 FBB = LastInst->getOperand(0).getMBB();
271 return false;
272 }
273
274 // If the block ends with two unconditional branches, handle it. The second
275 // one is not executed, so remove it.
276 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
277 TBB = SecondLastInst->getOperand(0).getMBB();
278 I = LastInst;
279 if (AllowModify)
280 I->eraseFromParent();
281 return false;
282 }
283
284 // ...likewise if it ends with an indirect branch followed by an unconditional
285 // branch.
286 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
287 I = LastInst;
288 if (AllowModify)
289 I->eraseFromParent();
290 return true;
291 }
292
293 // Otherwise, can't handle this.
294 return true;
295 }
296
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const297 bool AArch64InstrInfo::reverseBranchCondition(
298 SmallVectorImpl<MachineOperand> &Cond) const {
299 if (Cond[0].getImm() != -1) {
300 // Regular Bcc
301 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
302 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
303 } else {
304 // Folded compare-and-branch
305 switch (Cond[1].getImm()) {
306 default:
307 llvm_unreachable("Unknown conditional branch!");
308 case AArch64::CBZW:
309 Cond[1].setImm(AArch64::CBNZW);
310 break;
311 case AArch64::CBNZW:
312 Cond[1].setImm(AArch64::CBZW);
313 break;
314 case AArch64::CBZX:
315 Cond[1].setImm(AArch64::CBNZX);
316 break;
317 case AArch64::CBNZX:
318 Cond[1].setImm(AArch64::CBZX);
319 break;
320 case AArch64::TBZW:
321 Cond[1].setImm(AArch64::TBNZW);
322 break;
323 case AArch64::TBNZW:
324 Cond[1].setImm(AArch64::TBZW);
325 break;
326 case AArch64::TBZX:
327 Cond[1].setImm(AArch64::TBNZX);
328 break;
329 case AArch64::TBNZX:
330 Cond[1].setImm(AArch64::TBZX);
331 break;
332 }
333 }
334
335 return false;
336 }
337
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const338 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
339 int *BytesRemoved) const {
340 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
341 if (I == MBB.end())
342 return 0;
343
344 if (!isUncondBranchOpcode(I->getOpcode()) &&
345 !isCondBranchOpcode(I->getOpcode()))
346 return 0;
347
348 // Remove the branch.
349 I->eraseFromParent();
350
351 I = MBB.end();
352
353 if (I == MBB.begin()) {
354 if (BytesRemoved)
355 *BytesRemoved = 4;
356 return 1;
357 }
358 --I;
359 if (!isCondBranchOpcode(I->getOpcode())) {
360 if (BytesRemoved)
361 *BytesRemoved = 4;
362 return 1;
363 }
364
365 // Remove the branch.
366 I->eraseFromParent();
367 if (BytesRemoved)
368 *BytesRemoved = 8;
369
370 return 2;
371 }
372
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const373 void AArch64InstrInfo::instantiateCondBranch(
374 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
375 ArrayRef<MachineOperand> Cond) const {
376 if (Cond[0].getImm() != -1) {
377 // Regular Bcc
378 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
379 } else {
380 // Folded compare-and-branch
381 // Note that we use addOperand instead of addReg to keep the flags.
382 const MachineInstrBuilder MIB =
383 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
384 if (Cond.size() > 3)
385 MIB.addImm(Cond[3].getImm());
386 MIB.addMBB(TBB);
387 }
388 }
389
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const390 unsigned AArch64InstrInfo::insertBranch(
391 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
392 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
393 // Shouldn't be a fall through.
394 assert(TBB && "insertBranch must not be told to insert a fallthrough");
395
396 if (!FBB) {
397 if (Cond.empty()) // Unconditional branch?
398 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
399 else
400 instantiateCondBranch(MBB, DL, TBB, Cond);
401
402 if (BytesAdded)
403 *BytesAdded = 4;
404
405 return 1;
406 }
407
408 // Two-way conditional branch.
409 instantiateCondBranch(MBB, DL, TBB, Cond);
410 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
411
412 if (BytesAdded)
413 *BytesAdded = 8;
414
415 return 2;
416 }
417
418 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)419 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
420 while (TargetRegisterInfo::isVirtualRegister(VReg)) {
421 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
422 if (!DefMI->isFullCopy())
423 return VReg;
424 VReg = DefMI->getOperand(1).getReg();
425 }
426 return VReg;
427 }
428
429 // Determine if VReg is defined by an instruction that can be folded into a
430 // csel instruction. If so, return the folded opcode, and the replacement
431 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)432 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
433 unsigned *NewVReg = nullptr) {
434 VReg = removeCopies(MRI, VReg);
435 if (!TargetRegisterInfo::isVirtualRegister(VReg))
436 return 0;
437
438 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
439 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
440 unsigned Opc = 0;
441 unsigned SrcOpNum = 0;
442 switch (DefMI->getOpcode()) {
443 case AArch64::ADDSXri:
444 case AArch64::ADDSWri:
445 // if NZCV is used, do not fold.
446 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
447 return 0;
448 // fall-through to ADDXri and ADDWri.
449 LLVM_FALLTHROUGH;
450 case AArch64::ADDXri:
451 case AArch64::ADDWri:
452 // add x, 1 -> csinc.
453 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
454 DefMI->getOperand(3).getImm() != 0)
455 return 0;
456 SrcOpNum = 1;
457 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
458 break;
459
460 case AArch64::ORNXrr:
461 case AArch64::ORNWrr: {
462 // not x -> csinv, represented as orn dst, xzr, src.
463 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
464 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
465 return 0;
466 SrcOpNum = 2;
467 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
468 break;
469 }
470
471 case AArch64::SUBSXrr:
472 case AArch64::SUBSWrr:
473 // if NZCV is used, do not fold.
474 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
475 return 0;
476 // fall-through to SUBXrr and SUBWrr.
477 LLVM_FALLTHROUGH;
478 case AArch64::SUBXrr:
479 case AArch64::SUBWrr: {
480 // neg x -> csneg, represented as sub dst, xzr, src.
481 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
482 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
483 return 0;
484 SrcOpNum = 2;
485 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
486 break;
487 }
488 default:
489 return 0;
490 }
491 assert(Opc && SrcOpNum && "Missing parameters");
492
493 if (NewVReg)
494 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
495 return Opc;
496 }
497
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,unsigned TrueReg,unsigned FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const498 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
499 ArrayRef<MachineOperand> Cond,
500 unsigned TrueReg, unsigned FalseReg,
501 int &CondCycles, int &TrueCycles,
502 int &FalseCycles) const {
503 // Check register classes.
504 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
505 const TargetRegisterClass *RC =
506 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
507 if (!RC)
508 return false;
509
510 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
511 unsigned ExtraCondLat = Cond.size() != 1;
512
513 // GPRs are handled by csel.
514 // FIXME: Fold in x+1, -x, and ~x when applicable.
515 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
516 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
517 // Single-cycle csel, csinc, csinv, and csneg.
518 CondCycles = 1 + ExtraCondLat;
519 TrueCycles = FalseCycles = 1;
520 if (canFoldIntoCSel(MRI, TrueReg))
521 TrueCycles = 0;
522 else if (canFoldIntoCSel(MRI, FalseReg))
523 FalseCycles = 0;
524 return true;
525 }
526
527 // Scalar floating point is handled by fcsel.
528 // FIXME: Form fabs, fmin, and fmax when applicable.
529 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
530 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
531 CondCycles = 5 + ExtraCondLat;
532 TrueCycles = FalseCycles = 2;
533 return true;
534 }
535
536 // Can't do vectors.
537 return false;
538 }
539
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DstReg,ArrayRef<MachineOperand> Cond,unsigned TrueReg,unsigned FalseReg) const540 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
541 MachineBasicBlock::iterator I,
542 const DebugLoc &DL, unsigned DstReg,
543 ArrayRef<MachineOperand> Cond,
544 unsigned TrueReg, unsigned FalseReg) const {
545 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
546
547 // Parse the condition code, see parseCondBranch() above.
548 AArch64CC::CondCode CC;
549 switch (Cond.size()) {
550 default:
551 llvm_unreachable("Unknown condition opcode in Cond");
552 case 1: // b.cc
553 CC = AArch64CC::CondCode(Cond[0].getImm());
554 break;
555 case 3: { // cbz/cbnz
556 // We must insert a compare against 0.
557 bool Is64Bit;
558 switch (Cond[1].getImm()) {
559 default:
560 llvm_unreachable("Unknown branch opcode in Cond");
561 case AArch64::CBZW:
562 Is64Bit = false;
563 CC = AArch64CC::EQ;
564 break;
565 case AArch64::CBZX:
566 Is64Bit = true;
567 CC = AArch64CC::EQ;
568 break;
569 case AArch64::CBNZW:
570 Is64Bit = false;
571 CC = AArch64CC::NE;
572 break;
573 case AArch64::CBNZX:
574 Is64Bit = true;
575 CC = AArch64CC::NE;
576 break;
577 }
578 unsigned SrcReg = Cond[2].getReg();
579 if (Is64Bit) {
580 // cmp reg, #0 is actually subs xzr, reg, #0.
581 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
582 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
583 .addReg(SrcReg)
584 .addImm(0)
585 .addImm(0);
586 } else {
587 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
588 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
589 .addReg(SrcReg)
590 .addImm(0)
591 .addImm(0);
592 }
593 break;
594 }
595 case 4: { // tbz/tbnz
596 // We must insert a tst instruction.
597 switch (Cond[1].getImm()) {
598 default:
599 llvm_unreachable("Unknown branch opcode in Cond");
600 case AArch64::TBZW:
601 case AArch64::TBZX:
602 CC = AArch64CC::EQ;
603 break;
604 case AArch64::TBNZW:
605 case AArch64::TBNZX:
606 CC = AArch64CC::NE;
607 break;
608 }
609 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
610 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
611 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
612 .addReg(Cond[2].getReg())
613 .addImm(
614 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
615 else
616 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
617 .addReg(Cond[2].getReg())
618 .addImm(
619 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
620 break;
621 }
622 }
623
624 unsigned Opc = 0;
625 const TargetRegisterClass *RC = nullptr;
626 bool TryFold = false;
627 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
628 RC = &AArch64::GPR64RegClass;
629 Opc = AArch64::CSELXr;
630 TryFold = true;
631 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
632 RC = &AArch64::GPR32RegClass;
633 Opc = AArch64::CSELWr;
634 TryFold = true;
635 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
636 RC = &AArch64::FPR64RegClass;
637 Opc = AArch64::FCSELDrrr;
638 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
639 RC = &AArch64::FPR32RegClass;
640 Opc = AArch64::FCSELSrrr;
641 }
642 assert(RC && "Unsupported regclass");
643
644 // Try folding simple instructions into the csel.
645 if (TryFold) {
646 unsigned NewVReg = 0;
647 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
648 if (FoldedOpc) {
649 // The folded opcodes csinc, csinc and csneg apply the operation to
650 // FalseReg, so we need to invert the condition.
651 CC = AArch64CC::getInvertedCondCode(CC);
652 TrueReg = FalseReg;
653 } else
654 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
655
656 // Fold the operation. Leave any dead instructions for DCE to clean up.
657 if (FoldedOpc) {
658 FalseReg = NewVReg;
659 Opc = FoldedOpc;
660 // The extends the live range of NewVReg.
661 MRI.clearKillFlags(NewVReg);
662 }
663 }
664
665 // Pull all virtual register into the appropriate class.
666 MRI.constrainRegClass(TrueReg, RC);
667 MRI.constrainRegClass(FalseReg, RC);
668
669 // Insert the csel.
670 BuildMI(MBB, I, DL, get(Opc), DstReg)
671 .addReg(TrueReg)
672 .addReg(FalseReg)
673 .addImm(CC);
674 }
675
676 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
canBeExpandedToORR(const MachineInstr & MI,unsigned BitSize)677 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
678 uint64_t Imm = MI.getOperand(1).getImm();
679 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
680 uint64_t Encoding;
681 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
682 }
683
684 // FIXME: this implementation should be micro-architecture dependent, so a
685 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const686 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
687 if (!Subtarget.hasCustomCheapAsMoveHandling())
688 return MI.isAsCheapAsAMove();
689
690 const unsigned Opcode = MI.getOpcode();
691
692 // Firstly, check cases gated by features.
693
694 if (Subtarget.hasZeroCycleZeroingFP()) {
695 if (Opcode == AArch64::FMOVH0 ||
696 Opcode == AArch64::FMOVS0 ||
697 Opcode == AArch64::FMOVD0)
698 return true;
699 }
700
701 if (Subtarget.hasZeroCycleZeroingGP()) {
702 if (Opcode == TargetOpcode::COPY &&
703 (MI.getOperand(1).getReg() == AArch64::WZR ||
704 MI.getOperand(1).getReg() == AArch64::XZR))
705 return true;
706 }
707
708 // Secondly, check cases specific to sub-targets.
709
710 if (Subtarget.hasExynosCheapAsMoveHandling()) {
711 if (isExynosCheapAsMove(MI))
712 return true;
713
714 return MI.isAsCheapAsAMove();
715 }
716
717 // Finally, check generic cases.
718
719 switch (Opcode) {
720 default:
721 return false;
722
723 // add/sub on register without shift
724 case AArch64::ADDWri:
725 case AArch64::ADDXri:
726 case AArch64::SUBWri:
727 case AArch64::SUBXri:
728 return (MI.getOperand(3).getImm() == 0);
729
730 // logical ops on immediate
731 case AArch64::ANDWri:
732 case AArch64::ANDXri:
733 case AArch64::EORWri:
734 case AArch64::EORXri:
735 case AArch64::ORRWri:
736 case AArch64::ORRXri:
737 return true;
738
739 // logical ops on register without shift
740 case AArch64::ANDWrr:
741 case AArch64::ANDXrr:
742 case AArch64::BICWrr:
743 case AArch64::BICXrr:
744 case AArch64::EONWrr:
745 case AArch64::EONXrr:
746 case AArch64::EORWrr:
747 case AArch64::EORXrr:
748 case AArch64::ORNWrr:
749 case AArch64::ORNXrr:
750 case AArch64::ORRWrr:
751 case AArch64::ORRXrr:
752 return true;
753
754 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
755 // ORRXri, it is as cheap as MOV
756 case AArch64::MOVi32imm:
757 return canBeExpandedToORR(MI, 32);
758 case AArch64::MOVi64imm:
759 return canBeExpandedToORR(MI, 64);
760 }
761
762 llvm_unreachable("Unknown opcode to check as cheap as a move!");
763 }
764
isFalkorShiftExtFast(const MachineInstr & MI)765 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
766 switch (MI.getOpcode()) {
767 default:
768 return false;
769
770 case AArch64::ADDWrs:
771 case AArch64::ADDXrs:
772 case AArch64::ADDSWrs:
773 case AArch64::ADDSXrs: {
774 unsigned Imm = MI.getOperand(3).getImm();
775 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
776 if (ShiftVal == 0)
777 return true;
778 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
779 }
780
781 case AArch64::ADDWrx:
782 case AArch64::ADDXrx:
783 case AArch64::ADDXrx64:
784 case AArch64::ADDSWrx:
785 case AArch64::ADDSXrx:
786 case AArch64::ADDSXrx64: {
787 unsigned Imm = MI.getOperand(3).getImm();
788 switch (AArch64_AM::getArithExtendType(Imm)) {
789 default:
790 return false;
791 case AArch64_AM::UXTB:
792 case AArch64_AM::UXTH:
793 case AArch64_AM::UXTW:
794 case AArch64_AM::UXTX:
795 return AArch64_AM::getArithShiftValue(Imm) <= 4;
796 }
797 }
798
799 case AArch64::SUBWrs:
800 case AArch64::SUBSWrs: {
801 unsigned Imm = MI.getOperand(3).getImm();
802 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
803 return ShiftVal == 0 ||
804 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
805 }
806
807 case AArch64::SUBXrs:
808 case AArch64::SUBSXrs: {
809 unsigned Imm = MI.getOperand(3).getImm();
810 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
811 return ShiftVal == 0 ||
812 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
813 }
814
815 case AArch64::SUBWrx:
816 case AArch64::SUBXrx:
817 case AArch64::SUBXrx64:
818 case AArch64::SUBSWrx:
819 case AArch64::SUBSXrx:
820 case AArch64::SUBSXrx64: {
821 unsigned Imm = MI.getOperand(3).getImm();
822 switch (AArch64_AM::getArithExtendType(Imm)) {
823 default:
824 return false;
825 case AArch64_AM::UXTB:
826 case AArch64_AM::UXTH:
827 case AArch64_AM::UXTW:
828 case AArch64_AM::UXTX:
829 return AArch64_AM::getArithShiftValue(Imm) == 0;
830 }
831 }
832
833 case AArch64::LDRBBroW:
834 case AArch64::LDRBBroX:
835 case AArch64::LDRBroW:
836 case AArch64::LDRBroX:
837 case AArch64::LDRDroW:
838 case AArch64::LDRDroX:
839 case AArch64::LDRHHroW:
840 case AArch64::LDRHHroX:
841 case AArch64::LDRHroW:
842 case AArch64::LDRHroX:
843 case AArch64::LDRQroW:
844 case AArch64::LDRQroX:
845 case AArch64::LDRSBWroW:
846 case AArch64::LDRSBWroX:
847 case AArch64::LDRSBXroW:
848 case AArch64::LDRSBXroX:
849 case AArch64::LDRSHWroW:
850 case AArch64::LDRSHWroX:
851 case AArch64::LDRSHXroW:
852 case AArch64::LDRSHXroX:
853 case AArch64::LDRSWroW:
854 case AArch64::LDRSWroX:
855 case AArch64::LDRSroW:
856 case AArch64::LDRSroX:
857 case AArch64::LDRWroW:
858 case AArch64::LDRWroX:
859 case AArch64::LDRXroW:
860 case AArch64::LDRXroX:
861 case AArch64::PRFMroW:
862 case AArch64::PRFMroX:
863 case AArch64::STRBBroW:
864 case AArch64::STRBBroX:
865 case AArch64::STRBroW:
866 case AArch64::STRBroX:
867 case AArch64::STRDroW:
868 case AArch64::STRDroX:
869 case AArch64::STRHHroW:
870 case AArch64::STRHHroX:
871 case AArch64::STRHroW:
872 case AArch64::STRHroX:
873 case AArch64::STRQroW:
874 case AArch64::STRQroX:
875 case AArch64::STRSroW:
876 case AArch64::STRSroX:
877 case AArch64::STRWroW:
878 case AArch64::STRWroX:
879 case AArch64::STRXroW:
880 case AArch64::STRXroX: {
881 unsigned IsSigned = MI.getOperand(3).getImm();
882 return !IsSigned;
883 }
884 }
885 }
886
isSEHInstruction(const MachineInstr & MI)887 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
888 unsigned Opc = MI.getOpcode();
889 switch (Opc) {
890 default:
891 return false;
892 case AArch64::SEH_StackAlloc:
893 case AArch64::SEH_SaveFPLR:
894 case AArch64::SEH_SaveFPLR_X:
895 case AArch64::SEH_SaveReg:
896 case AArch64::SEH_SaveReg_X:
897 case AArch64::SEH_SaveRegP:
898 case AArch64::SEH_SaveRegP_X:
899 case AArch64::SEH_SaveFReg:
900 case AArch64::SEH_SaveFReg_X:
901 case AArch64::SEH_SaveFRegP:
902 case AArch64::SEH_SaveFRegP_X:
903 case AArch64::SEH_SetFP:
904 case AArch64::SEH_AddFP:
905 case AArch64::SEH_Nop:
906 case AArch64::SEH_PrologEnd:
907 case AArch64::SEH_EpilogStart:
908 case AArch64::SEH_EpilogEnd:
909 return true;
910 }
911 }
912
isCoalescableExtInstr(const MachineInstr & MI,unsigned & SrcReg,unsigned & DstReg,unsigned & SubIdx) const913 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
914 unsigned &SrcReg, unsigned &DstReg,
915 unsigned &SubIdx) const {
916 switch (MI.getOpcode()) {
917 default:
918 return false;
919 case AArch64::SBFMXri: // aka sxtw
920 case AArch64::UBFMXri: // aka uxtw
921 // Check for the 32 -> 64 bit extension case, these instructions can do
922 // much more.
923 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
924 return false;
925 // This is a signed or unsigned 32 -> 64 bit extension.
926 SrcReg = MI.getOperand(1).getReg();
927 DstReg = MI.getOperand(0).getReg();
928 SubIdx = AArch64::sub_32;
929 return true;
930 }
931 }
932
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb,AliasAnalysis * AA) const933 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
934 const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const {
935 const TargetRegisterInfo *TRI = &getRegisterInfo();
936 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
937 int64_t OffsetA = 0, OffsetB = 0;
938 unsigned WidthA = 0, WidthB = 0;
939
940 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
941 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
942
943 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
944 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
945 return false;
946
947 // Retrieve the base, offset from the base and width. Width
948 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
949 // base are identical, and the offset of a lower memory access +
950 // the width doesn't overlap the offset of a higher memory access,
951 // then the memory accesses are different.
952 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
953 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
954 if (BaseOpA->isIdenticalTo(*BaseOpB)) {
955 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
956 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
957 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
958 if (LowOffset + LowWidth <= HighOffset)
959 return true;
960 }
961 }
962 return false;
963 }
964
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const965 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
966 const MachineBasicBlock *MBB,
967 const MachineFunction &MF) const {
968 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
969 return true;
970 switch (MI.getOpcode()) {
971 case AArch64::HINT:
972 // CSDB hints are scheduling barriers.
973 if (MI.getOperand(0).getImm() == 0x14)
974 return true;
975 break;
976 case AArch64::DSB:
977 case AArch64::ISB:
978 // DSB and ISB also are scheduling barriers.
979 return true;
980 default:;
981 }
982 return isSEHInstruction(MI);
983 }
984
985 /// analyzeCompare - For a comparison instruction, return the source registers
986 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
987 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,unsigned & SrcReg,unsigned & SrcReg2,int & CmpMask,int & CmpValue) const988 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
989 unsigned &SrcReg2, int &CmpMask,
990 int &CmpValue) const {
991 // The first operand can be a frame index where we'd normally expect a
992 // register.
993 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
994 if (!MI.getOperand(1).isReg())
995 return false;
996
997 switch (MI.getOpcode()) {
998 default:
999 break;
1000 case AArch64::SUBSWrr:
1001 case AArch64::SUBSWrs:
1002 case AArch64::SUBSWrx:
1003 case AArch64::SUBSXrr:
1004 case AArch64::SUBSXrs:
1005 case AArch64::SUBSXrx:
1006 case AArch64::ADDSWrr:
1007 case AArch64::ADDSWrs:
1008 case AArch64::ADDSWrx:
1009 case AArch64::ADDSXrr:
1010 case AArch64::ADDSXrs:
1011 case AArch64::ADDSXrx:
1012 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1013 SrcReg = MI.getOperand(1).getReg();
1014 SrcReg2 = MI.getOperand(2).getReg();
1015 CmpMask = ~0;
1016 CmpValue = 0;
1017 return true;
1018 case AArch64::SUBSWri:
1019 case AArch64::ADDSWri:
1020 case AArch64::SUBSXri:
1021 case AArch64::ADDSXri:
1022 SrcReg = MI.getOperand(1).getReg();
1023 SrcReg2 = 0;
1024 CmpMask = ~0;
1025 // FIXME: In order to convert CmpValue to 0 or 1
1026 CmpValue = MI.getOperand(2).getImm() != 0;
1027 return true;
1028 case AArch64::ANDSWri:
1029 case AArch64::ANDSXri:
1030 // ANDS does not use the same encoding scheme as the others xxxS
1031 // instructions.
1032 SrcReg = MI.getOperand(1).getReg();
1033 SrcReg2 = 0;
1034 CmpMask = ~0;
1035 // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1036 // while the type of CmpValue is int. When converting uint64_t to int,
1037 // the high 32 bits of uint64_t will be lost.
1038 // In fact it causes a bug in spec2006-483.xalancbmk
1039 // CmpValue is only used to compare with zero in OptimizeCompareInstr
1040 CmpValue = AArch64_AM::decodeLogicalImmediate(
1041 MI.getOperand(2).getImm(),
1042 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1043 return true;
1044 }
1045
1046 return false;
1047 }
1048
UpdateOperandRegClass(MachineInstr & Instr)1049 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1050 MachineBasicBlock *MBB = Instr.getParent();
1051 assert(MBB && "Can't get MachineBasicBlock here");
1052 MachineFunction *MF = MBB->getParent();
1053 assert(MF && "Can't get MachineFunction here");
1054 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1055 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1056 MachineRegisterInfo *MRI = &MF->getRegInfo();
1057
1058 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1059 ++OpIdx) {
1060 MachineOperand &MO = Instr.getOperand(OpIdx);
1061 const TargetRegisterClass *OpRegCstraints =
1062 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1063
1064 // If there's no constraint, there's nothing to do.
1065 if (!OpRegCstraints)
1066 continue;
1067 // If the operand is a frame index, there's nothing to do here.
1068 // A frame index operand will resolve correctly during PEI.
1069 if (MO.isFI())
1070 continue;
1071
1072 assert(MO.isReg() &&
1073 "Operand has register constraints without being a register!");
1074
1075 unsigned Reg = MO.getReg();
1076 if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
1077 if (!OpRegCstraints->contains(Reg))
1078 return false;
1079 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1080 !MRI->constrainRegClass(Reg, OpRegCstraints))
1081 return false;
1082 }
1083
1084 return true;
1085 }
1086
1087 /// Return the opcode that does not set flags when possible - otherwise
1088 /// return the original opcode. The caller is responsible to do the actual
1089 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1090 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1091 // Don't convert all compare instructions, because for some the zero register
1092 // encoding becomes the sp register.
1093 bool MIDefinesZeroReg = false;
1094 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1095 MIDefinesZeroReg = true;
1096
1097 switch (MI.getOpcode()) {
1098 default:
1099 return MI.getOpcode();
1100 case AArch64::ADDSWrr:
1101 return AArch64::ADDWrr;
1102 case AArch64::ADDSWri:
1103 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1104 case AArch64::ADDSWrs:
1105 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1106 case AArch64::ADDSWrx:
1107 return AArch64::ADDWrx;
1108 case AArch64::ADDSXrr:
1109 return AArch64::ADDXrr;
1110 case AArch64::ADDSXri:
1111 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1112 case AArch64::ADDSXrs:
1113 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1114 case AArch64::ADDSXrx:
1115 return AArch64::ADDXrx;
1116 case AArch64::SUBSWrr:
1117 return AArch64::SUBWrr;
1118 case AArch64::SUBSWri:
1119 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1120 case AArch64::SUBSWrs:
1121 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1122 case AArch64::SUBSWrx:
1123 return AArch64::SUBWrx;
1124 case AArch64::SUBSXrr:
1125 return AArch64::SUBXrr;
1126 case AArch64::SUBSXri:
1127 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1128 case AArch64::SUBSXrs:
1129 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1130 case AArch64::SUBSXrx:
1131 return AArch64::SUBXrx;
1132 }
1133 }
1134
1135 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1136
1137 /// True when condition flags are accessed (either by writing or reading)
1138 /// on the instruction trace starting at From and ending at To.
1139 ///
1140 /// Note: If From and To are from different blocks it's assumed CC are accessed
1141 /// on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1142 static bool areCFlagsAccessedBetweenInstrs(
1143 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1144 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1145 // Early exit if To is at the beginning of the BB.
1146 if (To == To->getParent()->begin())
1147 return true;
1148
1149 // Check whether the instructions are in the same basic block
1150 // If not, assume the condition flags might get modified somewhere.
1151 if (To->getParent() != From->getParent())
1152 return true;
1153
1154 // From must be above To.
1155 assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1156 [From](MachineInstr &MI) {
1157 return MI.getIterator() == From;
1158 }) != To->getParent()->rend());
1159
1160 // We iterate backward starting \p To until we hit \p From.
1161 for (--To; To != From; --To) {
1162 const MachineInstr &Instr = *To;
1163
1164 if (((AccessToCheck & AK_Write) &&
1165 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1166 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1167 return true;
1168 }
1169 return false;
1170 }
1171
1172 /// Try to optimize a compare instruction. A compare instruction is an
1173 /// instruction which produces AArch64::NZCV. It can be truly compare
1174 /// instruction
1175 /// when there are no uses of its destination register.
1176 ///
1177 /// The following steps are tried in order:
1178 /// 1. Convert CmpInstr into an unconditional version.
1179 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1180 /// condition code or an instruction which can be converted into such an
1181 /// instruction.
1182 /// Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,unsigned SrcReg,unsigned SrcReg2,int CmpMask,int CmpValue,const MachineRegisterInfo * MRI) const1183 bool AArch64InstrInfo::optimizeCompareInstr(
1184 MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
1185 int CmpValue, const MachineRegisterInfo *MRI) const {
1186 assert(CmpInstr.getParent());
1187 assert(MRI);
1188
1189 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1190 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1191 if (DeadNZCVIdx != -1) {
1192 if (CmpInstr.definesRegister(AArch64::WZR) ||
1193 CmpInstr.definesRegister(AArch64::XZR)) {
1194 CmpInstr.eraseFromParent();
1195 return true;
1196 }
1197 unsigned Opc = CmpInstr.getOpcode();
1198 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1199 if (NewOpc == Opc)
1200 return false;
1201 const MCInstrDesc &MCID = get(NewOpc);
1202 CmpInstr.setDesc(MCID);
1203 CmpInstr.RemoveOperand(DeadNZCVIdx);
1204 bool succeeded = UpdateOperandRegClass(CmpInstr);
1205 (void)succeeded;
1206 assert(succeeded && "Some operands reg class are incompatible!");
1207 return true;
1208 }
1209
1210 // Continue only if we have a "ri" where immediate is zero.
1211 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1212 // function.
1213 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1214 if (CmpValue != 0 || SrcReg2 != 0)
1215 return false;
1216
1217 // CmpInstr is a Compare instruction if destination register is not used.
1218 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1219 return false;
1220
1221 return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1222 }
1223
1224 /// Get opcode of S version of Instr.
1225 /// If Instr is S version its opcode is returned.
1226 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1227 /// or we are not interested in it.
sForm(MachineInstr & Instr)1228 static unsigned sForm(MachineInstr &Instr) {
1229 switch (Instr.getOpcode()) {
1230 default:
1231 return AArch64::INSTRUCTION_LIST_END;
1232
1233 case AArch64::ADDSWrr:
1234 case AArch64::ADDSWri:
1235 case AArch64::ADDSXrr:
1236 case AArch64::ADDSXri:
1237 case AArch64::SUBSWrr:
1238 case AArch64::SUBSWri:
1239 case AArch64::SUBSXrr:
1240 case AArch64::SUBSXri:
1241 return Instr.getOpcode();
1242
1243 case AArch64::ADDWrr:
1244 return AArch64::ADDSWrr;
1245 case AArch64::ADDWri:
1246 return AArch64::ADDSWri;
1247 case AArch64::ADDXrr:
1248 return AArch64::ADDSXrr;
1249 case AArch64::ADDXri:
1250 return AArch64::ADDSXri;
1251 case AArch64::ADCWr:
1252 return AArch64::ADCSWr;
1253 case AArch64::ADCXr:
1254 return AArch64::ADCSXr;
1255 case AArch64::SUBWrr:
1256 return AArch64::SUBSWrr;
1257 case AArch64::SUBWri:
1258 return AArch64::SUBSWri;
1259 case AArch64::SUBXrr:
1260 return AArch64::SUBSXrr;
1261 case AArch64::SUBXri:
1262 return AArch64::SUBSXri;
1263 case AArch64::SBCWr:
1264 return AArch64::SBCSWr;
1265 case AArch64::SBCXr:
1266 return AArch64::SBCSXr;
1267 case AArch64::ANDWri:
1268 return AArch64::ANDSWri;
1269 case AArch64::ANDXri:
1270 return AArch64::ANDSXri;
1271 }
1272 }
1273
1274 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(MachineBasicBlock * MBB)1275 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1276 for (auto *BB : MBB->successors())
1277 if (BB->isLiveIn(AArch64::NZCV))
1278 return true;
1279 return false;
1280 }
1281
1282 namespace {
1283
1284 struct UsedNZCV {
1285 bool N = false;
1286 bool Z = false;
1287 bool C = false;
1288 bool V = false;
1289
1290 UsedNZCV() = default;
1291
operator |=__anon34ba85090211::UsedNZCV1292 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1293 this->N |= UsedFlags.N;
1294 this->Z |= UsedFlags.Z;
1295 this->C |= UsedFlags.C;
1296 this->V |= UsedFlags.V;
1297 return *this;
1298 }
1299 };
1300
1301 } // end anonymous namespace
1302
1303 /// Find a condition code used by the instruction.
1304 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1305 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1306 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1307 switch (Instr.getOpcode()) {
1308 default:
1309 return AArch64CC::Invalid;
1310
1311 case AArch64::Bcc: {
1312 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1313 assert(Idx >= 2);
1314 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1315 }
1316
1317 case AArch64::CSINVWr:
1318 case AArch64::CSINVXr:
1319 case AArch64::CSINCWr:
1320 case AArch64::CSINCXr:
1321 case AArch64::CSELWr:
1322 case AArch64::CSELXr:
1323 case AArch64::CSNEGWr:
1324 case AArch64::CSNEGXr:
1325 case AArch64::FCSELSrrr:
1326 case AArch64::FCSELDrrr: {
1327 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1328 assert(Idx >= 1);
1329 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1330 }
1331 }
1332 }
1333
getUsedNZCV(AArch64CC::CondCode CC)1334 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1335 assert(CC != AArch64CC::Invalid);
1336 UsedNZCV UsedFlags;
1337 switch (CC) {
1338 default:
1339 break;
1340
1341 case AArch64CC::EQ: // Z set
1342 case AArch64CC::NE: // Z clear
1343 UsedFlags.Z = true;
1344 break;
1345
1346 case AArch64CC::HI: // Z clear and C set
1347 case AArch64CC::LS: // Z set or C clear
1348 UsedFlags.Z = true;
1349 LLVM_FALLTHROUGH;
1350 case AArch64CC::HS: // C set
1351 case AArch64CC::LO: // C clear
1352 UsedFlags.C = true;
1353 break;
1354
1355 case AArch64CC::MI: // N set
1356 case AArch64CC::PL: // N clear
1357 UsedFlags.N = true;
1358 break;
1359
1360 case AArch64CC::VS: // V set
1361 case AArch64CC::VC: // V clear
1362 UsedFlags.V = true;
1363 break;
1364
1365 case AArch64CC::GT: // Z clear, N and V the same
1366 case AArch64CC::LE: // Z set, N and V differ
1367 UsedFlags.Z = true;
1368 LLVM_FALLTHROUGH;
1369 case AArch64CC::GE: // N and V the same
1370 case AArch64CC::LT: // N and V differ
1371 UsedFlags.N = true;
1372 UsedFlags.V = true;
1373 break;
1374 }
1375 return UsedFlags;
1376 }
1377
isADDSRegImm(unsigned Opcode)1378 static bool isADDSRegImm(unsigned Opcode) {
1379 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1380 }
1381
isSUBSRegImm(unsigned Opcode)1382 static bool isSUBSRegImm(unsigned Opcode) {
1383 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1384 }
1385
1386 /// Check if CmpInstr can be substituted by MI.
1387 ///
1388 /// CmpInstr can be substituted:
1389 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1390 /// - and, MI and CmpInstr are from the same MachineBB
1391 /// - and, condition flags are not alive in successors of the CmpInstr parent
1392 /// - and, if MI opcode is the S form there must be no defs of flags between
1393 /// MI and CmpInstr
1394 /// or if MI opcode is not the S form there must be neither defs of flags
1395 /// nor uses of flags between MI and CmpInstr.
1396 /// - and C/V flags are not used after CmpInstr
canInstrSubstituteCmpInstr(MachineInstr * MI,MachineInstr * CmpInstr,const TargetRegisterInfo * TRI)1397 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1398 const TargetRegisterInfo *TRI) {
1399 assert(MI);
1400 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1401 assert(CmpInstr);
1402
1403 const unsigned CmpOpcode = CmpInstr->getOpcode();
1404 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1405 return false;
1406
1407 if (MI->getParent() != CmpInstr->getParent())
1408 return false;
1409
1410 if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1411 return false;
1412
1413 AccessKind AccessToCheck = AK_Write;
1414 if (sForm(*MI) != MI->getOpcode())
1415 AccessToCheck = AK_All;
1416 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1417 return false;
1418
1419 UsedNZCV NZCVUsedAfterCmp;
1420 for (auto I = std::next(CmpInstr->getIterator()),
1421 E = CmpInstr->getParent()->instr_end();
1422 I != E; ++I) {
1423 const MachineInstr &Instr = *I;
1424 if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1425 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1426 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1427 return false;
1428 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1429 }
1430
1431 if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1432 break;
1433 }
1434
1435 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1436 }
1437
1438 /// Substitute an instruction comparing to zero with another instruction
1439 /// which produces needed condition flags.
1440 ///
1441 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo * MRI) const1442 bool AArch64InstrInfo::substituteCmpToZero(
1443 MachineInstr &CmpInstr, unsigned SrcReg,
1444 const MachineRegisterInfo *MRI) const {
1445 assert(MRI);
1446 // Get the unique definition of SrcReg.
1447 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1448 if (!MI)
1449 return false;
1450
1451 const TargetRegisterInfo *TRI = &getRegisterInfo();
1452
1453 unsigned NewOpc = sForm(*MI);
1454 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1455 return false;
1456
1457 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1458 return false;
1459
1460 // Update the instruction to set NZCV.
1461 MI->setDesc(get(NewOpc));
1462 CmpInstr.eraseFromParent();
1463 bool succeeded = UpdateOperandRegClass(*MI);
1464 (void)succeeded;
1465 assert(succeeded && "Some operands reg class are incompatible!");
1466 MI->addRegisterDefined(AArch64::NZCV, TRI);
1467 return true;
1468 }
1469
expandPostRAPseudo(MachineInstr & MI) const1470 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1471 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1472 MI.getOpcode() != AArch64::CATCHRET)
1473 return false;
1474
1475 MachineBasicBlock &MBB = *MI.getParent();
1476 DebugLoc DL = MI.getDebugLoc();
1477
1478 if (MI.getOpcode() == AArch64::CATCHRET) {
1479 // Skip to the first instruction before the epilog.
1480 const TargetInstrInfo *TII =
1481 MBB.getParent()->getSubtarget().getInstrInfo();
1482 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1483 auto MBBI = MachineBasicBlock::iterator(MI);
1484 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1485 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1486 FirstEpilogSEH != MBB.begin())
1487 FirstEpilogSEH = std::prev(FirstEpilogSEH);
1488 if (FirstEpilogSEH != MBB.begin())
1489 FirstEpilogSEH = std::next(FirstEpilogSEH);
1490 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1491 .addReg(AArch64::X0, RegState::Define)
1492 .addMBB(TargetMBB);
1493 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1494 .addReg(AArch64::X0, RegState::Define)
1495 .addReg(AArch64::X0)
1496 .addMBB(TargetMBB)
1497 .addImm(0);
1498 return true;
1499 }
1500
1501 unsigned Reg = MI.getOperand(0).getReg();
1502 const GlobalValue *GV =
1503 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1504 const TargetMachine &TM = MBB.getParent()->getTarget();
1505 unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1506 const unsigned char MO_NC = AArch64II::MO_NC;
1507
1508 if ((OpFlags & AArch64II::MO_GOT) != 0) {
1509 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1510 .addGlobalAddress(GV, 0, OpFlags);
1511 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1512 .addReg(Reg, RegState::Kill)
1513 .addImm(0)
1514 .addMemOperand(*MI.memoperands_begin());
1515 } else if (TM.getCodeModel() == CodeModel::Large) {
1516 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1517 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1518 .addImm(0);
1519 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1520 .addReg(Reg, RegState::Kill)
1521 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1522 .addImm(16);
1523 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1524 .addReg(Reg, RegState::Kill)
1525 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1526 .addImm(32);
1527 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1528 .addReg(Reg, RegState::Kill)
1529 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1530 .addImm(48);
1531 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1532 .addReg(Reg, RegState::Kill)
1533 .addImm(0)
1534 .addMemOperand(*MI.memoperands_begin());
1535 } else if (TM.getCodeModel() == CodeModel::Tiny) {
1536 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1537 .addGlobalAddress(GV, 0, OpFlags);
1538 } else {
1539 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1540 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1541 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1542 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1543 .addReg(Reg, RegState::Kill)
1544 .addGlobalAddress(GV, 0, LoFlags)
1545 .addMemOperand(*MI.memoperands_begin());
1546 }
1547
1548 MBB.erase(MI);
1549
1550 return true;
1551 }
1552
1553 // Return true if this instruction simply sets its single destination register
1554 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)1555 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1556 switch (MI.getOpcode()) {
1557 default:
1558 break;
1559 case AArch64::MOVZWi:
1560 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1561 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1562 assert(MI.getDesc().getNumOperands() == 3 &&
1563 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1564 return true;
1565 }
1566 break;
1567 case AArch64::ANDWri: // and Rd, Rzr, #imm
1568 return MI.getOperand(1).getReg() == AArch64::WZR;
1569 case AArch64::ANDXri:
1570 return MI.getOperand(1).getReg() == AArch64::XZR;
1571 case TargetOpcode::COPY:
1572 return MI.getOperand(1).getReg() == AArch64::WZR;
1573 }
1574 return false;
1575 }
1576
1577 // Return true if this instruction simply renames a general register without
1578 // modifying bits.
isGPRCopy(const MachineInstr & MI)1579 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1580 switch (MI.getOpcode()) {
1581 default:
1582 break;
1583 case TargetOpcode::COPY: {
1584 // GPR32 copies will by lowered to ORRXrs
1585 unsigned DstReg = MI.getOperand(0).getReg();
1586 return (AArch64::GPR32RegClass.contains(DstReg) ||
1587 AArch64::GPR64RegClass.contains(DstReg));
1588 }
1589 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1590 if (MI.getOperand(1).getReg() == AArch64::XZR) {
1591 assert(MI.getDesc().getNumOperands() == 4 &&
1592 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1593 return true;
1594 }
1595 break;
1596 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1597 if (MI.getOperand(2).getImm() == 0) {
1598 assert(MI.getDesc().getNumOperands() == 4 &&
1599 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1600 return true;
1601 }
1602 break;
1603 }
1604 return false;
1605 }
1606
1607 // Return true if this instruction simply renames a general register without
1608 // modifying bits.
isFPRCopy(const MachineInstr & MI)1609 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1610 switch (MI.getOpcode()) {
1611 default:
1612 break;
1613 case TargetOpcode::COPY: {
1614 // FPR64 copies will by lowered to ORR.16b
1615 unsigned DstReg = MI.getOperand(0).getReg();
1616 return (AArch64::FPR64RegClass.contains(DstReg) ||
1617 AArch64::FPR128RegClass.contains(DstReg));
1618 }
1619 case AArch64::ORRv16i8:
1620 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1621 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1622 "invalid ORRv16i8 operands");
1623 return true;
1624 }
1625 break;
1626 }
1627 return false;
1628 }
1629
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const1630 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1631 int &FrameIndex) const {
1632 switch (MI.getOpcode()) {
1633 default:
1634 break;
1635 case AArch64::LDRWui:
1636 case AArch64::LDRXui:
1637 case AArch64::LDRBui:
1638 case AArch64::LDRHui:
1639 case AArch64::LDRSui:
1640 case AArch64::LDRDui:
1641 case AArch64::LDRQui:
1642 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1643 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1644 FrameIndex = MI.getOperand(1).getIndex();
1645 return MI.getOperand(0).getReg();
1646 }
1647 break;
1648 }
1649
1650 return 0;
1651 }
1652
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const1653 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1654 int &FrameIndex) const {
1655 switch (MI.getOpcode()) {
1656 default:
1657 break;
1658 case AArch64::STRWui:
1659 case AArch64::STRXui:
1660 case AArch64::STRBui:
1661 case AArch64::STRHui:
1662 case AArch64::STRSui:
1663 case AArch64::STRDui:
1664 case AArch64::STRQui:
1665 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1666 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1667 FrameIndex = MI.getOperand(1).getIndex();
1668 return MI.getOperand(0).getReg();
1669 }
1670 break;
1671 }
1672 return 0;
1673 }
1674
1675 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)1676 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1677 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1678 return MMO->getFlags() & MOSuppressPair;
1679 });
1680 }
1681
1682 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)1683 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1684 if (MI.memoperands_empty())
1685 return;
1686 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1687 }
1688
1689 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)1690 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1691 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1692 return MMO->getFlags() & MOStridedAccess;
1693 });
1694 }
1695
isUnscaledLdSt(unsigned Opc)1696 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1697 switch (Opc) {
1698 default:
1699 return false;
1700 case AArch64::STURSi:
1701 case AArch64::STURDi:
1702 case AArch64::STURQi:
1703 case AArch64::STURBBi:
1704 case AArch64::STURHHi:
1705 case AArch64::STURWi:
1706 case AArch64::STURXi:
1707 case AArch64::LDURSi:
1708 case AArch64::LDURDi:
1709 case AArch64::LDURQi:
1710 case AArch64::LDURWi:
1711 case AArch64::LDURXi:
1712 case AArch64::LDURSWi:
1713 case AArch64::LDURHHi:
1714 case AArch64::LDURBBi:
1715 case AArch64::LDURSBWi:
1716 case AArch64::LDURSHWi:
1717 return true;
1718 }
1719 }
1720
getUnscaledLdSt(unsigned Opc)1721 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
1722 switch (Opc) {
1723 default: return {};
1724 case AArch64::PRFMui: return AArch64::PRFUMi;
1725 case AArch64::LDRXui: return AArch64::LDURXi;
1726 case AArch64::LDRWui: return AArch64::LDURWi;
1727 case AArch64::LDRBui: return AArch64::LDURBi;
1728 case AArch64::LDRHui: return AArch64::LDURHi;
1729 case AArch64::LDRSui: return AArch64::LDURSi;
1730 case AArch64::LDRDui: return AArch64::LDURDi;
1731 case AArch64::LDRQui: return AArch64::LDURQi;
1732 case AArch64::LDRBBui: return AArch64::LDURBBi;
1733 case AArch64::LDRHHui: return AArch64::LDURHHi;
1734 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1735 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1736 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1737 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1738 case AArch64::LDRSWui: return AArch64::LDURSWi;
1739 case AArch64::STRXui: return AArch64::STURXi;
1740 case AArch64::STRWui: return AArch64::STURWi;
1741 case AArch64::STRBui: return AArch64::STURBi;
1742 case AArch64::STRHui: return AArch64::STURHi;
1743 case AArch64::STRSui: return AArch64::STURSi;
1744 case AArch64::STRDui: return AArch64::STURDi;
1745 case AArch64::STRQui: return AArch64::STURQi;
1746 case AArch64::STRBBui: return AArch64::STURBBi;
1747 case AArch64::STRHHui: return AArch64::STURHHi;
1748 }
1749 }
1750
getLoadStoreImmIdx(unsigned Opc)1751 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
1752 switch (Opc) {
1753 default:
1754 return 2;
1755 case AArch64::LDPXi:
1756 case AArch64::LDPDi:
1757 case AArch64::STPXi:
1758 case AArch64::STPDi:
1759 case AArch64::LDNPXi:
1760 case AArch64::LDNPDi:
1761 case AArch64::STNPXi:
1762 case AArch64::STNPDi:
1763 case AArch64::LDPQi:
1764 case AArch64::STPQi:
1765 case AArch64::LDNPQi:
1766 case AArch64::STNPQi:
1767 case AArch64::LDPWi:
1768 case AArch64::LDPSi:
1769 case AArch64::STPWi:
1770 case AArch64::STPSi:
1771 case AArch64::LDNPWi:
1772 case AArch64::LDNPSi:
1773 case AArch64::STNPWi:
1774 case AArch64::STNPSi:
1775 case AArch64::LDG:
1776 case AArch64::STGPi:
1777 return 3;
1778 case AArch64::ADDG:
1779 case AArch64::STGOffset:
1780 return 2;
1781 }
1782 }
1783
isPairableLdStInst(const MachineInstr & MI)1784 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1785 switch (MI.getOpcode()) {
1786 default:
1787 return false;
1788 // Scaled instructions.
1789 case AArch64::STRSui:
1790 case AArch64::STRDui:
1791 case AArch64::STRQui:
1792 case AArch64::STRXui:
1793 case AArch64::STRWui:
1794 case AArch64::LDRSui:
1795 case AArch64::LDRDui:
1796 case AArch64::LDRQui:
1797 case AArch64::LDRXui:
1798 case AArch64::LDRWui:
1799 case AArch64::LDRSWui:
1800 // Unscaled instructions.
1801 case AArch64::STURSi:
1802 case AArch64::STURDi:
1803 case AArch64::STURQi:
1804 case AArch64::STURWi:
1805 case AArch64::STURXi:
1806 case AArch64::LDURSi:
1807 case AArch64::LDURDi:
1808 case AArch64::LDURQi:
1809 case AArch64::LDURWi:
1810 case AArch64::LDURXi:
1811 case AArch64::LDURSWi:
1812 return true;
1813 }
1814 }
1815
convertToFlagSettingOpc(unsigned Opc,bool & Is64Bit)1816 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1817 bool &Is64Bit) {
1818 switch (Opc) {
1819 default:
1820 llvm_unreachable("Opcode has no flag setting equivalent!");
1821 // 32-bit cases:
1822 case AArch64::ADDWri:
1823 Is64Bit = false;
1824 return AArch64::ADDSWri;
1825 case AArch64::ADDWrr:
1826 Is64Bit = false;
1827 return AArch64::ADDSWrr;
1828 case AArch64::ADDWrs:
1829 Is64Bit = false;
1830 return AArch64::ADDSWrs;
1831 case AArch64::ADDWrx:
1832 Is64Bit = false;
1833 return AArch64::ADDSWrx;
1834 case AArch64::ANDWri:
1835 Is64Bit = false;
1836 return AArch64::ANDSWri;
1837 case AArch64::ANDWrr:
1838 Is64Bit = false;
1839 return AArch64::ANDSWrr;
1840 case AArch64::ANDWrs:
1841 Is64Bit = false;
1842 return AArch64::ANDSWrs;
1843 case AArch64::BICWrr:
1844 Is64Bit = false;
1845 return AArch64::BICSWrr;
1846 case AArch64::BICWrs:
1847 Is64Bit = false;
1848 return AArch64::BICSWrs;
1849 case AArch64::SUBWri:
1850 Is64Bit = false;
1851 return AArch64::SUBSWri;
1852 case AArch64::SUBWrr:
1853 Is64Bit = false;
1854 return AArch64::SUBSWrr;
1855 case AArch64::SUBWrs:
1856 Is64Bit = false;
1857 return AArch64::SUBSWrs;
1858 case AArch64::SUBWrx:
1859 Is64Bit = false;
1860 return AArch64::SUBSWrx;
1861 // 64-bit cases:
1862 case AArch64::ADDXri:
1863 Is64Bit = true;
1864 return AArch64::ADDSXri;
1865 case AArch64::ADDXrr:
1866 Is64Bit = true;
1867 return AArch64::ADDSXrr;
1868 case AArch64::ADDXrs:
1869 Is64Bit = true;
1870 return AArch64::ADDSXrs;
1871 case AArch64::ADDXrx:
1872 Is64Bit = true;
1873 return AArch64::ADDSXrx;
1874 case AArch64::ANDXri:
1875 Is64Bit = true;
1876 return AArch64::ANDSXri;
1877 case AArch64::ANDXrr:
1878 Is64Bit = true;
1879 return AArch64::ANDSXrr;
1880 case AArch64::ANDXrs:
1881 Is64Bit = true;
1882 return AArch64::ANDSXrs;
1883 case AArch64::BICXrr:
1884 Is64Bit = true;
1885 return AArch64::BICSXrr;
1886 case AArch64::BICXrs:
1887 Is64Bit = true;
1888 return AArch64::BICSXrs;
1889 case AArch64::SUBXri:
1890 Is64Bit = true;
1891 return AArch64::SUBSXri;
1892 case AArch64::SUBXrr:
1893 Is64Bit = true;
1894 return AArch64::SUBSXrr;
1895 case AArch64::SUBXrs:
1896 Is64Bit = true;
1897 return AArch64::SUBSXrs;
1898 case AArch64::SUBXrx:
1899 Is64Bit = true;
1900 return AArch64::SUBSXrx;
1901 }
1902 }
1903
1904 // Is this a candidate for ld/st merging or pairing? For example, we don't
1905 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const1906 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
1907 // If this is a volatile load/store, don't mess with it.
1908 if (MI.hasOrderedMemoryRef())
1909 return false;
1910
1911 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
1912 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
1913 "Expected a reg or frame index operand.");
1914 if (!MI.getOperand(2).isImm())
1915 return false;
1916
1917 // Can't merge/pair if the instruction modifies the base register.
1918 // e.g., ldr x0, [x0]
1919 // This case will never occur with an FI base.
1920 if (MI.getOperand(1).isReg()) {
1921 unsigned BaseReg = MI.getOperand(1).getReg();
1922 const TargetRegisterInfo *TRI = &getRegisterInfo();
1923 if (MI.modifiesRegister(BaseReg, TRI))
1924 return false;
1925 }
1926
1927 // Check if this load/store has a hint to avoid pair formation.
1928 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1929 if (isLdStPairSuppressed(MI))
1930 return false;
1931
1932 // Do not pair any callee-save store/reload instructions in the
1933 // prologue/epilogue if the CFI information encoded the operations as separate
1934 // instructions, as that will cause the size of the actual prologue to mismatch
1935 // with the prologue size recorded in the Windows CFI.
1936 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
1937 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
1938 MI.getMF()->getFunction().needsUnwindTableEntry();
1939 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
1940 MI.getFlag(MachineInstr::FrameDestroy)))
1941 return false;
1942
1943 // On some CPUs quad load/store pairs are slower than two single load/stores.
1944 if (Subtarget.isPaired128Slow()) {
1945 switch (MI.getOpcode()) {
1946 default:
1947 break;
1948 case AArch64::LDURQi:
1949 case AArch64::STURQi:
1950 case AArch64::LDRQui:
1951 case AArch64::STRQui:
1952 return false;
1953 }
1954 }
1955
1956 return true;
1957 }
1958
getMemOperandWithOffset(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,const TargetRegisterInfo * TRI) const1959 bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
1960 const MachineOperand *&BaseOp,
1961 int64_t &Offset,
1962 const TargetRegisterInfo *TRI) const {
1963 unsigned Width;
1964 return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
1965 }
1966
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,unsigned & Width,const TargetRegisterInfo * TRI) const1967 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
1968 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
1969 unsigned &Width, const TargetRegisterInfo *TRI) const {
1970 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1971 // Handle only loads/stores with base register followed by immediate offset.
1972 if (LdSt.getNumExplicitOperands() == 3) {
1973 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
1974 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
1975 !LdSt.getOperand(2).isImm())
1976 return false;
1977 } else if (LdSt.getNumExplicitOperands() == 4) {
1978 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
1979 if (!LdSt.getOperand(1).isReg() ||
1980 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
1981 !LdSt.getOperand(3).isImm())
1982 return false;
1983 } else
1984 return false;
1985
1986 // Get the scaling factor for the instruction and set the width for the
1987 // instruction.
1988 unsigned Scale = 0;
1989 int64_t Dummy1, Dummy2;
1990
1991 // If this returns false, then it's an instruction we don't want to handle.
1992 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
1993 return false;
1994
1995 // Compute the offset. Offset is calculated as the immediate operand
1996 // multiplied by the scaling factor. Unscaled instructions have scaling factor
1997 // set to 1.
1998 if (LdSt.getNumExplicitOperands() == 3) {
1999 BaseOp = &LdSt.getOperand(1);
2000 Offset = LdSt.getOperand(2).getImm() * Scale;
2001 } else {
2002 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2003 BaseOp = &LdSt.getOperand(2);
2004 Offset = LdSt.getOperand(3).getImm() * Scale;
2005 }
2006
2007 assert((BaseOp->isReg() || BaseOp->isFI()) &&
2008 "getMemOperandWithOffset only supports base "
2009 "operands of type register or frame index.");
2010
2011 return true;
2012 }
2013
2014 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const2015 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
2016 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2017 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2018 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2019 return OfsOp;
2020 }
2021
getMemOpInfo(unsigned Opcode,unsigned & Scale,unsigned & Width,int64_t & MinOffset,int64_t & MaxOffset)2022 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
2023 unsigned &Width, int64_t &MinOffset,
2024 int64_t &MaxOffset) {
2025 switch (Opcode) {
2026 // Not a memory operation or something we want to handle.
2027 default:
2028 Scale = Width = 0;
2029 MinOffset = MaxOffset = 0;
2030 return false;
2031 case AArch64::STRWpost:
2032 case AArch64::LDRWpost:
2033 Width = 32;
2034 Scale = 4;
2035 MinOffset = -256;
2036 MaxOffset = 255;
2037 break;
2038 case AArch64::LDURQi:
2039 case AArch64::STURQi:
2040 Width = 16;
2041 Scale = 1;
2042 MinOffset = -256;
2043 MaxOffset = 255;
2044 break;
2045 case AArch64::PRFUMi:
2046 case AArch64::LDURXi:
2047 case AArch64::LDURDi:
2048 case AArch64::STURXi:
2049 case AArch64::STURDi:
2050 Width = 8;
2051 Scale = 1;
2052 MinOffset = -256;
2053 MaxOffset = 255;
2054 break;
2055 case AArch64::LDURWi:
2056 case AArch64::LDURSi:
2057 case AArch64::LDURSWi:
2058 case AArch64::STURWi:
2059 case AArch64::STURSi:
2060 Width = 4;
2061 Scale = 1;
2062 MinOffset = -256;
2063 MaxOffset = 255;
2064 break;
2065 case AArch64::LDURHi:
2066 case AArch64::LDURHHi:
2067 case AArch64::LDURSHXi:
2068 case AArch64::LDURSHWi:
2069 case AArch64::STURHi:
2070 case AArch64::STURHHi:
2071 Width = 2;
2072 Scale = 1;
2073 MinOffset = -256;
2074 MaxOffset = 255;
2075 break;
2076 case AArch64::LDURBi:
2077 case AArch64::LDURBBi:
2078 case AArch64::LDURSBXi:
2079 case AArch64::LDURSBWi:
2080 case AArch64::STURBi:
2081 case AArch64::STURBBi:
2082 Width = 1;
2083 Scale = 1;
2084 MinOffset = -256;
2085 MaxOffset = 255;
2086 break;
2087 case AArch64::LDPQi:
2088 case AArch64::LDNPQi:
2089 case AArch64::STPQi:
2090 case AArch64::STNPQi:
2091 Scale = 16;
2092 Width = 32;
2093 MinOffset = -64;
2094 MaxOffset = 63;
2095 break;
2096 case AArch64::LDRQui:
2097 case AArch64::STRQui:
2098 Scale = Width = 16;
2099 MinOffset = 0;
2100 MaxOffset = 4095;
2101 break;
2102 case AArch64::LDPXi:
2103 case AArch64::LDPDi:
2104 case AArch64::LDNPXi:
2105 case AArch64::LDNPDi:
2106 case AArch64::STPXi:
2107 case AArch64::STPDi:
2108 case AArch64::STNPXi:
2109 case AArch64::STNPDi:
2110 Scale = 8;
2111 Width = 16;
2112 MinOffset = -64;
2113 MaxOffset = 63;
2114 break;
2115 case AArch64::PRFMui:
2116 case AArch64::LDRXui:
2117 case AArch64::LDRDui:
2118 case AArch64::STRXui:
2119 case AArch64::STRDui:
2120 Scale = Width = 8;
2121 MinOffset = 0;
2122 MaxOffset = 4095;
2123 break;
2124 case AArch64::LDPWi:
2125 case AArch64::LDPSi:
2126 case AArch64::LDNPWi:
2127 case AArch64::LDNPSi:
2128 case AArch64::STPWi:
2129 case AArch64::STPSi:
2130 case AArch64::STNPWi:
2131 case AArch64::STNPSi:
2132 Scale = 4;
2133 Width = 8;
2134 MinOffset = -64;
2135 MaxOffset = 63;
2136 break;
2137 case AArch64::LDRWui:
2138 case AArch64::LDRSui:
2139 case AArch64::LDRSWui:
2140 case AArch64::STRWui:
2141 case AArch64::STRSui:
2142 Scale = Width = 4;
2143 MinOffset = 0;
2144 MaxOffset = 4095;
2145 break;
2146 case AArch64::LDRHui:
2147 case AArch64::LDRHHui:
2148 case AArch64::LDRSHWui:
2149 case AArch64::LDRSHXui:
2150 case AArch64::STRHui:
2151 case AArch64::STRHHui:
2152 Scale = Width = 2;
2153 MinOffset = 0;
2154 MaxOffset = 4095;
2155 break;
2156 case AArch64::LDRBui:
2157 case AArch64::LDRBBui:
2158 case AArch64::LDRSBWui:
2159 case AArch64::LDRSBXui:
2160 case AArch64::STRBui:
2161 case AArch64::STRBBui:
2162 Scale = Width = 1;
2163 MinOffset = 0;
2164 MaxOffset = 4095;
2165 break;
2166 case AArch64::ADDG:
2167 case AArch64::TAGPstack:
2168 Scale = 16;
2169 Width = 0;
2170 MinOffset = 0;
2171 MaxOffset = 63;
2172 break;
2173 case AArch64::LDG:
2174 case AArch64::STGOffset:
2175 case AArch64::STZGOffset:
2176 Scale = Width = 16;
2177 MinOffset = -256;
2178 MaxOffset = 255;
2179 break;
2180 case AArch64::ST2GOffset:
2181 case AArch64::STZ2GOffset:
2182 Scale = 16;
2183 Width = 32;
2184 MinOffset = -256;
2185 MaxOffset = 255;
2186 break;
2187 case AArch64::STGPi:
2188 Scale = Width = 16;
2189 MinOffset = -64;
2190 MaxOffset = 63;
2191 break;
2192 }
2193
2194 return true;
2195 }
2196
getOffsetStride(unsigned Opc)2197 static unsigned getOffsetStride(unsigned Opc) {
2198 switch (Opc) {
2199 default:
2200 return 0;
2201 case AArch64::LDURQi:
2202 case AArch64::STURQi:
2203 return 16;
2204 case AArch64::LDURXi:
2205 case AArch64::LDURDi:
2206 case AArch64::STURXi:
2207 case AArch64::STURDi:
2208 return 8;
2209 case AArch64::LDURWi:
2210 case AArch64::LDURSi:
2211 case AArch64::LDURSWi:
2212 case AArch64::STURWi:
2213 case AArch64::STURSi:
2214 return 4;
2215 }
2216 }
2217
2218 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
2219 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)2220 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2221 unsigned OffsetStride = getOffsetStride(Opc);
2222 if (OffsetStride == 0)
2223 return false;
2224 // If the byte-offset isn't a multiple of the stride, we can't scale this
2225 // offset.
2226 if (Offset % OffsetStride != 0)
2227 return false;
2228
2229 // Convert the byte-offset used by unscaled into an "element" offset used
2230 // by the scaled pair load/store instructions.
2231 Offset /= OffsetStride;
2232 return true;
2233 }
2234
2235 // Unscale the scaled offsets. Returns false if the scaled offset can't be
2236 // unscaled.
unscaleOffset(unsigned Opc,int64_t & Offset)2237 static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
2238 unsigned OffsetStride = getOffsetStride(Opc);
2239 if (OffsetStride == 0)
2240 return false;
2241
2242 // Convert the "element" offset used by scaled pair load/store instructions
2243 // into the byte-offset used by unscaled.
2244 Offset *= OffsetStride;
2245 return true;
2246 }
2247
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)2248 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2249 if (FirstOpc == SecondOpc)
2250 return true;
2251 // We can also pair sign-ext and zero-ext instructions.
2252 switch (FirstOpc) {
2253 default:
2254 return false;
2255 case AArch64::LDRWui:
2256 case AArch64::LDURWi:
2257 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2258 case AArch64::LDRSWui:
2259 case AArch64::LDURSWi:
2260 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2261 }
2262 // These instructions can't be paired based on their opcodes.
2263 return false;
2264 }
2265
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)2266 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2267 int64_t Offset1, unsigned Opcode1, int FI2,
2268 int64_t Offset2, unsigned Opcode2) {
2269 // Accesses through fixed stack object frame indices may access a different
2270 // fixed stack slot. Check that the object offsets + offsets match.
2271 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2272 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2273 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2274 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2275 // Get the byte-offset from the object offset.
2276 if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
2277 return false;
2278 ObjectOffset1 += Offset1;
2279 ObjectOffset2 += Offset2;
2280 // Get the "element" index in the object.
2281 if (!scaleOffset(Opcode1, ObjectOffset1) ||
2282 !scaleOffset(Opcode2, ObjectOffset2))
2283 return false;
2284 return ObjectOffset1 + 1 == ObjectOffset2;
2285 }
2286
2287 return FI1 == FI2;
2288 }
2289
2290 /// Detect opportunities for ldp/stp formation.
2291 ///
2292 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(const MachineOperand & BaseOp1,const MachineOperand & BaseOp2,unsigned NumLoads) const2293 bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
2294 const MachineOperand &BaseOp2,
2295 unsigned NumLoads) const {
2296 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2297 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2298 if (BaseOp1.getType() != BaseOp2.getType())
2299 return false;
2300
2301 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2302 "Only base registers and frame indices are supported.");
2303
2304 // Check for both base regs and base FI.
2305 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2306 return false;
2307
2308 // Only cluster up to a single pair.
2309 if (NumLoads > 1)
2310 return false;
2311
2312 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2313 return false;
2314
2315 // Can we pair these instructions based on their opcodes?
2316 unsigned FirstOpc = FirstLdSt.getOpcode();
2317 unsigned SecondOpc = SecondLdSt.getOpcode();
2318 if (!canPairLdStOpc(FirstOpc, SecondOpc))
2319 return false;
2320
2321 // Can't merge volatiles or load/stores that have a hint to avoid pair
2322 // formation, for example.
2323 if (!isCandidateToMergeOrPair(FirstLdSt) ||
2324 !isCandidateToMergeOrPair(SecondLdSt))
2325 return false;
2326
2327 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2328 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2329 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2330 return false;
2331
2332 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2333 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2334 return false;
2335
2336 // Pairwise instructions have a 7-bit signed offset field.
2337 if (Offset1 > 63 || Offset1 < -64)
2338 return false;
2339
2340 // The caller should already have ordered First/SecondLdSt by offset.
2341 // Note: except for non-equal frame index bases
2342 if (BaseOp1.isFI()) {
2343 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
2344 "Caller should have ordered offsets.");
2345
2346 const MachineFrameInfo &MFI =
2347 FirstLdSt.getParent()->getParent()->getFrameInfo();
2348 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2349 BaseOp2.getIndex(), Offset2, SecondOpc);
2350 }
2351
2352 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2353 "Caller should have ordered offsets.");
2354
2355 return Offset1 + 1 == Offset2;
2356 }
2357
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)2358 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2359 unsigned Reg, unsigned SubIdx,
2360 unsigned State,
2361 const TargetRegisterInfo *TRI) {
2362 if (!SubIdx)
2363 return MIB.addReg(Reg, State);
2364
2365 if (TargetRegisterInfo::isPhysicalRegister(Reg))
2366 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2367 return MIB.addReg(Reg, State, SubIdx);
2368 }
2369
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)2370 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2371 unsigned NumRegs) {
2372 // We really want the positive remainder mod 32 here, that happens to be
2373 // easily obtainable with a mask.
2374 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2375 }
2376
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const2377 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2378 MachineBasicBlock::iterator I,
2379 const DebugLoc &DL, unsigned DestReg,
2380 unsigned SrcReg, bool KillSrc,
2381 unsigned Opcode,
2382 ArrayRef<unsigned> Indices) const {
2383 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2384 const TargetRegisterInfo *TRI = &getRegisterInfo();
2385 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2386 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2387 unsigned NumRegs = Indices.size();
2388
2389 int SubReg = 0, End = NumRegs, Incr = 1;
2390 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2391 SubReg = NumRegs - 1;
2392 End = -1;
2393 Incr = -1;
2394 }
2395
2396 for (; SubReg != End; SubReg += Incr) {
2397 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2398 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2399 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2400 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2401 }
2402 }
2403
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const2404 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2405 MachineBasicBlock::iterator I,
2406 DebugLoc DL, unsigned DestReg,
2407 unsigned SrcReg, bool KillSrc,
2408 unsigned Opcode, unsigned ZeroReg,
2409 llvm::ArrayRef<unsigned> Indices) const {
2410 const TargetRegisterInfo *TRI = &getRegisterInfo();
2411 unsigned NumRegs = Indices.size();
2412
2413 #ifndef NDEBUG
2414 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2415 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2416 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2417 "GPR reg sequences should not be able to overlap");
2418 #endif
2419
2420 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2421 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2422 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2423 MIB.addReg(ZeroReg);
2424 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2425 MIB.addImm(0);
2426 }
2427 }
2428
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,bool KillSrc) const2429 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2430 MachineBasicBlock::iterator I,
2431 const DebugLoc &DL, unsigned DestReg,
2432 unsigned SrcReg, bool KillSrc) const {
2433 if (AArch64::GPR32spRegClass.contains(DestReg) &&
2434 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2435 const TargetRegisterInfo *TRI = &getRegisterInfo();
2436
2437 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2438 // If either operand is WSP, expand to ADD #0.
2439 if (Subtarget.hasZeroCycleRegMove()) {
2440 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2441 unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2442 &AArch64::GPR64spRegClass);
2443 unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2444 &AArch64::GPR64spRegClass);
2445 // This instruction is reading and writing X registers. This may upset
2446 // the register scavenger and machine verifier, so we need to indicate
2447 // that we are reading an undefined value from SrcRegX, but a proper
2448 // value from SrcReg.
2449 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2450 .addReg(SrcRegX, RegState::Undef)
2451 .addImm(0)
2452 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2453 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2454 } else {
2455 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2456 .addReg(SrcReg, getKillRegState(KillSrc))
2457 .addImm(0)
2458 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2459 }
2460 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2461 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2462 .addImm(0)
2463 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2464 } else {
2465 if (Subtarget.hasZeroCycleRegMove()) {
2466 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2467 unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2468 &AArch64::GPR64spRegClass);
2469 unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2470 &AArch64::GPR64spRegClass);
2471 // This instruction is reading and writing X registers. This may upset
2472 // the register scavenger and machine verifier, so we need to indicate
2473 // that we are reading an undefined value from SrcRegX, but a proper
2474 // value from SrcReg.
2475 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2476 .addReg(AArch64::XZR)
2477 .addReg(SrcRegX, RegState::Undef)
2478 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2479 } else {
2480 // Otherwise, expand to ORR WZR.
2481 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2482 .addReg(AArch64::WZR)
2483 .addReg(SrcReg, getKillRegState(KillSrc));
2484 }
2485 }
2486 return;
2487 }
2488
2489 if (AArch64::GPR64spRegClass.contains(DestReg) &&
2490 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2491 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2492 // If either operand is SP, expand to ADD #0.
2493 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2494 .addReg(SrcReg, getKillRegState(KillSrc))
2495 .addImm(0)
2496 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2497 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2498 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2499 .addImm(0)
2500 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2501 } else {
2502 // Otherwise, expand to ORR XZR.
2503 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2504 .addReg(AArch64::XZR)
2505 .addReg(SrcReg, getKillRegState(KillSrc));
2506 }
2507 return;
2508 }
2509
2510 // Copy a DDDD register quad by copying the individual sub-registers.
2511 if (AArch64::DDDDRegClass.contains(DestReg) &&
2512 AArch64::DDDDRegClass.contains(SrcReg)) {
2513 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2514 AArch64::dsub2, AArch64::dsub3};
2515 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2516 Indices);
2517 return;
2518 }
2519
2520 // Copy a DDD register triple by copying the individual sub-registers.
2521 if (AArch64::DDDRegClass.contains(DestReg) &&
2522 AArch64::DDDRegClass.contains(SrcReg)) {
2523 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2524 AArch64::dsub2};
2525 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2526 Indices);
2527 return;
2528 }
2529
2530 // Copy a DD register pair by copying the individual sub-registers.
2531 if (AArch64::DDRegClass.contains(DestReg) &&
2532 AArch64::DDRegClass.contains(SrcReg)) {
2533 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2534 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2535 Indices);
2536 return;
2537 }
2538
2539 // Copy a QQQQ register quad by copying the individual sub-registers.
2540 if (AArch64::QQQQRegClass.contains(DestReg) &&
2541 AArch64::QQQQRegClass.contains(SrcReg)) {
2542 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2543 AArch64::qsub2, AArch64::qsub3};
2544 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2545 Indices);
2546 return;
2547 }
2548
2549 // Copy a QQQ register triple by copying the individual sub-registers.
2550 if (AArch64::QQQRegClass.contains(DestReg) &&
2551 AArch64::QQQRegClass.contains(SrcReg)) {
2552 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2553 AArch64::qsub2};
2554 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2555 Indices);
2556 return;
2557 }
2558
2559 // Copy a QQ register pair by copying the individual sub-registers.
2560 if (AArch64::QQRegClass.contains(DestReg) &&
2561 AArch64::QQRegClass.contains(SrcReg)) {
2562 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2563 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2564 Indices);
2565 return;
2566 }
2567
2568 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2569 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2570 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2571 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2572 AArch64::XZR, Indices);
2573 return;
2574 }
2575
2576 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2577 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2578 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2579 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2580 AArch64::WZR, Indices);
2581 return;
2582 }
2583
2584 if (AArch64::FPR128RegClass.contains(DestReg) &&
2585 AArch64::FPR128RegClass.contains(SrcReg)) {
2586 if (Subtarget.hasNEON()) {
2587 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2588 .addReg(SrcReg)
2589 .addReg(SrcReg, getKillRegState(KillSrc));
2590 } else {
2591 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2592 .addReg(AArch64::SP, RegState::Define)
2593 .addReg(SrcReg, getKillRegState(KillSrc))
2594 .addReg(AArch64::SP)
2595 .addImm(-16);
2596 BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2597 .addReg(AArch64::SP, RegState::Define)
2598 .addReg(DestReg, RegState::Define)
2599 .addReg(AArch64::SP)
2600 .addImm(16);
2601 }
2602 return;
2603 }
2604
2605 if (AArch64::FPR64RegClass.contains(DestReg) &&
2606 AArch64::FPR64RegClass.contains(SrcReg)) {
2607 if (Subtarget.hasNEON()) {
2608 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2609 &AArch64::FPR128RegClass);
2610 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2611 &AArch64::FPR128RegClass);
2612 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2613 .addReg(SrcReg)
2614 .addReg(SrcReg, getKillRegState(KillSrc));
2615 } else {
2616 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2617 .addReg(SrcReg, getKillRegState(KillSrc));
2618 }
2619 return;
2620 }
2621
2622 if (AArch64::FPR32RegClass.contains(DestReg) &&
2623 AArch64::FPR32RegClass.contains(SrcReg)) {
2624 if (Subtarget.hasNEON()) {
2625 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2626 &AArch64::FPR128RegClass);
2627 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2628 &AArch64::FPR128RegClass);
2629 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2630 .addReg(SrcReg)
2631 .addReg(SrcReg, getKillRegState(KillSrc));
2632 } else {
2633 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2634 .addReg(SrcReg, getKillRegState(KillSrc));
2635 }
2636 return;
2637 }
2638
2639 if (AArch64::FPR16RegClass.contains(DestReg) &&
2640 AArch64::FPR16RegClass.contains(SrcReg)) {
2641 if (Subtarget.hasNEON()) {
2642 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2643 &AArch64::FPR128RegClass);
2644 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2645 &AArch64::FPR128RegClass);
2646 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2647 .addReg(SrcReg)
2648 .addReg(SrcReg, getKillRegState(KillSrc));
2649 } else {
2650 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2651 &AArch64::FPR32RegClass);
2652 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2653 &AArch64::FPR32RegClass);
2654 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2655 .addReg(SrcReg, getKillRegState(KillSrc));
2656 }
2657 return;
2658 }
2659
2660 if (AArch64::FPR8RegClass.contains(DestReg) &&
2661 AArch64::FPR8RegClass.contains(SrcReg)) {
2662 if (Subtarget.hasNEON()) {
2663 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2664 &AArch64::FPR128RegClass);
2665 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2666 &AArch64::FPR128RegClass);
2667 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2668 .addReg(SrcReg)
2669 .addReg(SrcReg, getKillRegState(KillSrc));
2670 } else {
2671 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2672 &AArch64::FPR32RegClass);
2673 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2674 &AArch64::FPR32RegClass);
2675 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2676 .addReg(SrcReg, getKillRegState(KillSrc));
2677 }
2678 return;
2679 }
2680
2681 // Copies between GPR64 and FPR64.
2682 if (AArch64::FPR64RegClass.contains(DestReg) &&
2683 AArch64::GPR64RegClass.contains(SrcReg)) {
2684 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2685 .addReg(SrcReg, getKillRegState(KillSrc));
2686 return;
2687 }
2688 if (AArch64::GPR64RegClass.contains(DestReg) &&
2689 AArch64::FPR64RegClass.contains(SrcReg)) {
2690 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2691 .addReg(SrcReg, getKillRegState(KillSrc));
2692 return;
2693 }
2694 // Copies between GPR32 and FPR32.
2695 if (AArch64::FPR32RegClass.contains(DestReg) &&
2696 AArch64::GPR32RegClass.contains(SrcReg)) {
2697 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2698 .addReg(SrcReg, getKillRegState(KillSrc));
2699 return;
2700 }
2701 if (AArch64::GPR32RegClass.contains(DestReg) &&
2702 AArch64::FPR32RegClass.contains(SrcReg)) {
2703 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2704 .addReg(SrcReg, getKillRegState(KillSrc));
2705 return;
2706 }
2707
2708 if (DestReg == AArch64::NZCV) {
2709 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2710 BuildMI(MBB, I, DL, get(AArch64::MSR))
2711 .addImm(AArch64SysReg::NZCV)
2712 .addReg(SrcReg, getKillRegState(KillSrc))
2713 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2714 return;
2715 }
2716
2717 if (SrcReg == AArch64::NZCV) {
2718 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2719 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2720 .addImm(AArch64SysReg::NZCV)
2721 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2722 return;
2723 }
2724
2725 llvm_unreachable("unimplemented reg-to-reg copy");
2726 }
2727
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,unsigned SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)2728 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
2729 MachineBasicBlock &MBB,
2730 MachineBasicBlock::iterator InsertBefore,
2731 const MCInstrDesc &MCID,
2732 unsigned SrcReg, bool IsKill,
2733 unsigned SubIdx0, unsigned SubIdx1, int FI,
2734 MachineMemOperand *MMO) {
2735 unsigned SrcReg0 = SrcReg;
2736 unsigned SrcReg1 = SrcReg;
2737 if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
2738 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2739 SubIdx0 = 0;
2740 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2741 SubIdx1 = 0;
2742 }
2743 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2744 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
2745 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
2746 .addFrameIndex(FI)
2747 .addImm(0)
2748 .addMemOperand(MMO);
2749 }
2750
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,unsigned SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const2751 void AArch64InstrInfo::storeRegToStackSlot(
2752 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
2753 bool isKill, int FI, const TargetRegisterClass *RC,
2754 const TargetRegisterInfo *TRI) const {
2755 MachineFunction &MF = *MBB.getParent();
2756 MachineFrameInfo &MFI = MF.getFrameInfo();
2757 unsigned Align = MFI.getObjectAlignment(FI);
2758
2759 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2760 MachineMemOperand *MMO = MF.getMachineMemOperand(
2761 PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
2762 unsigned Opc = 0;
2763 bool Offset = true;
2764 switch (TRI->getSpillSize(*RC)) {
2765 case 1:
2766 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2767 Opc = AArch64::STRBui;
2768 break;
2769 case 2:
2770 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2771 Opc = AArch64::STRHui;
2772 break;
2773 case 4:
2774 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2775 Opc = AArch64::STRWui;
2776 if (TargetRegisterInfo::isVirtualRegister(SrcReg))
2777 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
2778 else
2779 assert(SrcReg != AArch64::WSP);
2780 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2781 Opc = AArch64::STRSui;
2782 break;
2783 case 8:
2784 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2785 Opc = AArch64::STRXui;
2786 if (TargetRegisterInfo::isVirtualRegister(SrcReg))
2787 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2788 else
2789 assert(SrcReg != AArch64::SP);
2790 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2791 Opc = AArch64::STRDui;
2792 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2793 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2794 get(AArch64::STPWi), SrcReg, isKill,
2795 AArch64::sube32, AArch64::subo32, FI, MMO);
2796 return;
2797 }
2798 break;
2799 case 16:
2800 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2801 Opc = AArch64::STRQui;
2802 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2803 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2804 Opc = AArch64::ST1Twov1d;
2805 Offset = false;
2806 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2807 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2808 get(AArch64::STPXi), SrcReg, isKill,
2809 AArch64::sube64, AArch64::subo64, FI, MMO);
2810 return;
2811 }
2812 break;
2813 case 24:
2814 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2815 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2816 Opc = AArch64::ST1Threev1d;
2817 Offset = false;
2818 }
2819 break;
2820 case 32:
2821 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2822 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2823 Opc = AArch64::ST1Fourv1d;
2824 Offset = false;
2825 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2826 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2827 Opc = AArch64::ST1Twov2d;
2828 Offset = false;
2829 }
2830 break;
2831 case 48:
2832 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2833 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2834 Opc = AArch64::ST1Threev2d;
2835 Offset = false;
2836 }
2837 break;
2838 case 64:
2839 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2840 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2841 Opc = AArch64::ST1Fourv2d;
2842 Offset = false;
2843 }
2844 break;
2845 }
2846 assert(Opc && "Unknown register class");
2847
2848 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2849 .addReg(SrcReg, getKillRegState(isKill))
2850 .addFrameIndex(FI);
2851
2852 if (Offset)
2853 MI.addImm(0);
2854 MI.addMemOperand(MMO);
2855 }
2856
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,unsigned DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)2857 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
2858 MachineBasicBlock &MBB,
2859 MachineBasicBlock::iterator InsertBefore,
2860 const MCInstrDesc &MCID,
2861 unsigned DestReg, unsigned SubIdx0,
2862 unsigned SubIdx1, int FI,
2863 MachineMemOperand *MMO) {
2864 unsigned DestReg0 = DestReg;
2865 unsigned DestReg1 = DestReg;
2866 bool IsUndef = true;
2867 if (TargetRegisterInfo::isPhysicalRegister(DestReg)) {
2868 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
2869 SubIdx0 = 0;
2870 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
2871 SubIdx1 = 0;
2872 IsUndef = false;
2873 }
2874 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2875 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
2876 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
2877 .addFrameIndex(FI)
2878 .addImm(0)
2879 .addMemOperand(MMO);
2880 }
2881
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,unsigned DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const2882 void AArch64InstrInfo::loadRegFromStackSlot(
2883 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
2884 int FI, const TargetRegisterClass *RC,
2885 const TargetRegisterInfo *TRI) const {
2886 MachineFunction &MF = *MBB.getParent();
2887 MachineFrameInfo &MFI = MF.getFrameInfo();
2888 unsigned Align = MFI.getObjectAlignment(FI);
2889 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2890 MachineMemOperand *MMO = MF.getMachineMemOperand(
2891 PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
2892
2893 unsigned Opc = 0;
2894 bool Offset = true;
2895 switch (TRI->getSpillSize(*RC)) {
2896 case 1:
2897 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2898 Opc = AArch64::LDRBui;
2899 break;
2900 case 2:
2901 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2902 Opc = AArch64::LDRHui;
2903 break;
2904 case 4:
2905 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2906 Opc = AArch64::LDRWui;
2907 if (TargetRegisterInfo::isVirtualRegister(DestReg))
2908 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
2909 else
2910 assert(DestReg != AArch64::WSP);
2911 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2912 Opc = AArch64::LDRSui;
2913 break;
2914 case 8:
2915 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2916 Opc = AArch64::LDRXui;
2917 if (TargetRegisterInfo::isVirtualRegister(DestReg))
2918 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
2919 else
2920 assert(DestReg != AArch64::SP);
2921 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2922 Opc = AArch64::LDRDui;
2923 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2924 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2925 get(AArch64::LDPWi), DestReg, AArch64::sube32,
2926 AArch64::subo32, FI, MMO);
2927 return;
2928 }
2929 break;
2930 case 16:
2931 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2932 Opc = AArch64::LDRQui;
2933 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2934 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2935 Opc = AArch64::LD1Twov1d;
2936 Offset = false;
2937 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2938 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2939 get(AArch64::LDPXi), DestReg, AArch64::sube64,
2940 AArch64::subo64, FI, MMO);
2941 return;
2942 }
2943 break;
2944 case 24:
2945 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2946 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2947 Opc = AArch64::LD1Threev1d;
2948 Offset = false;
2949 }
2950 break;
2951 case 32:
2952 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2953 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2954 Opc = AArch64::LD1Fourv1d;
2955 Offset = false;
2956 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2957 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2958 Opc = AArch64::LD1Twov2d;
2959 Offset = false;
2960 }
2961 break;
2962 case 48:
2963 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2964 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2965 Opc = AArch64::LD1Threev2d;
2966 Offset = false;
2967 }
2968 break;
2969 case 64:
2970 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2971 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2972 Opc = AArch64::LD1Fourv2d;
2973 Offset = false;
2974 }
2975 break;
2976 }
2977 assert(Opc && "Unknown register class");
2978
2979 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2980 .addReg(DestReg, getDefRegState(true))
2981 .addFrameIndex(FI);
2982 if (Offset)
2983 MI.addImm(0);
2984 MI.addMemOperand(MMO);
2985 }
2986
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI)2987 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
2988 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
2989 unsigned DestReg, unsigned SrcReg, int Offset,
2990 const TargetInstrInfo *TII,
2991 MachineInstr::MIFlag Flag, bool SetNZCV,
2992 bool NeedsWinCFI, bool *HasWinCFI) {
2993 if (DestReg == SrcReg && Offset == 0)
2994 return;
2995
2996 assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
2997 "SP increment/decrement not 16-byte aligned");
2998
2999 bool isSub = Offset < 0;
3000 if (isSub)
3001 Offset = -Offset;
3002
3003 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3004 // scratch register. If DestReg is a virtual register, use it as the
3005 // scratch register; otherwise, create a new virtual register (to be
3006 // replaced by the scavenger at the end of PEI). That case can be optimized
3007 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3008 // register can be loaded with offset%8 and the add/sub can use an extending
3009 // instruction with LSL#3.
3010 // Currently the function handles any offsets but generates a poor sequence
3011 // of code.
3012 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3013
3014 unsigned Opc;
3015 if (SetNZCV)
3016 Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
3017 else
3018 Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
3019 const unsigned MaxEncoding = 0xfff;
3020 const unsigned ShiftSize = 12;
3021 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3022 while (((unsigned)Offset) >= (1 << ShiftSize)) {
3023 unsigned ThisVal;
3024 if (((unsigned)Offset) > MaxEncodableValue) {
3025 ThisVal = MaxEncodableValue;
3026 } else {
3027 ThisVal = Offset & MaxEncodableValue;
3028 }
3029 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3030 "Encoding cannot handle value that big");
3031 BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
3032 .addReg(SrcReg)
3033 .addImm(ThisVal >> ShiftSize)
3034 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
3035 .setMIFlag(Flag);
3036
3037 if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) {
3038 if (HasWinCFI)
3039 *HasWinCFI = true;
3040 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3041 .addImm(ThisVal)
3042 .setMIFlag(Flag);
3043 }
3044
3045 SrcReg = DestReg;
3046 Offset -= ThisVal;
3047 if (Offset == 0)
3048 return;
3049 }
3050 BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
3051 .addReg(SrcReg)
3052 .addImm(Offset)
3053 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
3054 .setMIFlag(Flag);
3055
3056 if (NeedsWinCFI) {
3057 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3058 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3059 if (HasWinCFI)
3060 *HasWinCFI = true;
3061 if (Offset == 0)
3062 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
3063 setMIFlag(Flag);
3064 else
3065 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
3066 addImm(Offset).setMIFlag(Flag);
3067 } else if (DestReg == AArch64::SP) {
3068 if (HasWinCFI)
3069 *HasWinCFI = true;
3070 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
3071 addImm(Offset).setMIFlag(Flag);
3072 }
3073 }
3074 }
3075
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const3076 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
3077 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
3078 MachineBasicBlock::iterator InsertPt, int FrameIndex,
3079 LiveIntervals *LIS, VirtRegMap *VRM) const {
3080 // This is a bit of a hack. Consider this instruction:
3081 //
3082 // %0 = COPY %sp; GPR64all:%0
3083 //
3084 // We explicitly chose GPR64all for the virtual register so such a copy might
3085 // be eliminated by RegisterCoalescer. However, that may not be possible, and
3086 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3087 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3088 //
3089 // To prevent that, we are going to constrain the %0 register class here.
3090 //
3091 // <rdar://problem/11522048>
3092 //
3093 if (MI.isFullCopy()) {
3094 unsigned DstReg = MI.getOperand(0).getReg();
3095 unsigned SrcReg = MI.getOperand(1).getReg();
3096 if (SrcReg == AArch64::SP &&
3097 TargetRegisterInfo::isVirtualRegister(DstReg)) {
3098 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3099 return nullptr;
3100 }
3101 if (DstReg == AArch64::SP &&
3102 TargetRegisterInfo::isVirtualRegister(SrcReg)) {
3103 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3104 return nullptr;
3105 }
3106 }
3107
3108 // Handle the case where a copy is being spilled or filled but the source
3109 // and destination register class don't match. For example:
3110 //
3111 // %0 = COPY %xzr; GPR64common:%0
3112 //
3113 // In this case we can still safely fold away the COPY and generate the
3114 // following spill code:
3115 //
3116 // STRXui %xzr, %stack.0
3117 //
3118 // This also eliminates spilled cross register class COPYs (e.g. between x and
3119 // d regs) of the same size. For example:
3120 //
3121 // %0 = COPY %1; GPR64:%0, FPR64:%1
3122 //
3123 // will be filled as
3124 //
3125 // LDRDui %0, fi<#0>
3126 //
3127 // instead of
3128 //
3129 // LDRXui %Temp, fi<#0>
3130 // %0 = FMOV %Temp
3131 //
3132 if (MI.isCopy() && Ops.size() == 1 &&
3133 // Make sure we're only folding the explicit COPY defs/uses.
3134 (Ops[0] == 0 || Ops[0] == 1)) {
3135 bool IsSpill = Ops[0] == 0;
3136 bool IsFill = !IsSpill;
3137 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3138 const MachineRegisterInfo &MRI = MF.getRegInfo();
3139 MachineBasicBlock &MBB = *MI.getParent();
3140 const MachineOperand &DstMO = MI.getOperand(0);
3141 const MachineOperand &SrcMO = MI.getOperand(1);
3142 unsigned DstReg = DstMO.getReg();
3143 unsigned SrcReg = SrcMO.getReg();
3144 // This is slightly expensive to compute for physical regs since
3145 // getMinimalPhysRegClass is slow.
3146 auto getRegClass = [&](unsigned Reg) {
3147 return TargetRegisterInfo::isVirtualRegister(Reg)
3148 ? MRI.getRegClass(Reg)
3149 : TRI.getMinimalPhysRegClass(Reg);
3150 };
3151
3152 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3153 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3154 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3155 "Mismatched register size in non subreg COPY");
3156 if (IsSpill)
3157 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3158 getRegClass(SrcReg), &TRI);
3159 else
3160 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3161 getRegClass(DstReg), &TRI);
3162 return &*--InsertPt;
3163 }
3164
3165 // Handle cases like spilling def of:
3166 //
3167 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3168 //
3169 // where the physical register source can be widened and stored to the full
3170 // virtual reg destination stack slot, in this case producing:
3171 //
3172 // STRXui %xzr, %stack.0
3173 //
3174 if (IsSpill && DstMO.isUndef() &&
3175 TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
3176 assert(SrcMO.getSubReg() == 0 &&
3177 "Unexpected subreg on physical register");
3178 const TargetRegisterClass *SpillRC;
3179 unsigned SpillSubreg;
3180 switch (DstMO.getSubReg()) {
3181 default:
3182 SpillRC = nullptr;
3183 break;
3184 case AArch64::sub_32:
3185 case AArch64::ssub:
3186 if (AArch64::GPR32RegClass.contains(SrcReg)) {
3187 SpillRC = &AArch64::GPR64RegClass;
3188 SpillSubreg = AArch64::sub_32;
3189 } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3190 SpillRC = &AArch64::FPR64RegClass;
3191 SpillSubreg = AArch64::ssub;
3192 } else
3193 SpillRC = nullptr;
3194 break;
3195 case AArch64::dsub:
3196 if (AArch64::FPR64RegClass.contains(SrcReg)) {
3197 SpillRC = &AArch64::FPR128RegClass;
3198 SpillSubreg = AArch64::dsub;
3199 } else
3200 SpillRC = nullptr;
3201 break;
3202 }
3203
3204 if (SpillRC)
3205 if (unsigned WidenedSrcReg =
3206 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3207 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3208 FrameIndex, SpillRC, &TRI);
3209 return &*--InsertPt;
3210 }
3211 }
3212
3213 // Handle cases like filling use of:
3214 //
3215 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3216 //
3217 // where we can load the full virtual reg source stack slot, into the subreg
3218 // destination, in this case producing:
3219 //
3220 // LDRWui %0:sub_32<def,read-undef>, %stack.0
3221 //
3222 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3223 const TargetRegisterClass *FillRC;
3224 switch (DstMO.getSubReg()) {
3225 default:
3226 FillRC = nullptr;
3227 break;
3228 case AArch64::sub_32:
3229 FillRC = &AArch64::GPR32RegClass;
3230 break;
3231 case AArch64::ssub:
3232 FillRC = &AArch64::FPR32RegClass;
3233 break;
3234 case AArch64::dsub:
3235 FillRC = &AArch64::FPR64RegClass;
3236 break;
3237 }
3238
3239 if (FillRC) {
3240 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3241 TRI.getRegSizeInBits(*FillRC) &&
3242 "Mismatched regclass size on folded subreg COPY");
3243 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3244 MachineInstr &LoadMI = *--InsertPt;
3245 MachineOperand &LoadDst = LoadMI.getOperand(0);
3246 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3247 LoadDst.setSubReg(DstMO.getSubReg());
3248 LoadDst.setIsUndef();
3249 return &LoadMI;
3250 }
3251 }
3252 }
3253
3254 // Cannot fold.
3255 return nullptr;
3256 }
3257
isAArch64FrameOffsetLegal(const MachineInstr & MI,int & Offset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int * EmittableOffset)3258 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
3259 bool *OutUseUnscaledOp,
3260 unsigned *OutUnscaledOp,
3261 int *EmittableOffset) {
3262 // Set output values in case of early exit.
3263 if (EmittableOffset)
3264 *EmittableOffset = 0;
3265 if (OutUseUnscaledOp)
3266 *OutUseUnscaledOp = false;
3267 if (OutUnscaledOp)
3268 *OutUnscaledOp = 0;
3269
3270 // Exit early for structured vector spills/fills as they can't take an
3271 // immediate offset.
3272 switch (MI.getOpcode()) {
3273 default:
3274 break;
3275 case AArch64::LD1Twov2d:
3276 case AArch64::LD1Threev2d:
3277 case AArch64::LD1Fourv2d:
3278 case AArch64::LD1Twov1d:
3279 case AArch64::LD1Threev1d:
3280 case AArch64::LD1Fourv1d:
3281 case AArch64::ST1Twov2d:
3282 case AArch64::ST1Threev2d:
3283 case AArch64::ST1Fourv2d:
3284 case AArch64::ST1Twov1d:
3285 case AArch64::ST1Threev1d:
3286 case AArch64::ST1Fourv1d:
3287 case AArch64::IRG:
3288 case AArch64::IRGstack:
3289 return AArch64FrameOffsetCannotUpdate;
3290 }
3291
3292 // Get the min/max offset and the scale.
3293 unsigned Scale, Width;
3294 int64_t MinOff, MaxOff;
3295 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
3296 MaxOff))
3297 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3298
3299 // Construct the complete offset.
3300 const MachineOperand &ImmOpnd =
3301 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
3302 Offset += ImmOpnd.getImm() * Scale;
3303
3304 // If the offset doesn't match the scale, we rewrite the instruction to
3305 // use the unscaled instruction instead. Likewise, if we have a negative
3306 // offset and there is an unscaled op to use.
3307 Optional<unsigned> UnscaledOp =
3308 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
3309 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3310 if (useUnscaledOp &&
3311 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
3312 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3313
3314 int64_t Remainder = Offset % Scale;
3315 assert(!(Remainder && useUnscaledOp) &&
3316 "Cannot have remainder when using unscaled op");
3317
3318 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3319 int NewOffset = Offset / Scale;
3320 if (MinOff <= NewOffset && NewOffset <= MaxOff)
3321 Offset = Remainder;
3322 else {
3323 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3324 Offset = Offset - NewOffset * Scale + Remainder;
3325 }
3326
3327 if (EmittableOffset)
3328 *EmittableOffset = NewOffset;
3329 if (OutUseUnscaledOp)
3330 *OutUseUnscaledOp = useUnscaledOp;
3331 if (OutUnscaledOp && UnscaledOp)
3332 *OutUnscaledOp = *UnscaledOp;
3333
3334 return AArch64FrameOffsetCanUpdate |
3335 (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
3336 }
3337
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,int & Offset,const AArch64InstrInfo * TII)3338 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3339 unsigned FrameReg, int &Offset,
3340 const AArch64InstrInfo *TII) {
3341 unsigned Opcode = MI.getOpcode();
3342 unsigned ImmIdx = FrameRegIdx + 1;
3343
3344 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3345 Offset += MI.getOperand(ImmIdx).getImm();
3346 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3347 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3348 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3349 MI.eraseFromParent();
3350 Offset = 0;
3351 return true;
3352 }
3353
3354 int NewOffset;
3355 unsigned UnscaledOp;
3356 bool UseUnscaledOp;
3357 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3358 &UnscaledOp, &NewOffset);
3359 if (Status & AArch64FrameOffsetCanUpdate) {
3360 if (Status & AArch64FrameOffsetIsLegal)
3361 // Replace the FrameIndex with FrameReg.
3362 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3363 if (UseUnscaledOp)
3364 MI.setDesc(TII->get(UnscaledOp));
3365
3366 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3367 return Offset == 0;
3368 }
3369
3370 return false;
3371 }
3372
getNoop(MCInst & NopInst) const3373 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3374 NopInst.setOpcode(AArch64::HINT);
3375 NopInst.addOperand(MCOperand::createImm(0));
3376 }
3377
3378 // AArch64 supports MachineCombiner.
useMachineCombiner() const3379 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3380
3381 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)3382 static bool isCombineInstrSettingFlag(unsigned Opc) {
3383 switch (Opc) {
3384 case AArch64::ADDSWrr:
3385 case AArch64::ADDSWri:
3386 case AArch64::ADDSXrr:
3387 case AArch64::ADDSXri:
3388 case AArch64::SUBSWrr:
3389 case AArch64::SUBSXrr:
3390 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3391 case AArch64::SUBSWri:
3392 case AArch64::SUBSXri:
3393 return true;
3394 default:
3395 break;
3396 }
3397 return false;
3398 }
3399
3400 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)3401 static bool isCombineInstrCandidate32(unsigned Opc) {
3402 switch (Opc) {
3403 case AArch64::ADDWrr:
3404 case AArch64::ADDWri:
3405 case AArch64::SUBWrr:
3406 case AArch64::ADDSWrr:
3407 case AArch64::ADDSWri:
3408 case AArch64::SUBSWrr:
3409 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3410 case AArch64::SUBWri:
3411 case AArch64::SUBSWri:
3412 return true;
3413 default:
3414 break;
3415 }
3416 return false;
3417 }
3418
3419 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)3420 static bool isCombineInstrCandidate64(unsigned Opc) {
3421 switch (Opc) {
3422 case AArch64::ADDXrr:
3423 case AArch64::ADDXri:
3424 case AArch64::SUBXrr:
3425 case AArch64::ADDSXrr:
3426 case AArch64::ADDSXri:
3427 case AArch64::SUBSXrr:
3428 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3429 case AArch64::SUBXri:
3430 case AArch64::SUBSXri:
3431 return true;
3432 default:
3433 break;
3434 }
3435 return false;
3436 }
3437
3438 // FP Opcodes that can be combined with a FMUL
isCombineInstrCandidateFP(const MachineInstr & Inst)3439 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3440 switch (Inst.getOpcode()) {
3441 default:
3442 break;
3443 case AArch64::FADDSrr:
3444 case AArch64::FADDDrr:
3445 case AArch64::FADDv2f32:
3446 case AArch64::FADDv2f64:
3447 case AArch64::FADDv4f32:
3448 case AArch64::FSUBSrr:
3449 case AArch64::FSUBDrr:
3450 case AArch64::FSUBv2f32:
3451 case AArch64::FSUBv2f64:
3452 case AArch64::FSUBv4f32:
3453 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3454 return (Options.UnsafeFPMath ||
3455 Options.AllowFPOpFusion == FPOpFusion::Fast);
3456 }
3457 return false;
3458 }
3459
3460 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)3461 static bool isCombineInstrCandidate(unsigned Opc) {
3462 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
3463 }
3464
3465 //
3466 // Utility routine that checks if \param MO is defined by an
3467 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)3468 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
3469 unsigned CombineOpc, unsigned ZeroReg = 0,
3470 bool CheckZeroReg = false) {
3471 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3472 MachineInstr *MI = nullptr;
3473
3474 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3475 MI = MRI.getUniqueVRegDef(MO.getReg());
3476 // And it needs to be in the trace (otherwise, it won't have a depth).
3477 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3478 return false;
3479 // Must only used by the user we combine with.
3480 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3481 return false;
3482
3483 if (CheckZeroReg) {
3484 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3485 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3486 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3487 // The third input reg must be zero.
3488 if (MI->getOperand(3).getReg() != ZeroReg)
3489 return false;
3490 }
3491
3492 return true;
3493 }
3494
3495 //
3496 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)3497 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3498 unsigned MulOpc, unsigned ZeroReg) {
3499 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3500 }
3501
3502 //
3503 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)3504 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3505 unsigned MulOpc) {
3506 return canCombine(MBB, MO, MulOpc);
3507 }
3508
3509 // TODO: There are many more machine instruction opcodes to match:
3510 // 1. Other data types (integer, vectors)
3511 // 2. Other math / logic operations (xor, or)
3512 // 3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst) const3513 bool AArch64InstrInfo::isAssociativeAndCommutative(
3514 const MachineInstr &Inst) const {
3515 switch (Inst.getOpcode()) {
3516 case AArch64::FADDDrr:
3517 case AArch64::FADDSrr:
3518 case AArch64::FADDv2f32:
3519 case AArch64::FADDv2f64:
3520 case AArch64::FADDv4f32:
3521 case AArch64::FMULDrr:
3522 case AArch64::FMULSrr:
3523 case AArch64::FMULX32:
3524 case AArch64::FMULX64:
3525 case AArch64::FMULXv2f32:
3526 case AArch64::FMULXv2f64:
3527 case AArch64::FMULXv4f32:
3528 case AArch64::FMULv2f32:
3529 case AArch64::FMULv2f64:
3530 case AArch64::FMULv4f32:
3531 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3532 default:
3533 return false;
3534 }
3535 }
3536
3537 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)3538 static bool getMaddPatterns(MachineInstr &Root,
3539 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3540 unsigned Opc = Root.getOpcode();
3541 MachineBasicBlock &MBB = *Root.getParent();
3542 bool Found = false;
3543
3544 if (!isCombineInstrCandidate(Opc))
3545 return false;
3546 if (isCombineInstrSettingFlag(Opc)) {
3547 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3548 // When NZCV is live bail out.
3549 if (Cmp_NZCV == -1)
3550 return false;
3551 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3552 // When opcode can't change bail out.
3553 // CHECKME: do we miss any cases for opcode conversion?
3554 if (NewOpc == Opc)
3555 return false;
3556 Opc = NewOpc;
3557 }
3558
3559 switch (Opc) {
3560 default:
3561 break;
3562 case AArch64::ADDWrr:
3563 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3564 "ADDWrr does not have register operands");
3565 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3566 AArch64::WZR)) {
3567 Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
3568 Found = true;
3569 }
3570 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3571 AArch64::WZR)) {
3572 Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
3573 Found = true;
3574 }
3575 break;
3576 case AArch64::ADDXrr:
3577 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3578 AArch64::XZR)) {
3579 Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
3580 Found = true;
3581 }
3582 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3583 AArch64::XZR)) {
3584 Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
3585 Found = true;
3586 }
3587 break;
3588 case AArch64::SUBWrr:
3589 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3590 AArch64::WZR)) {
3591 Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
3592 Found = true;
3593 }
3594 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3595 AArch64::WZR)) {
3596 Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
3597 Found = true;
3598 }
3599 break;
3600 case AArch64::SUBXrr:
3601 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3602 AArch64::XZR)) {
3603 Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
3604 Found = true;
3605 }
3606 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3607 AArch64::XZR)) {
3608 Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
3609 Found = true;
3610 }
3611 break;
3612 case AArch64::ADDWri:
3613 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3614 AArch64::WZR)) {
3615 Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
3616 Found = true;
3617 }
3618 break;
3619 case AArch64::ADDXri:
3620 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3621 AArch64::XZR)) {
3622 Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
3623 Found = true;
3624 }
3625 break;
3626 case AArch64::SUBWri:
3627 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3628 AArch64::WZR)) {
3629 Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
3630 Found = true;
3631 }
3632 break;
3633 case AArch64::SUBXri:
3634 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3635 AArch64::XZR)) {
3636 Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
3637 Found = true;
3638 }
3639 break;
3640 }
3641 return Found;
3642 }
3643 /// Floating-Point Support
3644
3645 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)3646 static bool getFMAPatterns(MachineInstr &Root,
3647 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3648
3649 if (!isCombineInstrCandidateFP(Root))
3650 return false;
3651
3652 MachineBasicBlock &MBB = *Root.getParent();
3653 bool Found = false;
3654
3655 switch (Root.getOpcode()) {
3656 default:
3657 assert(false && "Unsupported FP instruction in combiner\n");
3658 break;
3659 case AArch64::FADDSrr:
3660 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3661 "FADDWrr does not have register operands");
3662 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3663 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
3664 Found = true;
3665 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3666 AArch64::FMULv1i32_indexed)) {
3667 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
3668 Found = true;
3669 }
3670 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3671 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
3672 Found = true;
3673 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3674 AArch64::FMULv1i32_indexed)) {
3675 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
3676 Found = true;
3677 }
3678 break;
3679 case AArch64::FADDDrr:
3680 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3681 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
3682 Found = true;
3683 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3684 AArch64::FMULv1i64_indexed)) {
3685 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
3686 Found = true;
3687 }
3688 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3689 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
3690 Found = true;
3691 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3692 AArch64::FMULv1i64_indexed)) {
3693 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
3694 Found = true;
3695 }
3696 break;
3697 case AArch64::FADDv2f32:
3698 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3699 AArch64::FMULv2i32_indexed)) {
3700 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
3701 Found = true;
3702 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3703 AArch64::FMULv2f32)) {
3704 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
3705 Found = true;
3706 }
3707 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3708 AArch64::FMULv2i32_indexed)) {
3709 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
3710 Found = true;
3711 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3712 AArch64::FMULv2f32)) {
3713 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
3714 Found = true;
3715 }
3716 break;
3717 case AArch64::FADDv2f64:
3718 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3719 AArch64::FMULv2i64_indexed)) {
3720 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
3721 Found = true;
3722 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3723 AArch64::FMULv2f64)) {
3724 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
3725 Found = true;
3726 }
3727 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3728 AArch64::FMULv2i64_indexed)) {
3729 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
3730 Found = true;
3731 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3732 AArch64::FMULv2f64)) {
3733 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
3734 Found = true;
3735 }
3736 break;
3737 case AArch64::FADDv4f32:
3738 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3739 AArch64::FMULv4i32_indexed)) {
3740 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
3741 Found = true;
3742 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3743 AArch64::FMULv4f32)) {
3744 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
3745 Found = true;
3746 }
3747 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3748 AArch64::FMULv4i32_indexed)) {
3749 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
3750 Found = true;
3751 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3752 AArch64::FMULv4f32)) {
3753 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
3754 Found = true;
3755 }
3756 break;
3757
3758 case AArch64::FSUBSrr:
3759 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3760 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
3761 Found = true;
3762 }
3763 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3764 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
3765 Found = true;
3766 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3767 AArch64::FMULv1i32_indexed)) {
3768 Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
3769 Found = true;
3770 }
3771 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
3772 Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
3773 Found = true;
3774 }
3775 break;
3776 case AArch64::FSUBDrr:
3777 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3778 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
3779 Found = true;
3780 }
3781 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3782 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
3783 Found = true;
3784 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3785 AArch64::FMULv1i64_indexed)) {
3786 Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
3787 Found = true;
3788 }
3789 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
3790 Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
3791 Found = true;
3792 }
3793 break;
3794 case AArch64::FSUBv2f32:
3795 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3796 AArch64::FMULv2i32_indexed)) {
3797 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
3798 Found = true;
3799 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3800 AArch64::FMULv2f32)) {
3801 Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
3802 Found = true;
3803 }
3804 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3805 AArch64::FMULv2i32_indexed)) {
3806 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
3807 Found = true;
3808 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3809 AArch64::FMULv2f32)) {
3810 Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
3811 Found = true;
3812 }
3813 break;
3814 case AArch64::FSUBv2f64:
3815 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3816 AArch64::FMULv2i64_indexed)) {
3817 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
3818 Found = true;
3819 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3820 AArch64::FMULv2f64)) {
3821 Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
3822 Found = true;
3823 }
3824 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3825 AArch64::FMULv2i64_indexed)) {
3826 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
3827 Found = true;
3828 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3829 AArch64::FMULv2f64)) {
3830 Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
3831 Found = true;
3832 }
3833 break;
3834 case AArch64::FSUBv4f32:
3835 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3836 AArch64::FMULv4i32_indexed)) {
3837 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
3838 Found = true;
3839 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3840 AArch64::FMULv4f32)) {
3841 Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
3842 Found = true;
3843 }
3844 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3845 AArch64::FMULv4i32_indexed)) {
3846 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
3847 Found = true;
3848 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3849 AArch64::FMULv4f32)) {
3850 Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
3851 Found = true;
3852 }
3853 break;
3854 }
3855 return Found;
3856 }
3857
3858 /// Return true when a code sequence can improve throughput. It
3859 /// should be called only for instructions in loops.
3860 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const3861 bool AArch64InstrInfo::isThroughputPattern(
3862 MachineCombinerPattern Pattern) const {
3863 switch (Pattern) {
3864 default:
3865 break;
3866 case MachineCombinerPattern::FMULADDS_OP1:
3867 case MachineCombinerPattern::FMULADDS_OP2:
3868 case MachineCombinerPattern::FMULSUBS_OP1:
3869 case MachineCombinerPattern::FMULSUBS_OP2:
3870 case MachineCombinerPattern::FMULADDD_OP1:
3871 case MachineCombinerPattern::FMULADDD_OP2:
3872 case MachineCombinerPattern::FMULSUBD_OP1:
3873 case MachineCombinerPattern::FMULSUBD_OP2:
3874 case MachineCombinerPattern::FNMULSUBS_OP1:
3875 case MachineCombinerPattern::FNMULSUBD_OP1:
3876 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3877 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3878 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3879 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3880 case MachineCombinerPattern::FMLAv2f32_OP2:
3881 case MachineCombinerPattern::FMLAv2f32_OP1:
3882 case MachineCombinerPattern::FMLAv2f64_OP1:
3883 case MachineCombinerPattern::FMLAv2f64_OP2:
3884 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3885 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3886 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3887 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3888 case MachineCombinerPattern::FMLAv4f32_OP1:
3889 case MachineCombinerPattern::FMLAv4f32_OP2:
3890 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3891 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3892 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3893 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3894 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3895 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3896 case MachineCombinerPattern::FMLSv2f32_OP2:
3897 case MachineCombinerPattern::FMLSv2f64_OP2:
3898 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3899 case MachineCombinerPattern::FMLSv4f32_OP2:
3900 return true;
3901 } // end switch (Pattern)
3902 return false;
3903 }
3904 /// Return true when there is potentially a faster code sequence for an
3905 /// instruction chain ending in \p Root. All potential patterns are listed in
3906 /// the \p Pattern vector. Pattern should be sorted in priority order since the
3907 /// pattern evaluator stops checking as soon as it finds a faster sequence.
3908
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns) const3909 bool AArch64InstrInfo::getMachineCombinerPatterns(
3910 MachineInstr &Root,
3911 SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
3912 // Integer patterns
3913 if (getMaddPatterns(Root, Patterns))
3914 return true;
3915 // Floating point patterns
3916 if (getFMAPatterns(Root, Patterns))
3917 return true;
3918
3919 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
3920 }
3921
3922 enum class FMAInstKind { Default, Indexed, Accumulator };
3923 /// genFusedMultiply - Generate fused multiply instructions.
3924 /// This function supports both integer and floating point instructions.
3925 /// A typical example:
3926 /// F|MUL I=A,B,0
3927 /// F|ADD R,I,C
3928 /// ==> F|MADD R,A,B,C
3929 /// \param MF Containing MachineFunction
3930 /// \param MRI Register information
3931 /// \param TII Target information
3932 /// \param Root is the F|ADD instruction
3933 /// \param [out] InsInstrs is a vector of machine instructions and will
3934 /// contain the generated madd instruction
3935 /// \param IdxMulOpd is index of operand in Root that is the result of
3936 /// the F|MUL. In the example above IdxMulOpd is 1.
3937 /// \param MaddOpc the opcode fo the f|madd instruction
3938 /// \param RC Register class of operands
3939 /// \param kind of fma instruction (addressing mode) to be generated
3940 /// \param ReplacedAddend is the result register from the instruction
3941 /// replacing the non-combined operand, if any.
3942 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const unsigned * ReplacedAddend=nullptr)3943 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
3944 const TargetInstrInfo *TII, MachineInstr &Root,
3945 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
3946 unsigned MaddOpc, const TargetRegisterClass *RC,
3947 FMAInstKind kind = FMAInstKind::Default,
3948 const unsigned *ReplacedAddend = nullptr) {
3949 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
3950
3951 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
3952 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
3953 unsigned ResultReg = Root.getOperand(0).getReg();
3954 unsigned SrcReg0 = MUL->getOperand(1).getReg();
3955 bool Src0IsKill = MUL->getOperand(1).isKill();
3956 unsigned SrcReg1 = MUL->getOperand(2).getReg();
3957 bool Src1IsKill = MUL->getOperand(2).isKill();
3958
3959 unsigned SrcReg2;
3960 bool Src2IsKill;
3961 if (ReplacedAddend) {
3962 // If we just generated a new addend, we must be it's only use.
3963 SrcReg2 = *ReplacedAddend;
3964 Src2IsKill = true;
3965 } else {
3966 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
3967 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
3968 }
3969
3970 if (TargetRegisterInfo::isVirtualRegister(ResultReg))
3971 MRI.constrainRegClass(ResultReg, RC);
3972 if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
3973 MRI.constrainRegClass(SrcReg0, RC);
3974 if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
3975 MRI.constrainRegClass(SrcReg1, RC);
3976 if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
3977 MRI.constrainRegClass(SrcReg2, RC);
3978
3979 MachineInstrBuilder MIB;
3980 if (kind == FMAInstKind::Default)
3981 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3982 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3983 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3984 .addReg(SrcReg2, getKillRegState(Src2IsKill));
3985 else if (kind == FMAInstKind::Indexed)
3986 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3987 .addReg(SrcReg2, getKillRegState(Src2IsKill))
3988 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3989 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3990 .addImm(MUL->getOperand(3).getImm());
3991 else if (kind == FMAInstKind::Accumulator)
3992 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3993 .addReg(SrcReg2, getKillRegState(Src2IsKill))
3994 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3995 .addReg(SrcReg1, getKillRegState(Src1IsKill));
3996 else
3997 assert(false && "Invalid FMA instruction kind \n");
3998 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
3999 InsInstrs.push_back(MIB);
4000 return MUL;
4001 }
4002
4003 /// genMaddR - Generate madd instruction and combine mul and add using
4004 /// an extra virtual register
4005 /// Example - an ADD intermediate needs to be stored in a register:
4006 /// MUL I=A,B,0
4007 /// ADD R,I,Imm
4008 /// ==> ORR V, ZR, Imm
4009 /// ==> MADD R,A,B,V
4010 /// \param MF Containing MachineFunction
4011 /// \param MRI Register information
4012 /// \param TII Target information
4013 /// \param Root is the ADD instruction
4014 /// \param [out] InsInstrs is a vector of machine instructions and will
4015 /// contain the generated madd instruction
4016 /// \param IdxMulOpd is index of operand in Root that is the result of
4017 /// the MUL. In the example above IdxMulOpd is 1.
4018 /// \param MaddOpc the opcode fo the madd instruction
4019 /// \param VR is a virtual register that holds the value of an ADD operand
4020 /// (V in the example above).
4021 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)4022 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4023 const TargetInstrInfo *TII, MachineInstr &Root,
4024 SmallVectorImpl<MachineInstr *> &InsInstrs,
4025 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4026 const TargetRegisterClass *RC) {
4027 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4028
4029 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4030 unsigned ResultReg = Root.getOperand(0).getReg();
4031 unsigned SrcReg0 = MUL->getOperand(1).getReg();
4032 bool Src0IsKill = MUL->getOperand(1).isKill();
4033 unsigned SrcReg1 = MUL->getOperand(2).getReg();
4034 bool Src1IsKill = MUL->getOperand(2).isKill();
4035
4036 if (TargetRegisterInfo::isVirtualRegister(ResultReg))
4037 MRI.constrainRegClass(ResultReg, RC);
4038 if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
4039 MRI.constrainRegClass(SrcReg0, RC);
4040 if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
4041 MRI.constrainRegClass(SrcReg1, RC);
4042 if (TargetRegisterInfo::isVirtualRegister(VR))
4043 MRI.constrainRegClass(VR, RC);
4044
4045 MachineInstrBuilder MIB =
4046 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4047 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4048 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4049 .addReg(VR);
4050 // Insert the MADD
4051 InsInstrs.push_back(MIB);
4052 return MUL;
4053 }
4054
4055 /// When getMachineCombinerPatterns() finds potential patterns,
4056 /// this function generates the instructions that could replace the
4057 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const4058 void AArch64InstrInfo::genAlternativeCodeSequence(
4059 MachineInstr &Root, MachineCombinerPattern Pattern,
4060 SmallVectorImpl<MachineInstr *> &InsInstrs,
4061 SmallVectorImpl<MachineInstr *> &DelInstrs,
4062 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4063 MachineBasicBlock &MBB = *Root.getParent();
4064 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4065 MachineFunction &MF = *MBB.getParent();
4066 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4067
4068 MachineInstr *MUL;
4069 const TargetRegisterClass *RC;
4070 unsigned Opc;
4071 switch (Pattern) {
4072 default:
4073 // Reassociate instructions.
4074 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4075 DelInstrs, InstrIdxForVirtReg);
4076 return;
4077 case MachineCombinerPattern::MULADDW_OP1:
4078 case MachineCombinerPattern::MULADDX_OP1:
4079 // MUL I=A,B,0
4080 // ADD R,I,C
4081 // ==> MADD R,A,B,C
4082 // --- Create(MADD);
4083 if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4084 Opc = AArch64::MADDWrrr;
4085 RC = &AArch64::GPR32RegClass;
4086 } else {
4087 Opc = AArch64::MADDXrrr;
4088 RC = &AArch64::GPR64RegClass;
4089 }
4090 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4091 break;
4092 case MachineCombinerPattern::MULADDW_OP2:
4093 case MachineCombinerPattern::MULADDX_OP2:
4094 // MUL I=A,B,0
4095 // ADD R,C,I
4096 // ==> MADD R,A,B,C
4097 // --- Create(MADD);
4098 if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4099 Opc = AArch64::MADDWrrr;
4100 RC = &AArch64::GPR32RegClass;
4101 } else {
4102 Opc = AArch64::MADDXrrr;
4103 RC = &AArch64::GPR64RegClass;
4104 }
4105 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4106 break;
4107 case MachineCombinerPattern::MULADDWI_OP1:
4108 case MachineCombinerPattern::MULADDXI_OP1: {
4109 // MUL I=A,B,0
4110 // ADD R,I,Imm
4111 // ==> ORR V, ZR, Imm
4112 // ==> MADD R,A,B,V
4113 // --- Create(MADD);
4114 const TargetRegisterClass *OrrRC;
4115 unsigned BitSize, OrrOpc, ZeroReg;
4116 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4117 OrrOpc = AArch64::ORRWri;
4118 OrrRC = &AArch64::GPR32spRegClass;
4119 BitSize = 32;
4120 ZeroReg = AArch64::WZR;
4121 Opc = AArch64::MADDWrrr;
4122 RC = &AArch64::GPR32RegClass;
4123 } else {
4124 OrrOpc = AArch64::ORRXri;
4125 OrrRC = &AArch64::GPR64spRegClass;
4126 BitSize = 64;
4127 ZeroReg = AArch64::XZR;
4128 Opc = AArch64::MADDXrrr;
4129 RC = &AArch64::GPR64RegClass;
4130 }
4131 unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4132 uint64_t Imm = Root.getOperand(2).getImm();
4133
4134 if (Root.getOperand(3).isImm()) {
4135 unsigned Val = Root.getOperand(3).getImm();
4136 Imm = Imm << Val;
4137 }
4138 uint64_t UImm = SignExtend64(Imm, BitSize);
4139 uint64_t Encoding;
4140 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4141 MachineInstrBuilder MIB1 =
4142 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4143 .addReg(ZeroReg)
4144 .addImm(Encoding);
4145 InsInstrs.push_back(MIB1);
4146 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4147 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4148 }
4149 break;
4150 }
4151 case MachineCombinerPattern::MULSUBW_OP1:
4152 case MachineCombinerPattern::MULSUBX_OP1: {
4153 // MUL I=A,B,0
4154 // SUB R,I, C
4155 // ==> SUB V, 0, C
4156 // ==> MADD R,A,B,V // = -C + A*B
4157 // --- Create(MADD);
4158 const TargetRegisterClass *SubRC;
4159 unsigned SubOpc, ZeroReg;
4160 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4161 SubOpc = AArch64::SUBWrr;
4162 SubRC = &AArch64::GPR32spRegClass;
4163 ZeroReg = AArch64::WZR;
4164 Opc = AArch64::MADDWrrr;
4165 RC = &AArch64::GPR32RegClass;
4166 } else {
4167 SubOpc = AArch64::SUBXrr;
4168 SubRC = &AArch64::GPR64spRegClass;
4169 ZeroReg = AArch64::XZR;
4170 Opc = AArch64::MADDXrrr;
4171 RC = &AArch64::GPR64RegClass;
4172 }
4173 unsigned NewVR = MRI.createVirtualRegister(SubRC);
4174 // SUB NewVR, 0, C
4175 MachineInstrBuilder MIB1 =
4176 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4177 .addReg(ZeroReg)
4178 .add(Root.getOperand(2));
4179 InsInstrs.push_back(MIB1);
4180 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4181 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4182 break;
4183 }
4184 case MachineCombinerPattern::MULSUBW_OP2:
4185 case MachineCombinerPattern::MULSUBX_OP2:
4186 // MUL I=A,B,0
4187 // SUB R,C,I
4188 // ==> MSUB R,A,B,C (computes C - A*B)
4189 // --- Create(MSUB);
4190 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4191 Opc = AArch64::MSUBWrrr;
4192 RC = &AArch64::GPR32RegClass;
4193 } else {
4194 Opc = AArch64::MSUBXrrr;
4195 RC = &AArch64::GPR64RegClass;
4196 }
4197 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4198 break;
4199 case MachineCombinerPattern::MULSUBWI_OP1:
4200 case MachineCombinerPattern::MULSUBXI_OP1: {
4201 // MUL I=A,B,0
4202 // SUB R,I, Imm
4203 // ==> ORR V, ZR, -Imm
4204 // ==> MADD R,A,B,V // = -Imm + A*B
4205 // --- Create(MADD);
4206 const TargetRegisterClass *OrrRC;
4207 unsigned BitSize, OrrOpc, ZeroReg;
4208 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4209 OrrOpc = AArch64::ORRWri;
4210 OrrRC = &AArch64::GPR32spRegClass;
4211 BitSize = 32;
4212 ZeroReg = AArch64::WZR;
4213 Opc = AArch64::MADDWrrr;
4214 RC = &AArch64::GPR32RegClass;
4215 } else {
4216 OrrOpc = AArch64::ORRXri;
4217 OrrRC = &AArch64::GPR64spRegClass;
4218 BitSize = 64;
4219 ZeroReg = AArch64::XZR;
4220 Opc = AArch64::MADDXrrr;
4221 RC = &AArch64::GPR64RegClass;
4222 }
4223 unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4224 uint64_t Imm = Root.getOperand(2).getImm();
4225 if (Root.getOperand(3).isImm()) {
4226 unsigned Val = Root.getOperand(3).getImm();
4227 Imm = Imm << Val;
4228 }
4229 uint64_t UImm = SignExtend64(-Imm, BitSize);
4230 uint64_t Encoding;
4231 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4232 MachineInstrBuilder MIB1 =
4233 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4234 .addReg(ZeroReg)
4235 .addImm(Encoding);
4236 InsInstrs.push_back(MIB1);
4237 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4238 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4239 }
4240 break;
4241 }
4242 // Floating Point Support
4243 case MachineCombinerPattern::FMULADDS_OP1:
4244 case MachineCombinerPattern::FMULADDD_OP1:
4245 // MUL I=A,B,0
4246 // ADD R,I,C
4247 // ==> MADD R,A,B,C
4248 // --- Create(MADD);
4249 if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
4250 Opc = AArch64::FMADDSrrr;
4251 RC = &AArch64::FPR32RegClass;
4252 } else {
4253 Opc = AArch64::FMADDDrrr;
4254 RC = &AArch64::FPR64RegClass;
4255 }
4256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4257 break;
4258 case MachineCombinerPattern::FMULADDS_OP2:
4259 case MachineCombinerPattern::FMULADDD_OP2:
4260 // FMUL I=A,B,0
4261 // FADD R,C,I
4262 // ==> FMADD R,A,B,C
4263 // --- Create(FMADD);
4264 if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
4265 Opc = AArch64::FMADDSrrr;
4266 RC = &AArch64::FPR32RegClass;
4267 } else {
4268 Opc = AArch64::FMADDDrrr;
4269 RC = &AArch64::FPR64RegClass;
4270 }
4271 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4272 break;
4273
4274 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4275 Opc = AArch64::FMLAv1i32_indexed;
4276 RC = &AArch64::FPR32RegClass;
4277 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4278 FMAInstKind::Indexed);
4279 break;
4280 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4281 Opc = AArch64::FMLAv1i32_indexed;
4282 RC = &AArch64::FPR32RegClass;
4283 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4284 FMAInstKind::Indexed);
4285 break;
4286
4287 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4288 Opc = AArch64::FMLAv1i64_indexed;
4289 RC = &AArch64::FPR64RegClass;
4290 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4291 FMAInstKind::Indexed);
4292 break;
4293 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4294 Opc = AArch64::FMLAv1i64_indexed;
4295 RC = &AArch64::FPR64RegClass;
4296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4297 FMAInstKind::Indexed);
4298 break;
4299
4300 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4301 case MachineCombinerPattern::FMLAv2f32_OP1:
4302 RC = &AArch64::FPR64RegClass;
4303 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
4304 Opc = AArch64::FMLAv2i32_indexed;
4305 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4306 FMAInstKind::Indexed);
4307 } else {
4308 Opc = AArch64::FMLAv2f32;
4309 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4310 FMAInstKind::Accumulator);
4311 }
4312 break;
4313 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4314 case MachineCombinerPattern::FMLAv2f32_OP2:
4315 RC = &AArch64::FPR64RegClass;
4316 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
4317 Opc = AArch64::FMLAv2i32_indexed;
4318 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4319 FMAInstKind::Indexed);
4320 } else {
4321 Opc = AArch64::FMLAv2f32;
4322 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4323 FMAInstKind::Accumulator);
4324 }
4325 break;
4326
4327 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4328 case MachineCombinerPattern::FMLAv2f64_OP1:
4329 RC = &AArch64::FPR128RegClass;
4330 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
4331 Opc = AArch64::FMLAv2i64_indexed;
4332 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4333 FMAInstKind::Indexed);
4334 } else {
4335 Opc = AArch64::FMLAv2f64;
4336 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4337 FMAInstKind::Accumulator);
4338 }
4339 break;
4340 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4341 case MachineCombinerPattern::FMLAv2f64_OP2:
4342 RC = &AArch64::FPR128RegClass;
4343 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
4344 Opc = AArch64::FMLAv2i64_indexed;
4345 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4346 FMAInstKind::Indexed);
4347 } else {
4348 Opc = AArch64::FMLAv2f64;
4349 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4350 FMAInstKind::Accumulator);
4351 }
4352 break;
4353
4354 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4355 case MachineCombinerPattern::FMLAv4f32_OP1:
4356 RC = &AArch64::FPR128RegClass;
4357 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
4358 Opc = AArch64::FMLAv4i32_indexed;
4359 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4360 FMAInstKind::Indexed);
4361 } else {
4362 Opc = AArch64::FMLAv4f32;
4363 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4364 FMAInstKind::Accumulator);
4365 }
4366 break;
4367
4368 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4369 case MachineCombinerPattern::FMLAv4f32_OP2:
4370 RC = &AArch64::FPR128RegClass;
4371 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
4372 Opc = AArch64::FMLAv4i32_indexed;
4373 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4374 FMAInstKind::Indexed);
4375 } else {
4376 Opc = AArch64::FMLAv4f32;
4377 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4378 FMAInstKind::Accumulator);
4379 }
4380 break;
4381
4382 case MachineCombinerPattern::FMULSUBS_OP1:
4383 case MachineCombinerPattern::FMULSUBD_OP1: {
4384 // FMUL I=A,B,0
4385 // FSUB R,I,C
4386 // ==> FNMSUB R,A,B,C // = -C + A*B
4387 // --- Create(FNMSUB);
4388 if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
4389 Opc = AArch64::FNMSUBSrrr;
4390 RC = &AArch64::FPR32RegClass;
4391 } else {
4392 Opc = AArch64::FNMSUBDrrr;
4393 RC = &AArch64::FPR64RegClass;
4394 }
4395 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4396 break;
4397 }
4398
4399 case MachineCombinerPattern::FNMULSUBS_OP1:
4400 case MachineCombinerPattern::FNMULSUBD_OP1: {
4401 // FNMUL I=A,B,0
4402 // FSUB R,I,C
4403 // ==> FNMADD R,A,B,C // = -A*B - C
4404 // --- Create(FNMADD);
4405 if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
4406 Opc = AArch64::FNMADDSrrr;
4407 RC = &AArch64::FPR32RegClass;
4408 } else {
4409 Opc = AArch64::FNMADDDrrr;
4410 RC = &AArch64::FPR64RegClass;
4411 }
4412 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4413 break;
4414 }
4415
4416 case MachineCombinerPattern::FMULSUBS_OP2:
4417 case MachineCombinerPattern::FMULSUBD_OP2: {
4418 // FMUL I=A,B,0
4419 // FSUB R,C,I
4420 // ==> FMSUB R,A,B,C (computes C - A*B)
4421 // --- Create(FMSUB);
4422 if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
4423 Opc = AArch64::FMSUBSrrr;
4424 RC = &AArch64::FPR32RegClass;
4425 } else {
4426 Opc = AArch64::FMSUBDrrr;
4427 RC = &AArch64::FPR64RegClass;
4428 }
4429 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4430 break;
4431 }
4432
4433 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4434 Opc = AArch64::FMLSv1i32_indexed;
4435 RC = &AArch64::FPR32RegClass;
4436 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4437 FMAInstKind::Indexed);
4438 break;
4439
4440 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4441 Opc = AArch64::FMLSv1i64_indexed;
4442 RC = &AArch64::FPR64RegClass;
4443 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4444 FMAInstKind::Indexed);
4445 break;
4446
4447 case MachineCombinerPattern::FMLSv2f32_OP2:
4448 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4449 RC = &AArch64::FPR64RegClass;
4450 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
4451 Opc = AArch64::FMLSv2i32_indexed;
4452 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4453 FMAInstKind::Indexed);
4454 } else {
4455 Opc = AArch64::FMLSv2f32;
4456 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4457 FMAInstKind::Accumulator);
4458 }
4459 break;
4460
4461 case MachineCombinerPattern::FMLSv2f64_OP2:
4462 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4463 RC = &AArch64::FPR128RegClass;
4464 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
4465 Opc = AArch64::FMLSv2i64_indexed;
4466 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4467 FMAInstKind::Indexed);
4468 } else {
4469 Opc = AArch64::FMLSv2f64;
4470 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4471 FMAInstKind::Accumulator);
4472 }
4473 break;
4474
4475 case MachineCombinerPattern::FMLSv4f32_OP2:
4476 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4477 RC = &AArch64::FPR128RegClass;
4478 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
4479 Opc = AArch64::FMLSv4i32_indexed;
4480 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4481 FMAInstKind::Indexed);
4482 } else {
4483 Opc = AArch64::FMLSv4f32;
4484 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4485 FMAInstKind::Accumulator);
4486 }
4487 break;
4488 case MachineCombinerPattern::FMLSv2f32_OP1:
4489 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
4490 RC = &AArch64::FPR64RegClass;
4491 unsigned NewVR = MRI.createVirtualRegister(RC);
4492 MachineInstrBuilder MIB1 =
4493 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
4494 .add(Root.getOperand(2));
4495 InsInstrs.push_back(MIB1);
4496 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4497 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
4498 Opc = AArch64::FMLAv2i32_indexed;
4499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4500 FMAInstKind::Indexed, &NewVR);
4501 } else {
4502 Opc = AArch64::FMLAv2f32;
4503 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4504 FMAInstKind::Accumulator, &NewVR);
4505 }
4506 break;
4507 }
4508 case MachineCombinerPattern::FMLSv4f32_OP1:
4509 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
4510 RC = &AArch64::FPR128RegClass;
4511 unsigned NewVR = MRI.createVirtualRegister(RC);
4512 MachineInstrBuilder MIB1 =
4513 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
4514 .add(Root.getOperand(2));
4515 InsInstrs.push_back(MIB1);
4516 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4517 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
4518 Opc = AArch64::FMLAv4i32_indexed;
4519 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4520 FMAInstKind::Indexed, &NewVR);
4521 } else {
4522 Opc = AArch64::FMLAv4f32;
4523 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4524 FMAInstKind::Accumulator, &NewVR);
4525 }
4526 break;
4527 }
4528 case MachineCombinerPattern::FMLSv2f64_OP1:
4529 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
4530 RC = &AArch64::FPR128RegClass;
4531 unsigned NewVR = MRI.createVirtualRegister(RC);
4532 MachineInstrBuilder MIB1 =
4533 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
4534 .add(Root.getOperand(2));
4535 InsInstrs.push_back(MIB1);
4536 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4537 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
4538 Opc = AArch64::FMLAv2i64_indexed;
4539 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4540 FMAInstKind::Indexed, &NewVR);
4541 } else {
4542 Opc = AArch64::FMLAv2f64;
4543 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4544 FMAInstKind::Accumulator, &NewVR);
4545 }
4546 break;
4547 }
4548 } // end switch (Pattern)
4549 // Record MUL and ADD/SUB for deletion
4550 DelInstrs.push_back(MUL);
4551 DelInstrs.push_back(&Root);
4552 }
4553
4554 /// Replace csincr-branch sequence by simple conditional branch
4555 ///
4556 /// Examples:
4557 /// 1. \code
4558 /// csinc w9, wzr, wzr, <condition code>
4559 /// tbnz w9, #0, 0x44
4560 /// \endcode
4561 /// to
4562 /// \code
4563 /// b.<inverted condition code>
4564 /// \endcode
4565 ///
4566 /// 2. \code
4567 /// csinc w9, wzr, wzr, <condition code>
4568 /// tbz w9, #0, 0x44
4569 /// \endcode
4570 /// to
4571 /// \code
4572 /// b.<condition code>
4573 /// \endcode
4574 ///
4575 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
4576 /// compare's constant operand is power of 2.
4577 ///
4578 /// Examples:
4579 /// \code
4580 /// and w8, w8, #0x400
4581 /// cbnz w8, L1
4582 /// \endcode
4583 /// to
4584 /// \code
4585 /// tbnz w8, #10, L1
4586 /// \endcode
4587 ///
4588 /// \param MI Conditional Branch
4589 /// \return True when the simple conditional branch is generated
4590 ///
optimizeCondBranch(MachineInstr & MI) const4591 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
4592 bool IsNegativeBranch = false;
4593 bool IsTestAndBranch = false;
4594 unsigned TargetBBInMI = 0;
4595 switch (MI.getOpcode()) {
4596 default:
4597 llvm_unreachable("Unknown branch instruction?");
4598 case AArch64::Bcc:
4599 return false;
4600 case AArch64::CBZW:
4601 case AArch64::CBZX:
4602 TargetBBInMI = 1;
4603 break;
4604 case AArch64::CBNZW:
4605 case AArch64::CBNZX:
4606 TargetBBInMI = 1;
4607 IsNegativeBranch = true;
4608 break;
4609 case AArch64::TBZW:
4610 case AArch64::TBZX:
4611 TargetBBInMI = 2;
4612 IsTestAndBranch = true;
4613 break;
4614 case AArch64::TBNZW:
4615 case AArch64::TBNZX:
4616 TargetBBInMI = 2;
4617 IsNegativeBranch = true;
4618 IsTestAndBranch = true;
4619 break;
4620 }
4621 // So we increment a zero register and test for bits other
4622 // than bit 0? Conservatively bail out in case the verifier
4623 // missed this case.
4624 if (IsTestAndBranch && MI.getOperand(1).getImm())
4625 return false;
4626
4627 // Find Definition.
4628 assert(MI.getParent() && "Incomplete machine instruciton\n");
4629 MachineBasicBlock *MBB = MI.getParent();
4630 MachineFunction *MF = MBB->getParent();
4631 MachineRegisterInfo *MRI = &MF->getRegInfo();
4632 unsigned VReg = MI.getOperand(0).getReg();
4633 if (!TargetRegisterInfo::isVirtualRegister(VReg))
4634 return false;
4635
4636 MachineInstr *DefMI = MRI->getVRegDef(VReg);
4637
4638 // Look through COPY instructions to find definition.
4639 while (DefMI->isCopy()) {
4640 unsigned CopyVReg = DefMI->getOperand(1).getReg();
4641 if (!MRI->hasOneNonDBGUse(CopyVReg))
4642 return false;
4643 if (!MRI->hasOneDef(CopyVReg))
4644 return false;
4645 DefMI = MRI->getVRegDef(CopyVReg);
4646 }
4647
4648 switch (DefMI->getOpcode()) {
4649 default:
4650 return false;
4651 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
4652 case AArch64::ANDWri:
4653 case AArch64::ANDXri: {
4654 if (IsTestAndBranch)
4655 return false;
4656 if (DefMI->getParent() != MBB)
4657 return false;
4658 if (!MRI->hasOneNonDBGUse(VReg))
4659 return false;
4660
4661 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
4662 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
4663 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
4664 if (!isPowerOf2_64(Mask))
4665 return false;
4666
4667 MachineOperand &MO = DefMI->getOperand(1);
4668 unsigned NewReg = MO.getReg();
4669 if (!TargetRegisterInfo::isVirtualRegister(NewReg))
4670 return false;
4671
4672 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
4673
4674 MachineBasicBlock &RefToMBB = *MBB;
4675 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
4676 DebugLoc DL = MI.getDebugLoc();
4677 unsigned Imm = Log2_64(Mask);
4678 unsigned Opc = (Imm < 32)
4679 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
4680 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
4681 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
4682 .addReg(NewReg)
4683 .addImm(Imm)
4684 .addMBB(TBB);
4685 // Register lives on to the CBZ now.
4686 MO.setIsKill(false);
4687
4688 // For immediate smaller than 32, we need to use the 32-bit
4689 // variant (W) in all cases. Indeed the 64-bit variant does not
4690 // allow to encode them.
4691 // Therefore, if the input register is 64-bit, we need to take the
4692 // 32-bit sub-part.
4693 if (!Is32Bit && Imm < 32)
4694 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
4695 MI.eraseFromParent();
4696 return true;
4697 }
4698 // Look for CSINC
4699 case AArch64::CSINCWr:
4700 case AArch64::CSINCXr: {
4701 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
4702 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
4703 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
4704 DefMI->getOperand(2).getReg() == AArch64::XZR))
4705 return false;
4706
4707 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
4708 return false;
4709
4710 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
4711 // Convert only when the condition code is not modified between
4712 // the CSINC and the branch. The CC may be used by other
4713 // instructions in between.
4714 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
4715 return false;
4716 MachineBasicBlock &RefToMBB = *MBB;
4717 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
4718 DebugLoc DL = MI.getDebugLoc();
4719 if (IsNegativeBranch)
4720 CC = AArch64CC::getInvertedCondCode(CC);
4721 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
4722 MI.eraseFromParent();
4723 return true;
4724 }
4725 }
4726 }
4727
4728 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const4729 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
4730 const unsigned Mask = AArch64II::MO_FRAGMENT;
4731 return std::make_pair(TF & Mask, TF & ~Mask);
4732 }
4733
4734 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const4735 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
4736 using namespace AArch64II;
4737
4738 static const std::pair<unsigned, const char *> TargetFlags[] = {
4739 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
4740 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
4741 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
4742 {MO_HI12, "aarch64-hi12"}};
4743 return makeArrayRef(TargetFlags);
4744 }
4745
4746 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const4747 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
4748 using namespace AArch64II;
4749
4750 static const std::pair<unsigned, const char *> TargetFlags[] = {
4751 {MO_COFFSTUB, "aarch64-coffstub"},
4752 {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"},
4753 {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"},
4754 {MO_DLLIMPORT, "aarch64-dllimport"}};
4755 return makeArrayRef(TargetFlags);
4756 }
4757
4758 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const4759 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
4760 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
4761 {{MOSuppressPair, "aarch64-suppress-pair"},
4762 {MOStridedAccess, "aarch64-strided-access"}};
4763 return makeArrayRef(TargetFlags);
4764 }
4765
4766 /// Constants defining how certain sequences should be outlined.
4767 /// This encompasses how an outlined function should be called, and what kind of
4768 /// frame should be emitted for that outlined function.
4769 ///
4770 /// \p MachineOutlinerDefault implies that the function should be called with
4771 /// a save and restore of LR to the stack.
4772 ///
4773 /// That is,
4774 ///
4775 /// I1 Save LR OUTLINED_FUNCTION:
4776 /// I2 --> BL OUTLINED_FUNCTION I1
4777 /// I3 Restore LR I2
4778 /// I3
4779 /// RET
4780 ///
4781 /// * Call construction overhead: 3 (save + BL + restore)
4782 /// * Frame construction overhead: 1 (ret)
4783 /// * Requires stack fixups? Yes
4784 ///
4785 /// \p MachineOutlinerTailCall implies that the function is being created from
4786 /// a sequence of instructions ending in a return.
4787 ///
4788 /// That is,
4789 ///
4790 /// I1 OUTLINED_FUNCTION:
4791 /// I2 --> B OUTLINED_FUNCTION I1
4792 /// RET I2
4793 /// RET
4794 ///
4795 /// * Call construction overhead: 1 (B)
4796 /// * Frame construction overhead: 0 (Return included in sequence)
4797 /// * Requires stack fixups? No
4798 ///
4799 /// \p MachineOutlinerNoLRSave implies that the function should be called using
4800 /// a BL instruction, but doesn't require LR to be saved and restored. This
4801 /// happens when LR is known to be dead.
4802 ///
4803 /// That is,
4804 ///
4805 /// I1 OUTLINED_FUNCTION:
4806 /// I2 --> BL OUTLINED_FUNCTION I1
4807 /// I3 I2
4808 /// I3
4809 /// RET
4810 ///
4811 /// * Call construction overhead: 1 (BL)
4812 /// * Frame construction overhead: 1 (RET)
4813 /// * Requires stack fixups? No
4814 ///
4815 /// \p MachineOutlinerThunk implies that the function is being created from
4816 /// a sequence of instructions ending in a call. The outlined function is
4817 /// called with a BL instruction, and the outlined function tail-calls the
4818 /// original call destination.
4819 ///
4820 /// That is,
4821 ///
4822 /// I1 OUTLINED_FUNCTION:
4823 /// I2 --> BL OUTLINED_FUNCTION I1
4824 /// BL f I2
4825 /// B f
4826 /// * Call construction overhead: 1 (BL)
4827 /// * Frame construction overhead: 0
4828 /// * Requires stack fixups? No
4829 ///
4830 /// \p MachineOutlinerRegSave implies that the function should be called with a
4831 /// save and restore of LR to an available register. This allows us to avoid
4832 /// stack fixups. Note that this outlining variant is compatible with the
4833 /// NoLRSave case.
4834 ///
4835 /// That is,
4836 ///
4837 /// I1 Save LR OUTLINED_FUNCTION:
4838 /// I2 --> BL OUTLINED_FUNCTION I1
4839 /// I3 Restore LR I2
4840 /// I3
4841 /// RET
4842 ///
4843 /// * Call construction overhead: 3 (save + BL + restore)
4844 /// * Frame construction overhead: 1 (ret)
4845 /// * Requires stack fixups? No
4846 enum MachineOutlinerClass {
4847 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
4848 MachineOutlinerTailCall, /// Only emit a branch.
4849 MachineOutlinerNoLRSave, /// Emit a call and return.
4850 MachineOutlinerThunk, /// Emit a call and tail-call.
4851 MachineOutlinerRegSave /// Same as default, but save to a register.
4852 };
4853
4854 enum MachineOutlinerMBBFlags {
4855 LRUnavailableSomewhere = 0x2,
4856 HasCalls = 0x4,
4857 UnsafeRegsDead = 0x8
4858 };
4859
4860 unsigned
findRegisterToSaveLRTo(const outliner::Candidate & C) const4861 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
4862 assert(C.LRUWasSet && "LRU wasn't set?");
4863 MachineFunction *MF = C.getMF();
4864 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
4865 MF->getSubtarget().getRegisterInfo());
4866
4867 // Check if there is an available register across the sequence that we can
4868 // use.
4869 for (unsigned Reg : AArch64::GPR64RegClass) {
4870 if (!ARI->isReservedReg(*MF, Reg) &&
4871 Reg != AArch64::LR && // LR is not reserved, but don't use it.
4872 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
4873 Reg != AArch64::X17 && // Ditto for X17.
4874 C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
4875 return Reg;
4876 }
4877
4878 // No suitable register. Return 0.
4879 return 0u;
4880 }
4881
4882 outliner::OutlinedFunction
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const4883 AArch64InstrInfo::getOutliningCandidateInfo(
4884 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
4885 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
4886 unsigned SequenceSize =
4887 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
4888 [this](unsigned Sum, const MachineInstr &MI) {
4889 return Sum + getInstSizeInBytes(MI);
4890 });
4891
4892 // Properties about candidate MBBs that hold for all of them.
4893 unsigned FlagsSetInAll = 0xF;
4894
4895 // Compute liveness information for each candidate, and set FlagsSetInAll.
4896 const TargetRegisterInfo &TRI = getRegisterInfo();
4897 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
4898 [&FlagsSetInAll](outliner::Candidate &C) {
4899 FlagsSetInAll &= C.Flags;
4900 });
4901
4902 // According to the AArch64 Procedure Call Standard, the following are
4903 // undefined on entry/exit from a function call:
4904 //
4905 // * Registers x16, x17, (and thus w16, w17)
4906 // * Condition codes (and thus the NZCV register)
4907 //
4908 // Because if this, we can't outline any sequence of instructions where
4909 // one
4910 // of these registers is live into/across it. Thus, we need to delete
4911 // those
4912 // candidates.
4913 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
4914 // If the unsafe registers in this block are all dead, then we don't need
4915 // to compute liveness here.
4916 if (C.Flags & UnsafeRegsDead)
4917 return false;
4918 C.initLRU(TRI);
4919 LiveRegUnits LRU = C.LRU;
4920 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
4921 !LRU.available(AArch64::NZCV));
4922 };
4923
4924 // Are there any candidates where those registers are live?
4925 if (!(FlagsSetInAll & UnsafeRegsDead)) {
4926 // Erase every candidate that violates the restrictions above. (It could be
4927 // true that we have viable candidates, so it's not worth bailing out in
4928 // the case that, say, 1 out of 20 candidates violate the restructions.)
4929 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
4930 RepeatedSequenceLocs.end(),
4931 CantGuaranteeValueAcrossCall),
4932 RepeatedSequenceLocs.end());
4933
4934 // If the sequence doesn't have enough candidates left, then we're done.
4935 if (RepeatedSequenceLocs.size() < 2)
4936 return outliner::OutlinedFunction();
4937 }
4938
4939 // At this point, we have only "safe" candidates to outline. Figure out
4940 // frame + call instruction information.
4941
4942 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
4943
4944 // Helper lambda which sets call information for every candidate.
4945 auto SetCandidateCallInfo =
4946 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
4947 for (outliner::Candidate &C : RepeatedSequenceLocs)
4948 C.setCallInfo(CallID, NumBytesForCall);
4949 };
4950
4951 unsigned FrameID = MachineOutlinerDefault;
4952 unsigned NumBytesToCreateFrame = 4;
4953
4954 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
4955 return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
4956 });
4957
4958 // Returns true if an instructions is safe to fix up, false otherwise.
4959 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
4960 if (MI.isCall())
4961 return true;
4962
4963 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
4964 !MI.readsRegister(AArch64::SP, &TRI))
4965 return true;
4966
4967 // Any modification of SP will break our code to save/restore LR.
4968 // FIXME: We could handle some instructions which add a constant
4969 // offset to SP, with a bit more work.
4970 if (MI.modifiesRegister(AArch64::SP, &TRI))
4971 return false;
4972
4973 // At this point, we have a stack instruction that we might need to
4974 // fix up. We'll handle it if it's a load or store.
4975 if (MI.mayLoadOrStore()) {
4976 const MachineOperand *Base; // Filled with the base operand of MI.
4977 int64_t Offset; // Filled with the offset of MI.
4978
4979 // Does it allow us to offset the base operand and is the base the
4980 // register SP?
4981 if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
4982 Base->getReg() != AArch64::SP)
4983 return false;
4984
4985 // Find the minimum/maximum offset for this instruction and check
4986 // if fixing it up would be in range.
4987 int64_t MinOffset,
4988 MaxOffset; // Unscaled offsets for the instruction.
4989 unsigned Scale; // The scale to multiply the offsets by.
4990 unsigned DummyWidth;
4991 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
4992
4993 Offset += 16; // Update the offset to what it would be if we outlined.
4994 if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
4995 return false;
4996
4997 // It's in range, so we can outline it.
4998 return true;
4999 }
5000
5001 // FIXME: Add handling for instructions like "add x0, sp, #8".
5002
5003 // We can't fix it up, so don't outline it.
5004 return false;
5005 };
5006
5007 // True if it's possible to fix up each stack instruction in this sequence.
5008 // Important for frames/call variants that modify the stack.
5009 bool AllStackInstrsSafe = std::all_of(
5010 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
5011
5012 // If the last instruction in any candidate is a terminator, then we should
5013 // tail call all of the candidates.
5014 if (RepeatedSequenceLocs[0].back()->isTerminator()) {
5015 FrameID = MachineOutlinerTailCall;
5016 NumBytesToCreateFrame = 0;
5017 SetCandidateCallInfo(MachineOutlinerTailCall, 4);
5018 }
5019
5020 else if (LastInstrOpcode == AArch64::BL ||
5021 (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
5022 // FIXME: Do we need to check if the code after this uses the value of LR?
5023 FrameID = MachineOutlinerThunk;
5024 NumBytesToCreateFrame = 0;
5025 SetCandidateCallInfo(MachineOutlinerThunk, 4);
5026 }
5027
5028 else {
5029 // We need to decide how to emit calls + frames. We can always emit the same
5030 // frame if we don't need to save to the stack. If we have to save to the
5031 // stack, then we need a different frame.
5032 unsigned NumBytesNoStackCalls = 0;
5033 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
5034
5035 for (outliner::Candidate &C : RepeatedSequenceLocs) {
5036 C.initLRU(TRI);
5037
5038 // Is LR available? If so, we don't need a save.
5039 if (C.LRU.available(AArch64::LR)) {
5040 NumBytesNoStackCalls += 4;
5041 C.setCallInfo(MachineOutlinerNoLRSave, 4);
5042 CandidatesWithoutStackFixups.push_back(C);
5043 }
5044
5045 // Is an unused register available? If so, we won't modify the stack, so
5046 // we can outline with the same frame type as those that don't save LR.
5047 else if (findRegisterToSaveLRTo(C)) {
5048 NumBytesNoStackCalls += 12;
5049 C.setCallInfo(MachineOutlinerRegSave, 12);
5050 CandidatesWithoutStackFixups.push_back(C);
5051 }
5052
5053 // Is SP used in the sequence at all? If not, we don't have to modify
5054 // the stack, so we are guaranteed to get the same frame.
5055 else if (C.UsedInSequence.available(AArch64::SP)) {
5056 NumBytesNoStackCalls += 12;
5057 C.setCallInfo(MachineOutlinerDefault, 12);
5058 CandidatesWithoutStackFixups.push_back(C);
5059 }
5060
5061 // If we outline this, we need to modify the stack. Pretend we don't
5062 // outline this by saving all of its bytes.
5063 else {
5064 NumBytesNoStackCalls += SequenceSize;
5065 }
5066 }
5067
5068 // If there are no places where we have to save LR, then note that we
5069 // don't have to update the stack. Otherwise, give every candidate the
5070 // default call type, as long as it's safe to do so.
5071 if (!AllStackInstrsSafe ||
5072 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
5073 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
5074 FrameID = MachineOutlinerNoLRSave;
5075 } else {
5076 SetCandidateCallInfo(MachineOutlinerDefault, 12);
5077 }
5078
5079 // If we dropped all of the candidates, bail out here.
5080 if (RepeatedSequenceLocs.size() < 2) {
5081 RepeatedSequenceLocs.clear();
5082 return outliner::OutlinedFunction();
5083 }
5084 }
5085
5086 // Does every candidate's MBB contain a call? If so, then we might have a call
5087 // in the range.
5088 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
5089 // Check if the range contains a call. These require a save + restore of the
5090 // link register.
5091 bool ModStackToSaveLR = false;
5092 if (std::any_of(FirstCand.front(), FirstCand.back(),
5093 [](const MachineInstr &MI) { return MI.isCall(); }))
5094 ModStackToSaveLR = true;
5095
5096 // Handle the last instruction separately. If this is a tail call, then the
5097 // last instruction is a call. We don't want to save + restore in this case.
5098 // However, it could be possible that the last instruction is a call without
5099 // it being valid to tail call this sequence. We should consider this as
5100 // well.
5101 else if (FrameID != MachineOutlinerThunk &&
5102 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
5103 ModStackToSaveLR = true;
5104
5105 if (ModStackToSaveLR) {
5106 // We can't fix up the stack. Bail out.
5107 if (!AllStackInstrsSafe) {
5108 RepeatedSequenceLocs.clear();
5109 return outliner::OutlinedFunction();
5110 }
5111
5112 // Save + restore LR.
5113 NumBytesToCreateFrame += 8;
5114 }
5115 }
5116
5117 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
5118 NumBytesToCreateFrame, FrameID);
5119 }
5120
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const5121 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
5122 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
5123 const Function &F = MF.getFunction();
5124
5125 // Can F be deduplicated by the linker? If it can, don't outline from it.
5126 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
5127 return false;
5128
5129 // Don't outline from functions with section markings; the program could
5130 // expect that all the code is in the named section.
5131 // FIXME: Allow outlining from multiple functions with the same section
5132 // marking.
5133 if (F.hasSection())
5134 return false;
5135
5136 // Outlining from functions with redzones is unsafe since the outliner may
5137 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
5138 // outline from it.
5139 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5140 if (!AFI || AFI->hasRedZone().getValueOr(true))
5141 return false;
5142
5143 // It's safe to outline from MF.
5144 return true;
5145 }
5146
isMBBSafeToOutlineFrom(MachineBasicBlock & MBB,unsigned & Flags) const5147 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
5148 unsigned &Flags) const {
5149 // Check if LR is available through all of the MBB. If it's not, then set
5150 // a flag.
5151 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
5152 "Suitable Machine Function for outlining must track liveness");
5153 LiveRegUnits LRU(getRegisterInfo());
5154
5155 std::for_each(MBB.rbegin(), MBB.rend(),
5156 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
5157
5158 // Check if each of the unsafe registers are available...
5159 bool W16AvailableInBlock = LRU.available(AArch64::W16);
5160 bool W17AvailableInBlock = LRU.available(AArch64::W17);
5161 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
5162
5163 // If all of these are dead (and not live out), we know we don't have to check
5164 // them later.
5165 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
5166 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
5167
5168 // Now, add the live outs to the set.
5169 LRU.addLiveOuts(MBB);
5170
5171 // If any of these registers is available in the MBB, but also a live out of
5172 // the block, then we know outlining is unsafe.
5173 if (W16AvailableInBlock && !LRU.available(AArch64::W16))
5174 return false;
5175 if (W17AvailableInBlock && !LRU.available(AArch64::W17))
5176 return false;
5177 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
5178 return false;
5179
5180 // Check if there's a call inside this MachineBasicBlock. If there is, then
5181 // set a flag.
5182 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
5183 Flags |= MachineOutlinerMBBFlags::HasCalls;
5184
5185 MachineFunction *MF = MBB.getParent();
5186
5187 // In the event that we outline, we may have to save LR. If there is an
5188 // available register in the MBB, then we'll always save LR there. Check if
5189 // this is true.
5190 bool CanSaveLR = false;
5191 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5192 MF->getSubtarget().getRegisterInfo());
5193
5194 // Check if there is an available register across the sequence that we can
5195 // use.
5196 for (unsigned Reg : AArch64::GPR64RegClass) {
5197 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
5198 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
5199 CanSaveLR = true;
5200 break;
5201 }
5202 }
5203
5204 // Check if we have a register we can save LR to, and if LR was used
5205 // somewhere. If both of those things are true, then we need to evaluate the
5206 // safety of outlining stack instructions later.
5207 if (!CanSaveLR && !LRU.available(AArch64::LR))
5208 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
5209
5210 return true;
5211 }
5212
5213 outliner::InstrType
getOutliningType(MachineBasicBlock::iterator & MIT,unsigned Flags) const5214 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
5215 unsigned Flags) const {
5216 MachineInstr &MI = *MIT;
5217 MachineBasicBlock *MBB = MI.getParent();
5218 MachineFunction *MF = MBB->getParent();
5219 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
5220
5221 // Don't outline LOHs.
5222 if (FuncInfo->getLOHRelated().count(&MI))
5223 return outliner::InstrType::Illegal;
5224
5225 // Don't allow debug values to impact outlining type.
5226 if (MI.isDebugInstr() || MI.isIndirectDebugValue())
5227 return outliner::InstrType::Invisible;
5228
5229 // At this point, KILL instructions don't really tell us much so we can go
5230 // ahead and skip over them.
5231 if (MI.isKill())
5232 return outliner::InstrType::Invisible;
5233
5234 // Is this a terminator for a basic block?
5235 if (MI.isTerminator()) {
5236
5237 // Is this the end of a function?
5238 if (MI.getParent()->succ_empty())
5239 return outliner::InstrType::Legal;
5240
5241 // It's not, so don't outline it.
5242 return outliner::InstrType::Illegal;
5243 }
5244
5245 // Make sure none of the operands are un-outlinable.
5246 for (const MachineOperand &MOP : MI.operands()) {
5247 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
5248 MOP.isTargetIndex())
5249 return outliner::InstrType::Illegal;
5250
5251 // If it uses LR or W30 explicitly, then don't touch it.
5252 if (MOP.isReg() && !MOP.isImplicit() &&
5253 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
5254 return outliner::InstrType::Illegal;
5255 }
5256
5257 // Special cases for instructions that can always be outlined, but will fail
5258 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
5259 // be outlined because they don't require a *specific* value to be in LR.
5260 if (MI.getOpcode() == AArch64::ADRP)
5261 return outliner::InstrType::Legal;
5262
5263 // If MI is a call we might be able to outline it. We don't want to outline
5264 // any calls that rely on the position of items on the stack. When we outline
5265 // something containing a call, we have to emit a save and restore of LR in
5266 // the outlined function. Currently, this always happens by saving LR to the
5267 // stack. Thus, if we outline, say, half the parameters for a function call
5268 // plus the call, then we'll break the callee's expectations for the layout
5269 // of the stack.
5270 //
5271 // FIXME: Allow calls to functions which construct a stack frame, as long
5272 // as they don't access arguments on the stack.
5273 // FIXME: Figure out some way to analyze functions defined in other modules.
5274 // We should be able to compute the memory usage based on the IR calling
5275 // convention, even if we can't see the definition.
5276 if (MI.isCall()) {
5277 // Get the function associated with the call. Look at each operand and find
5278 // the one that represents the callee and get its name.
5279 const Function *Callee = nullptr;
5280 for (const MachineOperand &MOP : MI.operands()) {
5281 if (MOP.isGlobal()) {
5282 Callee = dyn_cast<Function>(MOP.getGlobal());
5283 break;
5284 }
5285 }
5286
5287 // Never outline calls to mcount. There isn't any rule that would require
5288 // this, but the Linux kernel's "ftrace" feature depends on it.
5289 if (Callee && Callee->getName() == "\01_mcount")
5290 return outliner::InstrType::Illegal;
5291
5292 // If we don't know anything about the callee, assume it depends on the
5293 // stack layout of the caller. In that case, it's only legal to outline
5294 // as a tail-call. Whitelist the call instructions we know about so we
5295 // don't get unexpected results with call pseudo-instructions.
5296 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
5297 if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
5298 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
5299
5300 if (!Callee)
5301 return UnknownCallOutlineType;
5302
5303 // We have a function we have information about. Check it if it's something
5304 // can safely outline.
5305 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
5306
5307 // We don't know what's going on with the callee at all. Don't touch it.
5308 if (!CalleeMF)
5309 return UnknownCallOutlineType;
5310
5311 // Check if we know anything about the callee saves on the function. If we
5312 // don't, then don't touch it, since that implies that we haven't
5313 // computed anything about its stack frame yet.
5314 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
5315 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
5316 MFI.getNumObjects() > 0)
5317 return UnknownCallOutlineType;
5318
5319 // At this point, we can say that CalleeMF ought to not pass anything on the
5320 // stack. Therefore, we can outline it.
5321 return outliner::InstrType::Legal;
5322 }
5323
5324 // Don't outline positions.
5325 if (MI.isPosition())
5326 return outliner::InstrType::Illegal;
5327
5328 // Don't touch the link register or W30.
5329 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
5330 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
5331 return outliner::InstrType::Illegal;
5332
5333 // Don't outline BTI instructions, because that will prevent the outlining
5334 // site from being indirectly callable.
5335 if (MI.getOpcode() == AArch64::HINT) {
5336 int64_t Imm = MI.getOperand(0).getImm();
5337 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5338 return outliner::InstrType::Illegal;
5339 }
5340
5341 return outliner::InstrType::Legal;
5342 }
5343
fixupPostOutline(MachineBasicBlock & MBB) const5344 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
5345 for (MachineInstr &MI : MBB) {
5346 const MachineOperand *Base;
5347 unsigned Width;
5348 int64_t Offset;
5349
5350 // Is this a load or store with an immediate offset with SP as the base?
5351 if (!MI.mayLoadOrStore() ||
5352 !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
5353 (Base->isReg() && Base->getReg() != AArch64::SP))
5354 continue;
5355
5356 // It is, so we have to fix it up.
5357 unsigned Scale;
5358 int64_t Dummy1, Dummy2;
5359
5360 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
5361 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
5362 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
5363 assert(Scale != 0 && "Unexpected opcode!");
5364
5365 // We've pushed the return address to the stack, so add 16 to the offset.
5366 // This is safe, since we already checked if it would overflow when we
5367 // checked if this instruction was legal to outline.
5368 int64_t NewImm = (Offset + 16) / Scale;
5369 StackOffsetOperand.setImm(NewImm);
5370 }
5371 }
5372
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const5373 void AArch64InstrInfo::buildOutlinedFrame(
5374 MachineBasicBlock &MBB, MachineFunction &MF,
5375 const outliner::OutlinedFunction &OF) const {
5376 // For thunk outlining, rewrite the last instruction from a call to a
5377 // tail-call.
5378 if (OF.FrameConstructionID == MachineOutlinerThunk) {
5379 MachineInstr *Call = &*--MBB.instr_end();
5380 unsigned TailOpcode;
5381 if (Call->getOpcode() == AArch64::BL) {
5382 TailOpcode = AArch64::TCRETURNdi;
5383 } else {
5384 assert(Call->getOpcode() == AArch64::BLR);
5385 TailOpcode = AArch64::TCRETURNriALL;
5386 }
5387 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
5388 .add(Call->getOperand(0))
5389 .addImm(0);
5390 MBB.insert(MBB.end(), TC);
5391 Call->eraseFromParent();
5392 }
5393
5394 // Is there a call in the outlined range?
5395 auto IsNonTailCall = [](MachineInstr &MI) {
5396 return MI.isCall() && !MI.isReturn();
5397 };
5398 if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
5399 // Fix up the instructions in the range, since we're going to modify the
5400 // stack.
5401 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
5402 "Can only fix up stack references once");
5403 fixupPostOutline(MBB);
5404
5405 // LR has to be a live in so that we can save it.
5406 MBB.addLiveIn(AArch64::LR);
5407
5408 MachineBasicBlock::iterator It = MBB.begin();
5409 MachineBasicBlock::iterator Et = MBB.end();
5410
5411 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5412 OF.FrameConstructionID == MachineOutlinerThunk)
5413 Et = std::prev(MBB.end());
5414
5415 // Insert a save before the outlined region
5416 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5417 .addReg(AArch64::SP, RegState::Define)
5418 .addReg(AArch64::LR)
5419 .addReg(AArch64::SP)
5420 .addImm(-16);
5421 It = MBB.insert(It, STRXpre);
5422
5423 const TargetSubtargetInfo &STI = MF.getSubtarget();
5424 const MCRegisterInfo *MRI = STI.getRegisterInfo();
5425 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
5426
5427 // Add a CFI saying the stack was moved 16 B down.
5428 int64_t StackPosEntry =
5429 MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
5430 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5431 .addCFIIndex(StackPosEntry)
5432 .setMIFlags(MachineInstr::FrameSetup);
5433
5434 // Add a CFI saying that the LR that we want to find is now 16 B higher than
5435 // before.
5436 int64_t LRPosEntry =
5437 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
5438 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5439 .addCFIIndex(LRPosEntry)
5440 .setMIFlags(MachineInstr::FrameSetup);
5441
5442 // Insert a restore before the terminator for the function.
5443 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5444 .addReg(AArch64::SP, RegState::Define)
5445 .addReg(AArch64::LR, RegState::Define)
5446 .addReg(AArch64::SP)
5447 .addImm(16);
5448 Et = MBB.insert(Et, LDRXpost);
5449 }
5450
5451 // If this is a tail call outlined function, then there's already a return.
5452 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5453 OF.FrameConstructionID == MachineOutlinerThunk)
5454 return;
5455
5456 // It's not a tail call, so we have to insert the return ourselves.
5457 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
5458 .addReg(AArch64::LR, RegState::Undef);
5459 MBB.insert(MBB.end(), ret);
5460
5461 // Did we have to modify the stack by saving the link register?
5462 if (OF.FrameConstructionID != MachineOutlinerDefault)
5463 return;
5464
5465 // We modified the stack.
5466 // Walk over the basic block and fix up all the stack accesses.
5467 fixupPostOutline(MBB);
5468 }
5469
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,const outliner::Candidate & C) const5470 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
5471 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
5472 MachineFunction &MF, const outliner::Candidate &C) const {
5473
5474 // Are we tail calling?
5475 if (C.CallConstructionID == MachineOutlinerTailCall) {
5476 // If yes, then we can just branch to the label.
5477 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
5478 .addGlobalAddress(M.getNamedValue(MF.getName()))
5479 .addImm(0));
5480 return It;
5481 }
5482
5483 // Are we saving the link register?
5484 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
5485 C.CallConstructionID == MachineOutlinerThunk) {
5486 // No, so just insert the call.
5487 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5488 .addGlobalAddress(M.getNamedValue(MF.getName())));
5489 return It;
5490 }
5491
5492 // We want to return the spot where we inserted the call.
5493 MachineBasicBlock::iterator CallPt;
5494
5495 // Instructions for saving and restoring LR around the call instruction we're
5496 // going to insert.
5497 MachineInstr *Save;
5498 MachineInstr *Restore;
5499 // Can we save to a register?
5500 if (C.CallConstructionID == MachineOutlinerRegSave) {
5501 // FIXME: This logic should be sunk into a target-specific interface so that
5502 // we don't have to recompute the register.
5503 unsigned Reg = findRegisterToSaveLRTo(C);
5504 assert(Reg != 0 && "No callee-saved register available?");
5505
5506 // Save and restore LR from that register.
5507 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
5508 .addReg(AArch64::XZR)
5509 .addReg(AArch64::LR)
5510 .addImm(0);
5511 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
5512 .addReg(AArch64::XZR)
5513 .addReg(Reg)
5514 .addImm(0);
5515 } else {
5516 // We have the default case. Save and restore from SP.
5517 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5518 .addReg(AArch64::SP, RegState::Define)
5519 .addReg(AArch64::LR)
5520 .addReg(AArch64::SP)
5521 .addImm(-16);
5522 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5523 .addReg(AArch64::SP, RegState::Define)
5524 .addReg(AArch64::LR, RegState::Define)
5525 .addReg(AArch64::SP)
5526 .addImm(16);
5527 }
5528
5529 It = MBB.insert(It, Save);
5530 It++;
5531
5532 // Insert the call.
5533 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5534 .addGlobalAddress(M.getNamedValue(MF.getName())));
5535 CallPt = It;
5536 It++;
5537
5538 It = MBB.insert(It, Restore);
5539 return CallPt;
5540 }
5541
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const5542 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
5543 MachineFunction &MF) const {
5544 return MF.getFunction().hasMinSize();
5545 }
5546
isCopyInstrImpl(const MachineInstr & MI,const MachineOperand * & Source,const MachineOperand * & Destination) const5547 bool AArch64InstrInfo::isCopyInstrImpl(
5548 const MachineInstr &MI, const MachineOperand *&Source,
5549 const MachineOperand *&Destination) const {
5550
5551 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
5552 // and zero immediate operands used as an alias for mov instruction.
5553 if (MI.getOpcode() == AArch64::ORRWrs &&
5554 MI.getOperand(1).getReg() == AArch64::WZR &&
5555 MI.getOperand(3).getImm() == 0x0) {
5556 Destination = &MI.getOperand(0);
5557 Source = &MI.getOperand(2);
5558 return true;
5559 }
5560
5561 if (MI.getOpcode() == AArch64::ORRXrs &&
5562 MI.getOperand(1).getReg() == AArch64::XZR &&
5563 MI.getOperand(3).getImm() == 0x0) {
5564 Destination = &MI.getOperand(0);
5565 Source = &MI.getOperand(2);
5566 return true;
5567 }
5568
5569 return false;
5570 }
5571
5572 #define GET_INSTRINFO_HELPERS
5573 #include "AArch64GenInstrInfo.inc"
5574