1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugInfoMetadata.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstrDesc.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CodeGen.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Compiler.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/MathExtras.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetOptions.h"
47 #include <cassert>
48 #include <cstdint>
49 #include <iterator>
50 #include <utility>
51
52 using namespace llvm;
53
54 #define GET_INSTRINFO_CTOR_DTOR
55 #include "AArch64GenInstrInfo.inc"
56
57 static cl::opt<unsigned> TBZDisplacementBits(
58 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
59 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
60
61 static cl::opt<unsigned> CBZDisplacementBits(
62 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
63 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
64
65 static cl::opt<unsigned>
66 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
67 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
68
AArch64InstrInfo(const AArch64Subtarget & STI)69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
70 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
71 AArch64::CATCHRET),
72 RI(STI.getTargetTriple()), Subtarget(STI) {}
73
74 /// GetInstSize - Return the number of bytes of code the specified
75 /// instruction may be. This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
77 const MachineBasicBlock &MBB = *MI.getParent();
78 const MachineFunction *MF = MBB.getParent();
79 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
80
81 {
82 auto Op = MI.getOpcode();
83 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
84 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
85 }
86
87 // Meta-instructions emit no code.
88 if (MI.isMetaInstruction())
89 return 0;
90
91 // FIXME: We currently only handle pseudoinstructions that don't get expanded
92 // before the assembly printer.
93 unsigned NumBytes = 0;
94 const MCInstrDesc &Desc = MI.getDesc();
95 switch (Desc.getOpcode()) {
96 default:
97 // Anything not explicitly designated otherwise is a normal 4-byte insn.
98 NumBytes = 4;
99 break;
100 case TargetOpcode::STACKMAP:
101 // The upper bound for a stackmap intrinsic is the full length of its shadow
102 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
103 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
104 break;
105 case TargetOpcode::PATCHPOINT:
106 // The size of the patchpoint intrinsic is the number of bytes requested
107 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
108 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
109 break;
110 case AArch64::TLSDESC_CALLSEQ:
111 // This gets lowered to an instruction sequence which takes 16 bytes
112 NumBytes = 16;
113 break;
114 case AArch64::SpeculationBarrierISBDSBEndBB:
115 // This gets lowered to 2 4-byte instructions.
116 NumBytes = 8;
117 break;
118 case AArch64::SpeculationBarrierSBEndBB:
119 // This gets lowered to 1 4-byte instructions.
120 NumBytes = 4;
121 break;
122 case AArch64::JumpTableDest32:
123 case AArch64::JumpTableDest16:
124 case AArch64::JumpTableDest8:
125 NumBytes = 12;
126 break;
127 case AArch64::SPACE:
128 NumBytes = MI.getOperand(1).getImm();
129 break;
130 case TargetOpcode::BUNDLE:
131 NumBytes = getInstBundleLength(MI);
132 break;
133 }
134
135 return NumBytes;
136 }
137
getInstBundleLength(const MachineInstr & MI) const138 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
139 unsigned Size = 0;
140 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
141 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
142 while (++I != E && I->isInsideBundle()) {
143 assert(!I->isBundle() && "No nested bundle!");
144 Size += getInstSizeInBytes(*I);
145 }
146 return Size;
147 }
148
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)149 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
150 SmallVectorImpl<MachineOperand> &Cond) {
151 // Block ends with fall-through condbranch.
152 switch (LastInst->getOpcode()) {
153 default:
154 llvm_unreachable("Unknown branch instruction?");
155 case AArch64::Bcc:
156 Target = LastInst->getOperand(1).getMBB();
157 Cond.push_back(LastInst->getOperand(0));
158 break;
159 case AArch64::CBZW:
160 case AArch64::CBZX:
161 case AArch64::CBNZW:
162 case AArch64::CBNZX:
163 Target = LastInst->getOperand(1).getMBB();
164 Cond.push_back(MachineOperand::CreateImm(-1));
165 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
166 Cond.push_back(LastInst->getOperand(0));
167 break;
168 case AArch64::TBZW:
169 case AArch64::TBZX:
170 case AArch64::TBNZW:
171 case AArch64::TBNZX:
172 Target = LastInst->getOperand(2).getMBB();
173 Cond.push_back(MachineOperand::CreateImm(-1));
174 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
175 Cond.push_back(LastInst->getOperand(0));
176 Cond.push_back(LastInst->getOperand(1));
177 }
178 }
179
getBranchDisplacementBits(unsigned Opc)180 static unsigned getBranchDisplacementBits(unsigned Opc) {
181 switch (Opc) {
182 default:
183 llvm_unreachable("unexpected opcode!");
184 case AArch64::B:
185 return 64;
186 case AArch64::TBNZW:
187 case AArch64::TBZW:
188 case AArch64::TBNZX:
189 case AArch64::TBZX:
190 return TBZDisplacementBits;
191 case AArch64::CBNZW:
192 case AArch64::CBZW:
193 case AArch64::CBNZX:
194 case AArch64::CBZX:
195 return CBZDisplacementBits;
196 case AArch64::Bcc:
197 return BCCDisplacementBits;
198 }
199 }
200
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const201 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
202 int64_t BrOffset) const {
203 unsigned Bits = getBranchDisplacementBits(BranchOp);
204 assert(Bits >= 3 && "max branch displacement must be enough to jump"
205 "over conditional branch expansion");
206 return isIntN(Bits, BrOffset / 4);
207 }
208
209 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const210 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
211 switch (MI.getOpcode()) {
212 default:
213 llvm_unreachable("unexpected opcode!");
214 case AArch64::B:
215 return MI.getOperand(0).getMBB();
216 case AArch64::TBZW:
217 case AArch64::TBNZW:
218 case AArch64::TBZX:
219 case AArch64::TBNZX:
220 return MI.getOperand(2).getMBB();
221 case AArch64::CBZW:
222 case AArch64::CBNZW:
223 case AArch64::CBZX:
224 case AArch64::CBNZX:
225 case AArch64::Bcc:
226 return MI.getOperand(1).getMBB();
227 }
228 }
229
230 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const231 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
232 MachineBasicBlock *&TBB,
233 MachineBasicBlock *&FBB,
234 SmallVectorImpl<MachineOperand> &Cond,
235 bool AllowModify) const {
236 // If the block has no terminators, it just falls into the block after it.
237 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
238 if (I == MBB.end())
239 return false;
240
241 // Skip over SpeculationBarrierEndBB terminators
242 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
243 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
244 --I;
245 }
246
247 if (!isUnpredicatedTerminator(*I))
248 return false;
249
250 // Get the last instruction in the block.
251 MachineInstr *LastInst = &*I;
252
253 // If there is only one terminator instruction, process it.
254 unsigned LastOpc = LastInst->getOpcode();
255 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
256 if (isUncondBranchOpcode(LastOpc)) {
257 TBB = LastInst->getOperand(0).getMBB();
258 return false;
259 }
260 if (isCondBranchOpcode(LastOpc)) {
261 // Block ends with fall-through condbranch.
262 parseCondBranch(LastInst, TBB, Cond);
263 return false;
264 }
265 return true; // Can't handle indirect branch.
266 }
267
268 // Get the instruction before it if it is a terminator.
269 MachineInstr *SecondLastInst = &*I;
270 unsigned SecondLastOpc = SecondLastInst->getOpcode();
271
272 // If AllowModify is true and the block ends with two or more unconditional
273 // branches, delete all but the first unconditional branch.
274 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
275 while (isUncondBranchOpcode(SecondLastOpc)) {
276 LastInst->eraseFromParent();
277 LastInst = SecondLastInst;
278 LastOpc = LastInst->getOpcode();
279 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
280 // Return now the only terminator is an unconditional branch.
281 TBB = LastInst->getOperand(0).getMBB();
282 return false;
283 } else {
284 SecondLastInst = &*I;
285 SecondLastOpc = SecondLastInst->getOpcode();
286 }
287 }
288 }
289
290 // If there are three terminators, we don't know what sort of block this is.
291 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
292 return true;
293
294 // If the block ends with a B and a Bcc, handle it.
295 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
296 parseCondBranch(SecondLastInst, TBB, Cond);
297 FBB = LastInst->getOperand(0).getMBB();
298 return false;
299 }
300
301 // If the block ends with two unconditional branches, handle it. The second
302 // one is not executed, so remove it.
303 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
304 TBB = SecondLastInst->getOperand(0).getMBB();
305 I = LastInst;
306 if (AllowModify)
307 I->eraseFromParent();
308 return false;
309 }
310
311 // ...likewise if it ends with an indirect branch followed by an unconditional
312 // branch.
313 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
314 I = LastInst;
315 if (AllowModify)
316 I->eraseFromParent();
317 return true;
318 }
319
320 // Otherwise, can't handle this.
321 return true;
322 }
323
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const324 bool AArch64InstrInfo::reverseBranchCondition(
325 SmallVectorImpl<MachineOperand> &Cond) const {
326 if (Cond[0].getImm() != -1) {
327 // Regular Bcc
328 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
329 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
330 } else {
331 // Folded compare-and-branch
332 switch (Cond[1].getImm()) {
333 default:
334 llvm_unreachable("Unknown conditional branch!");
335 case AArch64::CBZW:
336 Cond[1].setImm(AArch64::CBNZW);
337 break;
338 case AArch64::CBNZW:
339 Cond[1].setImm(AArch64::CBZW);
340 break;
341 case AArch64::CBZX:
342 Cond[1].setImm(AArch64::CBNZX);
343 break;
344 case AArch64::CBNZX:
345 Cond[1].setImm(AArch64::CBZX);
346 break;
347 case AArch64::TBZW:
348 Cond[1].setImm(AArch64::TBNZW);
349 break;
350 case AArch64::TBNZW:
351 Cond[1].setImm(AArch64::TBZW);
352 break;
353 case AArch64::TBZX:
354 Cond[1].setImm(AArch64::TBNZX);
355 break;
356 case AArch64::TBNZX:
357 Cond[1].setImm(AArch64::TBZX);
358 break;
359 }
360 }
361
362 return false;
363 }
364
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const365 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
366 int *BytesRemoved) const {
367 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
368 if (I == MBB.end())
369 return 0;
370
371 if (!isUncondBranchOpcode(I->getOpcode()) &&
372 !isCondBranchOpcode(I->getOpcode()))
373 return 0;
374
375 // Remove the branch.
376 I->eraseFromParent();
377
378 I = MBB.end();
379
380 if (I == MBB.begin()) {
381 if (BytesRemoved)
382 *BytesRemoved = 4;
383 return 1;
384 }
385 --I;
386 if (!isCondBranchOpcode(I->getOpcode())) {
387 if (BytesRemoved)
388 *BytesRemoved = 4;
389 return 1;
390 }
391
392 // Remove the branch.
393 I->eraseFromParent();
394 if (BytesRemoved)
395 *BytesRemoved = 8;
396
397 return 2;
398 }
399
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const400 void AArch64InstrInfo::instantiateCondBranch(
401 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
402 ArrayRef<MachineOperand> Cond) const {
403 if (Cond[0].getImm() != -1) {
404 // Regular Bcc
405 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
406 } else {
407 // Folded compare-and-branch
408 // Note that we use addOperand instead of addReg to keep the flags.
409 const MachineInstrBuilder MIB =
410 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
411 if (Cond.size() > 3)
412 MIB.addImm(Cond[3].getImm());
413 MIB.addMBB(TBB);
414 }
415 }
416
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const417 unsigned AArch64InstrInfo::insertBranch(
418 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
419 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
420 // Shouldn't be a fall through.
421 assert(TBB && "insertBranch must not be told to insert a fallthrough");
422
423 if (!FBB) {
424 if (Cond.empty()) // Unconditional branch?
425 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
426 else
427 instantiateCondBranch(MBB, DL, TBB, Cond);
428
429 if (BytesAdded)
430 *BytesAdded = 4;
431
432 return 1;
433 }
434
435 // Two-way conditional branch.
436 instantiateCondBranch(MBB, DL, TBB, Cond);
437 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
438
439 if (BytesAdded)
440 *BytesAdded = 8;
441
442 return 2;
443 }
444
445 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)446 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
447 while (Register::isVirtualRegister(VReg)) {
448 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
449 if (!DefMI->isFullCopy())
450 return VReg;
451 VReg = DefMI->getOperand(1).getReg();
452 }
453 return VReg;
454 }
455
456 // Determine if VReg is defined by an instruction that can be folded into a
457 // csel instruction. If so, return the folded opcode, and the replacement
458 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)459 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
460 unsigned *NewVReg = nullptr) {
461 VReg = removeCopies(MRI, VReg);
462 if (!Register::isVirtualRegister(VReg))
463 return 0;
464
465 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
466 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
467 unsigned Opc = 0;
468 unsigned SrcOpNum = 0;
469 switch (DefMI->getOpcode()) {
470 case AArch64::ADDSXri:
471 case AArch64::ADDSWri:
472 // if NZCV is used, do not fold.
473 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
474 return 0;
475 // fall-through to ADDXri and ADDWri.
476 LLVM_FALLTHROUGH;
477 case AArch64::ADDXri:
478 case AArch64::ADDWri:
479 // add x, 1 -> csinc.
480 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
481 DefMI->getOperand(3).getImm() != 0)
482 return 0;
483 SrcOpNum = 1;
484 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
485 break;
486
487 case AArch64::ORNXrr:
488 case AArch64::ORNWrr: {
489 // not x -> csinv, represented as orn dst, xzr, src.
490 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
491 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
492 return 0;
493 SrcOpNum = 2;
494 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
495 break;
496 }
497
498 case AArch64::SUBSXrr:
499 case AArch64::SUBSWrr:
500 // if NZCV is used, do not fold.
501 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
502 return 0;
503 // fall-through to SUBXrr and SUBWrr.
504 LLVM_FALLTHROUGH;
505 case AArch64::SUBXrr:
506 case AArch64::SUBWrr: {
507 // neg x -> csneg, represented as sub dst, xzr, src.
508 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
509 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
510 return 0;
511 SrcOpNum = 2;
512 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
513 break;
514 }
515 default:
516 return 0;
517 }
518 assert(Opc && SrcOpNum && "Missing parameters");
519
520 if (NewVReg)
521 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
522 return Opc;
523 }
524
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const525 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
526 ArrayRef<MachineOperand> Cond,
527 Register DstReg, Register TrueReg,
528 Register FalseReg, int &CondCycles,
529 int &TrueCycles,
530 int &FalseCycles) const {
531 // Check register classes.
532 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
533 const TargetRegisterClass *RC =
534 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
535 if (!RC)
536 return false;
537
538 // Also need to check the dest regclass, in case we're trying to optimize
539 // something like:
540 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
541 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
542 return false;
543
544 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
545 unsigned ExtraCondLat = Cond.size() != 1;
546
547 // GPRs are handled by csel.
548 // FIXME: Fold in x+1, -x, and ~x when applicable.
549 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
550 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
551 // Single-cycle csel, csinc, csinv, and csneg.
552 CondCycles = 1 + ExtraCondLat;
553 TrueCycles = FalseCycles = 1;
554 if (canFoldIntoCSel(MRI, TrueReg))
555 TrueCycles = 0;
556 else if (canFoldIntoCSel(MRI, FalseReg))
557 FalseCycles = 0;
558 return true;
559 }
560
561 // Scalar floating point is handled by fcsel.
562 // FIXME: Form fabs, fmin, and fmax when applicable.
563 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
564 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
565 CondCycles = 5 + ExtraCondLat;
566 TrueCycles = FalseCycles = 2;
567 return true;
568 }
569
570 // Can't do vectors.
571 return false;
572 }
573
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const574 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
575 MachineBasicBlock::iterator I,
576 const DebugLoc &DL, Register DstReg,
577 ArrayRef<MachineOperand> Cond,
578 Register TrueReg, Register FalseReg) const {
579 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
580
581 // Parse the condition code, see parseCondBranch() above.
582 AArch64CC::CondCode CC;
583 switch (Cond.size()) {
584 default:
585 llvm_unreachable("Unknown condition opcode in Cond");
586 case 1: // b.cc
587 CC = AArch64CC::CondCode(Cond[0].getImm());
588 break;
589 case 3: { // cbz/cbnz
590 // We must insert a compare against 0.
591 bool Is64Bit;
592 switch (Cond[1].getImm()) {
593 default:
594 llvm_unreachable("Unknown branch opcode in Cond");
595 case AArch64::CBZW:
596 Is64Bit = false;
597 CC = AArch64CC::EQ;
598 break;
599 case AArch64::CBZX:
600 Is64Bit = true;
601 CC = AArch64CC::EQ;
602 break;
603 case AArch64::CBNZW:
604 Is64Bit = false;
605 CC = AArch64CC::NE;
606 break;
607 case AArch64::CBNZX:
608 Is64Bit = true;
609 CC = AArch64CC::NE;
610 break;
611 }
612 Register SrcReg = Cond[2].getReg();
613 if (Is64Bit) {
614 // cmp reg, #0 is actually subs xzr, reg, #0.
615 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
616 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
617 .addReg(SrcReg)
618 .addImm(0)
619 .addImm(0);
620 } else {
621 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
622 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
623 .addReg(SrcReg)
624 .addImm(0)
625 .addImm(0);
626 }
627 break;
628 }
629 case 4: { // tbz/tbnz
630 // We must insert a tst instruction.
631 switch (Cond[1].getImm()) {
632 default:
633 llvm_unreachable("Unknown branch opcode in Cond");
634 case AArch64::TBZW:
635 case AArch64::TBZX:
636 CC = AArch64CC::EQ;
637 break;
638 case AArch64::TBNZW:
639 case AArch64::TBNZX:
640 CC = AArch64CC::NE;
641 break;
642 }
643 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
644 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
645 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
646 .addReg(Cond[2].getReg())
647 .addImm(
648 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
649 else
650 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
651 .addReg(Cond[2].getReg())
652 .addImm(
653 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
654 break;
655 }
656 }
657
658 unsigned Opc = 0;
659 const TargetRegisterClass *RC = nullptr;
660 bool TryFold = false;
661 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
662 RC = &AArch64::GPR64RegClass;
663 Opc = AArch64::CSELXr;
664 TryFold = true;
665 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
666 RC = &AArch64::GPR32RegClass;
667 Opc = AArch64::CSELWr;
668 TryFold = true;
669 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
670 RC = &AArch64::FPR64RegClass;
671 Opc = AArch64::FCSELDrrr;
672 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
673 RC = &AArch64::FPR32RegClass;
674 Opc = AArch64::FCSELSrrr;
675 }
676 assert(RC && "Unsupported regclass");
677
678 // Try folding simple instructions into the csel.
679 if (TryFold) {
680 unsigned NewVReg = 0;
681 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
682 if (FoldedOpc) {
683 // The folded opcodes csinc, csinc and csneg apply the operation to
684 // FalseReg, so we need to invert the condition.
685 CC = AArch64CC::getInvertedCondCode(CC);
686 TrueReg = FalseReg;
687 } else
688 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
689
690 // Fold the operation. Leave any dead instructions for DCE to clean up.
691 if (FoldedOpc) {
692 FalseReg = NewVReg;
693 Opc = FoldedOpc;
694 // The extends the live range of NewVReg.
695 MRI.clearKillFlags(NewVReg);
696 }
697 }
698
699 // Pull all virtual register into the appropriate class.
700 MRI.constrainRegClass(TrueReg, RC);
701 MRI.constrainRegClass(FalseReg, RC);
702
703 // Insert the csel.
704 BuildMI(MBB, I, DL, get(Opc), DstReg)
705 .addReg(TrueReg)
706 .addReg(FalseReg)
707 .addImm(CC);
708 }
709
710 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
canBeExpandedToORR(const MachineInstr & MI,unsigned BitSize)711 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
712 uint64_t Imm = MI.getOperand(1).getImm();
713 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
714 uint64_t Encoding;
715 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
716 }
717
718 // FIXME: this implementation should be micro-architecture dependent, so a
719 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const720 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
721 if (!Subtarget.hasCustomCheapAsMoveHandling())
722 return MI.isAsCheapAsAMove();
723
724 const unsigned Opcode = MI.getOpcode();
725
726 // Firstly, check cases gated by features.
727
728 if (Subtarget.hasZeroCycleZeroingFP()) {
729 if (Opcode == AArch64::FMOVH0 ||
730 Opcode == AArch64::FMOVS0 ||
731 Opcode == AArch64::FMOVD0)
732 return true;
733 }
734
735 if (Subtarget.hasZeroCycleZeroingGP()) {
736 if (Opcode == TargetOpcode::COPY &&
737 (MI.getOperand(1).getReg() == AArch64::WZR ||
738 MI.getOperand(1).getReg() == AArch64::XZR))
739 return true;
740 }
741
742 // Secondly, check cases specific to sub-targets.
743
744 if (Subtarget.hasExynosCheapAsMoveHandling()) {
745 if (isExynosCheapAsMove(MI))
746 return true;
747
748 return MI.isAsCheapAsAMove();
749 }
750
751 // Finally, check generic cases.
752
753 switch (Opcode) {
754 default:
755 return false;
756
757 // add/sub on register without shift
758 case AArch64::ADDWri:
759 case AArch64::ADDXri:
760 case AArch64::SUBWri:
761 case AArch64::SUBXri:
762 return (MI.getOperand(3).getImm() == 0);
763
764 // logical ops on immediate
765 case AArch64::ANDWri:
766 case AArch64::ANDXri:
767 case AArch64::EORWri:
768 case AArch64::EORXri:
769 case AArch64::ORRWri:
770 case AArch64::ORRXri:
771 return true;
772
773 // logical ops on register without shift
774 case AArch64::ANDWrr:
775 case AArch64::ANDXrr:
776 case AArch64::BICWrr:
777 case AArch64::BICXrr:
778 case AArch64::EONWrr:
779 case AArch64::EONXrr:
780 case AArch64::EORWrr:
781 case AArch64::EORXrr:
782 case AArch64::ORNWrr:
783 case AArch64::ORNXrr:
784 case AArch64::ORRWrr:
785 case AArch64::ORRXrr:
786 return true;
787
788 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
789 // ORRXri, it is as cheap as MOV
790 case AArch64::MOVi32imm:
791 return canBeExpandedToORR(MI, 32);
792 case AArch64::MOVi64imm:
793 return canBeExpandedToORR(MI, 64);
794 }
795
796 llvm_unreachable("Unknown opcode to check as cheap as a move!");
797 }
798
isFalkorShiftExtFast(const MachineInstr & MI)799 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
800 switch (MI.getOpcode()) {
801 default:
802 return false;
803
804 case AArch64::ADDWrs:
805 case AArch64::ADDXrs:
806 case AArch64::ADDSWrs:
807 case AArch64::ADDSXrs: {
808 unsigned Imm = MI.getOperand(3).getImm();
809 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
810 if (ShiftVal == 0)
811 return true;
812 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
813 }
814
815 case AArch64::ADDWrx:
816 case AArch64::ADDXrx:
817 case AArch64::ADDXrx64:
818 case AArch64::ADDSWrx:
819 case AArch64::ADDSXrx:
820 case AArch64::ADDSXrx64: {
821 unsigned Imm = MI.getOperand(3).getImm();
822 switch (AArch64_AM::getArithExtendType(Imm)) {
823 default:
824 return false;
825 case AArch64_AM::UXTB:
826 case AArch64_AM::UXTH:
827 case AArch64_AM::UXTW:
828 case AArch64_AM::UXTX:
829 return AArch64_AM::getArithShiftValue(Imm) <= 4;
830 }
831 }
832
833 case AArch64::SUBWrs:
834 case AArch64::SUBSWrs: {
835 unsigned Imm = MI.getOperand(3).getImm();
836 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
837 return ShiftVal == 0 ||
838 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
839 }
840
841 case AArch64::SUBXrs:
842 case AArch64::SUBSXrs: {
843 unsigned Imm = MI.getOperand(3).getImm();
844 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
845 return ShiftVal == 0 ||
846 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
847 }
848
849 case AArch64::SUBWrx:
850 case AArch64::SUBXrx:
851 case AArch64::SUBXrx64:
852 case AArch64::SUBSWrx:
853 case AArch64::SUBSXrx:
854 case AArch64::SUBSXrx64: {
855 unsigned Imm = MI.getOperand(3).getImm();
856 switch (AArch64_AM::getArithExtendType(Imm)) {
857 default:
858 return false;
859 case AArch64_AM::UXTB:
860 case AArch64_AM::UXTH:
861 case AArch64_AM::UXTW:
862 case AArch64_AM::UXTX:
863 return AArch64_AM::getArithShiftValue(Imm) == 0;
864 }
865 }
866
867 case AArch64::LDRBBroW:
868 case AArch64::LDRBBroX:
869 case AArch64::LDRBroW:
870 case AArch64::LDRBroX:
871 case AArch64::LDRDroW:
872 case AArch64::LDRDroX:
873 case AArch64::LDRHHroW:
874 case AArch64::LDRHHroX:
875 case AArch64::LDRHroW:
876 case AArch64::LDRHroX:
877 case AArch64::LDRQroW:
878 case AArch64::LDRQroX:
879 case AArch64::LDRSBWroW:
880 case AArch64::LDRSBWroX:
881 case AArch64::LDRSBXroW:
882 case AArch64::LDRSBXroX:
883 case AArch64::LDRSHWroW:
884 case AArch64::LDRSHWroX:
885 case AArch64::LDRSHXroW:
886 case AArch64::LDRSHXroX:
887 case AArch64::LDRSWroW:
888 case AArch64::LDRSWroX:
889 case AArch64::LDRSroW:
890 case AArch64::LDRSroX:
891 case AArch64::LDRWroW:
892 case AArch64::LDRWroX:
893 case AArch64::LDRXroW:
894 case AArch64::LDRXroX:
895 case AArch64::PRFMroW:
896 case AArch64::PRFMroX:
897 case AArch64::STRBBroW:
898 case AArch64::STRBBroX:
899 case AArch64::STRBroW:
900 case AArch64::STRBroX:
901 case AArch64::STRDroW:
902 case AArch64::STRDroX:
903 case AArch64::STRHHroW:
904 case AArch64::STRHHroX:
905 case AArch64::STRHroW:
906 case AArch64::STRHroX:
907 case AArch64::STRQroW:
908 case AArch64::STRQroX:
909 case AArch64::STRSroW:
910 case AArch64::STRSroX:
911 case AArch64::STRWroW:
912 case AArch64::STRWroX:
913 case AArch64::STRXroW:
914 case AArch64::STRXroX: {
915 unsigned IsSigned = MI.getOperand(3).getImm();
916 return !IsSigned;
917 }
918 }
919 }
920
isSEHInstruction(const MachineInstr & MI)921 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
922 unsigned Opc = MI.getOpcode();
923 switch (Opc) {
924 default:
925 return false;
926 case AArch64::SEH_StackAlloc:
927 case AArch64::SEH_SaveFPLR:
928 case AArch64::SEH_SaveFPLR_X:
929 case AArch64::SEH_SaveReg:
930 case AArch64::SEH_SaveReg_X:
931 case AArch64::SEH_SaveRegP:
932 case AArch64::SEH_SaveRegP_X:
933 case AArch64::SEH_SaveFReg:
934 case AArch64::SEH_SaveFReg_X:
935 case AArch64::SEH_SaveFRegP:
936 case AArch64::SEH_SaveFRegP_X:
937 case AArch64::SEH_SetFP:
938 case AArch64::SEH_AddFP:
939 case AArch64::SEH_Nop:
940 case AArch64::SEH_PrologEnd:
941 case AArch64::SEH_EpilogStart:
942 case AArch64::SEH_EpilogEnd:
943 return true;
944 }
945 }
946
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const947 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
948 Register &SrcReg, Register &DstReg,
949 unsigned &SubIdx) const {
950 switch (MI.getOpcode()) {
951 default:
952 return false;
953 case AArch64::SBFMXri: // aka sxtw
954 case AArch64::UBFMXri: // aka uxtw
955 // Check for the 32 -> 64 bit extension case, these instructions can do
956 // much more.
957 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
958 return false;
959 // This is a signed or unsigned 32 -> 64 bit extension.
960 SrcReg = MI.getOperand(1).getReg();
961 DstReg = MI.getOperand(0).getReg();
962 SubIdx = AArch64::sub_32;
963 return true;
964 }
965 }
966
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const967 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
968 const MachineInstr &MIa, const MachineInstr &MIb) const {
969 const TargetRegisterInfo *TRI = &getRegisterInfo();
970 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
971 int64_t OffsetA = 0, OffsetB = 0;
972 unsigned WidthA = 0, WidthB = 0;
973 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
974
975 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
976 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
977
978 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
979 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
980 return false;
981
982 // Retrieve the base, offset from the base and width. Width
983 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
984 // base are identical, and the offset of a lower memory access +
985 // the width doesn't overlap the offset of a higher memory access,
986 // then the memory accesses are different.
987 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
988 // are assumed to have the same scale (vscale).
989 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
990 WidthA, TRI) &&
991 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
992 WidthB, TRI)) {
993 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
994 OffsetAIsScalable == OffsetBIsScalable) {
995 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
996 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
997 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
998 if (LowOffset + LowWidth <= HighOffset)
999 return true;
1000 }
1001 }
1002 return false;
1003 }
1004
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1005 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1006 const MachineBasicBlock *MBB,
1007 const MachineFunction &MF) const {
1008 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1009 return true;
1010 switch (MI.getOpcode()) {
1011 case AArch64::HINT:
1012 // CSDB hints are scheduling barriers.
1013 if (MI.getOperand(0).getImm() == 0x14)
1014 return true;
1015 break;
1016 case AArch64::DSB:
1017 case AArch64::ISB:
1018 // DSB and ISB also are scheduling barriers.
1019 return true;
1020 default:;
1021 }
1022 return isSEHInstruction(MI);
1023 }
1024
1025 /// analyzeCompare - For a comparison instruction, return the source registers
1026 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1027 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int & CmpMask,int & CmpValue) const1028 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1029 Register &SrcReg2, int &CmpMask,
1030 int &CmpValue) const {
1031 // The first operand can be a frame index where we'd normally expect a
1032 // register.
1033 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1034 if (!MI.getOperand(1).isReg())
1035 return false;
1036
1037 switch (MI.getOpcode()) {
1038 default:
1039 break;
1040 case AArch64::SUBSWrr:
1041 case AArch64::SUBSWrs:
1042 case AArch64::SUBSWrx:
1043 case AArch64::SUBSXrr:
1044 case AArch64::SUBSXrs:
1045 case AArch64::SUBSXrx:
1046 case AArch64::ADDSWrr:
1047 case AArch64::ADDSWrs:
1048 case AArch64::ADDSWrx:
1049 case AArch64::ADDSXrr:
1050 case AArch64::ADDSXrs:
1051 case AArch64::ADDSXrx:
1052 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1053 SrcReg = MI.getOperand(1).getReg();
1054 SrcReg2 = MI.getOperand(2).getReg();
1055 CmpMask = ~0;
1056 CmpValue = 0;
1057 return true;
1058 case AArch64::SUBSWri:
1059 case AArch64::ADDSWri:
1060 case AArch64::SUBSXri:
1061 case AArch64::ADDSXri:
1062 SrcReg = MI.getOperand(1).getReg();
1063 SrcReg2 = 0;
1064 CmpMask = ~0;
1065 // FIXME: In order to convert CmpValue to 0 or 1
1066 CmpValue = MI.getOperand(2).getImm() != 0;
1067 return true;
1068 case AArch64::ANDSWri:
1069 case AArch64::ANDSXri:
1070 // ANDS does not use the same encoding scheme as the others xxxS
1071 // instructions.
1072 SrcReg = MI.getOperand(1).getReg();
1073 SrcReg2 = 0;
1074 CmpMask = ~0;
1075 // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1076 // while the type of CmpValue is int. When converting uint64_t to int,
1077 // the high 32 bits of uint64_t will be lost.
1078 // In fact it causes a bug in spec2006-483.xalancbmk
1079 // CmpValue is only used to compare with zero in OptimizeCompareInstr
1080 CmpValue = AArch64_AM::decodeLogicalImmediate(
1081 MI.getOperand(2).getImm(),
1082 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1083 return true;
1084 }
1085
1086 return false;
1087 }
1088
UpdateOperandRegClass(MachineInstr & Instr)1089 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1090 MachineBasicBlock *MBB = Instr.getParent();
1091 assert(MBB && "Can't get MachineBasicBlock here");
1092 MachineFunction *MF = MBB->getParent();
1093 assert(MF && "Can't get MachineFunction here");
1094 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1095 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1096 MachineRegisterInfo *MRI = &MF->getRegInfo();
1097
1098 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1099 ++OpIdx) {
1100 MachineOperand &MO = Instr.getOperand(OpIdx);
1101 const TargetRegisterClass *OpRegCstraints =
1102 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1103
1104 // If there's no constraint, there's nothing to do.
1105 if (!OpRegCstraints)
1106 continue;
1107 // If the operand is a frame index, there's nothing to do here.
1108 // A frame index operand will resolve correctly during PEI.
1109 if (MO.isFI())
1110 continue;
1111
1112 assert(MO.isReg() &&
1113 "Operand has register constraints without being a register!");
1114
1115 Register Reg = MO.getReg();
1116 if (Register::isPhysicalRegister(Reg)) {
1117 if (!OpRegCstraints->contains(Reg))
1118 return false;
1119 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1120 !MRI->constrainRegClass(Reg, OpRegCstraints))
1121 return false;
1122 }
1123
1124 return true;
1125 }
1126
1127 /// Return the opcode that does not set flags when possible - otherwise
1128 /// return the original opcode. The caller is responsible to do the actual
1129 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1130 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1131 // Don't convert all compare instructions, because for some the zero register
1132 // encoding becomes the sp register.
1133 bool MIDefinesZeroReg = false;
1134 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1135 MIDefinesZeroReg = true;
1136
1137 switch (MI.getOpcode()) {
1138 default:
1139 return MI.getOpcode();
1140 case AArch64::ADDSWrr:
1141 return AArch64::ADDWrr;
1142 case AArch64::ADDSWri:
1143 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1144 case AArch64::ADDSWrs:
1145 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1146 case AArch64::ADDSWrx:
1147 return AArch64::ADDWrx;
1148 case AArch64::ADDSXrr:
1149 return AArch64::ADDXrr;
1150 case AArch64::ADDSXri:
1151 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1152 case AArch64::ADDSXrs:
1153 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1154 case AArch64::ADDSXrx:
1155 return AArch64::ADDXrx;
1156 case AArch64::SUBSWrr:
1157 return AArch64::SUBWrr;
1158 case AArch64::SUBSWri:
1159 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1160 case AArch64::SUBSWrs:
1161 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1162 case AArch64::SUBSWrx:
1163 return AArch64::SUBWrx;
1164 case AArch64::SUBSXrr:
1165 return AArch64::SUBXrr;
1166 case AArch64::SUBSXri:
1167 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1168 case AArch64::SUBSXrs:
1169 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1170 case AArch64::SUBSXrx:
1171 return AArch64::SUBXrx;
1172 }
1173 }
1174
1175 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1176
1177 /// True when condition flags are accessed (either by writing or reading)
1178 /// on the instruction trace starting at From and ending at To.
1179 ///
1180 /// Note: If From and To are from different blocks it's assumed CC are accessed
1181 /// on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1182 static bool areCFlagsAccessedBetweenInstrs(
1183 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1184 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1185 // Early exit if To is at the beginning of the BB.
1186 if (To == To->getParent()->begin())
1187 return true;
1188
1189 // Check whether the instructions are in the same basic block
1190 // If not, assume the condition flags might get modified somewhere.
1191 if (To->getParent() != From->getParent())
1192 return true;
1193
1194 // From must be above To.
1195 assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1196 [From](MachineInstr &MI) {
1197 return MI.getIterator() == From;
1198 }) != To->getParent()->rend());
1199
1200 // We iterate backward starting at \p To until we hit \p From.
1201 for (const MachineInstr &Instr :
1202 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1203 if (((AccessToCheck & AK_Write) &&
1204 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1205 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1206 return true;
1207 }
1208 return false;
1209 }
1210
1211 /// Try to optimize a compare instruction. A compare instruction is an
1212 /// instruction which produces AArch64::NZCV. It can be truly compare
1213 /// instruction
1214 /// when there are no uses of its destination register.
1215 ///
1216 /// The following steps are tried in order:
1217 /// 1. Convert CmpInstr into an unconditional version.
1218 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1219 /// condition code or an instruction which can be converted into such an
1220 /// instruction.
1221 /// Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int CmpMask,int CmpValue,const MachineRegisterInfo * MRI) const1222 bool AArch64InstrInfo::optimizeCompareInstr(
1223 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
1224 int CmpValue, const MachineRegisterInfo *MRI) const {
1225 assert(CmpInstr.getParent());
1226 assert(MRI);
1227
1228 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1229 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1230 if (DeadNZCVIdx != -1) {
1231 if (CmpInstr.definesRegister(AArch64::WZR) ||
1232 CmpInstr.definesRegister(AArch64::XZR)) {
1233 CmpInstr.eraseFromParent();
1234 return true;
1235 }
1236 unsigned Opc = CmpInstr.getOpcode();
1237 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1238 if (NewOpc == Opc)
1239 return false;
1240 const MCInstrDesc &MCID = get(NewOpc);
1241 CmpInstr.setDesc(MCID);
1242 CmpInstr.RemoveOperand(DeadNZCVIdx);
1243 bool succeeded = UpdateOperandRegClass(CmpInstr);
1244 (void)succeeded;
1245 assert(succeeded && "Some operands reg class are incompatible!");
1246 return true;
1247 }
1248
1249 // Continue only if we have a "ri" where immediate is zero.
1250 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1251 // function.
1252 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1253 if (CmpValue != 0 || SrcReg2 != 0)
1254 return false;
1255
1256 // CmpInstr is a Compare instruction if destination register is not used.
1257 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1258 return false;
1259
1260 return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1261 }
1262
1263 /// Get opcode of S version of Instr.
1264 /// If Instr is S version its opcode is returned.
1265 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1266 /// or we are not interested in it.
sForm(MachineInstr & Instr)1267 static unsigned sForm(MachineInstr &Instr) {
1268 switch (Instr.getOpcode()) {
1269 default:
1270 return AArch64::INSTRUCTION_LIST_END;
1271
1272 case AArch64::ADDSWrr:
1273 case AArch64::ADDSWri:
1274 case AArch64::ADDSXrr:
1275 case AArch64::ADDSXri:
1276 case AArch64::SUBSWrr:
1277 case AArch64::SUBSWri:
1278 case AArch64::SUBSXrr:
1279 case AArch64::SUBSXri:
1280 return Instr.getOpcode();
1281
1282 case AArch64::ADDWrr:
1283 return AArch64::ADDSWrr;
1284 case AArch64::ADDWri:
1285 return AArch64::ADDSWri;
1286 case AArch64::ADDXrr:
1287 return AArch64::ADDSXrr;
1288 case AArch64::ADDXri:
1289 return AArch64::ADDSXri;
1290 case AArch64::ADCWr:
1291 return AArch64::ADCSWr;
1292 case AArch64::ADCXr:
1293 return AArch64::ADCSXr;
1294 case AArch64::SUBWrr:
1295 return AArch64::SUBSWrr;
1296 case AArch64::SUBWri:
1297 return AArch64::SUBSWri;
1298 case AArch64::SUBXrr:
1299 return AArch64::SUBSXrr;
1300 case AArch64::SUBXri:
1301 return AArch64::SUBSXri;
1302 case AArch64::SBCWr:
1303 return AArch64::SBCSWr;
1304 case AArch64::SBCXr:
1305 return AArch64::SBCSXr;
1306 case AArch64::ANDWri:
1307 return AArch64::ANDSWri;
1308 case AArch64::ANDXri:
1309 return AArch64::ANDSXri;
1310 }
1311 }
1312
1313 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(MachineBasicBlock * MBB)1314 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1315 for (auto *BB : MBB->successors())
1316 if (BB->isLiveIn(AArch64::NZCV))
1317 return true;
1318 return false;
1319 }
1320
1321 namespace {
1322
1323 struct UsedNZCV {
1324 bool N = false;
1325 bool Z = false;
1326 bool C = false;
1327 bool V = false;
1328
1329 UsedNZCV() = default;
1330
operator |=__anonad194af10211::UsedNZCV1331 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1332 this->N |= UsedFlags.N;
1333 this->Z |= UsedFlags.Z;
1334 this->C |= UsedFlags.C;
1335 this->V |= UsedFlags.V;
1336 return *this;
1337 }
1338 };
1339
1340 } // end anonymous namespace
1341
1342 /// Find a condition code used by the instruction.
1343 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1344 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1345 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1346 switch (Instr.getOpcode()) {
1347 default:
1348 return AArch64CC::Invalid;
1349
1350 case AArch64::Bcc: {
1351 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1352 assert(Idx >= 2);
1353 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1354 }
1355
1356 case AArch64::CSINVWr:
1357 case AArch64::CSINVXr:
1358 case AArch64::CSINCWr:
1359 case AArch64::CSINCXr:
1360 case AArch64::CSELWr:
1361 case AArch64::CSELXr:
1362 case AArch64::CSNEGWr:
1363 case AArch64::CSNEGXr:
1364 case AArch64::FCSELSrrr:
1365 case AArch64::FCSELDrrr: {
1366 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1367 assert(Idx >= 1);
1368 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1369 }
1370 }
1371 }
1372
getUsedNZCV(AArch64CC::CondCode CC)1373 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1374 assert(CC != AArch64CC::Invalid);
1375 UsedNZCV UsedFlags;
1376 switch (CC) {
1377 default:
1378 break;
1379
1380 case AArch64CC::EQ: // Z set
1381 case AArch64CC::NE: // Z clear
1382 UsedFlags.Z = true;
1383 break;
1384
1385 case AArch64CC::HI: // Z clear and C set
1386 case AArch64CC::LS: // Z set or C clear
1387 UsedFlags.Z = true;
1388 LLVM_FALLTHROUGH;
1389 case AArch64CC::HS: // C set
1390 case AArch64CC::LO: // C clear
1391 UsedFlags.C = true;
1392 break;
1393
1394 case AArch64CC::MI: // N set
1395 case AArch64CC::PL: // N clear
1396 UsedFlags.N = true;
1397 break;
1398
1399 case AArch64CC::VS: // V set
1400 case AArch64CC::VC: // V clear
1401 UsedFlags.V = true;
1402 break;
1403
1404 case AArch64CC::GT: // Z clear, N and V the same
1405 case AArch64CC::LE: // Z set, N and V differ
1406 UsedFlags.Z = true;
1407 LLVM_FALLTHROUGH;
1408 case AArch64CC::GE: // N and V the same
1409 case AArch64CC::LT: // N and V differ
1410 UsedFlags.N = true;
1411 UsedFlags.V = true;
1412 break;
1413 }
1414 return UsedFlags;
1415 }
1416
isADDSRegImm(unsigned Opcode)1417 static bool isADDSRegImm(unsigned Opcode) {
1418 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1419 }
1420
isSUBSRegImm(unsigned Opcode)1421 static bool isSUBSRegImm(unsigned Opcode) {
1422 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1423 }
1424
1425 /// Check if CmpInstr can be substituted by MI.
1426 ///
1427 /// CmpInstr can be substituted:
1428 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1429 /// - and, MI and CmpInstr are from the same MachineBB
1430 /// - and, condition flags are not alive in successors of the CmpInstr parent
1431 /// - and, if MI opcode is the S form there must be no defs of flags between
1432 /// MI and CmpInstr
1433 /// or if MI opcode is not the S form there must be neither defs of flags
1434 /// nor uses of flags between MI and CmpInstr.
1435 /// - and C/V flags are not used after CmpInstr
canInstrSubstituteCmpInstr(MachineInstr * MI,MachineInstr * CmpInstr,const TargetRegisterInfo * TRI)1436 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1437 const TargetRegisterInfo *TRI) {
1438 assert(MI);
1439 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1440 assert(CmpInstr);
1441
1442 const unsigned CmpOpcode = CmpInstr->getOpcode();
1443 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1444 return false;
1445
1446 if (MI->getParent() != CmpInstr->getParent())
1447 return false;
1448
1449 if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1450 return false;
1451
1452 AccessKind AccessToCheck = AK_Write;
1453 if (sForm(*MI) != MI->getOpcode())
1454 AccessToCheck = AK_All;
1455 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1456 return false;
1457
1458 UsedNZCV NZCVUsedAfterCmp;
1459 for (const MachineInstr &Instr :
1460 instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
1461 CmpInstr->getParent()->instr_end())) {
1462 if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1463 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1464 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1465 return false;
1466 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1467 }
1468
1469 if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1470 break;
1471 }
1472
1473 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1474 }
1475
1476 /// Substitute an instruction comparing to zero with another instruction
1477 /// which produces needed condition flags.
1478 ///
1479 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo * MRI) const1480 bool AArch64InstrInfo::substituteCmpToZero(
1481 MachineInstr &CmpInstr, unsigned SrcReg,
1482 const MachineRegisterInfo *MRI) const {
1483 assert(MRI);
1484 // Get the unique definition of SrcReg.
1485 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1486 if (!MI)
1487 return false;
1488
1489 const TargetRegisterInfo *TRI = &getRegisterInfo();
1490
1491 unsigned NewOpc = sForm(*MI);
1492 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1493 return false;
1494
1495 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1496 return false;
1497
1498 // Update the instruction to set NZCV.
1499 MI->setDesc(get(NewOpc));
1500 CmpInstr.eraseFromParent();
1501 bool succeeded = UpdateOperandRegClass(*MI);
1502 (void)succeeded;
1503 assert(succeeded && "Some operands reg class are incompatible!");
1504 MI->addRegisterDefined(AArch64::NZCV, TRI);
1505 return true;
1506 }
1507
expandPostRAPseudo(MachineInstr & MI) const1508 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1509 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1510 MI.getOpcode() != AArch64::CATCHRET)
1511 return false;
1512
1513 MachineBasicBlock &MBB = *MI.getParent();
1514 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1515 auto TRI = Subtarget.getRegisterInfo();
1516 DebugLoc DL = MI.getDebugLoc();
1517
1518 if (MI.getOpcode() == AArch64::CATCHRET) {
1519 // Skip to the first instruction before the epilog.
1520 const TargetInstrInfo *TII =
1521 MBB.getParent()->getSubtarget().getInstrInfo();
1522 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1523 auto MBBI = MachineBasicBlock::iterator(MI);
1524 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1525 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1526 FirstEpilogSEH != MBB.begin())
1527 FirstEpilogSEH = std::prev(FirstEpilogSEH);
1528 if (FirstEpilogSEH != MBB.begin())
1529 FirstEpilogSEH = std::next(FirstEpilogSEH);
1530 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1531 .addReg(AArch64::X0, RegState::Define)
1532 .addMBB(TargetMBB);
1533 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1534 .addReg(AArch64::X0, RegState::Define)
1535 .addReg(AArch64::X0)
1536 .addMBB(TargetMBB)
1537 .addImm(0);
1538 return true;
1539 }
1540
1541 Register Reg = MI.getOperand(0).getReg();
1542 const GlobalValue *GV =
1543 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1544 const TargetMachine &TM = MBB.getParent()->getTarget();
1545 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1546 const unsigned char MO_NC = AArch64II::MO_NC;
1547
1548 if ((OpFlags & AArch64II::MO_GOT) != 0) {
1549 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1550 .addGlobalAddress(GV, 0, OpFlags);
1551 if (Subtarget.isTargetILP32()) {
1552 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1553 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1554 .addDef(Reg32, RegState::Dead)
1555 .addUse(Reg, RegState::Kill)
1556 .addImm(0)
1557 .addMemOperand(*MI.memoperands_begin())
1558 .addDef(Reg, RegState::Implicit);
1559 } else {
1560 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1561 .addReg(Reg, RegState::Kill)
1562 .addImm(0)
1563 .addMemOperand(*MI.memoperands_begin());
1564 }
1565 } else if (TM.getCodeModel() == CodeModel::Large) {
1566 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1567 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1568 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1569 .addImm(0);
1570 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1571 .addReg(Reg, RegState::Kill)
1572 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1573 .addImm(16);
1574 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1575 .addReg(Reg, RegState::Kill)
1576 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1577 .addImm(32);
1578 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1579 .addReg(Reg, RegState::Kill)
1580 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1581 .addImm(48);
1582 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1583 .addReg(Reg, RegState::Kill)
1584 .addImm(0)
1585 .addMemOperand(*MI.memoperands_begin());
1586 } else if (TM.getCodeModel() == CodeModel::Tiny) {
1587 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1588 .addGlobalAddress(GV, 0, OpFlags);
1589 } else {
1590 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1591 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1592 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1593 if (Subtarget.isTargetILP32()) {
1594 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1595 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1596 .addDef(Reg32, RegState::Dead)
1597 .addUse(Reg, RegState::Kill)
1598 .addGlobalAddress(GV, 0, LoFlags)
1599 .addMemOperand(*MI.memoperands_begin())
1600 .addDef(Reg, RegState::Implicit);
1601 } else {
1602 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1603 .addReg(Reg, RegState::Kill)
1604 .addGlobalAddress(GV, 0, LoFlags)
1605 .addMemOperand(*MI.memoperands_begin());
1606 }
1607 }
1608
1609 MBB.erase(MI);
1610
1611 return true;
1612 }
1613
1614 // Return true if this instruction simply sets its single destination register
1615 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)1616 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1617 switch (MI.getOpcode()) {
1618 default:
1619 break;
1620 case AArch64::MOVZWi:
1621 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1622 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1623 assert(MI.getDesc().getNumOperands() == 3 &&
1624 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1625 return true;
1626 }
1627 break;
1628 case AArch64::ANDWri: // and Rd, Rzr, #imm
1629 return MI.getOperand(1).getReg() == AArch64::WZR;
1630 case AArch64::ANDXri:
1631 return MI.getOperand(1).getReg() == AArch64::XZR;
1632 case TargetOpcode::COPY:
1633 return MI.getOperand(1).getReg() == AArch64::WZR;
1634 }
1635 return false;
1636 }
1637
1638 // Return true if this instruction simply renames a general register without
1639 // modifying bits.
isGPRCopy(const MachineInstr & MI)1640 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1641 switch (MI.getOpcode()) {
1642 default:
1643 break;
1644 case TargetOpcode::COPY: {
1645 // GPR32 copies will by lowered to ORRXrs
1646 Register DstReg = MI.getOperand(0).getReg();
1647 return (AArch64::GPR32RegClass.contains(DstReg) ||
1648 AArch64::GPR64RegClass.contains(DstReg));
1649 }
1650 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1651 if (MI.getOperand(1).getReg() == AArch64::XZR) {
1652 assert(MI.getDesc().getNumOperands() == 4 &&
1653 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1654 return true;
1655 }
1656 break;
1657 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1658 if (MI.getOperand(2).getImm() == 0) {
1659 assert(MI.getDesc().getNumOperands() == 4 &&
1660 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1661 return true;
1662 }
1663 break;
1664 }
1665 return false;
1666 }
1667
1668 // Return true if this instruction simply renames a general register without
1669 // modifying bits.
isFPRCopy(const MachineInstr & MI)1670 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1671 switch (MI.getOpcode()) {
1672 default:
1673 break;
1674 case TargetOpcode::COPY: {
1675 // FPR64 copies will by lowered to ORR.16b
1676 Register DstReg = MI.getOperand(0).getReg();
1677 return (AArch64::FPR64RegClass.contains(DstReg) ||
1678 AArch64::FPR128RegClass.contains(DstReg));
1679 }
1680 case AArch64::ORRv16i8:
1681 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1682 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1683 "invalid ORRv16i8 operands");
1684 return true;
1685 }
1686 break;
1687 }
1688 return false;
1689 }
1690
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const1691 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1692 int &FrameIndex) const {
1693 switch (MI.getOpcode()) {
1694 default:
1695 break;
1696 case AArch64::LDRWui:
1697 case AArch64::LDRXui:
1698 case AArch64::LDRBui:
1699 case AArch64::LDRHui:
1700 case AArch64::LDRSui:
1701 case AArch64::LDRDui:
1702 case AArch64::LDRQui:
1703 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1704 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1705 FrameIndex = MI.getOperand(1).getIndex();
1706 return MI.getOperand(0).getReg();
1707 }
1708 break;
1709 }
1710
1711 return 0;
1712 }
1713
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const1714 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1715 int &FrameIndex) const {
1716 switch (MI.getOpcode()) {
1717 default:
1718 break;
1719 case AArch64::STRWui:
1720 case AArch64::STRXui:
1721 case AArch64::STRBui:
1722 case AArch64::STRHui:
1723 case AArch64::STRSui:
1724 case AArch64::STRDui:
1725 case AArch64::STRQui:
1726 case AArch64::LDR_PXI:
1727 case AArch64::STR_PXI:
1728 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1729 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1730 FrameIndex = MI.getOperand(1).getIndex();
1731 return MI.getOperand(0).getReg();
1732 }
1733 break;
1734 }
1735 return 0;
1736 }
1737
1738 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)1739 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1740 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1741 return MMO->getFlags() & MOSuppressPair;
1742 });
1743 }
1744
1745 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)1746 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1747 if (MI.memoperands_empty())
1748 return;
1749 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1750 }
1751
1752 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)1753 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1754 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1755 return MMO->getFlags() & MOStridedAccess;
1756 });
1757 }
1758
isUnscaledLdSt(unsigned Opc)1759 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1760 switch (Opc) {
1761 default:
1762 return false;
1763 case AArch64::STURSi:
1764 case AArch64::STURDi:
1765 case AArch64::STURQi:
1766 case AArch64::STURBBi:
1767 case AArch64::STURHHi:
1768 case AArch64::STURWi:
1769 case AArch64::STURXi:
1770 case AArch64::LDURSi:
1771 case AArch64::LDURDi:
1772 case AArch64::LDURQi:
1773 case AArch64::LDURWi:
1774 case AArch64::LDURXi:
1775 case AArch64::LDURSWi:
1776 case AArch64::LDURHHi:
1777 case AArch64::LDURBBi:
1778 case AArch64::LDURSBWi:
1779 case AArch64::LDURSHWi:
1780 return true;
1781 }
1782 }
1783
getUnscaledLdSt(unsigned Opc)1784 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
1785 switch (Opc) {
1786 default: return {};
1787 case AArch64::PRFMui: return AArch64::PRFUMi;
1788 case AArch64::LDRXui: return AArch64::LDURXi;
1789 case AArch64::LDRWui: return AArch64::LDURWi;
1790 case AArch64::LDRBui: return AArch64::LDURBi;
1791 case AArch64::LDRHui: return AArch64::LDURHi;
1792 case AArch64::LDRSui: return AArch64::LDURSi;
1793 case AArch64::LDRDui: return AArch64::LDURDi;
1794 case AArch64::LDRQui: return AArch64::LDURQi;
1795 case AArch64::LDRBBui: return AArch64::LDURBBi;
1796 case AArch64::LDRHHui: return AArch64::LDURHHi;
1797 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1798 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1799 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1800 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1801 case AArch64::LDRSWui: return AArch64::LDURSWi;
1802 case AArch64::STRXui: return AArch64::STURXi;
1803 case AArch64::STRWui: return AArch64::STURWi;
1804 case AArch64::STRBui: return AArch64::STURBi;
1805 case AArch64::STRHui: return AArch64::STURHi;
1806 case AArch64::STRSui: return AArch64::STURSi;
1807 case AArch64::STRDui: return AArch64::STURDi;
1808 case AArch64::STRQui: return AArch64::STURQi;
1809 case AArch64::STRBBui: return AArch64::STURBBi;
1810 case AArch64::STRHHui: return AArch64::STURHHi;
1811 }
1812 }
1813
getLoadStoreImmIdx(unsigned Opc)1814 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
1815 switch (Opc) {
1816 default:
1817 return 2;
1818 case AArch64::LDPXi:
1819 case AArch64::LDPDi:
1820 case AArch64::STPXi:
1821 case AArch64::STPDi:
1822 case AArch64::LDNPXi:
1823 case AArch64::LDNPDi:
1824 case AArch64::STNPXi:
1825 case AArch64::STNPDi:
1826 case AArch64::LDPQi:
1827 case AArch64::STPQi:
1828 case AArch64::LDNPQi:
1829 case AArch64::STNPQi:
1830 case AArch64::LDPWi:
1831 case AArch64::LDPSi:
1832 case AArch64::STPWi:
1833 case AArch64::STPSi:
1834 case AArch64::LDNPWi:
1835 case AArch64::LDNPSi:
1836 case AArch64::STNPWi:
1837 case AArch64::STNPSi:
1838 case AArch64::LDG:
1839 case AArch64::STGPi:
1840 case AArch64::LD1B_IMM:
1841 case AArch64::LD1H_IMM:
1842 case AArch64::LD1W_IMM:
1843 case AArch64::LD1D_IMM:
1844 case AArch64::ST1B_IMM:
1845 case AArch64::ST1H_IMM:
1846 case AArch64::ST1W_IMM:
1847 case AArch64::ST1D_IMM:
1848 case AArch64::LD1B_H_IMM:
1849 case AArch64::LD1SB_H_IMM:
1850 case AArch64::LD1H_S_IMM:
1851 case AArch64::LD1SH_S_IMM:
1852 case AArch64::LD1W_D_IMM:
1853 case AArch64::LD1SW_D_IMM:
1854 case AArch64::ST1B_H_IMM:
1855 case AArch64::ST1H_S_IMM:
1856 case AArch64::ST1W_D_IMM:
1857 case AArch64::LD1B_S_IMM:
1858 case AArch64::LD1SB_S_IMM:
1859 case AArch64::LD1H_D_IMM:
1860 case AArch64::LD1SH_D_IMM:
1861 case AArch64::ST1B_S_IMM:
1862 case AArch64::ST1H_D_IMM:
1863 case AArch64::LD1B_D_IMM:
1864 case AArch64::LD1SB_D_IMM:
1865 case AArch64::ST1B_D_IMM:
1866 return 3;
1867 case AArch64::ADDG:
1868 case AArch64::STGOffset:
1869 case AArch64::LDR_PXI:
1870 case AArch64::STR_PXI:
1871 return 2;
1872 }
1873 }
1874
isPairableLdStInst(const MachineInstr & MI)1875 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1876 switch (MI.getOpcode()) {
1877 default:
1878 return false;
1879 // Scaled instructions.
1880 case AArch64::STRSui:
1881 case AArch64::STRDui:
1882 case AArch64::STRQui:
1883 case AArch64::STRXui:
1884 case AArch64::STRWui:
1885 case AArch64::LDRSui:
1886 case AArch64::LDRDui:
1887 case AArch64::LDRQui:
1888 case AArch64::LDRXui:
1889 case AArch64::LDRWui:
1890 case AArch64::LDRSWui:
1891 // Unscaled instructions.
1892 case AArch64::STURSi:
1893 case AArch64::STURDi:
1894 case AArch64::STURQi:
1895 case AArch64::STURWi:
1896 case AArch64::STURXi:
1897 case AArch64::LDURSi:
1898 case AArch64::LDURDi:
1899 case AArch64::LDURQi:
1900 case AArch64::LDURWi:
1901 case AArch64::LDURXi:
1902 case AArch64::LDURSWi:
1903 return true;
1904 }
1905 }
1906
convertToFlagSettingOpc(unsigned Opc,bool & Is64Bit)1907 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1908 bool &Is64Bit) {
1909 switch (Opc) {
1910 default:
1911 llvm_unreachable("Opcode has no flag setting equivalent!");
1912 // 32-bit cases:
1913 case AArch64::ADDWri:
1914 Is64Bit = false;
1915 return AArch64::ADDSWri;
1916 case AArch64::ADDWrr:
1917 Is64Bit = false;
1918 return AArch64::ADDSWrr;
1919 case AArch64::ADDWrs:
1920 Is64Bit = false;
1921 return AArch64::ADDSWrs;
1922 case AArch64::ADDWrx:
1923 Is64Bit = false;
1924 return AArch64::ADDSWrx;
1925 case AArch64::ANDWri:
1926 Is64Bit = false;
1927 return AArch64::ANDSWri;
1928 case AArch64::ANDWrr:
1929 Is64Bit = false;
1930 return AArch64::ANDSWrr;
1931 case AArch64::ANDWrs:
1932 Is64Bit = false;
1933 return AArch64::ANDSWrs;
1934 case AArch64::BICWrr:
1935 Is64Bit = false;
1936 return AArch64::BICSWrr;
1937 case AArch64::BICWrs:
1938 Is64Bit = false;
1939 return AArch64::BICSWrs;
1940 case AArch64::SUBWri:
1941 Is64Bit = false;
1942 return AArch64::SUBSWri;
1943 case AArch64::SUBWrr:
1944 Is64Bit = false;
1945 return AArch64::SUBSWrr;
1946 case AArch64::SUBWrs:
1947 Is64Bit = false;
1948 return AArch64::SUBSWrs;
1949 case AArch64::SUBWrx:
1950 Is64Bit = false;
1951 return AArch64::SUBSWrx;
1952 // 64-bit cases:
1953 case AArch64::ADDXri:
1954 Is64Bit = true;
1955 return AArch64::ADDSXri;
1956 case AArch64::ADDXrr:
1957 Is64Bit = true;
1958 return AArch64::ADDSXrr;
1959 case AArch64::ADDXrs:
1960 Is64Bit = true;
1961 return AArch64::ADDSXrs;
1962 case AArch64::ADDXrx:
1963 Is64Bit = true;
1964 return AArch64::ADDSXrx;
1965 case AArch64::ANDXri:
1966 Is64Bit = true;
1967 return AArch64::ANDSXri;
1968 case AArch64::ANDXrr:
1969 Is64Bit = true;
1970 return AArch64::ANDSXrr;
1971 case AArch64::ANDXrs:
1972 Is64Bit = true;
1973 return AArch64::ANDSXrs;
1974 case AArch64::BICXrr:
1975 Is64Bit = true;
1976 return AArch64::BICSXrr;
1977 case AArch64::BICXrs:
1978 Is64Bit = true;
1979 return AArch64::BICSXrs;
1980 case AArch64::SUBXri:
1981 Is64Bit = true;
1982 return AArch64::SUBSXri;
1983 case AArch64::SUBXrr:
1984 Is64Bit = true;
1985 return AArch64::SUBSXrr;
1986 case AArch64::SUBXrs:
1987 Is64Bit = true;
1988 return AArch64::SUBSXrs;
1989 case AArch64::SUBXrx:
1990 Is64Bit = true;
1991 return AArch64::SUBSXrx;
1992 }
1993 }
1994
1995 // Is this a candidate for ld/st merging or pairing? For example, we don't
1996 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const1997 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
1998 // If this is a volatile load/store, don't mess with it.
1999 if (MI.hasOrderedMemoryRef())
2000 return false;
2001
2002 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2003 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
2004 "Expected a reg or frame index operand.");
2005 if (!MI.getOperand(2).isImm())
2006 return false;
2007
2008 // Can't merge/pair if the instruction modifies the base register.
2009 // e.g., ldr x0, [x0]
2010 // This case will never occur with an FI base.
2011 if (MI.getOperand(1).isReg()) {
2012 Register BaseReg = MI.getOperand(1).getReg();
2013 const TargetRegisterInfo *TRI = &getRegisterInfo();
2014 if (MI.modifiesRegister(BaseReg, TRI))
2015 return false;
2016 }
2017
2018 // Check if this load/store has a hint to avoid pair formation.
2019 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2020 if (isLdStPairSuppressed(MI))
2021 return false;
2022
2023 // Do not pair any callee-save store/reload instructions in the
2024 // prologue/epilogue if the CFI information encoded the operations as separate
2025 // instructions, as that will cause the size of the actual prologue to mismatch
2026 // with the prologue size recorded in the Windows CFI.
2027 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2028 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2029 MI.getMF()->getFunction().needsUnwindTableEntry();
2030 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2031 MI.getFlag(MachineInstr::FrameDestroy)))
2032 return false;
2033
2034 // On some CPUs quad load/store pairs are slower than two single load/stores.
2035 if (Subtarget.isPaired128Slow()) {
2036 switch (MI.getOpcode()) {
2037 default:
2038 break;
2039 case AArch64::LDURQi:
2040 case AArch64::STURQi:
2041 case AArch64::LDRQui:
2042 case AArch64::STRQui:
2043 return false;
2044 }
2045 }
2046
2047 return true;
2048 }
2049
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2050 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2051 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2052 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2053 const TargetRegisterInfo *TRI) const {
2054 if (!LdSt.mayLoadOrStore())
2055 return false;
2056
2057 const MachineOperand *BaseOp;
2058 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2059 Width, TRI))
2060 return false;
2061 BaseOps.push_back(BaseOp);
2062 return true;
2063 }
2064
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,unsigned & Width,const TargetRegisterInfo * TRI) const2065 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
2066 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2067 bool &OffsetIsScalable, unsigned &Width,
2068 const TargetRegisterInfo *TRI) const {
2069 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2070 // Handle only loads/stores with base register followed by immediate offset.
2071 if (LdSt.getNumExplicitOperands() == 3) {
2072 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2073 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2074 !LdSt.getOperand(2).isImm())
2075 return false;
2076 } else if (LdSt.getNumExplicitOperands() == 4) {
2077 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2078 if (!LdSt.getOperand(1).isReg() ||
2079 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2080 !LdSt.getOperand(3).isImm())
2081 return false;
2082 } else
2083 return false;
2084
2085 // Get the scaling factor for the instruction and set the width for the
2086 // instruction.
2087 TypeSize Scale(0U, false);
2088 int64_t Dummy1, Dummy2;
2089
2090 // If this returns false, then it's an instruction we don't want to handle.
2091 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2092 return false;
2093
2094 // Compute the offset. Offset is calculated as the immediate operand
2095 // multiplied by the scaling factor. Unscaled instructions have scaling factor
2096 // set to 1.
2097 if (LdSt.getNumExplicitOperands() == 3) {
2098 BaseOp = &LdSt.getOperand(1);
2099 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2100 } else {
2101 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2102 BaseOp = &LdSt.getOperand(2);
2103 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2104 }
2105 OffsetIsScalable = Scale.isScalable();
2106
2107 if (!BaseOp->isReg() && !BaseOp->isFI())
2108 return false;
2109
2110 return true;
2111 }
2112
2113 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const2114 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
2115 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2116 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2117 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2118 return OfsOp;
2119 }
2120
getMemOpInfo(unsigned Opcode,TypeSize & Scale,unsigned & Width,int64_t & MinOffset,int64_t & MaxOffset)2121 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2122 unsigned &Width, int64_t &MinOffset,
2123 int64_t &MaxOffset) {
2124 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2125 switch (Opcode) {
2126 // Not a memory operation or something we want to handle.
2127 default:
2128 Scale = TypeSize::Fixed(0);
2129 Width = 0;
2130 MinOffset = MaxOffset = 0;
2131 return false;
2132 case AArch64::STRWpost:
2133 case AArch64::LDRWpost:
2134 Width = 32;
2135 Scale = TypeSize::Fixed(4);
2136 MinOffset = -256;
2137 MaxOffset = 255;
2138 break;
2139 case AArch64::LDURQi:
2140 case AArch64::STURQi:
2141 Width = 16;
2142 Scale = TypeSize::Fixed(1);
2143 MinOffset = -256;
2144 MaxOffset = 255;
2145 break;
2146 case AArch64::PRFUMi:
2147 case AArch64::LDURXi:
2148 case AArch64::LDURDi:
2149 case AArch64::STURXi:
2150 case AArch64::STURDi:
2151 Width = 8;
2152 Scale = TypeSize::Fixed(1);
2153 MinOffset = -256;
2154 MaxOffset = 255;
2155 break;
2156 case AArch64::LDURWi:
2157 case AArch64::LDURSi:
2158 case AArch64::LDURSWi:
2159 case AArch64::STURWi:
2160 case AArch64::STURSi:
2161 Width = 4;
2162 Scale = TypeSize::Fixed(1);
2163 MinOffset = -256;
2164 MaxOffset = 255;
2165 break;
2166 case AArch64::LDURHi:
2167 case AArch64::LDURHHi:
2168 case AArch64::LDURSHXi:
2169 case AArch64::LDURSHWi:
2170 case AArch64::STURHi:
2171 case AArch64::STURHHi:
2172 Width = 2;
2173 Scale = TypeSize::Fixed(1);
2174 MinOffset = -256;
2175 MaxOffset = 255;
2176 break;
2177 case AArch64::LDURBi:
2178 case AArch64::LDURBBi:
2179 case AArch64::LDURSBXi:
2180 case AArch64::LDURSBWi:
2181 case AArch64::STURBi:
2182 case AArch64::STURBBi:
2183 Width = 1;
2184 Scale = TypeSize::Fixed(1);
2185 MinOffset = -256;
2186 MaxOffset = 255;
2187 break;
2188 case AArch64::LDPQi:
2189 case AArch64::LDNPQi:
2190 case AArch64::STPQi:
2191 case AArch64::STNPQi:
2192 Scale = TypeSize::Fixed(16);
2193 Width = 32;
2194 MinOffset = -64;
2195 MaxOffset = 63;
2196 break;
2197 case AArch64::LDRQui:
2198 case AArch64::STRQui:
2199 Scale = TypeSize::Fixed(16);
2200 Width = 16;
2201 MinOffset = 0;
2202 MaxOffset = 4095;
2203 break;
2204 case AArch64::LDPXi:
2205 case AArch64::LDPDi:
2206 case AArch64::LDNPXi:
2207 case AArch64::LDNPDi:
2208 case AArch64::STPXi:
2209 case AArch64::STPDi:
2210 case AArch64::STNPXi:
2211 case AArch64::STNPDi:
2212 Scale = TypeSize::Fixed(8);
2213 Width = 16;
2214 MinOffset = -64;
2215 MaxOffset = 63;
2216 break;
2217 case AArch64::PRFMui:
2218 case AArch64::LDRXui:
2219 case AArch64::LDRDui:
2220 case AArch64::STRXui:
2221 case AArch64::STRDui:
2222 Scale = TypeSize::Fixed(8);
2223 Width = 8;
2224 MinOffset = 0;
2225 MaxOffset = 4095;
2226 break;
2227 case AArch64::LDPWi:
2228 case AArch64::LDPSi:
2229 case AArch64::LDNPWi:
2230 case AArch64::LDNPSi:
2231 case AArch64::STPWi:
2232 case AArch64::STPSi:
2233 case AArch64::STNPWi:
2234 case AArch64::STNPSi:
2235 Scale = TypeSize::Fixed(4);
2236 Width = 8;
2237 MinOffset = -64;
2238 MaxOffset = 63;
2239 break;
2240 case AArch64::LDRWui:
2241 case AArch64::LDRSui:
2242 case AArch64::LDRSWui:
2243 case AArch64::STRWui:
2244 case AArch64::STRSui:
2245 Scale = TypeSize::Fixed(4);
2246 Width = 4;
2247 MinOffset = 0;
2248 MaxOffset = 4095;
2249 break;
2250 case AArch64::LDRHui:
2251 case AArch64::LDRHHui:
2252 case AArch64::LDRSHWui:
2253 case AArch64::LDRSHXui:
2254 case AArch64::STRHui:
2255 case AArch64::STRHHui:
2256 Scale = TypeSize::Fixed(2);
2257 Width = 2;
2258 MinOffset = 0;
2259 MaxOffset = 4095;
2260 break;
2261 case AArch64::LDRBui:
2262 case AArch64::LDRBBui:
2263 case AArch64::LDRSBWui:
2264 case AArch64::LDRSBXui:
2265 case AArch64::STRBui:
2266 case AArch64::STRBBui:
2267 Scale = TypeSize::Fixed(1);
2268 Width = 1;
2269 MinOffset = 0;
2270 MaxOffset = 4095;
2271 break;
2272 case AArch64::ADDG:
2273 Scale = TypeSize::Fixed(16);
2274 Width = 0;
2275 MinOffset = 0;
2276 MaxOffset = 63;
2277 break;
2278 case AArch64::TAGPstack:
2279 Scale = TypeSize::Fixed(16);
2280 Width = 0;
2281 // TAGP with a negative offset turns into SUBP, which has a maximum offset
2282 // of 63 (not 64!).
2283 MinOffset = -63;
2284 MaxOffset = 63;
2285 break;
2286 case AArch64::LDG:
2287 case AArch64::STGOffset:
2288 case AArch64::STZGOffset:
2289 Scale = TypeSize::Fixed(16);
2290 Width = 16;
2291 MinOffset = -256;
2292 MaxOffset = 255;
2293 break;
2294 case AArch64::STR_ZZZZXI:
2295 case AArch64::LDR_ZZZZXI:
2296 Scale = TypeSize::Scalable(16);
2297 Width = SVEMaxBytesPerVector * 4;
2298 MinOffset = -256;
2299 MaxOffset = 252;
2300 break;
2301 case AArch64::STR_ZZZXI:
2302 case AArch64::LDR_ZZZXI:
2303 Scale = TypeSize::Scalable(16);
2304 Width = SVEMaxBytesPerVector * 3;
2305 MinOffset = -256;
2306 MaxOffset = 253;
2307 break;
2308 case AArch64::STR_ZZXI:
2309 case AArch64::LDR_ZZXI:
2310 Scale = TypeSize::Scalable(16);
2311 Width = SVEMaxBytesPerVector * 2;
2312 MinOffset = -256;
2313 MaxOffset = 254;
2314 break;
2315 case AArch64::LDR_PXI:
2316 case AArch64::STR_PXI:
2317 Scale = TypeSize::Scalable(2);
2318 Width = SVEMaxBytesPerVector / 8;
2319 MinOffset = -256;
2320 MaxOffset = 255;
2321 break;
2322 case AArch64::LDR_ZXI:
2323 case AArch64::STR_ZXI:
2324 Scale = TypeSize::Scalable(16);
2325 Width = SVEMaxBytesPerVector;
2326 MinOffset = -256;
2327 MaxOffset = 255;
2328 break;
2329 case AArch64::LD1B_IMM:
2330 case AArch64::LD1H_IMM:
2331 case AArch64::LD1W_IMM:
2332 case AArch64::LD1D_IMM:
2333 case AArch64::ST1B_IMM:
2334 case AArch64::ST1H_IMM:
2335 case AArch64::ST1W_IMM:
2336 case AArch64::ST1D_IMM:
2337 // A full vectors worth of data
2338 // Width = mbytes * elements
2339 Scale = TypeSize::Scalable(16);
2340 Width = SVEMaxBytesPerVector;
2341 MinOffset = -8;
2342 MaxOffset = 7;
2343 break;
2344 case AArch64::LD1B_H_IMM:
2345 case AArch64::LD1SB_H_IMM:
2346 case AArch64::LD1H_S_IMM:
2347 case AArch64::LD1SH_S_IMM:
2348 case AArch64::LD1W_D_IMM:
2349 case AArch64::LD1SW_D_IMM:
2350 case AArch64::ST1B_H_IMM:
2351 case AArch64::ST1H_S_IMM:
2352 case AArch64::ST1W_D_IMM:
2353 // A half vector worth of data
2354 // Width = mbytes * elements
2355 Scale = TypeSize::Scalable(8);
2356 Width = SVEMaxBytesPerVector / 2;
2357 MinOffset = -8;
2358 MaxOffset = 7;
2359 break;
2360 case AArch64::LD1B_S_IMM:
2361 case AArch64::LD1SB_S_IMM:
2362 case AArch64::LD1H_D_IMM:
2363 case AArch64::LD1SH_D_IMM:
2364 case AArch64::ST1B_S_IMM:
2365 case AArch64::ST1H_D_IMM:
2366 // A quarter vector worth of data
2367 // Width = mbytes * elements
2368 Scale = TypeSize::Scalable(4);
2369 Width = SVEMaxBytesPerVector / 4;
2370 MinOffset = -8;
2371 MaxOffset = 7;
2372 break;
2373 case AArch64::LD1B_D_IMM:
2374 case AArch64::LD1SB_D_IMM:
2375 case AArch64::ST1B_D_IMM:
2376 // A eighth vector worth of data
2377 // Width = mbytes * elements
2378 Scale = TypeSize::Scalable(2);
2379 Width = SVEMaxBytesPerVector / 8;
2380 MinOffset = -8;
2381 MaxOffset = 7;
2382 break;
2383 case AArch64::ST2GOffset:
2384 case AArch64::STZ2GOffset:
2385 Scale = TypeSize::Fixed(16);
2386 Width = 32;
2387 MinOffset = -256;
2388 MaxOffset = 255;
2389 break;
2390 case AArch64::STGPi:
2391 Scale = TypeSize::Fixed(16);
2392 Width = 16;
2393 MinOffset = -64;
2394 MaxOffset = 63;
2395 break;
2396 }
2397
2398 return true;
2399 }
2400
2401 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)2402 int AArch64InstrInfo::getMemScale(unsigned Opc) {
2403 switch (Opc) {
2404 default:
2405 llvm_unreachable("Opcode has unknown scale!");
2406 case AArch64::LDRBBui:
2407 case AArch64::LDURBBi:
2408 case AArch64::LDRSBWui:
2409 case AArch64::LDURSBWi:
2410 case AArch64::STRBBui:
2411 case AArch64::STURBBi:
2412 return 1;
2413 case AArch64::LDRHHui:
2414 case AArch64::LDURHHi:
2415 case AArch64::LDRSHWui:
2416 case AArch64::LDURSHWi:
2417 case AArch64::STRHHui:
2418 case AArch64::STURHHi:
2419 return 2;
2420 case AArch64::LDRSui:
2421 case AArch64::LDURSi:
2422 case AArch64::LDRSWui:
2423 case AArch64::LDURSWi:
2424 case AArch64::LDRWui:
2425 case AArch64::LDURWi:
2426 case AArch64::STRSui:
2427 case AArch64::STURSi:
2428 case AArch64::STRWui:
2429 case AArch64::STURWi:
2430 case AArch64::LDPSi:
2431 case AArch64::LDPSWi:
2432 case AArch64::LDPWi:
2433 case AArch64::STPSi:
2434 case AArch64::STPWi:
2435 return 4;
2436 case AArch64::LDRDui:
2437 case AArch64::LDURDi:
2438 case AArch64::LDRXui:
2439 case AArch64::LDURXi:
2440 case AArch64::STRDui:
2441 case AArch64::STURDi:
2442 case AArch64::STRXui:
2443 case AArch64::STURXi:
2444 case AArch64::LDPDi:
2445 case AArch64::LDPXi:
2446 case AArch64::STPDi:
2447 case AArch64::STPXi:
2448 return 8;
2449 case AArch64::LDRQui:
2450 case AArch64::LDURQi:
2451 case AArch64::STRQui:
2452 case AArch64::STURQi:
2453 case AArch64::LDPQi:
2454 case AArch64::STPQi:
2455 case AArch64::STGOffset:
2456 case AArch64::STZGOffset:
2457 case AArch64::ST2GOffset:
2458 case AArch64::STZ2GOffset:
2459 case AArch64::STGPi:
2460 return 16;
2461 }
2462 }
2463
2464 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
2465 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)2466 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2467 int Scale = AArch64InstrInfo::getMemScale(Opc);
2468
2469 // If the byte-offset isn't a multiple of the stride, we can't scale this
2470 // offset.
2471 if (Offset % Scale != 0)
2472 return false;
2473
2474 // Convert the byte-offset used by unscaled into an "element" offset used
2475 // by the scaled pair load/store instructions.
2476 Offset /= Scale;
2477 return true;
2478 }
2479
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)2480 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2481 if (FirstOpc == SecondOpc)
2482 return true;
2483 // We can also pair sign-ext and zero-ext instructions.
2484 switch (FirstOpc) {
2485 default:
2486 return false;
2487 case AArch64::LDRWui:
2488 case AArch64::LDURWi:
2489 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2490 case AArch64::LDRSWui:
2491 case AArch64::LDURSWi:
2492 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2493 }
2494 // These instructions can't be paired based on their opcodes.
2495 return false;
2496 }
2497
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)2498 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2499 int64_t Offset1, unsigned Opcode1, int FI2,
2500 int64_t Offset2, unsigned Opcode2) {
2501 // Accesses through fixed stack object frame indices may access a different
2502 // fixed stack slot. Check that the object offsets + offsets match.
2503 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2504 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2505 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2506 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2507 // Convert to scaled object offsets.
2508 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
2509 if (ObjectOffset1 % Scale1 != 0)
2510 return false;
2511 ObjectOffset1 /= Scale1;
2512 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
2513 if (ObjectOffset2 % Scale2 != 0)
2514 return false;
2515 ObjectOffset2 /= Scale2;
2516 ObjectOffset1 += Offset1;
2517 ObjectOffset2 += Offset2;
2518 return ObjectOffset1 + 1 == ObjectOffset2;
2519 }
2520
2521 return FI1 == FI2;
2522 }
2523
2524 /// Detect opportunities for ldp/stp formation.
2525 ///
2526 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,ArrayRef<const MachineOperand * > BaseOps2,unsigned NumLoads,unsigned NumBytes) const2527 bool AArch64InstrInfo::shouldClusterMemOps(
2528 ArrayRef<const MachineOperand *> BaseOps1,
2529 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
2530 unsigned NumBytes) const {
2531 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
2532 const MachineOperand &BaseOp1 = *BaseOps1.front();
2533 const MachineOperand &BaseOp2 = *BaseOps2.front();
2534 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2535 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2536 if (BaseOp1.getType() != BaseOp2.getType())
2537 return false;
2538
2539 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2540 "Only base registers and frame indices are supported.");
2541
2542 // Check for both base regs and base FI.
2543 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2544 return false;
2545
2546 // Only cluster up to a single pair.
2547 if (NumLoads > 2)
2548 return false;
2549
2550 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2551 return false;
2552
2553 // Can we pair these instructions based on their opcodes?
2554 unsigned FirstOpc = FirstLdSt.getOpcode();
2555 unsigned SecondOpc = SecondLdSt.getOpcode();
2556 if (!canPairLdStOpc(FirstOpc, SecondOpc))
2557 return false;
2558
2559 // Can't merge volatiles or load/stores that have a hint to avoid pair
2560 // formation, for example.
2561 if (!isCandidateToMergeOrPair(FirstLdSt) ||
2562 !isCandidateToMergeOrPair(SecondLdSt))
2563 return false;
2564
2565 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2566 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2567 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2568 return false;
2569
2570 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2571 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2572 return false;
2573
2574 // Pairwise instructions have a 7-bit signed offset field.
2575 if (Offset1 > 63 || Offset1 < -64)
2576 return false;
2577
2578 // The caller should already have ordered First/SecondLdSt by offset.
2579 // Note: except for non-equal frame index bases
2580 if (BaseOp1.isFI()) {
2581 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2582 "Caller should have ordered offsets.");
2583
2584 const MachineFrameInfo &MFI =
2585 FirstLdSt.getParent()->getParent()->getFrameInfo();
2586 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2587 BaseOp2.getIndex(), Offset2, SecondOpc);
2588 }
2589
2590 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
2591
2592 return Offset1 + 1 == Offset2;
2593 }
2594
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)2595 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2596 unsigned Reg, unsigned SubIdx,
2597 unsigned State,
2598 const TargetRegisterInfo *TRI) {
2599 if (!SubIdx)
2600 return MIB.addReg(Reg, State);
2601
2602 if (Register::isPhysicalRegister(Reg))
2603 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2604 return MIB.addReg(Reg, State, SubIdx);
2605 }
2606
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)2607 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2608 unsigned NumRegs) {
2609 // We really want the positive remainder mod 32 here, that happens to be
2610 // easily obtainable with a mask.
2611 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2612 }
2613
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const2614 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2615 MachineBasicBlock::iterator I,
2616 const DebugLoc &DL, MCRegister DestReg,
2617 MCRegister SrcReg, bool KillSrc,
2618 unsigned Opcode,
2619 ArrayRef<unsigned> Indices) const {
2620 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2621 const TargetRegisterInfo *TRI = &getRegisterInfo();
2622 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2623 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2624 unsigned NumRegs = Indices.size();
2625
2626 int SubReg = 0, End = NumRegs, Incr = 1;
2627 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2628 SubReg = NumRegs - 1;
2629 End = -1;
2630 Incr = -1;
2631 }
2632
2633 for (; SubReg != End; SubReg += Incr) {
2634 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2635 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2636 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2637 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2638 }
2639 }
2640
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const2641 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2642 MachineBasicBlock::iterator I,
2643 DebugLoc DL, unsigned DestReg,
2644 unsigned SrcReg, bool KillSrc,
2645 unsigned Opcode, unsigned ZeroReg,
2646 llvm::ArrayRef<unsigned> Indices) const {
2647 const TargetRegisterInfo *TRI = &getRegisterInfo();
2648 unsigned NumRegs = Indices.size();
2649
2650 #ifndef NDEBUG
2651 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2652 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2653 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2654 "GPR reg sequences should not be able to overlap");
2655 #endif
2656
2657 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2658 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2659 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2660 MIB.addReg(ZeroReg);
2661 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2662 MIB.addImm(0);
2663 }
2664 }
2665
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const2666 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2667 MachineBasicBlock::iterator I,
2668 const DebugLoc &DL, MCRegister DestReg,
2669 MCRegister SrcReg, bool KillSrc) const {
2670 if (AArch64::GPR32spRegClass.contains(DestReg) &&
2671 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2672 const TargetRegisterInfo *TRI = &getRegisterInfo();
2673
2674 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2675 // If either operand is WSP, expand to ADD #0.
2676 if (Subtarget.hasZeroCycleRegMove()) {
2677 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2678 MCRegister DestRegX = TRI->getMatchingSuperReg(
2679 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2680 MCRegister SrcRegX = TRI->getMatchingSuperReg(
2681 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2682 // This instruction is reading and writing X registers. This may upset
2683 // the register scavenger and machine verifier, so we need to indicate
2684 // that we are reading an undefined value from SrcRegX, but a proper
2685 // value from SrcReg.
2686 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2687 .addReg(SrcRegX, RegState::Undef)
2688 .addImm(0)
2689 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2690 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2691 } else {
2692 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2693 .addReg(SrcReg, getKillRegState(KillSrc))
2694 .addImm(0)
2695 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2696 }
2697 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2698 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2699 .addImm(0)
2700 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2701 } else {
2702 if (Subtarget.hasZeroCycleRegMove()) {
2703 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2704 MCRegister DestRegX = TRI->getMatchingSuperReg(
2705 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2706 MCRegister SrcRegX = TRI->getMatchingSuperReg(
2707 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2708 // This instruction is reading and writing X registers. This may upset
2709 // the register scavenger and machine verifier, so we need to indicate
2710 // that we are reading an undefined value from SrcRegX, but a proper
2711 // value from SrcReg.
2712 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2713 .addReg(AArch64::XZR)
2714 .addReg(SrcRegX, RegState::Undef)
2715 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2716 } else {
2717 // Otherwise, expand to ORR WZR.
2718 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2719 .addReg(AArch64::WZR)
2720 .addReg(SrcReg, getKillRegState(KillSrc));
2721 }
2722 }
2723 return;
2724 }
2725
2726 // Copy a Predicate register by ORRing with itself.
2727 if (AArch64::PPRRegClass.contains(DestReg) &&
2728 AArch64::PPRRegClass.contains(SrcReg)) {
2729 assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2730 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
2731 .addReg(SrcReg) // Pg
2732 .addReg(SrcReg)
2733 .addReg(SrcReg, getKillRegState(KillSrc));
2734 return;
2735 }
2736
2737 // Copy a Z register by ORRing with itself.
2738 if (AArch64::ZPRRegClass.contains(DestReg) &&
2739 AArch64::ZPRRegClass.contains(SrcReg)) {
2740 assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2741 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
2742 .addReg(SrcReg)
2743 .addReg(SrcReg, getKillRegState(KillSrc));
2744 return;
2745 }
2746
2747 if (AArch64::GPR64spRegClass.contains(DestReg) &&
2748 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2749 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2750 // If either operand is SP, expand to ADD #0.
2751 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2752 .addReg(SrcReg, getKillRegState(KillSrc))
2753 .addImm(0)
2754 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2755 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2756 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2757 .addImm(0)
2758 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2759 } else {
2760 // Otherwise, expand to ORR XZR.
2761 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2762 .addReg(AArch64::XZR)
2763 .addReg(SrcReg, getKillRegState(KillSrc));
2764 }
2765 return;
2766 }
2767
2768 // Copy a DDDD register quad by copying the individual sub-registers.
2769 if (AArch64::DDDDRegClass.contains(DestReg) &&
2770 AArch64::DDDDRegClass.contains(SrcReg)) {
2771 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2772 AArch64::dsub2, AArch64::dsub3};
2773 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2774 Indices);
2775 return;
2776 }
2777
2778 // Copy a DDD register triple by copying the individual sub-registers.
2779 if (AArch64::DDDRegClass.contains(DestReg) &&
2780 AArch64::DDDRegClass.contains(SrcReg)) {
2781 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2782 AArch64::dsub2};
2783 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2784 Indices);
2785 return;
2786 }
2787
2788 // Copy a DD register pair by copying the individual sub-registers.
2789 if (AArch64::DDRegClass.contains(DestReg) &&
2790 AArch64::DDRegClass.contains(SrcReg)) {
2791 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2792 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2793 Indices);
2794 return;
2795 }
2796
2797 // Copy a QQQQ register quad by copying the individual sub-registers.
2798 if (AArch64::QQQQRegClass.contains(DestReg) &&
2799 AArch64::QQQQRegClass.contains(SrcReg)) {
2800 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2801 AArch64::qsub2, AArch64::qsub3};
2802 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2803 Indices);
2804 return;
2805 }
2806
2807 // Copy a QQQ register triple by copying the individual sub-registers.
2808 if (AArch64::QQQRegClass.contains(DestReg) &&
2809 AArch64::QQQRegClass.contains(SrcReg)) {
2810 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2811 AArch64::qsub2};
2812 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2813 Indices);
2814 return;
2815 }
2816
2817 // Copy a QQ register pair by copying the individual sub-registers.
2818 if (AArch64::QQRegClass.contains(DestReg) &&
2819 AArch64::QQRegClass.contains(SrcReg)) {
2820 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2821 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2822 Indices);
2823 return;
2824 }
2825
2826 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2827 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2828 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2829 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2830 AArch64::XZR, Indices);
2831 return;
2832 }
2833
2834 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2835 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2836 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2837 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2838 AArch64::WZR, Indices);
2839 return;
2840 }
2841
2842 if (AArch64::FPR128RegClass.contains(DestReg) &&
2843 AArch64::FPR128RegClass.contains(SrcReg)) {
2844 if (Subtarget.hasNEON()) {
2845 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2846 .addReg(SrcReg)
2847 .addReg(SrcReg, getKillRegState(KillSrc));
2848 } else {
2849 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2850 .addReg(AArch64::SP, RegState::Define)
2851 .addReg(SrcReg, getKillRegState(KillSrc))
2852 .addReg(AArch64::SP)
2853 .addImm(-16);
2854 BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2855 .addReg(AArch64::SP, RegState::Define)
2856 .addReg(DestReg, RegState::Define)
2857 .addReg(AArch64::SP)
2858 .addImm(16);
2859 }
2860 return;
2861 }
2862
2863 if (AArch64::FPR64RegClass.contains(DestReg) &&
2864 AArch64::FPR64RegClass.contains(SrcReg)) {
2865 if (Subtarget.hasNEON()) {
2866 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2867 &AArch64::FPR128RegClass);
2868 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2869 &AArch64::FPR128RegClass);
2870 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2871 .addReg(SrcReg)
2872 .addReg(SrcReg, getKillRegState(KillSrc));
2873 } else {
2874 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2875 .addReg(SrcReg, getKillRegState(KillSrc));
2876 }
2877 return;
2878 }
2879
2880 if (AArch64::FPR32RegClass.contains(DestReg) &&
2881 AArch64::FPR32RegClass.contains(SrcReg)) {
2882 if (Subtarget.hasNEON()) {
2883 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2884 &AArch64::FPR128RegClass);
2885 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2886 &AArch64::FPR128RegClass);
2887 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2888 .addReg(SrcReg)
2889 .addReg(SrcReg, getKillRegState(KillSrc));
2890 } else {
2891 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2892 .addReg(SrcReg, getKillRegState(KillSrc));
2893 }
2894 return;
2895 }
2896
2897 if (AArch64::FPR16RegClass.contains(DestReg) &&
2898 AArch64::FPR16RegClass.contains(SrcReg)) {
2899 if (Subtarget.hasNEON()) {
2900 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2901 &AArch64::FPR128RegClass);
2902 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2903 &AArch64::FPR128RegClass);
2904 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2905 .addReg(SrcReg)
2906 .addReg(SrcReg, getKillRegState(KillSrc));
2907 } else {
2908 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2909 &AArch64::FPR32RegClass);
2910 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2911 &AArch64::FPR32RegClass);
2912 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2913 .addReg(SrcReg, getKillRegState(KillSrc));
2914 }
2915 return;
2916 }
2917
2918 if (AArch64::FPR8RegClass.contains(DestReg) &&
2919 AArch64::FPR8RegClass.contains(SrcReg)) {
2920 if (Subtarget.hasNEON()) {
2921 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2922 &AArch64::FPR128RegClass);
2923 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2924 &AArch64::FPR128RegClass);
2925 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2926 .addReg(SrcReg)
2927 .addReg(SrcReg, getKillRegState(KillSrc));
2928 } else {
2929 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2930 &AArch64::FPR32RegClass);
2931 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2932 &AArch64::FPR32RegClass);
2933 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2934 .addReg(SrcReg, getKillRegState(KillSrc));
2935 }
2936 return;
2937 }
2938
2939 // Copies between GPR64 and FPR64.
2940 if (AArch64::FPR64RegClass.contains(DestReg) &&
2941 AArch64::GPR64RegClass.contains(SrcReg)) {
2942 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2943 .addReg(SrcReg, getKillRegState(KillSrc));
2944 return;
2945 }
2946 if (AArch64::GPR64RegClass.contains(DestReg) &&
2947 AArch64::FPR64RegClass.contains(SrcReg)) {
2948 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2949 .addReg(SrcReg, getKillRegState(KillSrc));
2950 return;
2951 }
2952 // Copies between GPR32 and FPR32.
2953 if (AArch64::FPR32RegClass.contains(DestReg) &&
2954 AArch64::GPR32RegClass.contains(SrcReg)) {
2955 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2956 .addReg(SrcReg, getKillRegState(KillSrc));
2957 return;
2958 }
2959 if (AArch64::GPR32RegClass.contains(DestReg) &&
2960 AArch64::FPR32RegClass.contains(SrcReg)) {
2961 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2962 .addReg(SrcReg, getKillRegState(KillSrc));
2963 return;
2964 }
2965
2966 if (DestReg == AArch64::NZCV) {
2967 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2968 BuildMI(MBB, I, DL, get(AArch64::MSR))
2969 .addImm(AArch64SysReg::NZCV)
2970 .addReg(SrcReg, getKillRegState(KillSrc))
2971 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2972 return;
2973 }
2974
2975 if (SrcReg == AArch64::NZCV) {
2976 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2977 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2978 .addImm(AArch64SysReg::NZCV)
2979 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2980 return;
2981 }
2982
2983 llvm_unreachable("unimplemented reg-to-reg copy");
2984 }
2985
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)2986 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
2987 MachineBasicBlock &MBB,
2988 MachineBasicBlock::iterator InsertBefore,
2989 const MCInstrDesc &MCID,
2990 Register SrcReg, bool IsKill,
2991 unsigned SubIdx0, unsigned SubIdx1, int FI,
2992 MachineMemOperand *MMO) {
2993 Register SrcReg0 = SrcReg;
2994 Register SrcReg1 = SrcReg;
2995 if (Register::isPhysicalRegister(SrcReg)) {
2996 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2997 SubIdx0 = 0;
2998 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2999 SubIdx1 = 0;
3000 }
3001 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3002 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3003 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3004 .addFrameIndex(FI)
3005 .addImm(0)
3006 .addMemOperand(MMO);
3007 }
3008
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3009 void AArch64InstrInfo::storeRegToStackSlot(
3010 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
3011 bool isKill, int FI, const TargetRegisterClass *RC,
3012 const TargetRegisterInfo *TRI) const {
3013 MachineFunction &MF = *MBB.getParent();
3014 MachineFrameInfo &MFI = MF.getFrameInfo();
3015
3016 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3017 MachineMemOperand *MMO =
3018 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
3019 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3020 unsigned Opc = 0;
3021 bool Offset = true;
3022 unsigned StackID = TargetStackID::Default;
3023 switch (TRI->getSpillSize(*RC)) {
3024 case 1:
3025 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3026 Opc = AArch64::STRBui;
3027 break;
3028 case 2:
3029 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3030 Opc = AArch64::STRHui;
3031 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3032 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3033 Opc = AArch64::STR_PXI;
3034 StackID = TargetStackID::SVEVector;
3035 }
3036 break;
3037 case 4:
3038 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3039 Opc = AArch64::STRWui;
3040 if (Register::isVirtualRegister(SrcReg))
3041 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3042 else
3043 assert(SrcReg != AArch64::WSP);
3044 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3045 Opc = AArch64::STRSui;
3046 break;
3047 case 8:
3048 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3049 Opc = AArch64::STRXui;
3050 if (Register::isVirtualRegister(SrcReg))
3051 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3052 else
3053 assert(SrcReg != AArch64::SP);
3054 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3055 Opc = AArch64::STRDui;
3056 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3057 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3058 get(AArch64::STPWi), SrcReg, isKill,
3059 AArch64::sube32, AArch64::subo32, FI, MMO);
3060 return;
3061 }
3062 break;
3063 case 16:
3064 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3065 Opc = AArch64::STRQui;
3066 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3067 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3068 Opc = AArch64::ST1Twov1d;
3069 Offset = false;
3070 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3071 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
3072 get(AArch64::STPXi), SrcReg, isKill,
3073 AArch64::sube64, AArch64::subo64, FI, MMO);
3074 return;
3075 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3076 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3077 Opc = AArch64::STR_ZXI;
3078 StackID = TargetStackID::SVEVector;
3079 }
3080 break;
3081 case 24:
3082 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3083 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3084 Opc = AArch64::ST1Threev1d;
3085 Offset = false;
3086 }
3087 break;
3088 case 32:
3089 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3090 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3091 Opc = AArch64::ST1Fourv1d;
3092 Offset = false;
3093 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3094 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3095 Opc = AArch64::ST1Twov2d;
3096 Offset = false;
3097 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3098 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3099 Opc = AArch64::STR_ZZXI;
3100 StackID = TargetStackID::SVEVector;
3101 }
3102 break;
3103 case 48:
3104 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3105 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3106 Opc = AArch64::ST1Threev2d;
3107 Offset = false;
3108 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3109 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3110 Opc = AArch64::STR_ZZZXI;
3111 StackID = TargetStackID::SVEVector;
3112 }
3113 break;
3114 case 64:
3115 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3116 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3117 Opc = AArch64::ST1Fourv2d;
3118 Offset = false;
3119 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3120 assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3121 Opc = AArch64::STR_ZZZZXI;
3122 StackID = TargetStackID::SVEVector;
3123 }
3124 break;
3125 }
3126 assert(Opc && "Unknown register class");
3127 MFI.setStackID(FI, StackID);
3128
3129 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3130 .addReg(SrcReg, getKillRegState(isKill))
3131 .addFrameIndex(FI);
3132
3133 if (Offset)
3134 MI.addImm(0);
3135 MI.addMemOperand(MMO);
3136 }
3137
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)3138 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
3139 MachineBasicBlock &MBB,
3140 MachineBasicBlock::iterator InsertBefore,
3141 const MCInstrDesc &MCID,
3142 Register DestReg, unsigned SubIdx0,
3143 unsigned SubIdx1, int FI,
3144 MachineMemOperand *MMO) {
3145 Register DestReg0 = DestReg;
3146 Register DestReg1 = DestReg;
3147 bool IsUndef = true;
3148 if (Register::isPhysicalRegister(DestReg)) {
3149 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3150 SubIdx0 = 0;
3151 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3152 SubIdx1 = 0;
3153 IsUndef = false;
3154 }
3155 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3156 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3157 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3158 .addFrameIndex(FI)
3159 .addImm(0)
3160 .addMemOperand(MMO);
3161 }
3162
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI) const3163 void AArch64InstrInfo::loadRegFromStackSlot(
3164 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
3165 int FI, const TargetRegisterClass *RC,
3166 const TargetRegisterInfo *TRI) const {
3167 MachineFunction &MF = *MBB.getParent();
3168 MachineFrameInfo &MFI = MF.getFrameInfo();
3169 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
3170 MachineMemOperand *MMO =
3171 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
3172 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3173
3174 unsigned Opc = 0;
3175 bool Offset = true;
3176 unsigned StackID = TargetStackID::Default;
3177 switch (TRI->getSpillSize(*RC)) {
3178 case 1:
3179 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3180 Opc = AArch64::LDRBui;
3181 break;
3182 case 2:
3183 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3184 Opc = AArch64::LDRHui;
3185 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3186 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3187 Opc = AArch64::LDR_PXI;
3188 StackID = TargetStackID::SVEVector;
3189 }
3190 break;
3191 case 4:
3192 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3193 Opc = AArch64::LDRWui;
3194 if (Register::isVirtualRegister(DestReg))
3195 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3196 else
3197 assert(DestReg != AArch64::WSP);
3198 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3199 Opc = AArch64::LDRSui;
3200 break;
3201 case 8:
3202 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3203 Opc = AArch64::LDRXui;
3204 if (Register::isVirtualRegister(DestReg))
3205 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3206 else
3207 assert(DestReg != AArch64::SP);
3208 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3209 Opc = AArch64::LDRDui;
3210 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3211 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3212 get(AArch64::LDPWi), DestReg, AArch64::sube32,
3213 AArch64::subo32, FI, MMO);
3214 return;
3215 }
3216 break;
3217 case 16:
3218 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3219 Opc = AArch64::LDRQui;
3220 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3221 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3222 Opc = AArch64::LD1Twov1d;
3223 Offset = false;
3224 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3225 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3226 get(AArch64::LDPXi), DestReg, AArch64::sube64,
3227 AArch64::subo64, FI, MMO);
3228 return;
3229 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3230 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3231 Opc = AArch64::LDR_ZXI;
3232 StackID = TargetStackID::SVEVector;
3233 }
3234 break;
3235 case 24:
3236 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3237 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3238 Opc = AArch64::LD1Threev1d;
3239 Offset = false;
3240 }
3241 break;
3242 case 32:
3243 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3244 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3245 Opc = AArch64::LD1Fourv1d;
3246 Offset = false;
3247 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3248 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3249 Opc = AArch64::LD1Twov2d;
3250 Offset = false;
3251 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3252 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3253 Opc = AArch64::LDR_ZZXI;
3254 StackID = TargetStackID::SVEVector;
3255 }
3256 break;
3257 case 48:
3258 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3259 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3260 Opc = AArch64::LD1Threev2d;
3261 Offset = false;
3262 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3263 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3264 Opc = AArch64::LDR_ZZZXI;
3265 StackID = TargetStackID::SVEVector;
3266 }
3267 break;
3268 case 64:
3269 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3270 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3271 Opc = AArch64::LD1Fourv2d;
3272 Offset = false;
3273 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3274 assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3275 Opc = AArch64::LDR_ZZZZXI;
3276 StackID = TargetStackID::SVEVector;
3277 }
3278 break;
3279 }
3280
3281 assert(Opc && "Unknown register class");
3282 MFI.setStackID(FI, StackID);
3283
3284 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3285 .addReg(DestReg, getDefRegState(true))
3286 .addFrameIndex(FI);
3287 if (Offset)
3288 MI.addImm(0);
3289 MI.addMemOperand(MMO);
3290 }
3291
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)3292 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
3293 const MachineInstr &UseMI,
3294 const TargetRegisterInfo *TRI) {
3295 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
3296 UseMI.getIterator()),
3297 [TRI](const MachineInstr &I) {
3298 return I.modifiesRegister(AArch64::NZCV, TRI) ||
3299 I.readsRegister(AArch64::NZCV, TRI);
3300 });
3301 }
3302
3303 // Helper function to emit a frame offset adjustment from a given
3304 // pointer (SrcReg), stored into DestReg. This function is explicit
3305 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI)3306 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
3307 MachineBasicBlock::iterator MBBI,
3308 const DebugLoc &DL, unsigned DestReg,
3309 unsigned SrcReg, int64_t Offset, unsigned Opc,
3310 const TargetInstrInfo *TII,
3311 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3312 bool *HasWinCFI) {
3313 int Sign = 1;
3314 unsigned MaxEncoding, ShiftSize;
3315 switch (Opc) {
3316 case AArch64::ADDXri:
3317 case AArch64::ADDSXri:
3318 case AArch64::SUBXri:
3319 case AArch64::SUBSXri:
3320 MaxEncoding = 0xfff;
3321 ShiftSize = 12;
3322 break;
3323 case AArch64::ADDVL_XXI:
3324 case AArch64::ADDPL_XXI:
3325 MaxEncoding = 31;
3326 ShiftSize = 0;
3327 if (Offset < 0) {
3328 MaxEncoding = 32;
3329 Sign = -1;
3330 Offset = -Offset;
3331 }
3332 break;
3333 default:
3334 llvm_unreachable("Unsupported opcode");
3335 }
3336
3337 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3338 // scratch register. If DestReg is a virtual register, use it as the
3339 // scratch register; otherwise, create a new virtual register (to be
3340 // replaced by the scavenger at the end of PEI). That case can be optimized
3341 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3342 // register can be loaded with offset%8 and the add/sub can use an extending
3343 // instruction with LSL#3.
3344 // Currently the function handles any offsets but generates a poor sequence
3345 // of code.
3346 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3347
3348 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3349 Register TmpReg = DestReg;
3350 if (TmpReg == AArch64::XZR)
3351 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
3352 &AArch64::GPR64RegClass);
3353 do {
3354 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
3355 unsigned LocalShiftSize = 0;
3356 if (ThisVal > MaxEncoding) {
3357 ThisVal = ThisVal >> ShiftSize;
3358 LocalShiftSize = ShiftSize;
3359 }
3360 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3361 "Encoding cannot handle value that big");
3362
3363 Offset -= ThisVal << LocalShiftSize;
3364 if (Offset == 0)
3365 TmpReg = DestReg;
3366 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
3367 .addReg(SrcReg)
3368 .addImm(Sign * (int)ThisVal);
3369 if (ShiftSize)
3370 MBI = MBI.addImm(
3371 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
3372 MBI = MBI.setMIFlag(Flag);
3373
3374 if (NeedsWinCFI) {
3375 assert(Sign == 1 && "SEH directives should always have a positive sign");
3376 int Imm = (int)(ThisVal << LocalShiftSize);
3377 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3378 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3379 if (HasWinCFI)
3380 *HasWinCFI = true;
3381 if (Imm == 0)
3382 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
3383 else
3384 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
3385 .addImm(Imm)
3386 .setMIFlag(Flag);
3387 assert(Offset == 0 && "Expected remaining offset to be zero to "
3388 "emit a single SEH directive");
3389 } else if (DestReg == AArch64::SP) {
3390 if (HasWinCFI)
3391 *HasWinCFI = true;
3392 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
3393 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3394 .addImm(Imm)
3395 .setMIFlag(Flag);
3396 }
3397 if (HasWinCFI)
3398 *HasWinCFI = true;
3399 }
3400
3401 SrcReg = TmpReg;
3402 } while (Offset);
3403 }
3404
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI)3405 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
3406 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
3407 unsigned DestReg, unsigned SrcReg,
3408 StackOffset Offset, const TargetInstrInfo *TII,
3409 MachineInstr::MIFlag Flag, bool SetNZCV,
3410 bool NeedsWinCFI, bool *HasWinCFI) {
3411 int64_t Bytes, NumPredicateVectors, NumDataVectors;
3412 Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
3413
3414 // First emit non-scalable frame offsets, or a simple 'mov'.
3415 if (Bytes || (!Offset && SrcReg != DestReg)) {
3416 assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
3417 "SP increment/decrement not 16-byte aligned");
3418 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
3419 if (Bytes < 0) {
3420 Bytes = -Bytes;
3421 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
3422 }
3423 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
3424 NeedsWinCFI, HasWinCFI);
3425 SrcReg = DestReg;
3426 }
3427
3428 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
3429 "SetNZCV not supported with SVE vectors");
3430 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
3431 "WinCFI not supported with SVE vectors");
3432
3433 if (NumDataVectors) {
3434 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
3435 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3436 SrcReg = DestReg;
3437 }
3438
3439 if (NumPredicateVectors) {
3440 assert(DestReg != AArch64::SP && "Unaligned access to SP");
3441 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
3442 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3443 }
3444 }
3445
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const3446 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
3447 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
3448 MachineBasicBlock::iterator InsertPt, int FrameIndex,
3449 LiveIntervals *LIS, VirtRegMap *VRM) const {
3450 // This is a bit of a hack. Consider this instruction:
3451 //
3452 // %0 = COPY %sp; GPR64all:%0
3453 //
3454 // We explicitly chose GPR64all for the virtual register so such a copy might
3455 // be eliminated by RegisterCoalescer. However, that may not be possible, and
3456 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3457 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3458 //
3459 // To prevent that, we are going to constrain the %0 register class here.
3460 //
3461 // <rdar://problem/11522048>
3462 //
3463 if (MI.isFullCopy()) {
3464 Register DstReg = MI.getOperand(0).getReg();
3465 Register SrcReg = MI.getOperand(1).getReg();
3466 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
3467 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3468 return nullptr;
3469 }
3470 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
3471 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3472 return nullptr;
3473 }
3474 }
3475
3476 // Handle the case where a copy is being spilled or filled but the source
3477 // and destination register class don't match. For example:
3478 //
3479 // %0 = COPY %xzr; GPR64common:%0
3480 //
3481 // In this case we can still safely fold away the COPY and generate the
3482 // following spill code:
3483 //
3484 // STRXui %xzr, %stack.0
3485 //
3486 // This also eliminates spilled cross register class COPYs (e.g. between x and
3487 // d regs) of the same size. For example:
3488 //
3489 // %0 = COPY %1; GPR64:%0, FPR64:%1
3490 //
3491 // will be filled as
3492 //
3493 // LDRDui %0, fi<#0>
3494 //
3495 // instead of
3496 //
3497 // LDRXui %Temp, fi<#0>
3498 // %0 = FMOV %Temp
3499 //
3500 if (MI.isCopy() && Ops.size() == 1 &&
3501 // Make sure we're only folding the explicit COPY defs/uses.
3502 (Ops[0] == 0 || Ops[0] == 1)) {
3503 bool IsSpill = Ops[0] == 0;
3504 bool IsFill = !IsSpill;
3505 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3506 const MachineRegisterInfo &MRI = MF.getRegInfo();
3507 MachineBasicBlock &MBB = *MI.getParent();
3508 const MachineOperand &DstMO = MI.getOperand(0);
3509 const MachineOperand &SrcMO = MI.getOperand(1);
3510 Register DstReg = DstMO.getReg();
3511 Register SrcReg = SrcMO.getReg();
3512 // This is slightly expensive to compute for physical regs since
3513 // getMinimalPhysRegClass is slow.
3514 auto getRegClass = [&](unsigned Reg) {
3515 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
3516 : TRI.getMinimalPhysRegClass(Reg);
3517 };
3518
3519 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3520 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3521 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3522 "Mismatched register size in non subreg COPY");
3523 if (IsSpill)
3524 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3525 getRegClass(SrcReg), &TRI);
3526 else
3527 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3528 getRegClass(DstReg), &TRI);
3529 return &*--InsertPt;
3530 }
3531
3532 // Handle cases like spilling def of:
3533 //
3534 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3535 //
3536 // where the physical register source can be widened and stored to the full
3537 // virtual reg destination stack slot, in this case producing:
3538 //
3539 // STRXui %xzr, %stack.0
3540 //
3541 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
3542 assert(SrcMO.getSubReg() == 0 &&
3543 "Unexpected subreg on physical register");
3544 const TargetRegisterClass *SpillRC;
3545 unsigned SpillSubreg;
3546 switch (DstMO.getSubReg()) {
3547 default:
3548 SpillRC = nullptr;
3549 break;
3550 case AArch64::sub_32:
3551 case AArch64::ssub:
3552 if (AArch64::GPR32RegClass.contains(SrcReg)) {
3553 SpillRC = &AArch64::GPR64RegClass;
3554 SpillSubreg = AArch64::sub_32;
3555 } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3556 SpillRC = &AArch64::FPR64RegClass;
3557 SpillSubreg = AArch64::ssub;
3558 } else
3559 SpillRC = nullptr;
3560 break;
3561 case AArch64::dsub:
3562 if (AArch64::FPR64RegClass.contains(SrcReg)) {
3563 SpillRC = &AArch64::FPR128RegClass;
3564 SpillSubreg = AArch64::dsub;
3565 } else
3566 SpillRC = nullptr;
3567 break;
3568 }
3569
3570 if (SpillRC)
3571 if (unsigned WidenedSrcReg =
3572 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3573 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3574 FrameIndex, SpillRC, &TRI);
3575 return &*--InsertPt;
3576 }
3577 }
3578
3579 // Handle cases like filling use of:
3580 //
3581 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3582 //
3583 // where we can load the full virtual reg source stack slot, into the subreg
3584 // destination, in this case producing:
3585 //
3586 // LDRWui %0:sub_32<def,read-undef>, %stack.0
3587 //
3588 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3589 const TargetRegisterClass *FillRC;
3590 switch (DstMO.getSubReg()) {
3591 default:
3592 FillRC = nullptr;
3593 break;
3594 case AArch64::sub_32:
3595 FillRC = &AArch64::GPR32RegClass;
3596 break;
3597 case AArch64::ssub:
3598 FillRC = &AArch64::FPR32RegClass;
3599 break;
3600 case AArch64::dsub:
3601 FillRC = &AArch64::FPR64RegClass;
3602 break;
3603 }
3604
3605 if (FillRC) {
3606 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3607 TRI.getRegSizeInBits(*FillRC) &&
3608 "Mismatched regclass size on folded subreg COPY");
3609 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3610 MachineInstr &LoadMI = *--InsertPt;
3611 MachineOperand &LoadDst = LoadMI.getOperand(0);
3612 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3613 LoadDst.setSubReg(DstMO.getSubReg());
3614 LoadDst.setIsUndef();
3615 return &LoadMI;
3616 }
3617 }
3618 }
3619
3620 // Cannot fold.
3621 return nullptr;
3622 }
3623
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)3624 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
3625 StackOffset &SOffset,
3626 bool *OutUseUnscaledOp,
3627 unsigned *OutUnscaledOp,
3628 int64_t *EmittableOffset) {
3629 // Set output values in case of early exit.
3630 if (EmittableOffset)
3631 *EmittableOffset = 0;
3632 if (OutUseUnscaledOp)
3633 *OutUseUnscaledOp = false;
3634 if (OutUnscaledOp)
3635 *OutUnscaledOp = 0;
3636
3637 // Exit early for structured vector spills/fills as they can't take an
3638 // immediate offset.
3639 switch (MI.getOpcode()) {
3640 default:
3641 break;
3642 case AArch64::LD1Twov2d:
3643 case AArch64::LD1Threev2d:
3644 case AArch64::LD1Fourv2d:
3645 case AArch64::LD1Twov1d:
3646 case AArch64::LD1Threev1d:
3647 case AArch64::LD1Fourv1d:
3648 case AArch64::ST1Twov2d:
3649 case AArch64::ST1Threev2d:
3650 case AArch64::ST1Fourv2d:
3651 case AArch64::ST1Twov1d:
3652 case AArch64::ST1Threev1d:
3653 case AArch64::ST1Fourv1d:
3654 case AArch64::IRG:
3655 case AArch64::IRGstack:
3656 case AArch64::STGloop:
3657 case AArch64::STZGloop:
3658 return AArch64FrameOffsetCannotUpdate;
3659 }
3660
3661 // Get the min/max offset and the scale.
3662 TypeSize ScaleValue(0U, false);
3663 unsigned Width;
3664 int64_t MinOff, MaxOff;
3665 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
3666 MaxOff))
3667 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3668
3669 // Construct the complete offset.
3670 bool IsMulVL = ScaleValue.isScalable();
3671 unsigned Scale = ScaleValue.getKnownMinSize();
3672 int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();
3673
3674 const MachineOperand &ImmOpnd =
3675 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
3676 Offset += ImmOpnd.getImm() * Scale;
3677
3678 // If the offset doesn't match the scale, we rewrite the instruction to
3679 // use the unscaled instruction instead. Likewise, if we have a negative
3680 // offset and there is an unscaled op to use.
3681 Optional<unsigned> UnscaledOp =
3682 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
3683 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3684 if (useUnscaledOp &&
3685 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
3686 MaxOff))
3687 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3688
3689 Scale = ScaleValue.getKnownMinSize();
3690 assert(IsMulVL == ScaleValue.isScalable() &&
3691 "Unscaled opcode has different value for scalable");
3692
3693 int64_t Remainder = Offset % Scale;
3694 assert(!(Remainder && useUnscaledOp) &&
3695 "Cannot have remainder when using unscaled op");
3696
3697 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3698 int64_t NewOffset = Offset / Scale;
3699 if (MinOff <= NewOffset && NewOffset <= MaxOff)
3700 Offset = Remainder;
3701 else {
3702 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3703 Offset = Offset - NewOffset * Scale + Remainder;
3704 }
3705
3706 if (EmittableOffset)
3707 *EmittableOffset = NewOffset;
3708 if (OutUseUnscaledOp)
3709 *OutUseUnscaledOp = useUnscaledOp;
3710 if (OutUnscaledOp && UnscaledOp)
3711 *OutUnscaledOp = *UnscaledOp;
3712
3713 if (IsMulVL)
3714 SOffset = StackOffset(Offset, MVT::nxv1i8) +
3715 StackOffset(SOffset.getBytes(), MVT::i8);
3716 else
3717 SOffset = StackOffset(Offset, MVT::i8) +
3718 StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
3719 return AArch64FrameOffsetCanUpdate |
3720 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
3721 }
3722
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)3723 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3724 unsigned FrameReg, StackOffset &Offset,
3725 const AArch64InstrInfo *TII) {
3726 unsigned Opcode = MI.getOpcode();
3727 unsigned ImmIdx = FrameRegIdx + 1;
3728
3729 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3730 Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
3731 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3732 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3733 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3734 MI.eraseFromParent();
3735 Offset = StackOffset();
3736 return true;
3737 }
3738
3739 int64_t NewOffset;
3740 unsigned UnscaledOp;
3741 bool UseUnscaledOp;
3742 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3743 &UnscaledOp, &NewOffset);
3744 if (Status & AArch64FrameOffsetCanUpdate) {
3745 if (Status & AArch64FrameOffsetIsLegal)
3746 // Replace the FrameIndex with FrameReg.
3747 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3748 if (UseUnscaledOp)
3749 MI.setDesc(TII->get(UnscaledOp));
3750
3751 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3752 return !Offset;
3753 }
3754
3755 return false;
3756 }
3757
getNoop(MCInst & NopInst) const3758 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3759 NopInst.setOpcode(AArch64::HINT);
3760 NopInst.addOperand(MCOperand::createImm(0));
3761 }
3762
3763 // AArch64 supports MachineCombiner.
useMachineCombiner() const3764 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3765
3766 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)3767 static bool isCombineInstrSettingFlag(unsigned Opc) {
3768 switch (Opc) {
3769 case AArch64::ADDSWrr:
3770 case AArch64::ADDSWri:
3771 case AArch64::ADDSXrr:
3772 case AArch64::ADDSXri:
3773 case AArch64::SUBSWrr:
3774 case AArch64::SUBSXrr:
3775 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3776 case AArch64::SUBSWri:
3777 case AArch64::SUBSXri:
3778 return true;
3779 default:
3780 break;
3781 }
3782 return false;
3783 }
3784
3785 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)3786 static bool isCombineInstrCandidate32(unsigned Opc) {
3787 switch (Opc) {
3788 case AArch64::ADDWrr:
3789 case AArch64::ADDWri:
3790 case AArch64::SUBWrr:
3791 case AArch64::ADDSWrr:
3792 case AArch64::ADDSWri:
3793 case AArch64::SUBSWrr:
3794 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3795 case AArch64::SUBWri:
3796 case AArch64::SUBSWri:
3797 return true;
3798 default:
3799 break;
3800 }
3801 return false;
3802 }
3803
3804 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)3805 static bool isCombineInstrCandidate64(unsigned Opc) {
3806 switch (Opc) {
3807 case AArch64::ADDXrr:
3808 case AArch64::ADDXri:
3809 case AArch64::SUBXrr:
3810 case AArch64::ADDSXrr:
3811 case AArch64::ADDSXri:
3812 case AArch64::SUBSXrr:
3813 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3814 case AArch64::SUBXri:
3815 case AArch64::SUBSXri:
3816 case AArch64::ADDv8i8:
3817 case AArch64::ADDv16i8:
3818 case AArch64::ADDv4i16:
3819 case AArch64::ADDv8i16:
3820 case AArch64::ADDv2i32:
3821 case AArch64::ADDv4i32:
3822 case AArch64::SUBv8i8:
3823 case AArch64::SUBv16i8:
3824 case AArch64::SUBv4i16:
3825 case AArch64::SUBv8i16:
3826 case AArch64::SUBv2i32:
3827 case AArch64::SUBv4i32:
3828 return true;
3829 default:
3830 break;
3831 }
3832 return false;
3833 }
3834
3835 // FP Opcodes that can be combined with a FMUL
isCombineInstrCandidateFP(const MachineInstr & Inst)3836 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3837 switch (Inst.getOpcode()) {
3838 default:
3839 break;
3840 case AArch64::FADDHrr:
3841 case AArch64::FADDSrr:
3842 case AArch64::FADDDrr:
3843 case AArch64::FADDv4f16:
3844 case AArch64::FADDv8f16:
3845 case AArch64::FADDv2f32:
3846 case AArch64::FADDv2f64:
3847 case AArch64::FADDv4f32:
3848 case AArch64::FSUBHrr:
3849 case AArch64::FSUBSrr:
3850 case AArch64::FSUBDrr:
3851 case AArch64::FSUBv4f16:
3852 case AArch64::FSUBv8f16:
3853 case AArch64::FSUBv2f32:
3854 case AArch64::FSUBv2f64:
3855 case AArch64::FSUBv4f32:
3856 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3857 return (Options.UnsafeFPMath ||
3858 Options.AllowFPOpFusion == FPOpFusion::Fast);
3859 }
3860 return false;
3861 }
3862
3863 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)3864 static bool isCombineInstrCandidate(unsigned Opc) {
3865 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
3866 }
3867
3868 //
3869 // Utility routine that checks if \param MO is defined by an
3870 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)3871 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
3872 unsigned CombineOpc, unsigned ZeroReg = 0,
3873 bool CheckZeroReg = false) {
3874 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3875 MachineInstr *MI = nullptr;
3876
3877 if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
3878 MI = MRI.getUniqueVRegDef(MO.getReg());
3879 // And it needs to be in the trace (otherwise, it won't have a depth).
3880 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3881 return false;
3882 // Must only used by the user we combine with.
3883 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3884 return false;
3885
3886 if (CheckZeroReg) {
3887 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3888 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3889 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3890 // The third input reg must be zero.
3891 if (MI->getOperand(3).getReg() != ZeroReg)
3892 return false;
3893 }
3894
3895 return true;
3896 }
3897
3898 //
3899 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)3900 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3901 unsigned MulOpc, unsigned ZeroReg) {
3902 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3903 }
3904
3905 //
3906 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)3907 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3908 unsigned MulOpc) {
3909 return canCombine(MBB, MO, MulOpc);
3910 }
3911
3912 // TODO: There are many more machine instruction opcodes to match:
3913 // 1. Other data types (integer, vectors)
3914 // 2. Other math / logic operations (xor, or)
3915 // 3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst) const3916 bool AArch64InstrInfo::isAssociativeAndCommutative(
3917 const MachineInstr &Inst) const {
3918 switch (Inst.getOpcode()) {
3919 case AArch64::FADDDrr:
3920 case AArch64::FADDSrr:
3921 case AArch64::FADDv2f32:
3922 case AArch64::FADDv2f64:
3923 case AArch64::FADDv4f32:
3924 case AArch64::FMULDrr:
3925 case AArch64::FMULSrr:
3926 case AArch64::FMULX32:
3927 case AArch64::FMULX64:
3928 case AArch64::FMULXv2f32:
3929 case AArch64::FMULXv2f64:
3930 case AArch64::FMULXv4f32:
3931 case AArch64::FMULv2f32:
3932 case AArch64::FMULv2f64:
3933 case AArch64::FMULv4f32:
3934 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3935 default:
3936 return false;
3937 }
3938 }
3939
3940 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)3941 static bool getMaddPatterns(MachineInstr &Root,
3942 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3943 unsigned Opc = Root.getOpcode();
3944 MachineBasicBlock &MBB = *Root.getParent();
3945 bool Found = false;
3946
3947 if (!isCombineInstrCandidate(Opc))
3948 return false;
3949 if (isCombineInstrSettingFlag(Opc)) {
3950 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3951 // When NZCV is live bail out.
3952 if (Cmp_NZCV == -1)
3953 return false;
3954 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3955 // When opcode can't change bail out.
3956 // CHECKME: do we miss any cases for opcode conversion?
3957 if (NewOpc == Opc)
3958 return false;
3959 Opc = NewOpc;
3960 }
3961
3962 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
3963 MachineCombinerPattern Pattern) {
3964 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
3965 Patterns.push_back(Pattern);
3966 Found = true;
3967 }
3968 };
3969
3970 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
3971 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
3972 Patterns.push_back(Pattern);
3973 Found = true;
3974 }
3975 };
3976
3977 typedef MachineCombinerPattern MCP;
3978
3979 switch (Opc) {
3980 default:
3981 break;
3982 case AArch64::ADDWrr:
3983 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3984 "ADDWrr does not have register operands");
3985 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
3986 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
3987 break;
3988 case AArch64::ADDXrr:
3989 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
3990 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
3991 break;
3992 case AArch64::SUBWrr:
3993 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
3994 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
3995 break;
3996 case AArch64::SUBXrr:
3997 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
3998 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
3999 break;
4000 case AArch64::ADDWri:
4001 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
4002 break;
4003 case AArch64::ADDXri:
4004 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
4005 break;
4006 case AArch64::SUBWri:
4007 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
4008 break;
4009 case AArch64::SUBXri:
4010 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
4011 break;
4012 case AArch64::ADDv8i8:
4013 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
4014 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
4015 break;
4016 case AArch64::ADDv16i8:
4017 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
4018 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
4019 break;
4020 case AArch64::ADDv4i16:
4021 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
4022 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
4023 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
4024 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
4025 break;
4026 case AArch64::ADDv8i16:
4027 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
4028 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
4029 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
4030 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
4031 break;
4032 case AArch64::ADDv2i32:
4033 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
4034 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
4035 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
4036 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
4037 break;
4038 case AArch64::ADDv4i32:
4039 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
4040 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
4041 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
4042 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
4043 break;
4044 case AArch64::SUBv8i8:
4045 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
4046 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
4047 break;
4048 case AArch64::SUBv16i8:
4049 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
4050 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
4051 break;
4052 case AArch64::SUBv4i16:
4053 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
4054 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
4055 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
4056 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
4057 break;
4058 case AArch64::SUBv8i16:
4059 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
4060 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
4061 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
4062 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
4063 break;
4064 case AArch64::SUBv2i32:
4065 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
4066 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
4067 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
4068 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
4069 break;
4070 case AArch64::SUBv4i32:
4071 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
4072 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
4073 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
4074 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
4075 break;
4076 }
4077 return Found;
4078 }
4079 /// Floating-Point Support
4080
4081 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns)4082 static bool getFMAPatterns(MachineInstr &Root,
4083 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
4084
4085 if (!isCombineInstrCandidateFP(Root))
4086 return false;
4087
4088 MachineBasicBlock &MBB = *Root.getParent();
4089 bool Found = false;
4090
4091 auto Match = [&](int Opcode, int Operand,
4092 MachineCombinerPattern Pattern) -> bool {
4093 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
4094 Patterns.push_back(Pattern);
4095 return true;
4096 }
4097 return false;
4098 };
4099
4100 typedef MachineCombinerPattern MCP;
4101
4102 switch (Root.getOpcode()) {
4103 default:
4104 assert(false && "Unsupported FP instruction in combiner\n");
4105 break;
4106 case AArch64::FADDHrr:
4107 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4108 "FADDHrr does not have register operands");
4109
4110 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
4111 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
4112 break;
4113 case AArch64::FADDSrr:
4114 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4115 "FADDSrr does not have register operands");
4116
4117 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
4118 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
4119
4120 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
4121 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
4122 break;
4123 case AArch64::FADDDrr:
4124 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
4125 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
4126
4127 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
4128 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
4129 break;
4130 case AArch64::FADDv4f16:
4131 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
4132 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
4133
4134 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
4135 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
4136 break;
4137 case AArch64::FADDv8f16:
4138 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
4139 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
4140
4141 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
4142 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
4143 break;
4144 case AArch64::FADDv2f32:
4145 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
4146 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
4147
4148 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
4149 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
4150 break;
4151 case AArch64::FADDv2f64:
4152 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
4153 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
4154
4155 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
4156 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
4157 break;
4158 case AArch64::FADDv4f32:
4159 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
4160 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
4161
4162 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
4163 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
4164 break;
4165 case AArch64::FSUBHrr:
4166 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
4167 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
4168 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
4169 break;
4170 case AArch64::FSUBSrr:
4171 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
4172
4173 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
4174 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
4175
4176 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
4177 break;
4178 case AArch64::FSUBDrr:
4179 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
4180
4181 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
4182 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
4183
4184 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
4185 break;
4186 case AArch64::FSUBv4f16:
4187 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
4188 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
4189
4190 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
4191 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
4192 break;
4193 case AArch64::FSUBv8f16:
4194 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
4195 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
4196
4197 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
4198 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
4199 break;
4200 case AArch64::FSUBv2f32:
4201 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
4202 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4203
4204 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4205 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4206 break;
4207 case AArch64::FSUBv2f64:
4208 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4209 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4210
4211 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4212 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4213 break;
4214 case AArch64::FSUBv4f32:
4215 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4216 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4217
4218 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4219 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4220 break;
4221 }
4222 return Found;
4223 }
4224
4225 /// Return true when a code sequence can improve throughput. It
4226 /// should be called only for instructions in loops.
4227 /// \param Pattern - combiner pattern
isThroughputPattern(MachineCombinerPattern Pattern) const4228 bool AArch64InstrInfo::isThroughputPattern(
4229 MachineCombinerPattern Pattern) const {
4230 switch (Pattern) {
4231 default:
4232 break;
4233 case MachineCombinerPattern::FMULADDH_OP1:
4234 case MachineCombinerPattern::FMULADDH_OP2:
4235 case MachineCombinerPattern::FMULSUBH_OP1:
4236 case MachineCombinerPattern::FMULSUBH_OP2:
4237 case MachineCombinerPattern::FMULADDS_OP1:
4238 case MachineCombinerPattern::FMULADDS_OP2:
4239 case MachineCombinerPattern::FMULSUBS_OP1:
4240 case MachineCombinerPattern::FMULSUBS_OP2:
4241 case MachineCombinerPattern::FMULADDD_OP1:
4242 case MachineCombinerPattern::FMULADDD_OP2:
4243 case MachineCombinerPattern::FMULSUBD_OP1:
4244 case MachineCombinerPattern::FMULSUBD_OP2:
4245 case MachineCombinerPattern::FNMULSUBH_OP1:
4246 case MachineCombinerPattern::FNMULSUBS_OP1:
4247 case MachineCombinerPattern::FNMULSUBD_OP1:
4248 case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
4249 case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
4250 case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
4251 case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
4252 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4253 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4254 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4255 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4256 case MachineCombinerPattern::FMLAv4f16_OP2:
4257 case MachineCombinerPattern::FMLAv4f16_OP1:
4258 case MachineCombinerPattern::FMLAv8f16_OP1:
4259 case MachineCombinerPattern::FMLAv8f16_OP2:
4260 case MachineCombinerPattern::FMLAv2f32_OP2:
4261 case MachineCombinerPattern::FMLAv2f32_OP1:
4262 case MachineCombinerPattern::FMLAv2f64_OP1:
4263 case MachineCombinerPattern::FMLAv2f64_OP2:
4264 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4265 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4266 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4267 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4268 case MachineCombinerPattern::FMLAv4f32_OP1:
4269 case MachineCombinerPattern::FMLAv4f32_OP2:
4270 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4271 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4272 case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
4273 case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
4274 case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
4275 case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
4276 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4277 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4278 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4279 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4280 case MachineCombinerPattern::FMLSv4f16_OP1:
4281 case MachineCombinerPattern::FMLSv4f16_OP2:
4282 case MachineCombinerPattern::FMLSv8f16_OP1:
4283 case MachineCombinerPattern::FMLSv8f16_OP2:
4284 case MachineCombinerPattern::FMLSv2f32_OP2:
4285 case MachineCombinerPattern::FMLSv2f64_OP2:
4286 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4287 case MachineCombinerPattern::FMLSv4f32_OP2:
4288 case MachineCombinerPattern::MULADDv8i8_OP1:
4289 case MachineCombinerPattern::MULADDv8i8_OP2:
4290 case MachineCombinerPattern::MULADDv16i8_OP1:
4291 case MachineCombinerPattern::MULADDv16i8_OP2:
4292 case MachineCombinerPattern::MULADDv4i16_OP1:
4293 case MachineCombinerPattern::MULADDv4i16_OP2:
4294 case MachineCombinerPattern::MULADDv8i16_OP1:
4295 case MachineCombinerPattern::MULADDv8i16_OP2:
4296 case MachineCombinerPattern::MULADDv2i32_OP1:
4297 case MachineCombinerPattern::MULADDv2i32_OP2:
4298 case MachineCombinerPattern::MULADDv4i32_OP1:
4299 case MachineCombinerPattern::MULADDv4i32_OP2:
4300 case MachineCombinerPattern::MULSUBv8i8_OP1:
4301 case MachineCombinerPattern::MULSUBv8i8_OP2:
4302 case MachineCombinerPattern::MULSUBv16i8_OP1:
4303 case MachineCombinerPattern::MULSUBv16i8_OP2:
4304 case MachineCombinerPattern::MULSUBv4i16_OP1:
4305 case MachineCombinerPattern::MULSUBv4i16_OP2:
4306 case MachineCombinerPattern::MULSUBv8i16_OP1:
4307 case MachineCombinerPattern::MULSUBv8i16_OP2:
4308 case MachineCombinerPattern::MULSUBv2i32_OP1:
4309 case MachineCombinerPattern::MULSUBv2i32_OP2:
4310 case MachineCombinerPattern::MULSUBv4i32_OP1:
4311 case MachineCombinerPattern::MULSUBv4i32_OP2:
4312 case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4313 case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4314 case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4315 case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4316 case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4317 case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4318 case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4319 case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4320 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4321 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4322 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4323 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4324 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4325 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4326 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4327 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4328 return true;
4329 } // end switch (Pattern)
4330 return false;
4331 }
4332 /// Return true when there is potentially a faster code sequence for an
4333 /// instruction chain ending in \p Root. All potential patterns are listed in
4334 /// the \p Pattern vector. Pattern should be sorted in priority order since the
4335 /// pattern evaluator stops checking as soon as it finds a faster sequence.
4336
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<MachineCombinerPattern> & Patterns) const4337 bool AArch64InstrInfo::getMachineCombinerPatterns(
4338 MachineInstr &Root,
4339 SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
4340 // Integer patterns
4341 if (getMaddPatterns(Root, Patterns))
4342 return true;
4343 // Floating point patterns
4344 if (getFMAPatterns(Root, Patterns))
4345 return true;
4346
4347 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
4348 }
4349
4350 enum class FMAInstKind { Default, Indexed, Accumulator };
4351 /// genFusedMultiply - Generate fused multiply instructions.
4352 /// This function supports both integer and floating point instructions.
4353 /// A typical example:
4354 /// F|MUL I=A,B,0
4355 /// F|ADD R,I,C
4356 /// ==> F|MADD R,A,B,C
4357 /// \param MF Containing MachineFunction
4358 /// \param MRI Register information
4359 /// \param TII Target information
4360 /// \param Root is the F|ADD instruction
4361 /// \param [out] InsInstrs is a vector of machine instructions and will
4362 /// contain the generated madd instruction
4363 /// \param IdxMulOpd is index of operand in Root that is the result of
4364 /// the F|MUL. In the example above IdxMulOpd is 1.
4365 /// \param MaddOpc the opcode fo the f|madd instruction
4366 /// \param RC Register class of operands
4367 /// \param kind of fma instruction (addressing mode) to be generated
4368 /// \param ReplacedAddend is the result register from the instruction
4369 /// replacing the non-combined operand, if any.
4370 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)4371 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
4372 const TargetInstrInfo *TII, MachineInstr &Root,
4373 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
4374 unsigned MaddOpc, const TargetRegisterClass *RC,
4375 FMAInstKind kind = FMAInstKind::Default,
4376 const Register *ReplacedAddend = nullptr) {
4377 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4378
4379 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
4380 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4381 Register ResultReg = Root.getOperand(0).getReg();
4382 Register SrcReg0 = MUL->getOperand(1).getReg();
4383 bool Src0IsKill = MUL->getOperand(1).isKill();
4384 Register SrcReg1 = MUL->getOperand(2).getReg();
4385 bool Src1IsKill = MUL->getOperand(2).isKill();
4386
4387 unsigned SrcReg2;
4388 bool Src2IsKill;
4389 if (ReplacedAddend) {
4390 // If we just generated a new addend, we must be it's only use.
4391 SrcReg2 = *ReplacedAddend;
4392 Src2IsKill = true;
4393 } else {
4394 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4395 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4396 }
4397
4398 if (Register::isVirtualRegister(ResultReg))
4399 MRI.constrainRegClass(ResultReg, RC);
4400 if (Register::isVirtualRegister(SrcReg0))
4401 MRI.constrainRegClass(SrcReg0, RC);
4402 if (Register::isVirtualRegister(SrcReg1))
4403 MRI.constrainRegClass(SrcReg1, RC);
4404 if (Register::isVirtualRegister(SrcReg2))
4405 MRI.constrainRegClass(SrcReg2, RC);
4406
4407 MachineInstrBuilder MIB;
4408 if (kind == FMAInstKind::Default)
4409 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4410 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4411 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4412 .addReg(SrcReg2, getKillRegState(Src2IsKill));
4413 else if (kind == FMAInstKind::Indexed)
4414 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4415 .addReg(SrcReg2, getKillRegState(Src2IsKill))
4416 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4417 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4418 .addImm(MUL->getOperand(3).getImm());
4419 else if (kind == FMAInstKind::Accumulator)
4420 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4421 .addReg(SrcReg2, getKillRegState(Src2IsKill))
4422 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4423 .addReg(SrcReg1, getKillRegState(Src1IsKill));
4424 else
4425 assert(false && "Invalid FMA instruction kind \n");
4426 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4427 InsInstrs.push_back(MIB);
4428 return MUL;
4429 }
4430
4431 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
4432 /// instructions.
4433 ///
4434 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4435 static MachineInstr *genFusedMultiplyAcc(
4436 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4437 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4438 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4439 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4440 FMAInstKind::Accumulator);
4441 }
4442
4443 /// genNeg - Helper to generate an intermediate negation of the second operand
4444 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)4445 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
4446 const TargetInstrInfo *TII, MachineInstr &Root,
4447 SmallVectorImpl<MachineInstr *> &InsInstrs,
4448 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
4449 unsigned MnegOpc, const TargetRegisterClass *RC) {
4450 Register NewVR = MRI.createVirtualRegister(RC);
4451 MachineInstrBuilder MIB =
4452 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
4453 .add(Root.getOperand(2));
4454 InsInstrs.push_back(MIB);
4455
4456 assert(InstrIdxForVirtReg.empty());
4457 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4458
4459 return NewVR;
4460 }
4461
4462 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4463 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4464 static MachineInstr *genFusedMultiplyAccNeg(
4465 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4466 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4467 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4468 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4469 assert(IdxMulOpd == 1);
4470
4471 Register NewVR =
4472 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4473 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4474 FMAInstKind::Accumulator, &NewVR);
4475 }
4476
4477 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
4478 /// instructions.
4479 ///
4480 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)4481 static MachineInstr *genFusedMultiplyIdx(
4482 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4483 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4484 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4485 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4486 FMAInstKind::Indexed);
4487 }
4488
4489 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4490 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)4491 static MachineInstr *genFusedMultiplyIdxNeg(
4492 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4493 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4494 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4495 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4496 assert(IdxMulOpd == 1);
4497
4498 Register NewVR =
4499 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4500
4501 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4502 FMAInstKind::Indexed, &NewVR);
4503 }
4504
4505 /// genMaddR - Generate madd instruction and combine mul and add using
4506 /// an extra virtual register
4507 /// Example - an ADD intermediate needs to be stored in a register:
4508 /// MUL I=A,B,0
4509 /// ADD R,I,Imm
4510 /// ==> ORR V, ZR, Imm
4511 /// ==> MADD R,A,B,V
4512 /// \param MF Containing MachineFunction
4513 /// \param MRI Register information
4514 /// \param TII Target information
4515 /// \param Root is the ADD instruction
4516 /// \param [out] InsInstrs is a vector of machine instructions and will
4517 /// contain the generated madd instruction
4518 /// \param IdxMulOpd is index of operand in Root that is the result of
4519 /// the MUL. In the example above IdxMulOpd is 1.
4520 /// \param MaddOpc the opcode fo the madd instruction
4521 /// \param VR is a virtual register that holds the value of an ADD operand
4522 /// (V in the example above).
4523 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)4524 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4525 const TargetInstrInfo *TII, MachineInstr &Root,
4526 SmallVectorImpl<MachineInstr *> &InsInstrs,
4527 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4528 const TargetRegisterClass *RC) {
4529 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4530
4531 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4532 Register ResultReg = Root.getOperand(0).getReg();
4533 Register SrcReg0 = MUL->getOperand(1).getReg();
4534 bool Src0IsKill = MUL->getOperand(1).isKill();
4535 Register SrcReg1 = MUL->getOperand(2).getReg();
4536 bool Src1IsKill = MUL->getOperand(2).isKill();
4537
4538 if (Register::isVirtualRegister(ResultReg))
4539 MRI.constrainRegClass(ResultReg, RC);
4540 if (Register::isVirtualRegister(SrcReg0))
4541 MRI.constrainRegClass(SrcReg0, RC);
4542 if (Register::isVirtualRegister(SrcReg1))
4543 MRI.constrainRegClass(SrcReg1, RC);
4544 if (Register::isVirtualRegister(VR))
4545 MRI.constrainRegClass(VR, RC);
4546
4547 MachineInstrBuilder MIB =
4548 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4549 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4550 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4551 .addReg(VR);
4552 // Insert the MADD
4553 InsInstrs.push_back(MIB);
4554 return MUL;
4555 }
4556
4557 /// When getMachineCombinerPatterns() finds potential patterns,
4558 /// this function generates the instructions that could replace the
4559 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,MachineCombinerPattern Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const4560 void AArch64InstrInfo::genAlternativeCodeSequence(
4561 MachineInstr &Root, MachineCombinerPattern Pattern,
4562 SmallVectorImpl<MachineInstr *> &InsInstrs,
4563 SmallVectorImpl<MachineInstr *> &DelInstrs,
4564 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4565 MachineBasicBlock &MBB = *Root.getParent();
4566 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4567 MachineFunction &MF = *MBB.getParent();
4568 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4569
4570 MachineInstr *MUL;
4571 const TargetRegisterClass *RC;
4572 unsigned Opc;
4573 switch (Pattern) {
4574 default:
4575 // Reassociate instructions.
4576 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4577 DelInstrs, InstrIdxForVirtReg);
4578 return;
4579 case MachineCombinerPattern::MULADDW_OP1:
4580 case MachineCombinerPattern::MULADDX_OP1:
4581 // MUL I=A,B,0
4582 // ADD R,I,C
4583 // ==> MADD R,A,B,C
4584 // --- Create(MADD);
4585 if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4586 Opc = AArch64::MADDWrrr;
4587 RC = &AArch64::GPR32RegClass;
4588 } else {
4589 Opc = AArch64::MADDXrrr;
4590 RC = &AArch64::GPR64RegClass;
4591 }
4592 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4593 break;
4594 case MachineCombinerPattern::MULADDW_OP2:
4595 case MachineCombinerPattern::MULADDX_OP2:
4596 // MUL I=A,B,0
4597 // ADD R,C,I
4598 // ==> MADD R,A,B,C
4599 // --- Create(MADD);
4600 if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4601 Opc = AArch64::MADDWrrr;
4602 RC = &AArch64::GPR32RegClass;
4603 } else {
4604 Opc = AArch64::MADDXrrr;
4605 RC = &AArch64::GPR64RegClass;
4606 }
4607 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4608 break;
4609 case MachineCombinerPattern::MULADDWI_OP1:
4610 case MachineCombinerPattern::MULADDXI_OP1: {
4611 // MUL I=A,B,0
4612 // ADD R,I,Imm
4613 // ==> ORR V, ZR, Imm
4614 // ==> MADD R,A,B,V
4615 // --- Create(MADD);
4616 const TargetRegisterClass *OrrRC;
4617 unsigned BitSize, OrrOpc, ZeroReg;
4618 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4619 OrrOpc = AArch64::ORRWri;
4620 OrrRC = &AArch64::GPR32spRegClass;
4621 BitSize = 32;
4622 ZeroReg = AArch64::WZR;
4623 Opc = AArch64::MADDWrrr;
4624 RC = &AArch64::GPR32RegClass;
4625 } else {
4626 OrrOpc = AArch64::ORRXri;
4627 OrrRC = &AArch64::GPR64spRegClass;
4628 BitSize = 64;
4629 ZeroReg = AArch64::XZR;
4630 Opc = AArch64::MADDXrrr;
4631 RC = &AArch64::GPR64RegClass;
4632 }
4633 Register NewVR = MRI.createVirtualRegister(OrrRC);
4634 uint64_t Imm = Root.getOperand(2).getImm();
4635
4636 if (Root.getOperand(3).isImm()) {
4637 unsigned Val = Root.getOperand(3).getImm();
4638 Imm = Imm << Val;
4639 }
4640 uint64_t UImm = SignExtend64(Imm, BitSize);
4641 uint64_t Encoding;
4642 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4643 MachineInstrBuilder MIB1 =
4644 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4645 .addReg(ZeroReg)
4646 .addImm(Encoding);
4647 InsInstrs.push_back(MIB1);
4648 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4649 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4650 }
4651 break;
4652 }
4653 case MachineCombinerPattern::MULSUBW_OP1:
4654 case MachineCombinerPattern::MULSUBX_OP1: {
4655 // MUL I=A,B,0
4656 // SUB R,I, C
4657 // ==> SUB V, 0, C
4658 // ==> MADD R,A,B,V // = -C + A*B
4659 // --- Create(MADD);
4660 const TargetRegisterClass *SubRC;
4661 unsigned SubOpc, ZeroReg;
4662 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4663 SubOpc = AArch64::SUBWrr;
4664 SubRC = &AArch64::GPR32spRegClass;
4665 ZeroReg = AArch64::WZR;
4666 Opc = AArch64::MADDWrrr;
4667 RC = &AArch64::GPR32RegClass;
4668 } else {
4669 SubOpc = AArch64::SUBXrr;
4670 SubRC = &AArch64::GPR64spRegClass;
4671 ZeroReg = AArch64::XZR;
4672 Opc = AArch64::MADDXrrr;
4673 RC = &AArch64::GPR64RegClass;
4674 }
4675 Register NewVR = MRI.createVirtualRegister(SubRC);
4676 // SUB NewVR, 0, C
4677 MachineInstrBuilder MIB1 =
4678 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4679 .addReg(ZeroReg)
4680 .add(Root.getOperand(2));
4681 InsInstrs.push_back(MIB1);
4682 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4683 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4684 break;
4685 }
4686 case MachineCombinerPattern::MULSUBW_OP2:
4687 case MachineCombinerPattern::MULSUBX_OP2:
4688 // MUL I=A,B,0
4689 // SUB R,C,I
4690 // ==> MSUB R,A,B,C (computes C - A*B)
4691 // --- Create(MSUB);
4692 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4693 Opc = AArch64::MSUBWrrr;
4694 RC = &AArch64::GPR32RegClass;
4695 } else {
4696 Opc = AArch64::MSUBXrrr;
4697 RC = &AArch64::GPR64RegClass;
4698 }
4699 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4700 break;
4701 case MachineCombinerPattern::MULSUBWI_OP1:
4702 case MachineCombinerPattern::MULSUBXI_OP1: {
4703 // MUL I=A,B,0
4704 // SUB R,I, Imm
4705 // ==> ORR V, ZR, -Imm
4706 // ==> MADD R,A,B,V // = -Imm + A*B
4707 // --- Create(MADD);
4708 const TargetRegisterClass *OrrRC;
4709 unsigned BitSize, OrrOpc, ZeroReg;
4710 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4711 OrrOpc = AArch64::ORRWri;
4712 OrrRC = &AArch64::GPR32spRegClass;
4713 BitSize = 32;
4714 ZeroReg = AArch64::WZR;
4715 Opc = AArch64::MADDWrrr;
4716 RC = &AArch64::GPR32RegClass;
4717 } else {
4718 OrrOpc = AArch64::ORRXri;
4719 OrrRC = &AArch64::GPR64spRegClass;
4720 BitSize = 64;
4721 ZeroReg = AArch64::XZR;
4722 Opc = AArch64::MADDXrrr;
4723 RC = &AArch64::GPR64RegClass;
4724 }
4725 Register NewVR = MRI.createVirtualRegister(OrrRC);
4726 uint64_t Imm = Root.getOperand(2).getImm();
4727 if (Root.getOperand(3).isImm()) {
4728 unsigned Val = Root.getOperand(3).getImm();
4729 Imm = Imm << Val;
4730 }
4731 uint64_t UImm = SignExtend64(-Imm, BitSize);
4732 uint64_t Encoding;
4733 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4734 MachineInstrBuilder MIB1 =
4735 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4736 .addReg(ZeroReg)
4737 .addImm(Encoding);
4738 InsInstrs.push_back(MIB1);
4739 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4740 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4741 }
4742 break;
4743 }
4744
4745 case MachineCombinerPattern::MULADDv8i8_OP1:
4746 Opc = AArch64::MLAv8i8;
4747 RC = &AArch64::FPR64RegClass;
4748 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4749 break;
4750 case MachineCombinerPattern::MULADDv8i8_OP2:
4751 Opc = AArch64::MLAv8i8;
4752 RC = &AArch64::FPR64RegClass;
4753 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4754 break;
4755 case MachineCombinerPattern::MULADDv16i8_OP1:
4756 Opc = AArch64::MLAv16i8;
4757 RC = &AArch64::FPR128RegClass;
4758 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4759 break;
4760 case MachineCombinerPattern::MULADDv16i8_OP2:
4761 Opc = AArch64::MLAv16i8;
4762 RC = &AArch64::FPR128RegClass;
4763 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4764 break;
4765 case MachineCombinerPattern::MULADDv4i16_OP1:
4766 Opc = AArch64::MLAv4i16;
4767 RC = &AArch64::FPR64RegClass;
4768 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4769 break;
4770 case MachineCombinerPattern::MULADDv4i16_OP2:
4771 Opc = AArch64::MLAv4i16;
4772 RC = &AArch64::FPR64RegClass;
4773 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4774 break;
4775 case MachineCombinerPattern::MULADDv8i16_OP1:
4776 Opc = AArch64::MLAv8i16;
4777 RC = &AArch64::FPR128RegClass;
4778 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4779 break;
4780 case MachineCombinerPattern::MULADDv8i16_OP2:
4781 Opc = AArch64::MLAv8i16;
4782 RC = &AArch64::FPR128RegClass;
4783 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4784 break;
4785 case MachineCombinerPattern::MULADDv2i32_OP1:
4786 Opc = AArch64::MLAv2i32;
4787 RC = &AArch64::FPR64RegClass;
4788 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4789 break;
4790 case MachineCombinerPattern::MULADDv2i32_OP2:
4791 Opc = AArch64::MLAv2i32;
4792 RC = &AArch64::FPR64RegClass;
4793 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4794 break;
4795 case MachineCombinerPattern::MULADDv4i32_OP1:
4796 Opc = AArch64::MLAv4i32;
4797 RC = &AArch64::FPR128RegClass;
4798 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4799 break;
4800 case MachineCombinerPattern::MULADDv4i32_OP2:
4801 Opc = AArch64::MLAv4i32;
4802 RC = &AArch64::FPR128RegClass;
4803 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4804 break;
4805
4806 case MachineCombinerPattern::MULSUBv8i8_OP1:
4807 Opc = AArch64::MLAv8i8;
4808 RC = &AArch64::FPR64RegClass;
4809 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4810 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
4811 RC);
4812 break;
4813 case MachineCombinerPattern::MULSUBv8i8_OP2:
4814 Opc = AArch64::MLSv8i8;
4815 RC = &AArch64::FPR64RegClass;
4816 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4817 break;
4818 case MachineCombinerPattern::MULSUBv16i8_OP1:
4819 Opc = AArch64::MLAv16i8;
4820 RC = &AArch64::FPR128RegClass;
4821 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4822 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
4823 RC);
4824 break;
4825 case MachineCombinerPattern::MULSUBv16i8_OP2:
4826 Opc = AArch64::MLSv16i8;
4827 RC = &AArch64::FPR128RegClass;
4828 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4829 break;
4830 case MachineCombinerPattern::MULSUBv4i16_OP1:
4831 Opc = AArch64::MLAv4i16;
4832 RC = &AArch64::FPR64RegClass;
4833 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4834 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
4835 RC);
4836 break;
4837 case MachineCombinerPattern::MULSUBv4i16_OP2:
4838 Opc = AArch64::MLSv4i16;
4839 RC = &AArch64::FPR64RegClass;
4840 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4841 break;
4842 case MachineCombinerPattern::MULSUBv8i16_OP1:
4843 Opc = AArch64::MLAv8i16;
4844 RC = &AArch64::FPR128RegClass;
4845 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4846 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
4847 RC);
4848 break;
4849 case MachineCombinerPattern::MULSUBv8i16_OP2:
4850 Opc = AArch64::MLSv8i16;
4851 RC = &AArch64::FPR128RegClass;
4852 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4853 break;
4854 case MachineCombinerPattern::MULSUBv2i32_OP1:
4855 Opc = AArch64::MLAv2i32;
4856 RC = &AArch64::FPR64RegClass;
4857 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4858 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
4859 RC);
4860 break;
4861 case MachineCombinerPattern::MULSUBv2i32_OP2:
4862 Opc = AArch64::MLSv2i32;
4863 RC = &AArch64::FPR64RegClass;
4864 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4865 break;
4866 case MachineCombinerPattern::MULSUBv4i32_OP1:
4867 Opc = AArch64::MLAv4i32;
4868 RC = &AArch64::FPR128RegClass;
4869 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4870 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
4871 RC);
4872 break;
4873 case MachineCombinerPattern::MULSUBv4i32_OP2:
4874 Opc = AArch64::MLSv4i32;
4875 RC = &AArch64::FPR128RegClass;
4876 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4877 break;
4878
4879 case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4880 Opc = AArch64::MLAv4i16_indexed;
4881 RC = &AArch64::FPR64RegClass;
4882 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4883 break;
4884 case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4885 Opc = AArch64::MLAv4i16_indexed;
4886 RC = &AArch64::FPR64RegClass;
4887 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4888 break;
4889 case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4890 Opc = AArch64::MLAv8i16_indexed;
4891 RC = &AArch64::FPR128RegClass;
4892 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4893 break;
4894 case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4895 Opc = AArch64::MLAv8i16_indexed;
4896 RC = &AArch64::FPR128RegClass;
4897 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4898 break;
4899 case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4900 Opc = AArch64::MLAv2i32_indexed;
4901 RC = &AArch64::FPR64RegClass;
4902 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4903 break;
4904 case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4905 Opc = AArch64::MLAv2i32_indexed;
4906 RC = &AArch64::FPR64RegClass;
4907 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4908 break;
4909 case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4910 Opc = AArch64::MLAv4i32_indexed;
4911 RC = &AArch64::FPR128RegClass;
4912 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4913 break;
4914 case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4915 Opc = AArch64::MLAv4i32_indexed;
4916 RC = &AArch64::FPR128RegClass;
4917 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4918 break;
4919
4920 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4921 Opc = AArch64::MLAv4i16_indexed;
4922 RC = &AArch64::FPR64RegClass;
4923 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4924 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
4925 RC);
4926 break;
4927 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4928 Opc = AArch64::MLSv4i16_indexed;
4929 RC = &AArch64::FPR64RegClass;
4930 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4931 break;
4932 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4933 Opc = AArch64::MLAv8i16_indexed;
4934 RC = &AArch64::FPR128RegClass;
4935 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4936 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
4937 RC);
4938 break;
4939 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4940 Opc = AArch64::MLSv8i16_indexed;
4941 RC = &AArch64::FPR128RegClass;
4942 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4943 break;
4944 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4945 Opc = AArch64::MLAv2i32_indexed;
4946 RC = &AArch64::FPR64RegClass;
4947 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4948 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
4949 RC);
4950 break;
4951 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4952 Opc = AArch64::MLSv2i32_indexed;
4953 RC = &AArch64::FPR64RegClass;
4954 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4955 break;
4956 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4957 Opc = AArch64::MLAv4i32_indexed;
4958 RC = &AArch64::FPR128RegClass;
4959 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4960 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
4961 RC);
4962 break;
4963 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4964 Opc = AArch64::MLSv4i32_indexed;
4965 RC = &AArch64::FPR128RegClass;
4966 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4967 break;
4968
4969 // Floating Point Support
4970 case MachineCombinerPattern::FMULADDH_OP1:
4971 Opc = AArch64::FMADDHrrr;
4972 RC = &AArch64::FPR16RegClass;
4973 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4974 break;
4975 case MachineCombinerPattern::FMULADDS_OP1:
4976 Opc = AArch64::FMADDSrrr;
4977 RC = &AArch64::FPR32RegClass;
4978 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4979 break;
4980 case MachineCombinerPattern::FMULADDD_OP1:
4981 Opc = AArch64::FMADDDrrr;
4982 RC = &AArch64::FPR64RegClass;
4983 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4984 break;
4985
4986 case MachineCombinerPattern::FMULADDH_OP2:
4987 Opc = AArch64::FMADDHrrr;
4988 RC = &AArch64::FPR16RegClass;
4989 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4990 break;
4991 case MachineCombinerPattern::FMULADDS_OP2:
4992 Opc = AArch64::FMADDSrrr;
4993 RC = &AArch64::FPR32RegClass;
4994 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4995 break;
4996 case MachineCombinerPattern::FMULADDD_OP2:
4997 Opc = AArch64::FMADDDrrr;
4998 RC = &AArch64::FPR64RegClass;
4999 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5000 break;
5001
5002 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
5003 Opc = AArch64::FMLAv1i32_indexed;
5004 RC = &AArch64::FPR32RegClass;
5005 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5006 FMAInstKind::Indexed);
5007 break;
5008 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
5009 Opc = AArch64::FMLAv1i32_indexed;
5010 RC = &AArch64::FPR32RegClass;
5011 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5012 FMAInstKind::Indexed);
5013 break;
5014
5015 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
5016 Opc = AArch64::FMLAv1i64_indexed;
5017 RC = &AArch64::FPR64RegClass;
5018 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5019 FMAInstKind::Indexed);
5020 break;
5021 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
5022 Opc = AArch64::FMLAv1i64_indexed;
5023 RC = &AArch64::FPR64RegClass;
5024 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5025 FMAInstKind::Indexed);
5026 break;
5027
5028 case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
5029 RC = &AArch64::FPR64RegClass;
5030 Opc = AArch64::FMLAv4i16_indexed;
5031 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5032 FMAInstKind::Indexed);
5033 break;
5034 case MachineCombinerPattern::FMLAv4f16_OP1:
5035 RC = &AArch64::FPR64RegClass;
5036 Opc = AArch64::FMLAv4f16;
5037 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5038 FMAInstKind::Accumulator);
5039 break;
5040 case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
5041 RC = &AArch64::FPR64RegClass;
5042 Opc = AArch64::FMLAv4i16_indexed;
5043 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5044 FMAInstKind::Indexed);
5045 break;
5046 case MachineCombinerPattern::FMLAv4f16_OP2:
5047 RC = &AArch64::FPR64RegClass;
5048 Opc = AArch64::FMLAv4f16;
5049 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5050 FMAInstKind::Accumulator);
5051 break;
5052
5053 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
5054 case MachineCombinerPattern::FMLAv2f32_OP1:
5055 RC = &AArch64::FPR64RegClass;
5056 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
5057 Opc = AArch64::FMLAv2i32_indexed;
5058 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5059 FMAInstKind::Indexed);
5060 } else {
5061 Opc = AArch64::FMLAv2f32;
5062 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5063 FMAInstKind::Accumulator);
5064 }
5065 break;
5066 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
5067 case MachineCombinerPattern::FMLAv2f32_OP2:
5068 RC = &AArch64::FPR64RegClass;
5069 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
5070 Opc = AArch64::FMLAv2i32_indexed;
5071 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5072 FMAInstKind::Indexed);
5073 } else {
5074 Opc = AArch64::FMLAv2f32;
5075 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5076 FMAInstKind::Accumulator);
5077 }
5078 break;
5079
5080 case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
5081 RC = &AArch64::FPR128RegClass;
5082 Opc = AArch64::FMLAv8i16_indexed;
5083 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5084 FMAInstKind::Indexed);
5085 break;
5086 case MachineCombinerPattern::FMLAv8f16_OP1:
5087 RC = &AArch64::FPR128RegClass;
5088 Opc = AArch64::FMLAv8f16;
5089 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5090 FMAInstKind::Accumulator);
5091 break;
5092 case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
5093 RC = &AArch64::FPR128RegClass;
5094 Opc = AArch64::FMLAv8i16_indexed;
5095 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5096 FMAInstKind::Indexed);
5097 break;
5098 case MachineCombinerPattern::FMLAv8f16_OP2:
5099 RC = &AArch64::FPR128RegClass;
5100 Opc = AArch64::FMLAv8f16;
5101 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5102 FMAInstKind::Accumulator);
5103 break;
5104
5105 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
5106 case MachineCombinerPattern::FMLAv2f64_OP1:
5107 RC = &AArch64::FPR128RegClass;
5108 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
5109 Opc = AArch64::FMLAv2i64_indexed;
5110 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5111 FMAInstKind::Indexed);
5112 } else {
5113 Opc = AArch64::FMLAv2f64;
5114 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5115 FMAInstKind::Accumulator);
5116 }
5117 break;
5118 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
5119 case MachineCombinerPattern::FMLAv2f64_OP2:
5120 RC = &AArch64::FPR128RegClass;
5121 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
5122 Opc = AArch64::FMLAv2i64_indexed;
5123 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5124 FMAInstKind::Indexed);
5125 } else {
5126 Opc = AArch64::FMLAv2f64;
5127 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5128 FMAInstKind::Accumulator);
5129 }
5130 break;
5131
5132 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
5133 case MachineCombinerPattern::FMLAv4f32_OP1:
5134 RC = &AArch64::FPR128RegClass;
5135 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
5136 Opc = AArch64::FMLAv4i32_indexed;
5137 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5138 FMAInstKind::Indexed);
5139 } else {
5140 Opc = AArch64::FMLAv4f32;
5141 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5142 FMAInstKind::Accumulator);
5143 }
5144 break;
5145
5146 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
5147 case MachineCombinerPattern::FMLAv4f32_OP2:
5148 RC = &AArch64::FPR128RegClass;
5149 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
5150 Opc = AArch64::FMLAv4i32_indexed;
5151 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5152 FMAInstKind::Indexed);
5153 } else {
5154 Opc = AArch64::FMLAv4f32;
5155 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5156 FMAInstKind::Accumulator);
5157 }
5158 break;
5159
5160 case MachineCombinerPattern::FMULSUBH_OP1:
5161 Opc = AArch64::FNMSUBHrrr;
5162 RC = &AArch64::FPR16RegClass;
5163 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5164 break;
5165 case MachineCombinerPattern::FMULSUBS_OP1:
5166 Opc = AArch64::FNMSUBSrrr;
5167 RC = &AArch64::FPR32RegClass;
5168 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5169 break;
5170 case MachineCombinerPattern::FMULSUBD_OP1:
5171 Opc = AArch64::FNMSUBDrrr;
5172 RC = &AArch64::FPR64RegClass;
5173 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5174 break;
5175
5176 case MachineCombinerPattern::FNMULSUBH_OP1:
5177 Opc = AArch64::FNMADDHrrr;
5178 RC = &AArch64::FPR16RegClass;
5179 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5180 break;
5181 case MachineCombinerPattern::FNMULSUBS_OP1:
5182 Opc = AArch64::FNMADDSrrr;
5183 RC = &AArch64::FPR32RegClass;
5184 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5185 break;
5186 case MachineCombinerPattern::FNMULSUBD_OP1:
5187 Opc = AArch64::FNMADDDrrr;
5188 RC = &AArch64::FPR64RegClass;
5189 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5190 break;
5191
5192 case MachineCombinerPattern::FMULSUBH_OP2:
5193 Opc = AArch64::FMSUBHrrr;
5194 RC = &AArch64::FPR16RegClass;
5195 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5196 break;
5197 case MachineCombinerPattern::FMULSUBS_OP2:
5198 Opc = AArch64::FMSUBSrrr;
5199 RC = &AArch64::FPR32RegClass;
5200 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5201 break;
5202 case MachineCombinerPattern::FMULSUBD_OP2:
5203 Opc = AArch64::FMSUBDrrr;
5204 RC = &AArch64::FPR64RegClass;
5205 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5206 break;
5207
5208 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
5209 Opc = AArch64::FMLSv1i32_indexed;
5210 RC = &AArch64::FPR32RegClass;
5211 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5212 FMAInstKind::Indexed);
5213 break;
5214
5215 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
5216 Opc = AArch64::FMLSv1i64_indexed;
5217 RC = &AArch64::FPR64RegClass;
5218 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5219 FMAInstKind::Indexed);
5220 break;
5221
5222 case MachineCombinerPattern::FMLSv4f16_OP1:
5223 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
5224 RC = &AArch64::FPR64RegClass;
5225 Register NewVR = MRI.createVirtualRegister(RC);
5226 MachineInstrBuilder MIB1 =
5227 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
5228 .add(Root.getOperand(2));
5229 InsInstrs.push_back(MIB1);
5230 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5231 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
5232 Opc = AArch64::FMLAv4f16;
5233 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5234 FMAInstKind::Accumulator, &NewVR);
5235 } else {
5236 Opc = AArch64::FMLAv4i16_indexed;
5237 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5238 FMAInstKind::Indexed, &NewVR);
5239 }
5240 break;
5241 }
5242 case MachineCombinerPattern::FMLSv4f16_OP2:
5243 RC = &AArch64::FPR64RegClass;
5244 Opc = AArch64::FMLSv4f16;
5245 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5246 FMAInstKind::Accumulator);
5247 break;
5248 case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
5249 RC = &AArch64::FPR64RegClass;
5250 Opc = AArch64::FMLSv4i16_indexed;
5251 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5252 FMAInstKind::Indexed);
5253 break;
5254
5255 case MachineCombinerPattern::FMLSv2f32_OP2:
5256 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
5257 RC = &AArch64::FPR64RegClass;
5258 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
5259 Opc = AArch64::FMLSv2i32_indexed;
5260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5261 FMAInstKind::Indexed);
5262 } else {
5263 Opc = AArch64::FMLSv2f32;
5264 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5265 FMAInstKind::Accumulator);
5266 }
5267 break;
5268
5269 case MachineCombinerPattern::FMLSv8f16_OP1:
5270 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
5271 RC = &AArch64::FPR128RegClass;
5272 Register NewVR = MRI.createVirtualRegister(RC);
5273 MachineInstrBuilder MIB1 =
5274 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
5275 .add(Root.getOperand(2));
5276 InsInstrs.push_back(MIB1);
5277 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5278 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
5279 Opc = AArch64::FMLAv8f16;
5280 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5281 FMAInstKind::Accumulator, &NewVR);
5282 } else {
5283 Opc = AArch64::FMLAv8i16_indexed;
5284 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5285 FMAInstKind::Indexed, &NewVR);
5286 }
5287 break;
5288 }
5289 case MachineCombinerPattern::FMLSv8f16_OP2:
5290 RC = &AArch64::FPR128RegClass;
5291 Opc = AArch64::FMLSv8f16;
5292 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5293 FMAInstKind::Accumulator);
5294 break;
5295 case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
5296 RC = &AArch64::FPR128RegClass;
5297 Opc = AArch64::FMLSv8i16_indexed;
5298 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5299 FMAInstKind::Indexed);
5300 break;
5301
5302 case MachineCombinerPattern::FMLSv2f64_OP2:
5303 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
5304 RC = &AArch64::FPR128RegClass;
5305 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
5306 Opc = AArch64::FMLSv2i64_indexed;
5307 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5308 FMAInstKind::Indexed);
5309 } else {
5310 Opc = AArch64::FMLSv2f64;
5311 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5312 FMAInstKind::Accumulator);
5313 }
5314 break;
5315
5316 case MachineCombinerPattern::FMLSv4f32_OP2:
5317 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
5318 RC = &AArch64::FPR128RegClass;
5319 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
5320 Opc = AArch64::FMLSv4i32_indexed;
5321 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5322 FMAInstKind::Indexed);
5323 } else {
5324 Opc = AArch64::FMLSv4f32;
5325 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5326 FMAInstKind::Accumulator);
5327 }
5328 break;
5329 case MachineCombinerPattern::FMLSv2f32_OP1:
5330 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
5331 RC = &AArch64::FPR64RegClass;
5332 Register NewVR = MRI.createVirtualRegister(RC);
5333 MachineInstrBuilder MIB1 =
5334 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
5335 .add(Root.getOperand(2));
5336 InsInstrs.push_back(MIB1);
5337 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5338 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
5339 Opc = AArch64::FMLAv2i32_indexed;
5340 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5341 FMAInstKind::Indexed, &NewVR);
5342 } else {
5343 Opc = AArch64::FMLAv2f32;
5344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5345 FMAInstKind::Accumulator, &NewVR);
5346 }
5347 break;
5348 }
5349 case MachineCombinerPattern::FMLSv4f32_OP1:
5350 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
5351 RC = &AArch64::FPR128RegClass;
5352 Register NewVR = MRI.createVirtualRegister(RC);
5353 MachineInstrBuilder MIB1 =
5354 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
5355 .add(Root.getOperand(2));
5356 InsInstrs.push_back(MIB1);
5357 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5358 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
5359 Opc = AArch64::FMLAv4i32_indexed;
5360 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5361 FMAInstKind::Indexed, &NewVR);
5362 } else {
5363 Opc = AArch64::FMLAv4f32;
5364 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5365 FMAInstKind::Accumulator, &NewVR);
5366 }
5367 break;
5368 }
5369 case MachineCombinerPattern::FMLSv2f64_OP1:
5370 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
5371 RC = &AArch64::FPR128RegClass;
5372 Register NewVR = MRI.createVirtualRegister(RC);
5373 MachineInstrBuilder MIB1 =
5374 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
5375 .add(Root.getOperand(2));
5376 InsInstrs.push_back(MIB1);
5377 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5378 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
5379 Opc = AArch64::FMLAv2i64_indexed;
5380 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5381 FMAInstKind::Indexed, &NewVR);
5382 } else {
5383 Opc = AArch64::FMLAv2f64;
5384 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5385 FMAInstKind::Accumulator, &NewVR);
5386 }
5387 break;
5388 }
5389 } // end switch (Pattern)
5390 // Record MUL and ADD/SUB for deletion
5391 DelInstrs.push_back(MUL);
5392 DelInstrs.push_back(&Root);
5393 }
5394
5395 /// Replace csincr-branch sequence by simple conditional branch
5396 ///
5397 /// Examples:
5398 /// 1. \code
5399 /// csinc w9, wzr, wzr, <condition code>
5400 /// tbnz w9, #0, 0x44
5401 /// \endcode
5402 /// to
5403 /// \code
5404 /// b.<inverted condition code>
5405 /// \endcode
5406 ///
5407 /// 2. \code
5408 /// csinc w9, wzr, wzr, <condition code>
5409 /// tbz w9, #0, 0x44
5410 /// \endcode
5411 /// to
5412 /// \code
5413 /// b.<condition code>
5414 /// \endcode
5415 ///
5416 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
5417 /// compare's constant operand is power of 2.
5418 ///
5419 /// Examples:
5420 /// \code
5421 /// and w8, w8, #0x400
5422 /// cbnz w8, L1
5423 /// \endcode
5424 /// to
5425 /// \code
5426 /// tbnz w8, #10, L1
5427 /// \endcode
5428 ///
5429 /// \param MI Conditional Branch
5430 /// \return True when the simple conditional branch is generated
5431 ///
optimizeCondBranch(MachineInstr & MI) const5432 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
5433 bool IsNegativeBranch = false;
5434 bool IsTestAndBranch = false;
5435 unsigned TargetBBInMI = 0;
5436 switch (MI.getOpcode()) {
5437 default:
5438 llvm_unreachable("Unknown branch instruction?");
5439 case AArch64::Bcc:
5440 return false;
5441 case AArch64::CBZW:
5442 case AArch64::CBZX:
5443 TargetBBInMI = 1;
5444 break;
5445 case AArch64::CBNZW:
5446 case AArch64::CBNZX:
5447 TargetBBInMI = 1;
5448 IsNegativeBranch = true;
5449 break;
5450 case AArch64::TBZW:
5451 case AArch64::TBZX:
5452 TargetBBInMI = 2;
5453 IsTestAndBranch = true;
5454 break;
5455 case AArch64::TBNZW:
5456 case AArch64::TBNZX:
5457 TargetBBInMI = 2;
5458 IsNegativeBranch = true;
5459 IsTestAndBranch = true;
5460 break;
5461 }
5462 // So we increment a zero register and test for bits other
5463 // than bit 0? Conservatively bail out in case the verifier
5464 // missed this case.
5465 if (IsTestAndBranch && MI.getOperand(1).getImm())
5466 return false;
5467
5468 // Find Definition.
5469 assert(MI.getParent() && "Incomplete machine instruciton\n");
5470 MachineBasicBlock *MBB = MI.getParent();
5471 MachineFunction *MF = MBB->getParent();
5472 MachineRegisterInfo *MRI = &MF->getRegInfo();
5473 Register VReg = MI.getOperand(0).getReg();
5474 if (!Register::isVirtualRegister(VReg))
5475 return false;
5476
5477 MachineInstr *DefMI = MRI->getVRegDef(VReg);
5478
5479 // Look through COPY instructions to find definition.
5480 while (DefMI->isCopy()) {
5481 Register CopyVReg = DefMI->getOperand(1).getReg();
5482 if (!MRI->hasOneNonDBGUse(CopyVReg))
5483 return false;
5484 if (!MRI->hasOneDef(CopyVReg))
5485 return false;
5486 DefMI = MRI->getVRegDef(CopyVReg);
5487 }
5488
5489 switch (DefMI->getOpcode()) {
5490 default:
5491 return false;
5492 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
5493 case AArch64::ANDWri:
5494 case AArch64::ANDXri: {
5495 if (IsTestAndBranch)
5496 return false;
5497 if (DefMI->getParent() != MBB)
5498 return false;
5499 if (!MRI->hasOneNonDBGUse(VReg))
5500 return false;
5501
5502 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
5503 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
5504 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
5505 if (!isPowerOf2_64(Mask))
5506 return false;
5507
5508 MachineOperand &MO = DefMI->getOperand(1);
5509 Register NewReg = MO.getReg();
5510 if (!Register::isVirtualRegister(NewReg))
5511 return false;
5512
5513 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
5514
5515 MachineBasicBlock &RefToMBB = *MBB;
5516 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
5517 DebugLoc DL = MI.getDebugLoc();
5518 unsigned Imm = Log2_64(Mask);
5519 unsigned Opc = (Imm < 32)
5520 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
5521 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
5522 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
5523 .addReg(NewReg)
5524 .addImm(Imm)
5525 .addMBB(TBB);
5526 // Register lives on to the CBZ now.
5527 MO.setIsKill(false);
5528
5529 // For immediate smaller than 32, we need to use the 32-bit
5530 // variant (W) in all cases. Indeed the 64-bit variant does not
5531 // allow to encode them.
5532 // Therefore, if the input register is 64-bit, we need to take the
5533 // 32-bit sub-part.
5534 if (!Is32Bit && Imm < 32)
5535 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
5536 MI.eraseFromParent();
5537 return true;
5538 }
5539 // Look for CSINC
5540 case AArch64::CSINCWr:
5541 case AArch64::CSINCXr: {
5542 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
5543 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
5544 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
5545 DefMI->getOperand(2).getReg() == AArch64::XZR))
5546 return false;
5547
5548 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
5549 return false;
5550
5551 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
5552 // Convert only when the condition code is not modified between
5553 // the CSINC and the branch. The CC may be used by other
5554 // instructions in between.
5555 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
5556 return false;
5557 MachineBasicBlock &RefToMBB = *MBB;
5558 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
5559 DebugLoc DL = MI.getDebugLoc();
5560 if (IsNegativeBranch)
5561 CC = AArch64CC::getInvertedCondCode(CC);
5562 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
5563 MI.eraseFromParent();
5564 return true;
5565 }
5566 }
5567 }
5568
5569 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const5570 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
5571 const unsigned Mask = AArch64II::MO_FRAGMENT;
5572 return std::make_pair(TF & Mask, TF & ~Mask);
5573 }
5574
5575 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const5576 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
5577 using namespace AArch64II;
5578
5579 static const std::pair<unsigned, const char *> TargetFlags[] = {
5580 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
5581 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
5582 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
5583 {MO_HI12, "aarch64-hi12"}};
5584 return makeArrayRef(TargetFlags);
5585 }
5586
5587 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const5588 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
5589 using namespace AArch64II;
5590
5591 static const std::pair<unsigned, const char *> TargetFlags[] = {
5592 {MO_COFFSTUB, "aarch64-coffstub"},
5593 {MO_GOT, "aarch64-got"},
5594 {MO_NC, "aarch64-nc"},
5595 {MO_S, "aarch64-s"},
5596 {MO_TLS, "aarch64-tls"},
5597 {MO_DLLIMPORT, "aarch64-dllimport"},
5598 {MO_PREL, "aarch64-prel"},
5599 {MO_TAGGED, "aarch64-tagged"}};
5600 return makeArrayRef(TargetFlags);
5601 }
5602
5603 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const5604 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
5605 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
5606 {{MOSuppressPair, "aarch64-suppress-pair"},
5607 {MOStridedAccess, "aarch64-strided-access"}};
5608 return makeArrayRef(TargetFlags);
5609 }
5610
5611 /// Constants defining how certain sequences should be outlined.
5612 /// This encompasses how an outlined function should be called, and what kind of
5613 /// frame should be emitted for that outlined function.
5614 ///
5615 /// \p MachineOutlinerDefault implies that the function should be called with
5616 /// a save and restore of LR to the stack.
5617 ///
5618 /// That is,
5619 ///
5620 /// I1 Save LR OUTLINED_FUNCTION:
5621 /// I2 --> BL OUTLINED_FUNCTION I1
5622 /// I3 Restore LR I2
5623 /// I3
5624 /// RET
5625 ///
5626 /// * Call construction overhead: 3 (save + BL + restore)
5627 /// * Frame construction overhead: 1 (ret)
5628 /// * Requires stack fixups? Yes
5629 ///
5630 /// \p MachineOutlinerTailCall implies that the function is being created from
5631 /// a sequence of instructions ending in a return.
5632 ///
5633 /// That is,
5634 ///
5635 /// I1 OUTLINED_FUNCTION:
5636 /// I2 --> B OUTLINED_FUNCTION I1
5637 /// RET I2
5638 /// RET
5639 ///
5640 /// * Call construction overhead: 1 (B)
5641 /// * Frame construction overhead: 0 (Return included in sequence)
5642 /// * Requires stack fixups? No
5643 ///
5644 /// \p MachineOutlinerNoLRSave implies that the function should be called using
5645 /// a BL instruction, but doesn't require LR to be saved and restored. This
5646 /// happens when LR is known to be dead.
5647 ///
5648 /// That is,
5649 ///
5650 /// I1 OUTLINED_FUNCTION:
5651 /// I2 --> BL OUTLINED_FUNCTION I1
5652 /// I3 I2
5653 /// I3
5654 /// RET
5655 ///
5656 /// * Call construction overhead: 1 (BL)
5657 /// * Frame construction overhead: 1 (RET)
5658 /// * Requires stack fixups? No
5659 ///
5660 /// \p MachineOutlinerThunk implies that the function is being created from
5661 /// a sequence of instructions ending in a call. The outlined function is
5662 /// called with a BL instruction, and the outlined function tail-calls the
5663 /// original call destination.
5664 ///
5665 /// That is,
5666 ///
5667 /// I1 OUTLINED_FUNCTION:
5668 /// I2 --> BL OUTLINED_FUNCTION I1
5669 /// BL f I2
5670 /// B f
5671 /// * Call construction overhead: 1 (BL)
5672 /// * Frame construction overhead: 0
5673 /// * Requires stack fixups? No
5674 ///
5675 /// \p MachineOutlinerRegSave implies that the function should be called with a
5676 /// save and restore of LR to an available register. This allows us to avoid
5677 /// stack fixups. Note that this outlining variant is compatible with the
5678 /// NoLRSave case.
5679 ///
5680 /// That is,
5681 ///
5682 /// I1 Save LR OUTLINED_FUNCTION:
5683 /// I2 --> BL OUTLINED_FUNCTION I1
5684 /// I3 Restore LR I2
5685 /// I3
5686 /// RET
5687 ///
5688 /// * Call construction overhead: 3 (save + BL + restore)
5689 /// * Frame construction overhead: 1 (ret)
5690 /// * Requires stack fixups? No
5691 enum MachineOutlinerClass {
5692 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
5693 MachineOutlinerTailCall, /// Only emit a branch.
5694 MachineOutlinerNoLRSave, /// Emit a call and return.
5695 MachineOutlinerThunk, /// Emit a call and tail-call.
5696 MachineOutlinerRegSave /// Same as default, but save to a register.
5697 };
5698
5699 enum MachineOutlinerMBBFlags {
5700 LRUnavailableSomewhere = 0x2,
5701 HasCalls = 0x4,
5702 UnsafeRegsDead = 0x8
5703 };
5704
5705 unsigned
findRegisterToSaveLRTo(const outliner::Candidate & C) const5706 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
5707 assert(C.LRUWasSet && "LRU wasn't set?");
5708 MachineFunction *MF = C.getMF();
5709 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5710 MF->getSubtarget().getRegisterInfo());
5711
5712 // Check if there is an available register across the sequence that we can
5713 // use.
5714 for (unsigned Reg : AArch64::GPR64RegClass) {
5715 if (!ARI->isReservedReg(*MF, Reg) &&
5716 Reg != AArch64::LR && // LR is not reserved, but don't use it.
5717 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
5718 Reg != AArch64::X17 && // Ditto for X17.
5719 C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
5720 return Reg;
5721 }
5722
5723 // No suitable register. Return 0.
5724 return 0u;
5725 }
5726
5727 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5728 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
5729 const outliner::Candidate &b) {
5730 const Function &Fa = a.getMF()->getFunction();
5731 const Function &Fb = b.getMF()->getFunction();
5732
5733 // If none of the functions have the "sign-return-address" attribute their
5734 // signing behaviour is equal
5735 if (!Fa.hasFnAttribute("sign-return-address") &&
5736 !Fb.hasFnAttribute("sign-return-address")) {
5737 return true;
5738 }
5739
5740 // If both functions have the "sign-return-address" attribute their signing
5741 // behaviour is equal, if the values of the attributes are equal
5742 if (Fa.hasFnAttribute("sign-return-address") &&
5743 Fb.hasFnAttribute("sign-return-address")) {
5744 StringRef ScopeA =
5745 Fa.getFnAttribute("sign-return-address").getValueAsString();
5746 StringRef ScopeB =
5747 Fb.getFnAttribute("sign-return-address").getValueAsString();
5748 return ScopeA.equals(ScopeB);
5749 }
5750
5751 // If function B doesn't have the "sign-return-address" attribute but A does,
5752 // the functions' signing behaviour is equal if A's value for
5753 // "sign-return-address" is "none" and vice versa.
5754 if (Fa.hasFnAttribute("sign-return-address")) {
5755 StringRef ScopeA =
5756 Fa.getFnAttribute("sign-return-address").getValueAsString();
5757 return ScopeA.equals("none");
5758 }
5759
5760 if (Fb.hasFnAttribute("sign-return-address")) {
5761 StringRef ScopeB =
5762 Fb.getFnAttribute("sign-return-address").getValueAsString();
5763 return ScopeB.equals("none");
5764 }
5765
5766 llvm_unreachable("Unkown combination of sign-return-address attributes");
5767 }
5768
5769 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5770 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
5771 const outliner::Candidate &b) {
5772 const Function &Fa = a.getMF()->getFunction();
5773 const Function &Fb = b.getMF()->getFunction();
5774
5775 // If none of the functions have the "sign-return-address-key" attribute
5776 // their keys are equal
5777 if (!Fa.hasFnAttribute("sign-return-address-key") &&
5778 !Fb.hasFnAttribute("sign-return-address-key")) {
5779 return true;
5780 }
5781
5782 // If both functions have the "sign-return-address-key" attribute their
5783 // keys are equal if the values of "sign-return-address-key" are equal
5784 if (Fa.hasFnAttribute("sign-return-address-key") &&
5785 Fb.hasFnAttribute("sign-return-address-key")) {
5786 StringRef KeyA =
5787 Fa.getFnAttribute("sign-return-address-key").getValueAsString();
5788 StringRef KeyB =
5789 Fb.getFnAttribute("sign-return-address-key").getValueAsString();
5790 return KeyA.equals(KeyB);
5791 }
5792
5793 // If B doesn't have the "sign-return-address-key" attribute, both keys are
5794 // equal, if function a has the default key (a_key)
5795 if (Fa.hasFnAttribute("sign-return-address-key")) {
5796 StringRef KeyA =
5797 Fa.getFnAttribute("sign-return-address-key").getValueAsString();
5798 return KeyA.equals_lower("a_key");
5799 }
5800
5801 if (Fb.hasFnAttribute("sign-return-address-key")) {
5802 StringRef KeyB =
5803 Fb.getFnAttribute("sign-return-address-key").getValueAsString();
5804 return KeyB.equals_lower("a_key");
5805 }
5806
5807 llvm_unreachable("Unkown combination of sign-return-address-key attributes");
5808 }
5809
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)5810 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
5811 const outliner::Candidate &b) {
5812 const AArch64Subtarget &SubtargetA =
5813 a.getMF()->getSubtarget<AArch64Subtarget>();
5814 const AArch64Subtarget &SubtargetB =
5815 b.getMF()->getSubtarget<AArch64Subtarget>();
5816 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
5817 }
5818
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const5819 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
5820 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
5821 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
5822 unsigned SequenceSize =
5823 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
5824 [this](unsigned Sum, const MachineInstr &MI) {
5825 return Sum + getInstSizeInBytes(MI);
5826 });
5827 unsigned NumBytesToCreateFrame = 0;
5828
5829 // We only allow outlining for functions having exactly matching return
5830 // address signing attributes, i.e., all share the same value for the
5831 // attribute "sign-return-address" and all share the same type of key they
5832 // are signed with.
5833 // Additionally we require all functions to simultaniously either support
5834 // v8.3a features or not. Otherwise an outlined function could get signed
5835 // using dedicated v8.3 instructions and a call from a function that doesn't
5836 // support v8.3 instructions would therefore be invalid.
5837 if (std::adjacent_find(
5838 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5839 [](const outliner::Candidate &a, const outliner::Candidate &b) {
5840 // Return true if a and b are non-equal w.r.t. return address
5841 // signing or support of v8.3a features
5842 if (outliningCandidatesSigningScopeConsensus(a, b) &&
5843 outliningCandidatesSigningKeyConsensus(a, b) &&
5844 outliningCandidatesV8_3OpsConsensus(a, b)) {
5845 return false;
5846 }
5847 return true;
5848 }) != RepeatedSequenceLocs.end()) {
5849 return outliner::OutlinedFunction();
5850 }
5851
5852 // Since at this point all candidates agree on their return address signing
5853 // picking just one is fine. If the candidate functions potentially sign their
5854 // return addresses, the outlined function should do the same. Note that in
5855 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
5856 // not certainly true that the outlined function will have to sign its return
5857 // address but this decision is made later, when the decision to outline
5858 // has already been made.
5859 // The same holds for the number of additional instructions we need: On
5860 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
5861 // necessary. However, at this point we don't know if the outlined function
5862 // will have a RET instruction so we assume the worst.
5863 const Function &FCF = FirstCand.getMF()->getFunction();
5864 const TargetRegisterInfo &TRI = getRegisterInfo();
5865 if (FCF.hasFnAttribute("sign-return-address")) {
5866 // One PAC and one AUT instructions
5867 NumBytesToCreateFrame += 8;
5868
5869 // We have to check if sp modifying instructions would get outlined.
5870 // If so we only allow outlining if sp is unchanged overall, so matching
5871 // sub and add instructions are okay to outline, all other sp modifications
5872 // are not
5873 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
5874 int SPValue = 0;
5875 MachineBasicBlock::iterator MBBI = C.front();
5876 for (;;) {
5877 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
5878 switch (MBBI->getOpcode()) {
5879 case AArch64::ADDXri:
5880 case AArch64::ADDWri:
5881 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5882 assert(MBBI->getOperand(2).isImm() &&
5883 "Expected operand to be immediate");
5884 assert(MBBI->getOperand(1).isReg() &&
5885 "Expected operand to be a register");
5886 // Check if the add just increments sp. If so, we search for
5887 // matching sub instructions that decrement sp. If not, the
5888 // modification is illegal
5889 if (MBBI->getOperand(1).getReg() == AArch64::SP)
5890 SPValue += MBBI->getOperand(2).getImm();
5891 else
5892 return true;
5893 break;
5894 case AArch64::SUBXri:
5895 case AArch64::SUBWri:
5896 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5897 assert(MBBI->getOperand(2).isImm() &&
5898 "Expected operand to be immediate");
5899 assert(MBBI->getOperand(1).isReg() &&
5900 "Expected operand to be a register");
5901 // Check if the sub just decrements sp. If so, we search for
5902 // matching add instructions that increment sp. If not, the
5903 // modification is illegal
5904 if (MBBI->getOperand(1).getReg() == AArch64::SP)
5905 SPValue -= MBBI->getOperand(2).getImm();
5906 else
5907 return true;
5908 break;
5909 default:
5910 return true;
5911 }
5912 }
5913 if (MBBI == C.back())
5914 break;
5915 ++MBBI;
5916 }
5917 if (SPValue)
5918 return true;
5919 return false;
5920 };
5921 // Remove candidates with illegal stack modifying instructions
5922 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
5923 RepeatedSequenceLocs.end(),
5924 hasIllegalSPModification),
5925 RepeatedSequenceLocs.end());
5926
5927 // If the sequence doesn't have enough candidates left, then we're done.
5928 if (RepeatedSequenceLocs.size() < 2)
5929 return outliner::OutlinedFunction();
5930 }
5931
5932 // Properties about candidate MBBs that hold for all of them.
5933 unsigned FlagsSetInAll = 0xF;
5934
5935 // Compute liveness information for each candidate, and set FlagsSetInAll.
5936 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5937 [&FlagsSetInAll](outliner::Candidate &C) {
5938 FlagsSetInAll &= C.Flags;
5939 });
5940
5941 // According to the AArch64 Procedure Call Standard, the following are
5942 // undefined on entry/exit from a function call:
5943 //
5944 // * Registers x16, x17, (and thus w16, w17)
5945 // * Condition codes (and thus the NZCV register)
5946 //
5947 // Because if this, we can't outline any sequence of instructions where
5948 // one
5949 // of these registers is live into/across it. Thus, we need to delete
5950 // those
5951 // candidates.
5952 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
5953 // If the unsafe registers in this block are all dead, then we don't need
5954 // to compute liveness here.
5955 if (C.Flags & UnsafeRegsDead)
5956 return false;
5957 C.initLRU(TRI);
5958 LiveRegUnits LRU = C.LRU;
5959 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
5960 !LRU.available(AArch64::NZCV));
5961 };
5962
5963 // Are there any candidates where those registers are live?
5964 if (!(FlagsSetInAll & UnsafeRegsDead)) {
5965 // Erase every candidate that violates the restrictions above. (It could be
5966 // true that we have viable candidates, so it's not worth bailing out in
5967 // the case that, say, 1 out of 20 candidates violate the restructions.)
5968 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
5969 RepeatedSequenceLocs.end(),
5970 CantGuaranteeValueAcrossCall),
5971 RepeatedSequenceLocs.end());
5972
5973 // If the sequence doesn't have enough candidates left, then we're done.
5974 if (RepeatedSequenceLocs.size() < 2)
5975 return outliner::OutlinedFunction();
5976 }
5977
5978 // At this point, we have only "safe" candidates to outline. Figure out
5979 // frame + call instruction information.
5980
5981 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
5982
5983 // Helper lambda which sets call information for every candidate.
5984 auto SetCandidateCallInfo =
5985 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
5986 for (outliner::Candidate &C : RepeatedSequenceLocs)
5987 C.setCallInfo(CallID, NumBytesForCall);
5988 };
5989
5990 unsigned FrameID = MachineOutlinerDefault;
5991 NumBytesToCreateFrame += 4;
5992
5993 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
5994 return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
5995 });
5996
5997 // We check to see if CFI Instructions are present, and if they are
5998 // we find the number of CFI Instructions in the candidates.
5999 unsigned CFICount = 0;
6000 MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
6001 for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
6002 Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
6003 const std::vector<MCCFIInstruction> &CFIInstructions =
6004 RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
6005 if (MBBI->isCFIInstruction()) {
6006 unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
6007 MCCFIInstruction CFI = CFIInstructions[CFIIndex];
6008 CFICount++;
6009 }
6010 MBBI++;
6011 }
6012
6013 // We compare the number of found CFI Instructions to the number of CFI
6014 // instructions in the parent function for each candidate. We must check this
6015 // since if we outline one of the CFI instructions in a function, we have to
6016 // outline them all for correctness. If we do not, the address offsets will be
6017 // incorrect between the two sections of the program.
6018 for (outliner::Candidate &C : RepeatedSequenceLocs) {
6019 std::vector<MCCFIInstruction> CFIInstructions =
6020 C.getMF()->getFrameInstructions();
6021
6022 if (CFICount > 0 && CFICount != CFIInstructions.size())
6023 return outliner::OutlinedFunction();
6024 }
6025
6026 // Returns true if an instructions is safe to fix up, false otherwise.
6027 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
6028 if (MI.isCall())
6029 return true;
6030
6031 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
6032 !MI.readsRegister(AArch64::SP, &TRI))
6033 return true;
6034
6035 // Any modification of SP will break our code to save/restore LR.
6036 // FIXME: We could handle some instructions which add a constant
6037 // offset to SP, with a bit more work.
6038 if (MI.modifiesRegister(AArch64::SP, &TRI))
6039 return false;
6040
6041 // At this point, we have a stack instruction that we might need to
6042 // fix up. We'll handle it if it's a load or store.
6043 if (MI.mayLoadOrStore()) {
6044 const MachineOperand *Base; // Filled with the base operand of MI.
6045 int64_t Offset; // Filled with the offset of MI.
6046 bool OffsetIsScalable;
6047
6048 // Does it allow us to offset the base operand and is the base the
6049 // register SP?
6050 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
6051 !Base->isReg() || Base->getReg() != AArch64::SP)
6052 return false;
6053
6054 // Fixe-up code below assumes bytes.
6055 if (OffsetIsScalable)
6056 return false;
6057
6058 // Find the minimum/maximum offset for this instruction and check
6059 // if fixing it up would be in range.
6060 int64_t MinOffset,
6061 MaxOffset; // Unscaled offsets for the instruction.
6062 TypeSize Scale(0U, false); // The scale to multiply the offsets by.
6063 unsigned DummyWidth;
6064 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
6065
6066 Offset += 16; // Update the offset to what it would be if we outlined.
6067 if (Offset < MinOffset * (int64_t)Scale.getFixedSize() ||
6068 Offset > MaxOffset * (int64_t)Scale.getFixedSize())
6069 return false;
6070
6071 // It's in range, so we can outline it.
6072 return true;
6073 }
6074
6075 // FIXME: Add handling for instructions like "add x0, sp, #8".
6076
6077 // We can't fix it up, so don't outline it.
6078 return false;
6079 };
6080
6081 // True if it's possible to fix up each stack instruction in this sequence.
6082 // Important for frames/call variants that modify the stack.
6083 bool AllStackInstrsSafe = std::all_of(
6084 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
6085
6086 // If the last instruction in any candidate is a terminator, then we should
6087 // tail call all of the candidates.
6088 if (RepeatedSequenceLocs[0].back()->isTerminator()) {
6089 FrameID = MachineOutlinerTailCall;
6090 NumBytesToCreateFrame = 0;
6091 SetCandidateCallInfo(MachineOutlinerTailCall, 4);
6092 }
6093
6094 else if (LastInstrOpcode == AArch64::BL ||
6095 ((LastInstrOpcode == AArch64::BLR ||
6096 LastInstrOpcode == AArch64::BLRNoIP) &&
6097 !HasBTI)) {
6098 // FIXME: Do we need to check if the code after this uses the value of LR?
6099 FrameID = MachineOutlinerThunk;
6100 NumBytesToCreateFrame = 0;
6101 SetCandidateCallInfo(MachineOutlinerThunk, 4);
6102 }
6103
6104 else {
6105 // We need to decide how to emit calls + frames. We can always emit the same
6106 // frame if we don't need to save to the stack. If we have to save to the
6107 // stack, then we need a different frame.
6108 unsigned NumBytesNoStackCalls = 0;
6109 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
6110
6111 // Check if we have to save LR.
6112 for (outliner::Candidate &C : RepeatedSequenceLocs) {
6113 C.initLRU(TRI);
6114
6115 // If we have a noreturn caller, then we're going to be conservative and
6116 // say that we have to save LR. If we don't have a ret at the end of the
6117 // block, then we can't reason about liveness accurately.
6118 //
6119 // FIXME: We can probably do better than always disabling this in
6120 // noreturn functions by fixing up the liveness info.
6121 bool IsNoReturn =
6122 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
6123
6124 // Is LR available? If so, we don't need a save.
6125 if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
6126 NumBytesNoStackCalls += 4;
6127 C.setCallInfo(MachineOutlinerNoLRSave, 4);
6128 CandidatesWithoutStackFixups.push_back(C);
6129 }
6130
6131 // Is an unused register available? If so, we won't modify the stack, so
6132 // we can outline with the same frame type as those that don't save LR.
6133 else if (findRegisterToSaveLRTo(C)) {
6134 NumBytesNoStackCalls += 12;
6135 C.setCallInfo(MachineOutlinerRegSave, 12);
6136 CandidatesWithoutStackFixups.push_back(C);
6137 }
6138
6139 // Is SP used in the sequence at all? If not, we don't have to modify
6140 // the stack, so we are guaranteed to get the same frame.
6141 else if (C.UsedInSequence.available(AArch64::SP)) {
6142 NumBytesNoStackCalls += 12;
6143 C.setCallInfo(MachineOutlinerDefault, 12);
6144 CandidatesWithoutStackFixups.push_back(C);
6145 }
6146
6147 // If we outline this, we need to modify the stack. Pretend we don't
6148 // outline this by saving all of its bytes.
6149 else {
6150 NumBytesNoStackCalls += SequenceSize;
6151 }
6152 }
6153
6154 // If there are no places where we have to save LR, then note that we
6155 // don't have to update the stack. Otherwise, give every candidate the
6156 // default call type, as long as it's safe to do so.
6157 if (!AllStackInstrsSafe ||
6158 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
6159 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
6160 FrameID = MachineOutlinerNoLRSave;
6161 } else {
6162 SetCandidateCallInfo(MachineOutlinerDefault, 12);
6163 }
6164
6165 // If we dropped all of the candidates, bail out here.
6166 if (RepeatedSequenceLocs.size() < 2) {
6167 RepeatedSequenceLocs.clear();
6168 return outliner::OutlinedFunction();
6169 }
6170 }
6171
6172 // Does every candidate's MBB contain a call? If so, then we might have a call
6173 // in the range.
6174 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
6175 // Check if the range contains a call. These require a save + restore of the
6176 // link register.
6177 bool ModStackToSaveLR = false;
6178 if (std::any_of(FirstCand.front(), FirstCand.back(),
6179 [](const MachineInstr &MI) { return MI.isCall(); }))
6180 ModStackToSaveLR = true;
6181
6182 // Handle the last instruction separately. If this is a tail call, then the
6183 // last instruction is a call. We don't want to save + restore in this case.
6184 // However, it could be possible that the last instruction is a call without
6185 // it being valid to tail call this sequence. We should consider this as
6186 // well.
6187 else if (FrameID != MachineOutlinerThunk &&
6188 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
6189 ModStackToSaveLR = true;
6190
6191 if (ModStackToSaveLR) {
6192 // We can't fix up the stack. Bail out.
6193 if (!AllStackInstrsSafe) {
6194 RepeatedSequenceLocs.clear();
6195 return outliner::OutlinedFunction();
6196 }
6197
6198 // Save + restore LR.
6199 NumBytesToCreateFrame += 8;
6200 }
6201 }
6202
6203 // If we have CFI instructions, we can only outline if the outlined section
6204 // can be a tail call
6205 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
6206 return outliner::OutlinedFunction();
6207
6208 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
6209 NumBytesToCreateFrame, FrameID);
6210 }
6211
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const6212 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
6213 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
6214 const Function &F = MF.getFunction();
6215
6216 // Can F be deduplicated by the linker? If it can, don't outline from it.
6217 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
6218 return false;
6219
6220 // Don't outline from functions with section markings; the program could
6221 // expect that all the code is in the named section.
6222 // FIXME: Allow outlining from multiple functions with the same section
6223 // marking.
6224 if (F.hasSection())
6225 return false;
6226
6227 // Outlining from functions with redzones is unsafe since the outliner may
6228 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
6229 // outline from it.
6230 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
6231 if (!AFI || AFI->hasRedZone().getValueOr(true))
6232 return false;
6233
6234 // FIXME: Teach the outliner to generate/handle Windows unwind info.
6235 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
6236 return false;
6237
6238 // It's safe to outline from MF.
6239 return true;
6240 }
6241
isMBBSafeToOutlineFrom(MachineBasicBlock & MBB,unsigned & Flags) const6242 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
6243 unsigned &Flags) const {
6244 // Check if LR is available through all of the MBB. If it's not, then set
6245 // a flag.
6246 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
6247 "Suitable Machine Function for outlining must track liveness");
6248 LiveRegUnits LRU(getRegisterInfo());
6249
6250 std::for_each(MBB.rbegin(), MBB.rend(),
6251 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
6252
6253 // Check if each of the unsafe registers are available...
6254 bool W16AvailableInBlock = LRU.available(AArch64::W16);
6255 bool W17AvailableInBlock = LRU.available(AArch64::W17);
6256 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
6257
6258 // If all of these are dead (and not live out), we know we don't have to check
6259 // them later.
6260 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
6261 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
6262
6263 // Now, add the live outs to the set.
6264 LRU.addLiveOuts(MBB);
6265
6266 // If any of these registers is available in the MBB, but also a live out of
6267 // the block, then we know outlining is unsafe.
6268 if (W16AvailableInBlock && !LRU.available(AArch64::W16))
6269 return false;
6270 if (W17AvailableInBlock && !LRU.available(AArch64::W17))
6271 return false;
6272 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
6273 return false;
6274
6275 // Check if there's a call inside this MachineBasicBlock. If there is, then
6276 // set a flag.
6277 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
6278 Flags |= MachineOutlinerMBBFlags::HasCalls;
6279
6280 MachineFunction *MF = MBB.getParent();
6281
6282 // In the event that we outline, we may have to save LR. If there is an
6283 // available register in the MBB, then we'll always save LR there. Check if
6284 // this is true.
6285 bool CanSaveLR = false;
6286 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
6287 MF->getSubtarget().getRegisterInfo());
6288
6289 // Check if there is an available register across the sequence that we can
6290 // use.
6291 for (unsigned Reg : AArch64::GPR64RegClass) {
6292 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
6293 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
6294 CanSaveLR = true;
6295 break;
6296 }
6297 }
6298
6299 // Check if we have a register we can save LR to, and if LR was used
6300 // somewhere. If both of those things are true, then we need to evaluate the
6301 // safety of outlining stack instructions later.
6302 if (!CanSaveLR && !LRU.available(AArch64::LR))
6303 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
6304
6305 return true;
6306 }
6307
6308 outliner::InstrType
getOutliningType(MachineBasicBlock::iterator & MIT,unsigned Flags) const6309 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
6310 unsigned Flags) const {
6311 MachineInstr &MI = *MIT;
6312 MachineBasicBlock *MBB = MI.getParent();
6313 MachineFunction *MF = MBB->getParent();
6314 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
6315
6316 // Don't outline anything used for return address signing. The outlined
6317 // function will get signed later if needed
6318 switch (MI.getOpcode()) {
6319 case AArch64::PACIASP:
6320 case AArch64::PACIBSP:
6321 case AArch64::AUTIASP:
6322 case AArch64::AUTIBSP:
6323 case AArch64::RETAA:
6324 case AArch64::RETAB:
6325 case AArch64::EMITBKEY:
6326 return outliner::InstrType::Illegal;
6327 }
6328
6329 // Don't outline LOHs.
6330 if (FuncInfo->getLOHRelated().count(&MI))
6331 return outliner::InstrType::Illegal;
6332
6333 // We can only outline these if we will tail call the outlined function, or
6334 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
6335 // in a tail call.
6336 //
6337 // FIXME: If the proper fixups for the offset are implemented, this should be
6338 // possible.
6339 if (MI.isCFIInstruction())
6340 return outliner::InstrType::Legal;
6341
6342 // Don't allow debug values to impact outlining type.
6343 if (MI.isDebugInstr() || MI.isIndirectDebugValue())
6344 return outliner::InstrType::Invisible;
6345
6346 // At this point, KILL instructions don't really tell us much so we can go
6347 // ahead and skip over them.
6348 if (MI.isKill())
6349 return outliner::InstrType::Invisible;
6350
6351 // Is this a terminator for a basic block?
6352 if (MI.isTerminator()) {
6353
6354 // Is this the end of a function?
6355 if (MI.getParent()->succ_empty())
6356 return outliner::InstrType::Legal;
6357
6358 // It's not, so don't outline it.
6359 return outliner::InstrType::Illegal;
6360 }
6361
6362 // Make sure none of the operands are un-outlinable.
6363 for (const MachineOperand &MOP : MI.operands()) {
6364 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
6365 MOP.isTargetIndex())
6366 return outliner::InstrType::Illegal;
6367
6368 // If it uses LR or W30 explicitly, then don't touch it.
6369 if (MOP.isReg() && !MOP.isImplicit() &&
6370 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
6371 return outliner::InstrType::Illegal;
6372 }
6373
6374 // Special cases for instructions that can always be outlined, but will fail
6375 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
6376 // be outlined because they don't require a *specific* value to be in LR.
6377 if (MI.getOpcode() == AArch64::ADRP)
6378 return outliner::InstrType::Legal;
6379
6380 // If MI is a call we might be able to outline it. We don't want to outline
6381 // any calls that rely on the position of items on the stack. When we outline
6382 // something containing a call, we have to emit a save and restore of LR in
6383 // the outlined function. Currently, this always happens by saving LR to the
6384 // stack. Thus, if we outline, say, half the parameters for a function call
6385 // plus the call, then we'll break the callee's expectations for the layout
6386 // of the stack.
6387 //
6388 // FIXME: Allow calls to functions which construct a stack frame, as long
6389 // as they don't access arguments on the stack.
6390 // FIXME: Figure out some way to analyze functions defined in other modules.
6391 // We should be able to compute the memory usage based on the IR calling
6392 // convention, even if we can't see the definition.
6393 if (MI.isCall()) {
6394 // Get the function associated with the call. Look at each operand and find
6395 // the one that represents the callee and get its name.
6396 const Function *Callee = nullptr;
6397 for (const MachineOperand &MOP : MI.operands()) {
6398 if (MOP.isGlobal()) {
6399 Callee = dyn_cast<Function>(MOP.getGlobal());
6400 break;
6401 }
6402 }
6403
6404 // Never outline calls to mcount. There isn't any rule that would require
6405 // this, but the Linux kernel's "ftrace" feature depends on it.
6406 if (Callee && Callee->getName() == "\01_mcount")
6407 return outliner::InstrType::Illegal;
6408
6409 // If we don't know anything about the callee, assume it depends on the
6410 // stack layout of the caller. In that case, it's only legal to outline
6411 // as a tail-call. Explicitly list the call instructions we know about so we
6412 // don't get unexpected results with call pseudo-instructions.
6413 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
6414 if (MI.getOpcode() == AArch64::BLR ||
6415 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
6416 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
6417
6418 if (!Callee)
6419 return UnknownCallOutlineType;
6420
6421 // We have a function we have information about. Check it if it's something
6422 // can safely outline.
6423 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
6424
6425 // We don't know what's going on with the callee at all. Don't touch it.
6426 if (!CalleeMF)
6427 return UnknownCallOutlineType;
6428
6429 // Check if we know anything about the callee saves on the function. If we
6430 // don't, then don't touch it, since that implies that we haven't
6431 // computed anything about its stack frame yet.
6432 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
6433 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
6434 MFI.getNumObjects() > 0)
6435 return UnknownCallOutlineType;
6436
6437 // At this point, we can say that CalleeMF ought to not pass anything on the
6438 // stack. Therefore, we can outline it.
6439 return outliner::InstrType::Legal;
6440 }
6441
6442 // Don't outline positions.
6443 if (MI.isPosition())
6444 return outliner::InstrType::Illegal;
6445
6446 // Don't touch the link register or W30.
6447 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
6448 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
6449 return outliner::InstrType::Illegal;
6450
6451 // Don't outline BTI instructions, because that will prevent the outlining
6452 // site from being indirectly callable.
6453 if (MI.getOpcode() == AArch64::HINT) {
6454 int64_t Imm = MI.getOperand(0).getImm();
6455 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
6456 return outliner::InstrType::Illegal;
6457 }
6458
6459 return outliner::InstrType::Legal;
6460 }
6461
fixupPostOutline(MachineBasicBlock & MBB) const6462 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
6463 for (MachineInstr &MI : MBB) {
6464 const MachineOperand *Base;
6465 unsigned Width;
6466 int64_t Offset;
6467 bool OffsetIsScalable;
6468
6469 // Is this a load or store with an immediate offset with SP as the base?
6470 if (!MI.mayLoadOrStore() ||
6471 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
6472 &RI) ||
6473 (Base->isReg() && Base->getReg() != AArch64::SP))
6474 continue;
6475
6476 // It is, so we have to fix it up.
6477 TypeSize Scale(0U, false);
6478 int64_t Dummy1, Dummy2;
6479
6480 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
6481 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
6482 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
6483 assert(Scale != 0 && "Unexpected opcode!");
6484 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
6485
6486 // We've pushed the return address to the stack, so add 16 to the offset.
6487 // This is safe, since we already checked if it would overflow when we
6488 // checked if this instruction was legal to outline.
6489 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
6490 StackOffsetOperand.setImm(NewImm);
6491 }
6492 }
6493
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,bool ShouldSignReturnAddr,bool ShouldSignReturnAddrWithAKey)6494 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
6495 bool ShouldSignReturnAddr,
6496 bool ShouldSignReturnAddrWithAKey) {
6497 if (ShouldSignReturnAddr) {
6498 MachineBasicBlock::iterator MBBPAC = MBB.begin();
6499 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
6500 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
6501 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
6502 DebugLoc DL;
6503
6504 if (MBBAUT != MBB.end())
6505 DL = MBBAUT->getDebugLoc();
6506
6507 // At the very beginning of the basic block we insert the following
6508 // depending on the key type
6509 //
6510 // a_key: b_key:
6511 // PACIASP EMITBKEY
6512 // CFI_INSTRUCTION PACIBSP
6513 // CFI_INSTRUCTION
6514 if (ShouldSignReturnAddrWithAKey) {
6515 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
6516 .setMIFlag(MachineInstr::FrameSetup);
6517 } else {
6518 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
6519 .setMIFlag(MachineInstr::FrameSetup);
6520 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
6521 .setMIFlag(MachineInstr::FrameSetup);
6522 }
6523 unsigned CFIIndex =
6524 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
6525 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
6526 .addCFIIndex(CFIIndex)
6527 .setMIFlags(MachineInstr::FrameSetup);
6528
6529 // If v8.3a features are available we can replace a RET instruction by
6530 // RETAA or RETAB and omit the AUT instructions
6531 if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
6532 MBBAUT->getOpcode() == AArch64::RET) {
6533 BuildMI(MBB, MBBAUT, DL,
6534 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
6535 : AArch64::RETAB))
6536 .copyImplicitOps(*MBBAUT);
6537 MBB.erase(MBBAUT);
6538 } else {
6539 BuildMI(MBB, MBBAUT, DL,
6540 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
6541 : AArch64::AUTIBSP))
6542 .setMIFlag(MachineInstr::FrameDestroy);
6543 }
6544 }
6545 }
6546
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const6547 void AArch64InstrInfo::buildOutlinedFrame(
6548 MachineBasicBlock &MBB, MachineFunction &MF,
6549 const outliner::OutlinedFunction &OF) const {
6550
6551 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
6552
6553 if (OF.FrameConstructionID == MachineOutlinerTailCall)
6554 FI->setOutliningStyle("Tail Call");
6555 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
6556 // For thunk outlining, rewrite the last instruction from a call to a
6557 // tail-call.
6558 MachineInstr *Call = &*--MBB.instr_end();
6559 unsigned TailOpcode;
6560 if (Call->getOpcode() == AArch64::BL) {
6561 TailOpcode = AArch64::TCRETURNdi;
6562 } else {
6563 assert(Call->getOpcode() == AArch64::BLR ||
6564 Call->getOpcode() == AArch64::BLRNoIP);
6565 TailOpcode = AArch64::TCRETURNriALL;
6566 }
6567 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
6568 .add(Call->getOperand(0))
6569 .addImm(0);
6570 MBB.insert(MBB.end(), TC);
6571 Call->eraseFromParent();
6572
6573 FI->setOutliningStyle("Thunk");
6574 }
6575
6576 bool IsLeafFunction = true;
6577
6578 // Is there a call in the outlined range?
6579 auto IsNonTailCall = [](const MachineInstr &MI) {
6580 return MI.isCall() && !MI.isReturn();
6581 };
6582
6583 if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
6584 // Fix up the instructions in the range, since we're going to modify the
6585 // stack.
6586 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
6587 "Can only fix up stack references once");
6588 fixupPostOutline(MBB);
6589
6590 IsLeafFunction = false;
6591
6592 // LR has to be a live in so that we can save it.
6593 if (!MBB.isLiveIn(AArch64::LR))
6594 MBB.addLiveIn(AArch64::LR);
6595
6596 MachineBasicBlock::iterator It = MBB.begin();
6597 MachineBasicBlock::iterator Et = MBB.end();
6598
6599 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6600 OF.FrameConstructionID == MachineOutlinerThunk)
6601 Et = std::prev(MBB.end());
6602
6603 // Insert a save before the outlined region
6604 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6605 .addReg(AArch64::SP, RegState::Define)
6606 .addReg(AArch64::LR)
6607 .addReg(AArch64::SP)
6608 .addImm(-16);
6609 It = MBB.insert(It, STRXpre);
6610
6611 const TargetSubtargetInfo &STI = MF.getSubtarget();
6612 const MCRegisterInfo *MRI = STI.getRegisterInfo();
6613 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
6614
6615 // Add a CFI saying the stack was moved 16 B down.
6616 int64_t StackPosEntry =
6617 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
6618 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6619 .addCFIIndex(StackPosEntry)
6620 .setMIFlags(MachineInstr::FrameSetup);
6621
6622 // Add a CFI saying that the LR that we want to find is now 16 B higher than
6623 // before.
6624 int64_t LRPosEntry =
6625 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
6626 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6627 .addCFIIndex(LRPosEntry)
6628 .setMIFlags(MachineInstr::FrameSetup);
6629
6630 // Insert a restore before the terminator for the function.
6631 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6632 .addReg(AArch64::SP, RegState::Define)
6633 .addReg(AArch64::LR, RegState::Define)
6634 .addReg(AArch64::SP)
6635 .addImm(16);
6636 Et = MBB.insert(Et, LDRXpost);
6637 }
6638
6639 // If a bunch of candidates reach this point they must agree on their return
6640 // address signing. It is therefore enough to just consider the signing
6641 // behaviour of one of them
6642 const Function &CF = OF.Candidates.front().getMF()->getFunction();
6643 bool ShouldSignReturnAddr = false;
6644 if (CF.hasFnAttribute("sign-return-address")) {
6645 StringRef Scope =
6646 CF.getFnAttribute("sign-return-address").getValueAsString();
6647 if (Scope.equals("all"))
6648 ShouldSignReturnAddr = true;
6649 else if (Scope.equals("non-leaf") && !IsLeafFunction)
6650 ShouldSignReturnAddr = true;
6651 }
6652
6653 // a_key is the default
6654 bool ShouldSignReturnAddrWithAKey = true;
6655 if (CF.hasFnAttribute("sign-return-address-key")) {
6656 const StringRef Key =
6657 CF.getFnAttribute("sign-return-address-key").getValueAsString();
6658 // Key can either be a_key or b_key
6659 assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
6660 "Return address signing key must be either a_key or b_key");
6661 ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
6662 }
6663
6664 // If this is a tail call outlined function, then there's already a return.
6665 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6666 OF.FrameConstructionID == MachineOutlinerThunk) {
6667 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6668 ShouldSignReturnAddrWithAKey);
6669 return;
6670 }
6671
6672 // It's not a tail call, so we have to insert the return ourselves.
6673
6674 // LR has to be a live in so that we can return to it.
6675 if (!MBB.isLiveIn(AArch64::LR))
6676 MBB.addLiveIn(AArch64::LR);
6677
6678 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
6679 .addReg(AArch64::LR);
6680 MBB.insert(MBB.end(), ret);
6681
6682 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6683 ShouldSignReturnAddrWithAKey);
6684
6685 FI->setOutliningStyle("Function");
6686
6687 // Did we have to modify the stack by saving the link register?
6688 if (OF.FrameConstructionID != MachineOutlinerDefault)
6689 return;
6690
6691 // We modified the stack.
6692 // Walk over the basic block and fix up all the stack accesses.
6693 fixupPostOutline(MBB);
6694 }
6695
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,const outliner::Candidate & C) const6696 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
6697 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
6698 MachineFunction &MF, const outliner::Candidate &C) const {
6699
6700 // Are we tail calling?
6701 if (C.CallConstructionID == MachineOutlinerTailCall) {
6702 // If yes, then we can just branch to the label.
6703 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
6704 .addGlobalAddress(M.getNamedValue(MF.getName()))
6705 .addImm(0));
6706 return It;
6707 }
6708
6709 // Are we saving the link register?
6710 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
6711 C.CallConstructionID == MachineOutlinerThunk) {
6712 // No, so just insert the call.
6713 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6714 .addGlobalAddress(M.getNamedValue(MF.getName())));
6715 return It;
6716 }
6717
6718 // We want to return the spot where we inserted the call.
6719 MachineBasicBlock::iterator CallPt;
6720
6721 // Instructions for saving and restoring LR around the call instruction we're
6722 // going to insert.
6723 MachineInstr *Save;
6724 MachineInstr *Restore;
6725 // Can we save to a register?
6726 if (C.CallConstructionID == MachineOutlinerRegSave) {
6727 // FIXME: This logic should be sunk into a target-specific interface so that
6728 // we don't have to recompute the register.
6729 unsigned Reg = findRegisterToSaveLRTo(C);
6730 assert(Reg != 0 && "No callee-saved register available?");
6731
6732 // Save and restore LR from that register.
6733 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
6734 .addReg(AArch64::XZR)
6735 .addReg(AArch64::LR)
6736 .addImm(0);
6737 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
6738 .addReg(AArch64::XZR)
6739 .addReg(Reg)
6740 .addImm(0);
6741 } else {
6742 // We have the default case. Save and restore from SP.
6743 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6744 .addReg(AArch64::SP, RegState::Define)
6745 .addReg(AArch64::LR)
6746 .addReg(AArch64::SP)
6747 .addImm(-16);
6748 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6749 .addReg(AArch64::SP, RegState::Define)
6750 .addReg(AArch64::LR, RegState::Define)
6751 .addReg(AArch64::SP)
6752 .addImm(16);
6753 }
6754
6755 It = MBB.insert(It, Save);
6756 It++;
6757
6758 // Insert the call.
6759 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6760 .addGlobalAddress(M.getNamedValue(MF.getName())));
6761 CallPt = It;
6762 It++;
6763
6764 It = MBB.insert(It, Restore);
6765 return CallPt;
6766 }
6767
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const6768 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
6769 MachineFunction &MF) const {
6770 return MF.getFunction().hasMinSize();
6771 }
6772
6773 Optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const6774 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
6775
6776 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
6777 // and zero immediate operands used as an alias for mov instruction.
6778 if (MI.getOpcode() == AArch64::ORRWrs &&
6779 MI.getOperand(1).getReg() == AArch64::WZR &&
6780 MI.getOperand(3).getImm() == 0x0) {
6781 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6782 }
6783
6784 if (MI.getOpcode() == AArch64::ORRXrs &&
6785 MI.getOperand(1).getReg() == AArch64::XZR &&
6786 MI.getOperand(3).getImm() == 0x0) {
6787 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6788 }
6789
6790 return None;
6791 }
6792
isAddImmediate(const MachineInstr & MI,Register Reg) const6793 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
6794 Register Reg) const {
6795 int Sign = 1;
6796 int64_t Offset = 0;
6797
6798 // TODO: Handle cases where Reg is a super- or sub-register of the
6799 // destination register.
6800 const MachineOperand &Op0 = MI.getOperand(0);
6801 if (!Op0.isReg() || Reg != Op0.getReg())
6802 return None;
6803
6804 switch (MI.getOpcode()) {
6805 default:
6806 return None;
6807 case AArch64::SUBWri:
6808 case AArch64::SUBXri:
6809 case AArch64::SUBSWri:
6810 case AArch64::SUBSXri:
6811 Sign *= -1;
6812 LLVM_FALLTHROUGH;
6813 case AArch64::ADDSWri:
6814 case AArch64::ADDSXri:
6815 case AArch64::ADDWri:
6816 case AArch64::ADDXri: {
6817 // TODO: Third operand can be global address (usually some string).
6818 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
6819 !MI.getOperand(2).isImm())
6820 return None;
6821 Offset = MI.getOperand(2).getImm() * Sign;
6822 int Shift = MI.getOperand(3).getImm();
6823 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
6824 Offset = Offset << Shift;
6825 }
6826 }
6827 return RegImmPair{MI.getOperand(1).getReg(), Offset};
6828 }
6829
6830 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
6831 /// the destination register then, if possible, describe the value in terms of
6832 /// the source register.
6833 static Optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)6834 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
6835 const TargetInstrInfo *TII,
6836 const TargetRegisterInfo *TRI) {
6837 auto DestSrc = TII->isCopyInstr(MI);
6838 if (!DestSrc)
6839 return None;
6840
6841 Register DestReg = DestSrc->Destination->getReg();
6842 Register SrcReg = DestSrc->Source->getReg();
6843
6844 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
6845
6846 // If the described register is the destination, just return the source.
6847 if (DestReg == DescribedReg)
6848 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6849
6850 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
6851 if (MI.getOpcode() == AArch64::ORRWrs &&
6852 TRI->isSuperRegister(DestReg, DescribedReg))
6853 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6854
6855 // We may need to describe the lower part of a ORRXrs move.
6856 if (MI.getOpcode() == AArch64::ORRXrs &&
6857 TRI->isSubRegister(DestReg, DescribedReg)) {
6858 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
6859 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
6860 }
6861
6862 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
6863 "Unhandled ORR[XW]rs copy case");
6864
6865 return None;
6866 }
6867
6868 Optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const6869 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
6870 Register Reg) const {
6871 const MachineFunction *MF = MI.getMF();
6872 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
6873 switch (MI.getOpcode()) {
6874 case AArch64::MOVZWi:
6875 case AArch64::MOVZXi: {
6876 // MOVZWi may be used for producing zero-extended 32-bit immediates in
6877 // 64-bit parameters, so we need to consider super-registers.
6878 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
6879 return None;
6880
6881 if (!MI.getOperand(1).isImm())
6882 return None;
6883 int64_t Immediate = MI.getOperand(1).getImm();
6884 int Shift = MI.getOperand(2).getImm();
6885 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
6886 nullptr);
6887 }
6888 case AArch64::ORRWrs:
6889 case AArch64::ORRXrs:
6890 return describeORRLoadedValue(MI, Reg, this, TRI);
6891 }
6892
6893 return TargetInstrInfo::describeLoadedValue(MI, Reg);
6894 }
6895
getElementSizeForOpcode(unsigned Opc) const6896 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
6897 return get(Opc).TSFlags & AArch64::ElementSizeMask;
6898 }
6899
getBLRCallOpcode(const MachineFunction & MF)6900 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
6901 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
6902 return AArch64::BLRNoIP;
6903 else
6904 return AArch64::BLR;
6905 }
6906
6907 #define GET_INSTRINFO_HELPERS
6908 #define GET_INSTRMAP_INFO
6909 #include "AArch64GenInstrInfo.inc"
6910