1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// \file
8 //===----------------------------------------------------------------------===//
9 //
10
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17
18 #define DEBUG_TYPE "si-fold-operands"
19 using namespace llvm;
20
21 namespace {
22
23 struct FoldCandidate {
24 MachineInstr *UseMI;
25 union {
26 MachineOperand *OpToFold;
27 uint64_t ImmToFold;
28 int FrameIndexToFold;
29 };
30 int ShrinkOpcode;
31 unsigned UseOpNo;
32 MachineOperand::MachineOperandType Kind;
33 bool Commuted;
34
FoldCandidate__anon49cdb40c0111::FoldCandidate35 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
36 bool Commuted_ = false,
37 int ShrinkOp = -1) :
38 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
39 Kind(FoldOp->getType()),
40 Commuted(Commuted_) {
41 if (FoldOp->isImm()) {
42 ImmToFold = FoldOp->getImm();
43 } else if (FoldOp->isFI()) {
44 FrameIndexToFold = FoldOp->getIndex();
45 } else {
46 assert(FoldOp->isReg() || FoldOp->isGlobal());
47 OpToFold = FoldOp;
48 }
49 }
50
isFI__anon49cdb40c0111::FoldCandidate51 bool isFI() const {
52 return Kind == MachineOperand::MO_FrameIndex;
53 }
54
isImm__anon49cdb40c0111::FoldCandidate55 bool isImm() const {
56 return Kind == MachineOperand::MO_Immediate;
57 }
58
isReg__anon49cdb40c0111::FoldCandidate59 bool isReg() const {
60 return Kind == MachineOperand::MO_Register;
61 }
62
isGlobal__anon49cdb40c0111::FoldCandidate63 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
64
isCommuted__anon49cdb40c0111::FoldCandidate65 bool isCommuted() const {
66 return Commuted;
67 }
68
needsShrink__anon49cdb40c0111::FoldCandidate69 bool needsShrink() const {
70 return ShrinkOpcode != -1;
71 }
72
getShrinkOpcode__anon49cdb40c0111::FoldCandidate73 int getShrinkOpcode() const {
74 return ShrinkOpcode;
75 }
76 };
77
78 class SIFoldOperands : public MachineFunctionPass {
79 public:
80 static char ID;
81 MachineRegisterInfo *MRI;
82 const SIInstrInfo *TII;
83 const SIRegisterInfo *TRI;
84 const GCNSubtarget *ST;
85 const SIMachineFunctionInfo *MFI;
86
87 void foldOperand(MachineOperand &OpToFold,
88 MachineInstr *UseMI,
89 int UseOpIdx,
90 SmallVectorImpl<FoldCandidate> &FoldList,
91 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
92
93 bool tryFoldCndMask(MachineInstr &MI) const;
94 bool tryFoldZeroHighBits(MachineInstr &MI) const;
95 void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
96
97 const MachineOperand *isClamp(const MachineInstr &MI) const;
98 bool tryFoldClamp(MachineInstr &MI);
99
100 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
101 bool tryFoldOMod(MachineInstr &MI);
102 bool tryFoldRegSequence(MachineInstr &MI);
103 bool tryFoldLCSSAPhi(MachineInstr &MI);
104 bool tryFoldLoad(MachineInstr &MI);
105
106 public:
SIFoldOperands()107 SIFoldOperands() : MachineFunctionPass(ID) {
108 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
109 }
110
111 bool runOnMachineFunction(MachineFunction &MF) override;
112
getPassName() const113 StringRef getPassName() const override { return "SI Fold Operands"; }
114
getAnalysisUsage(AnalysisUsage & AU) const115 void getAnalysisUsage(AnalysisUsage &AU) const override {
116 AU.setPreservesCFG();
117 MachineFunctionPass::getAnalysisUsage(AU);
118 }
119 };
120
121 } // End anonymous namespace.
122
123 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
124 "SI Fold Operands", false, false)
125
126 char SIFoldOperands::ID = 0;
127
128 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
129
130 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
macToMad(unsigned Opc)131 static unsigned macToMad(unsigned Opc) {
132 switch (Opc) {
133 case AMDGPU::V_MAC_F32_e64:
134 return AMDGPU::V_MAD_F32_e64;
135 case AMDGPU::V_MAC_F16_e64:
136 return AMDGPU::V_MAD_F16_e64;
137 case AMDGPU::V_FMAC_F32_e64:
138 return AMDGPU::V_FMA_F32_e64;
139 case AMDGPU::V_FMAC_F16_e64:
140 return AMDGPU::V_FMA_F16_gfx9_e64;
141 case AMDGPU::V_FMAC_LEGACY_F32_e64:
142 return AMDGPU::V_FMA_LEGACY_F32_e64;
143 case AMDGPU::V_FMAC_F64_e64:
144 return AMDGPU::V_FMA_F64_e64;
145 }
146 return AMDGPU::INSTRUCTION_LIST_END;
147 }
148
149 // Wrapper around isInlineConstant that understands special cases when
150 // instruction types are replaced during operand folding.
isInlineConstantIfFolded(const SIInstrInfo * TII,const MachineInstr & UseMI,unsigned OpNo,const MachineOperand & OpToFold)151 static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
152 const MachineInstr &UseMI,
153 unsigned OpNo,
154 const MachineOperand &OpToFold) {
155 if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
156 return true;
157
158 unsigned Opc = UseMI.getOpcode();
159 unsigned NewOpc = macToMad(Opc);
160 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
161 // Special case for mac. Since this is replaced with mad when folded into
162 // src2, we need to check the legality for the final instruction.
163 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
164 if (static_cast<int>(OpNo) == Src2Idx) {
165 const MCInstrDesc &MadDesc = TII->get(NewOpc);
166 return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
167 }
168 }
169
170 return false;
171 }
172
173 // TODO: Add heuristic that the frame index might not fit in the addressing mode
174 // immediate offset to avoid materializing in loops.
frameIndexMayFold(const SIInstrInfo * TII,const MachineInstr & UseMI,int OpNo,const MachineOperand & OpToFold)175 static bool frameIndexMayFold(const SIInstrInfo *TII,
176 const MachineInstr &UseMI,
177 int OpNo,
178 const MachineOperand &OpToFold) {
179 if (!OpToFold.isFI())
180 return false;
181
182 if (TII->isMUBUF(UseMI))
183 return OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
184 AMDGPU::OpName::vaddr);
185 if (!TII->isFLATScratch(UseMI))
186 return false;
187
188 int SIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
189 AMDGPU::OpName::saddr);
190 if (OpNo == SIdx)
191 return true;
192
193 int VIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
194 AMDGPU::OpName::vaddr);
195 return OpNo == VIdx && SIdx == -1;
196 }
197
createSIFoldOperandsPass()198 FunctionPass *llvm::createSIFoldOperandsPass() {
199 return new SIFoldOperands();
200 }
201
updateOperand(FoldCandidate & Fold,const SIInstrInfo & TII,const TargetRegisterInfo & TRI,const GCNSubtarget & ST)202 static bool updateOperand(FoldCandidate &Fold,
203 const SIInstrInfo &TII,
204 const TargetRegisterInfo &TRI,
205 const GCNSubtarget &ST) {
206 MachineInstr *MI = Fold.UseMI;
207 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
208 assert(Old.isReg());
209
210 if (Fold.isImm()) {
211 if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
212 !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
213 AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
214 ST.hasInv2PiInlineImm())) {
215 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
216 // already set.
217 unsigned Opcode = MI->getOpcode();
218 int OpNo = MI->getOperandNo(&Old);
219 int ModIdx = -1;
220 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
221 ModIdx = AMDGPU::OpName::src0_modifiers;
222 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
223 ModIdx = AMDGPU::OpName::src1_modifiers;
224 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
225 ModIdx = AMDGPU::OpName::src2_modifiers;
226 assert(ModIdx != -1);
227 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
228 MachineOperand &Mod = MI->getOperand(ModIdx);
229 unsigned Val = Mod.getImm();
230 if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
231 // Only apply the following transformation if that operand requries
232 // a packed immediate.
233 switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
234 case AMDGPU::OPERAND_REG_IMM_V2FP16:
235 case AMDGPU::OPERAND_REG_IMM_V2INT16:
236 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
237 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
238 // If upper part is all zero we do not need op_sel_hi.
239 if (!isUInt<16>(Fold.ImmToFold)) {
240 if (!(Fold.ImmToFold & 0xffff)) {
241 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
242 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
243 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
244 return true;
245 }
246 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
247 Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
248 return true;
249 }
250 break;
251 default:
252 break;
253 }
254 }
255 }
256 }
257
258 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
259 MachineBasicBlock *MBB = MI->getParent();
260 auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16);
261 if (Liveness != MachineBasicBlock::LQR_Dead) {
262 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
263 return false;
264 }
265
266 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
267 int Op32 = Fold.getShrinkOpcode();
268 MachineOperand &Dst0 = MI->getOperand(0);
269 MachineOperand &Dst1 = MI->getOperand(1);
270 assert(Dst0.isDef() && Dst1.isDef());
271
272 bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
273
274 const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
275 Register NewReg0 = MRI.createVirtualRegister(Dst0RC);
276
277 MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
278
279 if (HaveNonDbgCarryUse) {
280 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
281 .addReg(AMDGPU::VCC, RegState::Kill);
282 }
283
284 // Keep the old instruction around to avoid breaking iterators, but
285 // replace it with a dummy instruction to remove uses.
286 //
287 // FIXME: We should not invert how this pass looks at operands to avoid
288 // this. Should track set of foldable movs instead of looking for uses
289 // when looking at a use.
290 Dst0.setReg(NewReg0);
291 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
292 MI->RemoveOperand(I);
293 MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
294
295 if (Fold.isCommuted())
296 TII.commuteInstruction(*Inst32, false);
297 return true;
298 }
299
300 assert(!Fold.needsShrink() && "not handled");
301
302 if (Fold.isImm()) {
303 Old.ChangeToImmediate(Fold.ImmToFold);
304 return true;
305 }
306
307 if (Fold.isGlobal()) {
308 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
309 Fold.OpToFold->getTargetFlags());
310 return true;
311 }
312
313 if (Fold.isFI()) {
314 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
315 return true;
316 }
317
318 MachineOperand *New = Fold.OpToFold;
319 Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
320 Old.setIsUndef(New->isUndef());
321 return true;
322 }
323
isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,const MachineInstr * MI)324 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
325 const MachineInstr *MI) {
326 for (auto Candidate : FoldList) {
327 if (Candidate.UseMI == MI)
328 return true;
329 }
330 return false;
331 }
332
appendFoldCandidate(SmallVectorImpl<FoldCandidate> & FoldList,MachineInstr * MI,unsigned OpNo,MachineOperand * FoldOp,bool Commuted=false,int ShrinkOp=-1)333 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
334 MachineInstr *MI, unsigned OpNo,
335 MachineOperand *FoldOp, bool Commuted = false,
336 int ShrinkOp = -1) {
337 // Skip additional folding on the same operand.
338 for (FoldCandidate &Fold : FoldList)
339 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
340 return;
341 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
342 << " operand " << OpNo << "\n " << *MI);
343 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
344 }
345
tryAddToFoldList(SmallVectorImpl<FoldCandidate> & FoldList,MachineInstr * MI,unsigned OpNo,MachineOperand * OpToFold,const SIInstrInfo * TII)346 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
347 MachineInstr *MI, unsigned OpNo,
348 MachineOperand *OpToFold,
349 const SIInstrInfo *TII) {
350 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
351 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
352 unsigned Opc = MI->getOpcode();
353 unsigned NewOpc = macToMad(Opc);
354 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
355 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
356 // to fold the operand.
357 MI->setDesc(TII->get(NewOpc));
358 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
359 if (FoldAsMAD) {
360 MI->untieRegOperand(OpNo);
361 return true;
362 }
363 MI->setDesc(TII->get(Opc));
364 }
365
366 // Special case for s_setreg_b32
367 if (OpToFold->isImm()) {
368 unsigned ImmOpc = 0;
369 if (Opc == AMDGPU::S_SETREG_B32)
370 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
371 else if (Opc == AMDGPU::S_SETREG_B32_mode)
372 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
373 if (ImmOpc) {
374 MI->setDesc(TII->get(ImmOpc));
375 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
376 return true;
377 }
378 }
379
380 // If we are already folding into another operand of MI, then
381 // we can't commute the instruction, otherwise we risk making the
382 // other fold illegal.
383 if (isUseMIInFoldList(FoldList, MI))
384 return false;
385
386 unsigned CommuteOpNo = OpNo;
387
388 // Operand is not legal, so try to commute the instruction to
389 // see if this makes it possible to fold.
390 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
391 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
392 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
393
394 if (CanCommute) {
395 if (CommuteIdx0 == OpNo)
396 CommuteOpNo = CommuteIdx1;
397 else if (CommuteIdx1 == OpNo)
398 CommuteOpNo = CommuteIdx0;
399 }
400
401
402 // One of operands might be an Imm operand, and OpNo may refer to it after
403 // the call of commuteInstruction() below. Such situations are avoided
404 // here explicitly as OpNo must be a register operand to be a candidate
405 // for memory folding.
406 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
407 !MI->getOperand(CommuteIdx1).isReg()))
408 return false;
409
410 if (!CanCommute ||
411 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
412 return false;
413
414 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
415 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
416 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
417 Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
418 (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
419 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
420
421 // Verify the other operand is a VGPR, otherwise we would violate the
422 // constant bus restriction.
423 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
424 MachineOperand &OtherOp = MI->getOperand(OtherIdx);
425 if (!OtherOp.isReg() ||
426 !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
427 return false;
428
429 assert(MI->getOperand(1).isDef());
430
431 // Make sure to get the 32-bit version of the commuted opcode.
432 unsigned MaybeCommutedOpc = MI->getOpcode();
433 int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
434
435 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
436 return true;
437 }
438
439 TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
440 return false;
441 }
442
443 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
444 return true;
445 }
446
447 // Check the case where we might introduce a second constant operand to a
448 // scalar instruction
449 if (TII->isSALU(MI->getOpcode())) {
450 const MCInstrDesc &InstDesc = MI->getDesc();
451 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
452 const SIRegisterInfo &SRI = TII->getRegisterInfo();
453
454 // Fine if the operand can be encoded as an inline constant
455 if (OpToFold->isImm()) {
456 if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
457 !TII->isInlineConstant(*OpToFold, OpInfo)) {
458 // Otherwise check for another constant
459 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
460 auto &Op = MI->getOperand(i);
461 if (OpNo != i &&
462 TII->isLiteralConstantLike(Op, OpInfo)) {
463 return false;
464 }
465 }
466 }
467 }
468 }
469
470 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
471 return true;
472 }
473
474 // If the use operand doesn't care about the value, this may be an operand only
475 // used for register indexing, in which case it is unsafe to fold.
isUseSafeToFold(const SIInstrInfo * TII,const MachineInstr & MI,const MachineOperand & UseMO)476 static bool isUseSafeToFold(const SIInstrInfo *TII,
477 const MachineInstr &MI,
478 const MachineOperand &UseMO) {
479 if (UseMO.isUndef() || TII->isSDWA(MI))
480 return false;
481
482 switch (MI.getOpcode()) {
483 case AMDGPU::V_MOV_B32_e32:
484 case AMDGPU::V_MOV_B32_e64:
485 case AMDGPU::V_MOV_B64_PSEUDO:
486 // Do not fold into an indirect mov.
487 return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
488 }
489
490 return true;
491 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
492 }
493
494 // Find a def of the UseReg, check if it is a reg_sequence and find initializers
495 // for each subreg, tracking it to foldable inline immediate if possible.
496 // Returns true on success.
getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *,unsigned>> & Defs,Register UseReg,uint8_t OpTy,const SIInstrInfo * TII,const MachineRegisterInfo & MRI)497 static bool getRegSeqInit(
498 SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
499 Register UseReg, uint8_t OpTy,
500 const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
501 MachineInstr *Def = MRI.getVRegDef(UseReg);
502 if (!Def || !Def->isRegSequence())
503 return false;
504
505 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
506 MachineOperand *Sub = &Def->getOperand(I);
507 assert(Sub->isReg());
508
509 for (MachineInstr *SubDef = MRI.getVRegDef(Sub->getReg());
510 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
511 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
512 SubDef = MRI.getVRegDef(Sub->getReg())) {
513 MachineOperand *Op = &SubDef->getOperand(1);
514 if (Op->isImm()) {
515 if (TII->isInlineConstant(*Op, OpTy))
516 Sub = Op;
517 break;
518 }
519 if (!Op->isReg() || Op->getReg().isPhysical())
520 break;
521 Sub = Op;
522 }
523
524 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
525 }
526
527 return true;
528 }
529
tryToFoldACImm(const SIInstrInfo * TII,const MachineOperand & OpToFold,MachineInstr * UseMI,unsigned UseOpIdx,SmallVectorImpl<FoldCandidate> & FoldList)530 static bool tryToFoldACImm(const SIInstrInfo *TII,
531 const MachineOperand &OpToFold,
532 MachineInstr *UseMI,
533 unsigned UseOpIdx,
534 SmallVectorImpl<FoldCandidate> &FoldList) {
535 const MCInstrDesc &Desc = UseMI->getDesc();
536 const MCOperandInfo *OpInfo = Desc.OpInfo;
537 if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
538 return false;
539
540 uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
541 if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
542 OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
543 (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
544 OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
545 return false;
546
547 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
548 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
549 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
550 return true;
551 }
552
553 if (!OpToFold.isReg())
554 return false;
555
556 Register UseReg = OpToFold.getReg();
557 if (!UseReg.isVirtual())
558 return false;
559
560 if (isUseMIInFoldList(FoldList, UseMI))
561 return false;
562
563 MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
564
565 // Maybe it is just a COPY of an immediate itself.
566 MachineInstr *Def = MRI.getVRegDef(UseReg);
567 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
568 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
569 MachineOperand &DefOp = Def->getOperand(1);
570 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
571 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
572 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
573 return true;
574 }
575 }
576
577 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
578 if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
579 return false;
580
581 int32_t Imm;
582 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
583 const MachineOperand *Op = Defs[I].first;
584 if (!Op->isImm())
585 return false;
586
587 auto SubImm = Op->getImm();
588 if (!I) {
589 Imm = SubImm;
590 if (!TII->isInlineConstant(*Op, OpTy) ||
591 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
592 return false;
593
594 continue;
595 }
596 if (Imm != SubImm)
597 return false; // Can only fold splat constants
598 }
599
600 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
601 return true;
602 }
603
foldOperand(MachineOperand & OpToFold,MachineInstr * UseMI,int UseOpIdx,SmallVectorImpl<FoldCandidate> & FoldList,SmallVectorImpl<MachineInstr * > & CopiesToReplace) const604 void SIFoldOperands::foldOperand(
605 MachineOperand &OpToFold,
606 MachineInstr *UseMI,
607 int UseOpIdx,
608 SmallVectorImpl<FoldCandidate> &FoldList,
609 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
610 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
611
612 if (!isUseSafeToFold(TII, *UseMI, UseOp))
613 return;
614
615 // FIXME: Fold operands with subregs.
616 if (UseOp.isReg() && OpToFold.isReg()) {
617 if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
618 return;
619 }
620
621 // Special case for REG_SEQUENCE: We can't fold literals into
622 // REG_SEQUENCE instructions, so we have to fold them into the
623 // uses of REG_SEQUENCE.
624 if (UseMI->isRegSequence()) {
625 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
626 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
627
628 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
629 MachineInstr *RSUseMI = RSUse.getParent();
630
631 if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
632 RSUseMI->getOperandNo(&RSUse), FoldList))
633 continue;
634
635 if (RSUse.getSubReg() != RegSeqDstSubReg)
636 continue;
637
638 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
639 CopiesToReplace);
640 }
641
642 return;
643 }
644
645 if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
646 return;
647
648 if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
649 // Sanity check that this is a stack access.
650 // FIXME: Should probably use stack pseudos before frame lowering.
651
652 if (TII->isMUBUF(*UseMI)) {
653 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
654 MFI->getScratchRSrcReg())
655 return;
656
657 // Ensure this is either relative to the current frame or the current
658 // wave.
659 MachineOperand &SOff =
660 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
661 if (!SOff.isImm() || SOff.getImm() != 0)
662 return;
663 }
664
665 // A frame index will resolve to a positive constant, so it should always be
666 // safe to fold the addressing mode, even pre-GFX9.
667 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
668
669 if (TII->isFLATScratch(*UseMI) &&
670 AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
671 AMDGPU::OpName::vaddr) != -1) {
672 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode());
673 UseMI->setDesc(TII->get(NewOpc));
674 }
675
676 return;
677 }
678
679 bool FoldingImmLike =
680 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
681
682 if (FoldingImmLike && UseMI->isCopy()) {
683 Register DestReg = UseMI->getOperand(0).getReg();
684 Register SrcReg = UseMI->getOperand(1).getReg();
685 assert(SrcReg.isVirtual());
686
687 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
688
689 // Don't fold into a copy to a physical register with the same class. Doing
690 // so would interfere with the register coalescer's logic which would avoid
691 // redundant initalizations.
692 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
693 return;
694
695 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
696 if (!DestReg.isPhysical()) {
697 if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
698 SmallVector<FoldCandidate, 4> CopyUses;
699 for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
700 // There's no point trying to fold into an implicit operand.
701 if (Use.isImplicit())
702 continue;
703
704 CopyUses.emplace_back(Use.getParent(),
705 Use.getParent()->getOperandNo(&Use),
706 &UseMI->getOperand(1));
707 }
708 for (auto &F : CopyUses) {
709 foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
710 }
711 }
712
713 if (DestRC == &AMDGPU::AGPR_32RegClass &&
714 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
715 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
716 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
717 CopiesToReplace.push_back(UseMI);
718 return;
719 }
720 }
721
722 // In order to fold immediates into copies, we need to change the
723 // copy to a MOV.
724
725 unsigned MovOp = TII->getMovOpcode(DestRC);
726 if (MovOp == AMDGPU::COPY)
727 return;
728
729 UseMI->setDesc(TII->get(MovOp));
730 MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
731 MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
732 while (ImpOpI != ImpOpE) {
733 MachineInstr::mop_iterator Tmp = ImpOpI;
734 ImpOpI++;
735 UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
736 }
737 CopiesToReplace.push_back(UseMI);
738 } else {
739 if (UseMI->isCopy() && OpToFold.isReg() &&
740 UseMI->getOperand(0).getReg().isVirtual() &&
741 !UseMI->getOperand(1).getSubReg()) {
742 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
743 unsigned Size = TII->getOpSize(*UseMI, 1);
744 Register UseReg = OpToFold.getReg();
745 UseMI->getOperand(1).setReg(UseReg);
746 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
747 UseMI->getOperand(1).setIsKill(false);
748 CopiesToReplace.push_back(UseMI);
749 OpToFold.setIsKill(false);
750
751 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
752 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
753 // its initializers right here, so we will rematerialize immediates and
754 // avoid copies via different reg classes.
755 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
756 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
757 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII,
758 *MRI)) {
759 const DebugLoc &DL = UseMI->getDebugLoc();
760 MachineBasicBlock &MBB = *UseMI->getParent();
761
762 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
763 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
764 UseMI->RemoveOperand(I);
765
766 MachineInstrBuilder B(*MBB.getParent(), UseMI);
767 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
768 SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
769 for (unsigned I = 0; I < Size / 4; ++I) {
770 MachineOperand *Def = Defs[I].first;
771 TargetInstrInfo::RegSubRegPair CopyToVGPR;
772 if (Def->isImm() &&
773 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
774 int64_t Imm = Def->getImm();
775
776 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
777 BuildMI(MBB, UseMI, DL,
778 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
779 B.addReg(Tmp);
780 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
781 auto Src = getRegSubRegPair(*Def);
782 Def->setIsKill(false);
783 if (!SeenAGPRs.insert(Src)) {
784 // We cannot build a reg_sequence out of the same registers, they
785 // must be copied. Better do it here before copyPhysReg() created
786 // several reads to do the AGPR->VGPR->AGPR copy.
787 CopyToVGPR = Src;
788 } else {
789 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
790 Src.SubReg);
791 }
792 } else {
793 assert(Def->isReg());
794 Def->setIsKill(false);
795 auto Src = getRegSubRegPair(*Def);
796
797 // Direct copy from SGPR to AGPR is not possible. To avoid creation
798 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
799 // create a copy here and track if we already have such a copy.
800 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
801 CopyToVGPR = Src;
802 } else {
803 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
804 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
805 B.addReg(Tmp);
806 }
807 }
808
809 if (CopyToVGPR.Reg) {
810 Register Vgpr;
811 if (VGPRCopies.count(CopyToVGPR)) {
812 Vgpr = VGPRCopies[CopyToVGPR];
813 } else {
814 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
815 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
816 VGPRCopies[CopyToVGPR] = Vgpr;
817 }
818 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
819 BuildMI(MBB, UseMI, DL,
820 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
821 B.addReg(Tmp);
822 }
823
824 B.addImm(Defs[I].second);
825 }
826 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
827 return;
828 }
829
830 if (Size != 4)
831 return;
832 if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
833 TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
834 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
835 else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
836 TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
837 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
838 else if (ST->hasGFX90AInsts() &&
839 TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
840 TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
841 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
842 return;
843 }
844
845 unsigned UseOpc = UseMI->getOpcode();
846 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
847 (UseOpc == AMDGPU::V_READLANE_B32 &&
848 (int)UseOpIdx ==
849 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
850 // %vgpr = V_MOV_B32 imm
851 // %sgpr = V_READFIRSTLANE_B32 %vgpr
852 // =>
853 // %sgpr = S_MOV_B32 imm
854 if (FoldingImmLike) {
855 if (execMayBeModifiedBeforeUse(*MRI,
856 UseMI->getOperand(UseOpIdx).getReg(),
857 *OpToFold.getParent(),
858 *UseMI))
859 return;
860
861 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
862
863 if (OpToFold.isImm())
864 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
865 else
866 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
867 UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
868 return;
869 }
870
871 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
872 if (execMayBeModifiedBeforeUse(*MRI,
873 UseMI->getOperand(UseOpIdx).getReg(),
874 *OpToFold.getParent(),
875 *UseMI))
876 return;
877
878 // %vgpr = COPY %sgpr0
879 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
880 // =>
881 // %sgpr1 = COPY %sgpr0
882 UseMI->setDesc(TII->get(AMDGPU::COPY));
883 UseMI->getOperand(1).setReg(OpToFold.getReg());
884 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
885 UseMI->getOperand(1).setIsKill(false);
886 UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
887 return;
888 }
889 }
890
891 const MCInstrDesc &UseDesc = UseMI->getDesc();
892
893 // Don't fold into target independent nodes. Target independent opcodes
894 // don't have defined register classes.
895 if (UseDesc.isVariadic() ||
896 UseOp.isImplicit() ||
897 UseDesc.OpInfo[UseOpIdx].RegClass == -1)
898 return;
899 }
900
901 if (!FoldingImmLike) {
902 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
903
904 // FIXME: We could try to change the instruction from 64-bit to 32-bit
905 // to enable more folding opportunites. The shrink operands pass
906 // already does this.
907 return;
908 }
909
910
911 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
912 const TargetRegisterClass *FoldRC =
913 TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
914
915 // Split 64-bit constants into 32-bits for folding.
916 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
917 Register UseReg = UseOp.getReg();
918 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
919
920 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
921 return;
922
923 APInt Imm(64, OpToFold.getImm());
924 if (UseOp.getSubReg() == AMDGPU::sub0) {
925 Imm = Imm.getLoBits(32);
926 } else {
927 assert(UseOp.getSubReg() == AMDGPU::sub1);
928 Imm = Imm.getHiBits(32);
929 }
930
931 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
932 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
933 return;
934 }
935
936
937
938 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
939 }
940
evalBinaryInstruction(unsigned Opcode,int32_t & Result,uint32_t LHS,uint32_t RHS)941 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
942 uint32_t LHS, uint32_t RHS) {
943 switch (Opcode) {
944 case AMDGPU::V_AND_B32_e64:
945 case AMDGPU::V_AND_B32_e32:
946 case AMDGPU::S_AND_B32:
947 Result = LHS & RHS;
948 return true;
949 case AMDGPU::V_OR_B32_e64:
950 case AMDGPU::V_OR_B32_e32:
951 case AMDGPU::S_OR_B32:
952 Result = LHS | RHS;
953 return true;
954 case AMDGPU::V_XOR_B32_e64:
955 case AMDGPU::V_XOR_B32_e32:
956 case AMDGPU::S_XOR_B32:
957 Result = LHS ^ RHS;
958 return true;
959 case AMDGPU::S_XNOR_B32:
960 Result = ~(LHS ^ RHS);
961 return true;
962 case AMDGPU::S_NAND_B32:
963 Result = ~(LHS & RHS);
964 return true;
965 case AMDGPU::S_NOR_B32:
966 Result = ~(LHS | RHS);
967 return true;
968 case AMDGPU::S_ANDN2_B32:
969 Result = LHS & ~RHS;
970 return true;
971 case AMDGPU::S_ORN2_B32:
972 Result = LHS | ~RHS;
973 return true;
974 case AMDGPU::V_LSHL_B32_e64:
975 case AMDGPU::V_LSHL_B32_e32:
976 case AMDGPU::S_LSHL_B32:
977 // The instruction ignores the high bits for out of bounds shifts.
978 Result = LHS << (RHS & 31);
979 return true;
980 case AMDGPU::V_LSHLREV_B32_e64:
981 case AMDGPU::V_LSHLREV_B32_e32:
982 Result = RHS << (LHS & 31);
983 return true;
984 case AMDGPU::V_LSHR_B32_e64:
985 case AMDGPU::V_LSHR_B32_e32:
986 case AMDGPU::S_LSHR_B32:
987 Result = LHS >> (RHS & 31);
988 return true;
989 case AMDGPU::V_LSHRREV_B32_e64:
990 case AMDGPU::V_LSHRREV_B32_e32:
991 Result = RHS >> (LHS & 31);
992 return true;
993 case AMDGPU::V_ASHR_I32_e64:
994 case AMDGPU::V_ASHR_I32_e32:
995 case AMDGPU::S_ASHR_I32:
996 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
997 return true;
998 case AMDGPU::V_ASHRREV_I32_e64:
999 case AMDGPU::V_ASHRREV_I32_e32:
1000 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1001 return true;
1002 default:
1003 return false;
1004 }
1005 }
1006
getMovOpc(bool IsScalar)1007 static unsigned getMovOpc(bool IsScalar) {
1008 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1009 }
1010
1011 /// Remove any leftover implicit operands from mutating the instruction. e.g.
1012 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1013 /// anymore.
stripExtraCopyOperands(MachineInstr & MI)1014 static void stripExtraCopyOperands(MachineInstr &MI) {
1015 const MCInstrDesc &Desc = MI.getDesc();
1016 unsigned NumOps = Desc.getNumOperands() +
1017 Desc.getNumImplicitUses() +
1018 Desc.getNumImplicitDefs();
1019
1020 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1021 MI.RemoveOperand(I);
1022 }
1023
mutateCopyOp(MachineInstr & MI,const MCInstrDesc & NewDesc)1024 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1025 MI.setDesc(NewDesc);
1026 stripExtraCopyOperands(MI);
1027 }
1028
getImmOrMaterializedImm(MachineRegisterInfo & MRI,MachineOperand & Op)1029 static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
1030 MachineOperand &Op) {
1031 if (Op.isReg()) {
1032 // If this has a subregister, it obviously is a register source.
1033 if (Op.getSubReg() != AMDGPU::NoSubRegister || !Op.getReg().isVirtual())
1034 return &Op;
1035
1036 MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1037 if (Def && Def->isMoveImmediate()) {
1038 MachineOperand &ImmSrc = Def->getOperand(1);
1039 if (ImmSrc.isImm())
1040 return &ImmSrc;
1041 }
1042 }
1043
1044 return &Op;
1045 }
1046
1047 // Try to simplify operations with a constant that may appear after instruction
1048 // selection.
1049 // TODO: See if a frame index with a fixed offset can fold.
tryConstantFoldOp(MachineRegisterInfo & MRI,const SIInstrInfo * TII,MachineInstr * MI)1050 static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
1051 MachineInstr *MI) {
1052 unsigned Opc = MI->getOpcode();
1053
1054 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1055 if (Src0Idx == -1)
1056 return false;
1057 MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
1058
1059 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1060 Opc == AMDGPU::S_NOT_B32) &&
1061 Src0->isImm()) {
1062 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1063 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1064 return true;
1065 }
1066
1067 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1068 if (Src1Idx == -1)
1069 return false;
1070 MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
1071
1072 if (!Src0->isImm() && !Src1->isImm())
1073 return false;
1074
1075 // and k0, k1 -> v_mov_b32 (k0 & k1)
1076 // or k0, k1 -> v_mov_b32 (k0 | k1)
1077 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1078 if (Src0->isImm() && Src1->isImm()) {
1079 int32_t NewImm;
1080 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1081 return false;
1082
1083 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1084 bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
1085
1086 // Be careful to change the right operand, src0 may belong to a different
1087 // instruction.
1088 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1089 MI->RemoveOperand(Src1Idx);
1090 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1091 return true;
1092 }
1093
1094 if (!MI->isCommutable())
1095 return false;
1096
1097 if (Src0->isImm() && !Src1->isImm()) {
1098 std::swap(Src0, Src1);
1099 std::swap(Src0Idx, Src1Idx);
1100 }
1101
1102 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1103 if (Opc == AMDGPU::V_OR_B32_e64 ||
1104 Opc == AMDGPU::V_OR_B32_e32 ||
1105 Opc == AMDGPU::S_OR_B32) {
1106 if (Src1Val == 0) {
1107 // y = or x, 0 => y = copy x
1108 MI->RemoveOperand(Src1Idx);
1109 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1110 } else if (Src1Val == -1) {
1111 // y = or x, -1 => y = v_mov_b32 -1
1112 MI->RemoveOperand(Src1Idx);
1113 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1114 } else
1115 return false;
1116
1117 return true;
1118 }
1119
1120 if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
1121 MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
1122 MI->getOpcode() == AMDGPU::S_AND_B32) {
1123 if (Src1Val == 0) {
1124 // y = and x, 0 => y = v_mov_b32 0
1125 MI->RemoveOperand(Src0Idx);
1126 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1127 } else if (Src1Val == -1) {
1128 // y = and x, -1 => y = copy x
1129 MI->RemoveOperand(Src1Idx);
1130 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1131 stripExtraCopyOperands(*MI);
1132 } else
1133 return false;
1134
1135 return true;
1136 }
1137
1138 if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
1139 MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
1140 MI->getOpcode() == AMDGPU::S_XOR_B32) {
1141 if (Src1Val == 0) {
1142 // y = xor x, 0 => y = copy x
1143 MI->RemoveOperand(Src1Idx);
1144 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1145 return true;
1146 }
1147 }
1148
1149 return false;
1150 }
1151
1152 // Try to fold an instruction into a simpler one
tryFoldCndMask(MachineInstr & MI) const1153 bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1154 unsigned Opc = MI.getOpcode();
1155 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1156 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1157 return false;
1158
1159 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1160 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1161 if (!Src1->isIdenticalTo(*Src0)) {
1162 auto *Src0Imm = getImmOrMaterializedImm(*MRI, *Src0);
1163 auto *Src1Imm = getImmOrMaterializedImm(*MRI, *Src1);
1164 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1165 return false;
1166 }
1167
1168 int Src1ModIdx =
1169 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1170 int Src0ModIdx =
1171 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1172 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1173 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1174 return false;
1175
1176 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1177 auto &NewDesc =
1178 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1179 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1180 if (Src2Idx != -1)
1181 MI.RemoveOperand(Src2Idx);
1182 MI.RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1183 if (Src1ModIdx != -1)
1184 MI.RemoveOperand(Src1ModIdx);
1185 if (Src0ModIdx != -1)
1186 MI.RemoveOperand(Src0ModIdx);
1187 mutateCopyOp(MI, NewDesc);
1188 LLVM_DEBUG(dbgs() << MI);
1189 return true;
1190 }
1191
tryFoldZeroHighBits(MachineInstr & MI) const1192 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1193 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1194 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1195 return false;
1196
1197 MachineOperand *Src0 = getImmOrMaterializedImm(*MRI, MI.getOperand(1));
1198 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1199 return false;
1200
1201 Register Src1 = MI.getOperand(2).getReg();
1202 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1203 if (ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) {
1204 Register Dst = MI.getOperand(0).getReg();
1205 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1206 MI.eraseFromParent();
1207 return true;
1208 }
1209
1210 return false;
1211 }
1212
foldInstOperand(MachineInstr & MI,MachineOperand & OpToFold) const1213 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
1214 MachineOperand &OpToFold) const {
1215 // We need mutate the operands of new mov instructions to add implicit
1216 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1217 // this.
1218 SmallVector<MachineInstr *, 4> CopiesToReplace;
1219 SmallVector<FoldCandidate, 4> FoldList;
1220 MachineOperand &Dst = MI.getOperand(0);
1221
1222 if (OpToFold.isImm()) {
1223 for (auto &UseMI :
1224 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1225 // Folding the immediate may reveal operations that can be constant
1226 // folded or replaced with a copy. This can happen for example after
1227 // frame indices are lowered to constants or from splitting 64-bit
1228 // constants.
1229 //
1230 // We may also encounter cases where one or both operands are
1231 // immediates materialized into a register, which would ordinarily not
1232 // be folded due to multiple uses or operand constraints.
1233 if (tryConstantFoldOp(*MRI, TII, &UseMI))
1234 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1235 }
1236 }
1237
1238 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1239 if (FoldingImm) {
1240 unsigned NumLiteralUses = 0;
1241 MachineOperand *NonInlineUse = nullptr;
1242 int NonInlineUseOpNo = -1;
1243
1244 for (auto &Use :
1245 make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
1246 MachineInstr *UseMI = Use.getParent();
1247 unsigned OpNo = UseMI->getOperandNo(&Use);
1248
1249 // Try to fold any inline immediate uses, and then only fold other
1250 // constants if they have one use.
1251 //
1252 // The legality of the inline immediate must be checked based on the use
1253 // operand, not the defining instruction, because 32-bit instructions
1254 // with 32-bit inline immediate sources may be used to materialize
1255 // constants used in 16-bit operands.
1256 //
1257 // e.g. it is unsafe to fold:
1258 // s_mov_b32 s0, 1.0 // materializes 0x3f800000
1259 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
1260
1261 // Folding immediates with more than one use will increase program size.
1262 // FIXME: This will also reduce register usage, which may be better
1263 // in some cases. A better heuristic is needed.
1264 if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
1265 foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
1266 } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
1267 foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
1268 } else {
1269 if (++NumLiteralUses == 1) {
1270 NonInlineUse = &Use;
1271 NonInlineUseOpNo = OpNo;
1272 }
1273 }
1274 }
1275
1276 if (NumLiteralUses == 1) {
1277 MachineInstr *UseMI = NonInlineUse->getParent();
1278 foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
1279 }
1280 } else {
1281 // Folding register.
1282 SmallVector <MachineOperand *, 4> UsesToProcess;
1283 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1284 UsesToProcess.push_back(&Use);
1285 for (auto U : UsesToProcess) {
1286 MachineInstr *UseMI = U->getParent();
1287
1288 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
1289 FoldList, CopiesToReplace);
1290 }
1291 }
1292
1293 MachineFunction *MF = MI.getParent()->getParent();
1294 // Make sure we add EXEC uses to any new v_mov instructions created.
1295 for (MachineInstr *Copy : CopiesToReplace)
1296 Copy->addImplicitDefUseOperands(*MF);
1297
1298 for (FoldCandidate &Fold : FoldList) {
1299 assert(!Fold.isReg() || Fold.OpToFold);
1300 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1301 Register Reg = Fold.OpToFold->getReg();
1302 MachineInstr *DefMI = Fold.OpToFold->getParent();
1303 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1304 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1305 continue;
1306 }
1307 if (updateOperand(Fold, *TII, *TRI, *ST)) {
1308 // Clear kill flags.
1309 if (Fold.isReg()) {
1310 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1311 // FIXME: Probably shouldn't bother trying to fold if not an
1312 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1313 // copies.
1314 MRI->clearKillFlags(Fold.OpToFold->getReg());
1315 }
1316 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1317 << static_cast<int>(Fold.UseOpNo) << " of "
1318 << *Fold.UseMI);
1319 } else if (Fold.isCommuted()) {
1320 // Restoring instruction's original operand order if fold has failed.
1321 TII->commuteInstruction(*Fold.UseMI, false);
1322 }
1323 }
1324 }
1325
1326 // Clamp patterns are canonically selected to v_max_* instructions, so only
1327 // handle them.
isClamp(const MachineInstr & MI) const1328 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1329 unsigned Op = MI.getOpcode();
1330 switch (Op) {
1331 case AMDGPU::V_MAX_F32_e64:
1332 case AMDGPU::V_MAX_F16_e64:
1333 case AMDGPU::V_MAX_F64_e64:
1334 case AMDGPU::V_PK_MAX_F16: {
1335 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1336 return nullptr;
1337
1338 // Make sure sources are identical.
1339 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1340 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1341 if (!Src0->isReg() || !Src1->isReg() ||
1342 Src0->getReg() != Src1->getReg() ||
1343 Src0->getSubReg() != Src1->getSubReg() ||
1344 Src0->getSubReg() != AMDGPU::NoSubRegister)
1345 return nullptr;
1346
1347 // Can't fold up if we have modifiers.
1348 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1349 return nullptr;
1350
1351 unsigned Src0Mods
1352 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1353 unsigned Src1Mods
1354 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1355
1356 // Having a 0 op_sel_hi would require swizzling the output in the source
1357 // instruction, which we can't do.
1358 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1359 : 0u;
1360 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1361 return nullptr;
1362 return Src0;
1363 }
1364 default:
1365 return nullptr;
1366 }
1367 }
1368
1369 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
tryFoldClamp(MachineInstr & MI)1370 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1371 const MachineOperand *ClampSrc = isClamp(MI);
1372 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1373 return false;
1374
1375 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1376
1377 // The type of clamp must be compatible.
1378 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1379 return false;
1380
1381 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1382 if (!DefClamp)
1383 return false;
1384
1385 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1386
1387 // Clamp is applied after omod, so it is OK if omod is set.
1388 DefClamp->setImm(1);
1389 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1390 MI.eraseFromParent();
1391 return true;
1392 }
1393
getOModValue(unsigned Opc,int64_t Val)1394 static int getOModValue(unsigned Opc, int64_t Val) {
1395 switch (Opc) {
1396 case AMDGPU::V_MUL_F64_e64: {
1397 switch (Val) {
1398 case 0x3fe0000000000000: // 0.5
1399 return SIOutMods::DIV2;
1400 case 0x4000000000000000: // 2.0
1401 return SIOutMods::MUL2;
1402 case 0x4010000000000000: // 4.0
1403 return SIOutMods::MUL4;
1404 default:
1405 return SIOutMods::NONE;
1406 }
1407 }
1408 case AMDGPU::V_MUL_F32_e64: {
1409 switch (static_cast<uint32_t>(Val)) {
1410 case 0x3f000000: // 0.5
1411 return SIOutMods::DIV2;
1412 case 0x40000000: // 2.0
1413 return SIOutMods::MUL2;
1414 case 0x40800000: // 4.0
1415 return SIOutMods::MUL4;
1416 default:
1417 return SIOutMods::NONE;
1418 }
1419 }
1420 case AMDGPU::V_MUL_F16_e64: {
1421 switch (static_cast<uint16_t>(Val)) {
1422 case 0x3800: // 0.5
1423 return SIOutMods::DIV2;
1424 case 0x4000: // 2.0
1425 return SIOutMods::MUL2;
1426 case 0x4400: // 4.0
1427 return SIOutMods::MUL4;
1428 default:
1429 return SIOutMods::NONE;
1430 }
1431 }
1432 default:
1433 llvm_unreachable("invalid mul opcode");
1434 }
1435 }
1436
1437 // FIXME: Does this really not support denormals with f16?
1438 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1439 // handled, so will anything other than that break?
1440 std::pair<const MachineOperand *, int>
isOMod(const MachineInstr & MI) const1441 SIFoldOperands::isOMod(const MachineInstr &MI) const {
1442 unsigned Op = MI.getOpcode();
1443 switch (Op) {
1444 case AMDGPU::V_MUL_F64_e64:
1445 case AMDGPU::V_MUL_F32_e64:
1446 case AMDGPU::V_MUL_F16_e64: {
1447 // If output denormals are enabled, omod is ignored.
1448 if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
1449 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64) &&
1450 MFI->getMode().FP64FP16OutputDenormals))
1451 return std::make_pair(nullptr, SIOutMods::NONE);
1452
1453 const MachineOperand *RegOp = nullptr;
1454 const MachineOperand *ImmOp = nullptr;
1455 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1456 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1457 if (Src0->isImm()) {
1458 ImmOp = Src0;
1459 RegOp = Src1;
1460 } else if (Src1->isImm()) {
1461 ImmOp = Src1;
1462 RegOp = Src0;
1463 } else
1464 return std::make_pair(nullptr, SIOutMods::NONE);
1465
1466 int OMod = getOModValue(Op, ImmOp->getImm());
1467 if (OMod == SIOutMods::NONE ||
1468 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1469 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1470 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1471 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1472 return std::make_pair(nullptr, SIOutMods::NONE);
1473
1474 return std::make_pair(RegOp, OMod);
1475 }
1476 case AMDGPU::V_ADD_F64_e64:
1477 case AMDGPU::V_ADD_F32_e64:
1478 case AMDGPU::V_ADD_F16_e64: {
1479 // If output denormals are enabled, omod is ignored.
1480 if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
1481 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64) &&
1482 MFI->getMode().FP64FP16OutputDenormals))
1483 return std::make_pair(nullptr, SIOutMods::NONE);
1484
1485 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1486 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1487 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1488
1489 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1490 Src0->getSubReg() == Src1->getSubReg() &&
1491 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1492 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1493 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1494 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1495 return std::make_pair(Src0, SIOutMods::MUL2);
1496
1497 return std::make_pair(nullptr, SIOutMods::NONE);
1498 }
1499 default:
1500 return std::make_pair(nullptr, SIOutMods::NONE);
1501 }
1502 }
1503
1504 // FIXME: Does this need to check IEEE bit on function?
tryFoldOMod(MachineInstr & MI)1505 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1506 const MachineOperand *RegOp;
1507 int OMod;
1508 std::tie(RegOp, OMod) = isOMod(MI);
1509 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1510 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1511 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1512 return false;
1513
1514 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1515 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1516 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1517 return false;
1518
1519 // Clamp is applied after omod. If the source already has clamp set, don't
1520 // fold it.
1521 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1522 return false;
1523
1524 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1525
1526 DefOMod->setImm(OMod);
1527 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1528 MI.eraseFromParent();
1529 return true;
1530 }
1531
1532 // Try to fold a reg_sequence with vgpr output and agpr inputs into an
1533 // instruction which can take an agpr. So far that means a store.
tryFoldRegSequence(MachineInstr & MI)1534 bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1535 assert(MI.isRegSequence());
1536 auto Reg = MI.getOperand(0).getReg();
1537
1538 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1539 !MRI->hasOneNonDBGUse(Reg))
1540 return false;
1541
1542 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
1543 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI))
1544 return false;
1545
1546 for (auto &Def : Defs) {
1547 const auto *Op = Def.first;
1548 if (!Op->isReg())
1549 return false;
1550 if (TRI->isAGPR(*MRI, Op->getReg()))
1551 continue;
1552 // Maybe this is a COPY from AREG
1553 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1554 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1555 return false;
1556 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1557 return false;
1558 }
1559
1560 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1561 MachineInstr *UseMI = Op->getParent();
1562 while (UseMI->isCopy() && !Op->getSubReg()) {
1563 Reg = UseMI->getOperand(0).getReg();
1564 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1565 return false;
1566 Op = &*MRI->use_nodbg_begin(Reg);
1567 UseMI = Op->getParent();
1568 }
1569
1570 if (Op->getSubReg())
1571 return false;
1572
1573 unsigned OpIdx = Op - &UseMI->getOperand(0);
1574 const MCInstrDesc &InstDesc = UseMI->getDesc();
1575 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
1576 switch (OpInfo.RegClass) {
1577 case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH;
1578 case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH;
1579 case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH;
1580 case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH;
1581 case AMDGPU::AV_160RegClassID:
1582 break;
1583 default:
1584 return false;
1585 }
1586
1587 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1588 auto Dst = MRI->createVirtualRegister(NewDstRC);
1589 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1590 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1591
1592 for (unsigned I = 0; I < Defs.size(); ++I) {
1593 MachineOperand *Def = Defs[I].first;
1594 Def->setIsKill(false);
1595 if (TRI->isAGPR(*MRI, Def->getReg())) {
1596 RS.add(*Def);
1597 } else { // This is a copy
1598 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1599 SubDef->getOperand(1).setIsKill(false);
1600 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1601 }
1602 RS.addImm(Defs[I].second);
1603 }
1604
1605 Op->setReg(Dst);
1606 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1607 Op->setReg(Reg);
1608 RS->eraseFromParent();
1609 return false;
1610 }
1611
1612 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1613
1614 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1615 // in which case we can erase them all later in runOnMachineFunction.
1616 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1617 MI.eraseFromParentAndMarkDBGValuesForRemoval();
1618 return true;
1619 }
1620
1621 // Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
1622 // This should allow folding of an AGPR into a consumer which may support it.
1623 // I.e.:
1624 //
1625 // loop: // loop:
1626 // %1:vreg = COPY %0:areg // exit:
1627 // exit: => // %1:areg = PHI %0:areg, %loop
1628 // %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
tryFoldLCSSAPhi(MachineInstr & PHI)1629 bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
1630 assert(PHI.isPHI());
1631
1632 if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
1633 return false;
1634
1635 Register PhiIn = PHI.getOperand(1).getReg();
1636 Register PhiOut = PHI.getOperand(0).getReg();
1637 if (PHI.getOperand(1).getSubReg() ||
1638 !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
1639 return false;
1640
1641 // A single use should not matter for correctness, but if it has another use
1642 // inside the loop we may perform copy twice in a worst case.
1643 if (!MRI->hasOneNonDBGUse(PhiIn))
1644 return false;
1645
1646 MachineInstr *Copy = MRI->getVRegDef(PhiIn);
1647 if (!Copy || !Copy->isCopy())
1648 return false;
1649
1650 Register CopyIn = Copy->getOperand(1).getReg();
1651 if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
1652 return false;
1653
1654 const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
1655 Register NewReg = MRI->createVirtualRegister(ARC);
1656 PHI.getOperand(1).setReg(CopyIn);
1657 PHI.getOperand(0).setReg(NewReg);
1658
1659 MachineBasicBlock *MBB = PHI.getParent();
1660 BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
1661 TII->get(AMDGPU::COPY), PhiOut)
1662 .addReg(NewReg, RegState::Kill);
1663 Copy->eraseFromParent(); // We know this copy had a single use.
1664
1665 LLVM_DEBUG(dbgs() << "Folded " << PHI);
1666
1667 return true;
1668 }
1669
1670 // Attempt to convert VGPR load to an AGPR load.
tryFoldLoad(MachineInstr & MI)1671 bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1672 assert(MI.mayLoad());
1673 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
1674 return false;
1675
1676 MachineOperand &Def = MI.getOperand(0);
1677 if (!Def.isDef())
1678 return false;
1679
1680 Register DefReg = Def.getReg();
1681
1682 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
1683 return false;
1684
1685 SmallVector<const MachineInstr*, 8> Users;
1686 SmallVector<Register, 8> MoveRegs;
1687 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) {
1688 Users.push_back(&I);
1689 }
1690 if (Users.empty())
1691 return false;
1692
1693 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
1694 while (!Users.empty()) {
1695 const MachineInstr *I = Users.pop_back_val();
1696 if (!I->isCopy() && !I->isRegSequence())
1697 return false;
1698 Register DstReg = I->getOperand(0).getReg();
1699 if (TRI->isAGPR(*MRI, DstReg))
1700 continue;
1701 MoveRegs.push_back(DstReg);
1702 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) {
1703 Users.push_back(&U);
1704 }
1705 }
1706
1707 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
1708 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
1709 if (!TII->isOperandLegal(MI, 0, &Def)) {
1710 MRI->setRegClass(DefReg, RC);
1711 return false;
1712 }
1713
1714 while (!MoveRegs.empty()) {
1715 Register Reg = MoveRegs.pop_back_val();
1716 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
1717 }
1718
1719 LLVM_DEBUG(dbgs() << "Folded " << MI);
1720
1721 return true;
1722 }
1723
runOnMachineFunction(MachineFunction & MF)1724 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1725 if (skipFunction(MF.getFunction()))
1726 return false;
1727
1728 MRI = &MF.getRegInfo();
1729 ST = &MF.getSubtarget<GCNSubtarget>();
1730 TII = ST->getInstrInfo();
1731 TRI = &TII->getRegisterInfo();
1732 MFI = MF.getInfo<SIMachineFunctionInfo>();
1733
1734 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1735 // correctly handle signed zeros.
1736 //
1737 // FIXME: Also need to check strictfp
1738 bool IsIEEEMode = MFI->getMode().IEEE;
1739 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
1740
1741 for (MachineBasicBlock *MBB : depth_first(&MF)) {
1742 MachineOperand *CurrentKnownM0Val = nullptr;
1743 for (auto &MI : make_early_inc_range(*MBB)) {
1744 tryFoldCndMask(MI);
1745
1746 if (tryFoldZeroHighBits(MI))
1747 continue;
1748
1749 if (MI.isRegSequence() && tryFoldRegSequence(MI))
1750 continue;
1751
1752 if (MI.isPHI() && tryFoldLCSSAPhi(MI))
1753 continue;
1754
1755 if (MI.mayLoad() && tryFoldLoad(MI))
1756 continue;
1757
1758 if (!TII->isFoldableCopy(MI)) {
1759 // Saw an unknown clobber of m0, so we no longer know what it is.
1760 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
1761 CurrentKnownM0Val = nullptr;
1762
1763 // TODO: Omod might be OK if there is NSZ only on the source
1764 // instruction, and not the omod multiply.
1765 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
1766 !tryFoldOMod(MI))
1767 tryFoldClamp(MI);
1768
1769 continue;
1770 }
1771
1772 // Specially track simple redefs of m0 to the same value in a block, so we
1773 // can erase the later ones.
1774 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1775 MachineOperand &NewM0Val = MI.getOperand(1);
1776 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1777 MI.eraseFromParent();
1778 continue;
1779 }
1780
1781 // We aren't tracking other physical registers
1782 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ?
1783 nullptr : &NewM0Val;
1784 continue;
1785 }
1786
1787 MachineOperand &OpToFold = MI.getOperand(1);
1788 bool FoldingImm =
1789 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1790
1791 // FIXME: We could also be folding things like TargetIndexes.
1792 if (!FoldingImm && !OpToFold.isReg())
1793 continue;
1794
1795 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1796 continue;
1797
1798 // Prevent folding operands backwards in the function. For example,
1799 // the COPY opcode must not be replaced by 1 in this example:
1800 //
1801 // %3 = COPY %vgpr0; VGPR_32:%3
1802 // ...
1803 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1804 if (!MI.getOperand(0).getReg().isVirtual())
1805 continue;
1806
1807 foldInstOperand(MI, OpToFold);
1808
1809 // If we managed to fold all uses of this copy then we might as well
1810 // delete it now.
1811 // The only reason we need to follow chains of copies here is that
1812 // tryFoldRegSequence looks forward through copies before folding a
1813 // REG_SEQUENCE into its eventual users.
1814 auto *InstToErase = &MI;
1815 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1816 auto &SrcOp = InstToErase->getOperand(1);
1817 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1818 InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
1819 InstToErase = nullptr;
1820 if (!SrcReg || SrcReg.isPhysical())
1821 break;
1822 InstToErase = MRI->getVRegDef(SrcReg);
1823 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1824 break;
1825 }
1826 if (InstToErase && InstToErase->isRegSequence() &&
1827 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
1828 InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
1829 }
1830 }
1831 return true;
1832 }
1833