1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// \file
8 //===----------------------------------------------------------------------===//
9 //
10
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineOperand.h"
18
19 #define DEBUG_TYPE "si-fold-operands"
20 using namespace llvm;
21
22 namespace {
23
24 struct FoldCandidate {
25 MachineInstr *UseMI;
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
33 MachineOperand::MachineOperandType Kind;
34 bool Commuted;
35
FoldCandidate__anon4fb005380111::FoldCandidate36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
isFI__anon4fb005380111::FoldCandidate52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
isImm__anon4fb005380111::FoldCandidate56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
isReg__anon4fb005380111::FoldCandidate60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
isGlobal__anon4fb005380111::FoldCandidate64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
needsShrink__anon4fb005380111::FoldCandidate66 bool needsShrink() const { return ShrinkOpcode != -1; }
67 };
68
69 class SIFoldOperands : public MachineFunctionPass {
70 public:
71 static char ID;
72 MachineRegisterInfo *MRI;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86
87 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88 MachineInstr *MI, unsigned OpNo,
89 MachineOperand *OpToFold) const;
90 bool isUseSafeToFold(const MachineInstr &MI,
91 const MachineOperand &UseMO) const;
92 bool
93 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94 Register UseReg, uint8_t OpTy) const;
95 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96 unsigned UseOpIdx,
97 SmallVectorImpl<FoldCandidate> &FoldList) const;
98 void foldOperand(MachineOperand &OpToFold,
99 MachineInstr *UseMI,
100 int UseOpIdx,
101 SmallVectorImpl<FoldCandidate> &FoldList,
102 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103
104 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105 bool tryConstantFoldOp(MachineInstr *MI) const;
106 bool tryFoldCndMask(MachineInstr &MI) const;
107 bool tryFoldZeroHighBits(MachineInstr &MI) const;
108 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109 bool tryFoldFoldableCopy(MachineInstr &MI,
110 MachineOperand *&CurrentKnownM0Val) const;
111
112 const MachineOperand *isClamp(const MachineInstr &MI) const;
113 bool tryFoldClamp(MachineInstr &MI);
114
115 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116 bool tryFoldOMod(MachineInstr &MI);
117 bool tryFoldRegSequence(MachineInstr &MI);
118 bool tryFoldPhiAGPR(MachineInstr &MI);
119 bool tryFoldLoad(MachineInstr &MI);
120
121 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122
123 public:
SIFoldOperands()124 SIFoldOperands() : MachineFunctionPass(ID) {
125 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
126 }
127
128 bool runOnMachineFunction(MachineFunction &MF) override;
129
getPassName() const130 StringRef getPassName() const override { return "SI Fold Operands"; }
131
getAnalysisUsage(AnalysisUsage & AU) const132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.setPreservesCFG();
134 MachineFunctionPass::getAnalysisUsage(AU);
135 }
136 };
137
138 } // End anonymous namespace.
139
140 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141 "SI Fold Operands", false, false)
142
143 char SIFoldOperands::ID = 0;
144
145 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146
getRegOpRC(const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const MachineOperand & MO)147 static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI,
148 const TargetRegisterInfo &TRI,
149 const MachineOperand &MO) {
150 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151 if (const TargetRegisterClass *SubRC =
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
153 RC = SubRC;
154 return RC;
155 }
156
157 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
macToMad(unsigned Opc)158 static unsigned macToMad(unsigned Opc) {
159 switch (Opc) {
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
174 }
175 return AMDGPU::INSTRUCTION_LIST_END;
176 }
177
178 // TODO: Add heuristic that the frame index might not fit in the addressing mode
179 // immediate offset to avoid materializing in loops.
frameIndexMayFold(const MachineInstr & UseMI,int OpNo,const MachineOperand & OpToFold) const180 bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181 const MachineOperand &OpToFold) const {
182 if (!OpToFold.isFI())
183 return false;
184
185 const unsigned Opc = UseMI.getOpcode();
186 if (TII->isMUBUF(UseMI))
187 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188 if (!TII->isFLATScratch(UseMI))
189 return false;
190
191 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192 if (OpNo == SIdx)
193 return true;
194
195 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196 return OpNo == VIdx && SIdx == -1;
197 }
198
createSIFoldOperandsPass()199 FunctionPass *llvm::createSIFoldOperandsPass() {
200 return new SIFoldOperands();
201 }
202
canUseImmWithOpSel(FoldCandidate & Fold) const203 bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204 MachineInstr *MI = Fold.UseMI;
205 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206 const uint64_t TSFlags = MI->getDesc().TSFlags;
207
208 assert(Old.isReg() && Fold.isImm());
209
210 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213 return false;
214
215 unsigned Opcode = MI->getOpcode();
216 int OpNo = MI->getOperandNo(&Old);
217 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218 switch (OpType) {
219 default:
220 return false;
221 case AMDGPU::OPERAND_REG_IMM_V2FP16:
222 case AMDGPU::OPERAND_REG_IMM_V2INT16:
223 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
224 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
225 break;
226 }
227
228 return true;
229 }
230
tryFoldImmWithOpSel(FoldCandidate & Fold) const231 bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
232 MachineInstr *MI = Fold.UseMI;
233 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
234 unsigned Opcode = MI->getOpcode();
235 int OpNo = MI->getOperandNo(&Old);
236 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
237
238 // If the literal can be inlined as-is, apply it and short-circuit the
239 // tests below. The main motivation for this is to avoid unintuitive
240 // uses of opsel.
241 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
242 Old.ChangeToImmediate(Fold.ImmToFold);
243 return true;
244 }
245
246 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
247 // op_sel in a way that allows an inline constant.
248 int ModIdx = -1;
249 unsigned SrcIdx = ~0;
250 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
251 ModIdx = AMDGPU::OpName::src0_modifiers;
252 SrcIdx = 0;
253 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
254 ModIdx = AMDGPU::OpName::src1_modifiers;
255 SrcIdx = 1;
256 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
257 ModIdx = AMDGPU::OpName::src2_modifiers;
258 SrcIdx = 2;
259 }
260 assert(ModIdx != -1);
261 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
262 MachineOperand &Mod = MI->getOperand(ModIdx);
263 unsigned ModVal = Mod.getImm();
264
265 uint16_t ImmLo = static_cast<uint16_t>(
266 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
267 uint16_t ImmHi = static_cast<uint16_t>(
268 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
269 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
270 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
271
272 // Helper function that attempts to inline the given value with a newly
273 // chosen opsel pattern.
274 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
275 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
276 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
277 Old.ChangeToImmediate(Imm);
278 return true;
279 }
280
281 // Try to shuffle the halves around and leverage opsel to get an inline
282 // constant.
283 uint16_t Lo = static_cast<uint16_t>(Imm);
284 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
285 if (Lo == Hi) {
286 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
287 Mod.setImm(NewModVal);
288 Old.ChangeToImmediate(Lo);
289 return true;
290 }
291
292 if (static_cast<int16_t>(Lo) < 0) {
293 int32_t SExt = static_cast<int16_t>(Lo);
294 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
295 Mod.setImm(NewModVal);
296 Old.ChangeToImmediate(SExt);
297 return true;
298 }
299 }
300
301 // This check is only useful for integer instructions
302 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
303 OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
304 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
305 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
306 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
307 return true;
308 }
309 }
310 } else {
311 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
312 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
313 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
314 Old.ChangeToImmediate(Swapped);
315 return true;
316 }
317 }
318
319 return false;
320 };
321
322 if (tryFoldToInline(Imm))
323 return true;
324
325 // Replace integer addition by subtraction and vice versa if it allows
326 // folding the immediate to an inline constant.
327 //
328 // We should only ever get here for SrcIdx == 1 due to canonicalization
329 // earlier in the pipeline, but we double-check here to be safe / fully
330 // general.
331 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
332 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
333 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
334 unsigned ClampIdx =
335 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
336 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
337
338 if (!Clamp) {
339 uint16_t NegLo = -static_cast<uint16_t>(Imm);
340 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
341 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
342
343 if (tryFoldToInline(NegImm)) {
344 unsigned NegOpcode =
345 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
346 MI->setDesc(TII->get(NegOpcode));
347 return true;
348 }
349 }
350 }
351
352 return false;
353 }
354
updateOperand(FoldCandidate & Fold) const355 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
356 MachineInstr *MI = Fold.UseMI;
357 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
358 assert(Old.isReg());
359
360 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
361 if (tryFoldImmWithOpSel(Fold))
362 return true;
363
364 // We can't represent the candidate as an inline constant. Try as a literal
365 // with the original opsel, checking constant bus limitations.
366 MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
367 int OpNo = MI->getOperandNo(&Old);
368 if (!TII->isOperandLegal(*MI, OpNo, &New))
369 return false;
370 Old.ChangeToImmediate(Fold.ImmToFold);
371 return true;
372 }
373
374 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
375 MachineBasicBlock *MBB = MI->getParent();
376 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
377 if (Liveness != MachineBasicBlock::LQR_Dead) {
378 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
379 return false;
380 }
381
382 int Op32 = Fold.ShrinkOpcode;
383 MachineOperand &Dst0 = MI->getOperand(0);
384 MachineOperand &Dst1 = MI->getOperand(1);
385 assert(Dst0.isDef() && Dst1.isDef());
386
387 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
388
389 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
390 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
391
392 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
393
394 if (HaveNonDbgCarryUse) {
395 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
396 Dst1.getReg())
397 .addReg(AMDGPU::VCC, RegState::Kill);
398 }
399
400 // Keep the old instruction around to avoid breaking iterators, but
401 // replace it with a dummy instruction to remove uses.
402 //
403 // FIXME: We should not invert how this pass looks at operands to avoid
404 // this. Should track set of foldable movs instead of looking for uses
405 // when looking at a use.
406 Dst0.setReg(NewReg0);
407 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
408 MI->removeOperand(I);
409 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
410
411 if (Fold.Commuted)
412 TII->commuteInstruction(*Inst32, false);
413 return true;
414 }
415
416 assert(!Fold.needsShrink() && "not handled");
417
418 if (Fold.isImm()) {
419 if (Old.isTied()) {
420 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
421 if (NewMFMAOpc == -1)
422 return false;
423 MI->setDesc(TII->get(NewMFMAOpc));
424 MI->untieRegOperand(0);
425 }
426 Old.ChangeToImmediate(Fold.ImmToFold);
427 return true;
428 }
429
430 if (Fold.isGlobal()) {
431 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
432 Fold.OpToFold->getTargetFlags());
433 return true;
434 }
435
436 if (Fold.isFI()) {
437 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
438 return true;
439 }
440
441 MachineOperand *New = Fold.OpToFold;
442 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
443 Old.setIsUndef(New->isUndef());
444 return true;
445 }
446
isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,const MachineInstr * MI)447 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
448 const MachineInstr *MI) {
449 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
450 }
451
appendFoldCandidate(SmallVectorImpl<FoldCandidate> & FoldList,MachineInstr * MI,unsigned OpNo,MachineOperand * FoldOp,bool Commuted=false,int ShrinkOp=-1)452 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
453 MachineInstr *MI, unsigned OpNo,
454 MachineOperand *FoldOp, bool Commuted = false,
455 int ShrinkOp = -1) {
456 // Skip additional folding on the same operand.
457 for (FoldCandidate &Fold : FoldList)
458 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
459 return;
460 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
461 << " operand " << OpNo << "\n " << *MI);
462 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
463 }
464
tryAddToFoldList(SmallVectorImpl<FoldCandidate> & FoldList,MachineInstr * MI,unsigned OpNo,MachineOperand * OpToFold) const465 bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
466 MachineInstr *MI, unsigned OpNo,
467 MachineOperand *OpToFold) const {
468 const unsigned Opc = MI->getOpcode();
469
470 auto tryToFoldAsFMAAKorMK = [&]() {
471 if (!OpToFold->isImm())
472 return false;
473
474 const bool TryAK = OpNo == 3;
475 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
476 MI->setDesc(TII->get(NewOpc));
477
478 // We have to fold into operand which would be Imm not into OpNo.
479 bool FoldAsFMAAKorMK =
480 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
481 if (FoldAsFMAAKorMK) {
482 // Untie Src2 of fmac.
483 MI->untieRegOperand(3);
484 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
485 if (OpNo == 1) {
486 MachineOperand &Op1 = MI->getOperand(1);
487 MachineOperand &Op2 = MI->getOperand(2);
488 Register OldReg = Op1.getReg();
489 // Operand 2 might be an inlinable constant
490 if (Op2.isImm()) {
491 Op1.ChangeToImmediate(Op2.getImm());
492 Op2.ChangeToRegister(OldReg, false);
493 } else {
494 Op1.setReg(Op2.getReg());
495 Op2.setReg(OldReg);
496 }
497 }
498 return true;
499 }
500 MI->setDesc(TII->get(Opc));
501 return false;
502 };
503
504 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
505 if (!IsLegal && OpToFold->isImm()) {
506 FoldCandidate Fold(MI, OpNo, OpToFold);
507 IsLegal = canUseImmWithOpSel(Fold);
508 }
509
510 if (!IsLegal) {
511 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
512 unsigned NewOpc = macToMad(Opc);
513 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
514 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
515 // to fold the operand.
516 MI->setDesc(TII->get(NewOpc));
517 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
518 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
519 if (AddOpSel)
520 MI->addOperand(MachineOperand::CreateImm(0));
521 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
522 if (FoldAsMAD) {
523 MI->untieRegOperand(OpNo);
524 return true;
525 }
526 if (AddOpSel)
527 MI->removeOperand(MI->getNumExplicitOperands() - 1);
528 MI->setDesc(TII->get(Opc));
529 }
530
531 // Special case for s_fmac_f32 if we are trying to fold into Src2.
532 // By transforming into fmaak we can untie Src2 and make folding legal.
533 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
534 if (tryToFoldAsFMAAKorMK())
535 return true;
536 }
537
538 // Special case for s_setreg_b32
539 if (OpToFold->isImm()) {
540 unsigned ImmOpc = 0;
541 if (Opc == AMDGPU::S_SETREG_B32)
542 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
543 else if (Opc == AMDGPU::S_SETREG_B32_mode)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
545 if (ImmOpc) {
546 MI->setDesc(TII->get(ImmOpc));
547 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
548 return true;
549 }
550 }
551
552 // If we are already folding into another operand of MI, then
553 // we can't commute the instruction, otherwise we risk making the
554 // other fold illegal.
555 if (isUseMIInFoldList(FoldList, MI))
556 return false;
557
558 // Operand is not legal, so try to commute the instruction to
559 // see if this makes it possible to fold.
560 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
561 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
562 if (!CanCommute)
563 return false;
564
565 // One of operands might be an Imm operand, and OpNo may refer to it after
566 // the call of commuteInstruction() below. Such situations are avoided
567 // here explicitly as OpNo must be a register operand to be a candidate
568 // for memory folding.
569 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
570 return false;
571
572 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
573 return false;
574
575 int Op32 = -1;
576 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
577 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
578 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
579 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
580 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
581 return false;
582 }
583
584 // Verify the other operand is a VGPR, otherwise we would violate the
585 // constant bus restriction.
586 MachineOperand &OtherOp = MI->getOperand(OpNo);
587 if (!OtherOp.isReg() ||
588 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
589 return false;
590
591 assert(MI->getOperand(1).isDef());
592
593 // Make sure to get the 32-bit version of the commuted opcode.
594 unsigned MaybeCommutedOpc = MI->getOpcode();
595 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
596 }
597
598 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
599 return true;
600 }
601
602 // Inlineable constant might have been folded into Imm operand of fmaak or
603 // fmamk and we are trying to fold a non-inlinable constant.
604 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
605 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
606 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
607 MachineOperand &OpImm = MI->getOperand(ImmIdx);
608 if (!OpImm.isReg() &&
609 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
610 return tryToFoldAsFMAAKorMK();
611 }
612
613 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
614 // By changing into fmamk we can untie Src2.
615 // If folding for Src0 happens first and it is identical operand to Src1 we
616 // should avoid transforming into fmamk which requires commuting as it would
617 // cause folding into Src1 to fail later on due to wrong OpNo used.
618 if (Opc == AMDGPU::S_FMAC_F32 &&
619 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
620 if (tryToFoldAsFMAAKorMK())
621 return true;
622 }
623
624 // Check the case where we might introduce a second constant operand to a
625 // scalar instruction
626 if (TII->isSALU(MI->getOpcode())) {
627 const MCInstrDesc &InstDesc = MI->getDesc();
628 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
629
630 // Fine if the operand can be encoded as an inline constant
631 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
632 // Otherwise check for another constant
633 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
634 auto &Op = MI->getOperand(i);
635 if (OpNo != i && !Op.isReg() &&
636 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
637 return false;
638 }
639 }
640 }
641
642 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
643 return true;
644 }
645
isUseSafeToFold(const MachineInstr & MI,const MachineOperand & UseMO) const646 bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
647 const MachineOperand &UseMO) const {
648 // Operands of SDWA instructions must be registers.
649 return !TII->isSDWA(MI);
650 }
651
652 // Find a def of the UseReg, check if it is a reg_sequence and find initializers
653 // for each subreg, tracking it to foldable inline immediate if possible.
654 // Returns true on success.
getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *,unsigned>> & Defs,Register UseReg,uint8_t OpTy) const655 bool SIFoldOperands::getRegSeqInit(
656 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
657 Register UseReg, uint8_t OpTy) const {
658 MachineInstr *Def = MRI->getVRegDef(UseReg);
659 if (!Def || !Def->isRegSequence())
660 return false;
661
662 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
663 MachineOperand *Sub = &Def->getOperand(I);
664 assert(Sub->isReg());
665
666 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
667 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
668 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
669 SubDef = MRI->getVRegDef(Sub->getReg())) {
670 MachineOperand *Op = &SubDef->getOperand(1);
671 if (Op->isImm()) {
672 if (TII->isInlineConstant(*Op, OpTy))
673 Sub = Op;
674 break;
675 }
676 if (!Op->isReg() || Op->getReg().isPhysical())
677 break;
678 Sub = Op;
679 }
680
681 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
682 }
683
684 return true;
685 }
686
tryToFoldACImm(const MachineOperand & OpToFold,MachineInstr * UseMI,unsigned UseOpIdx,SmallVectorImpl<FoldCandidate> & FoldList) const687 bool SIFoldOperands::tryToFoldACImm(
688 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
689 SmallVectorImpl<FoldCandidate> &FoldList) const {
690 const MCInstrDesc &Desc = UseMI->getDesc();
691 if (UseOpIdx >= Desc.getNumOperands())
692 return false;
693
694 if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx))
695 return false;
696
697 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
698 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
699 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
700 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
701 return true;
702 }
703
704 if (!OpToFold.isReg())
705 return false;
706
707 Register UseReg = OpToFold.getReg();
708 if (!UseReg.isVirtual())
709 return false;
710
711 if (isUseMIInFoldList(FoldList, UseMI))
712 return false;
713
714 // Maybe it is just a COPY of an immediate itself.
715 MachineInstr *Def = MRI->getVRegDef(UseReg);
716 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
717 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
718 MachineOperand &DefOp = Def->getOperand(1);
719 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
720 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
721 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
722 return true;
723 }
724 }
725
726 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
727 if (!getRegSeqInit(Defs, UseReg, OpTy))
728 return false;
729
730 int32_t Imm;
731 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
732 const MachineOperand *Op = Defs[I].first;
733 if (!Op->isImm())
734 return false;
735
736 auto SubImm = Op->getImm();
737 if (!I) {
738 Imm = SubImm;
739 if (!TII->isInlineConstant(*Op, OpTy) ||
740 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
741 return false;
742
743 continue;
744 }
745 if (Imm != SubImm)
746 return false; // Can only fold splat constants
747 }
748
749 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
750 return true;
751 }
752
foldOperand(MachineOperand & OpToFold,MachineInstr * UseMI,int UseOpIdx,SmallVectorImpl<FoldCandidate> & FoldList,SmallVectorImpl<MachineInstr * > & CopiesToReplace) const753 void SIFoldOperands::foldOperand(
754 MachineOperand &OpToFold,
755 MachineInstr *UseMI,
756 int UseOpIdx,
757 SmallVectorImpl<FoldCandidate> &FoldList,
758 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
759 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
760
761 if (!isUseSafeToFold(*UseMI, UseOp))
762 return;
763
764 // FIXME: Fold operands with subregs.
765 if (UseOp.isReg() && OpToFold.isReg() &&
766 (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
767 return;
768
769 // Special case for REG_SEQUENCE: We can't fold literals into
770 // REG_SEQUENCE instructions, so we have to fold them into the
771 // uses of REG_SEQUENCE.
772 if (UseMI->isRegSequence()) {
773 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
774 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
775
776 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
777 MachineInstr *RSUseMI = RSUse.getParent();
778
779 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
780 RSUseMI->getOperandNo(&RSUse), FoldList))
781 continue;
782
783 if (RSUse.getSubReg() != RegSeqDstSubReg)
784 continue;
785
786 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
787 CopiesToReplace);
788 }
789
790 return;
791 }
792
793 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
794 return;
795
796 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
797 // Verify that this is a stack access.
798 // FIXME: Should probably use stack pseudos before frame lowering.
799
800 if (TII->isMUBUF(*UseMI)) {
801 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
802 MFI->getScratchRSrcReg())
803 return;
804
805 // Ensure this is either relative to the current frame or the current
806 // wave.
807 MachineOperand &SOff =
808 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
809 if (!SOff.isImm() || SOff.getImm() != 0)
810 return;
811 }
812
813 // A frame index will resolve to a positive constant, so it should always be
814 // safe to fold the addressing mode, even pre-GFX9.
815 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
816
817 const unsigned Opc = UseMI->getOpcode();
818 if (TII->isFLATScratch(*UseMI) &&
819 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
820 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
821 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
822 UseMI->setDesc(TII->get(NewOpc));
823 }
824
825 return;
826 }
827
828 bool FoldingImmLike =
829 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
830
831 if (FoldingImmLike && UseMI->isCopy()) {
832 Register DestReg = UseMI->getOperand(0).getReg();
833 Register SrcReg = UseMI->getOperand(1).getReg();
834 assert(SrcReg.isVirtual());
835
836 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
837
838 // Don't fold into a copy to a physical register with the same class. Doing
839 // so would interfere with the register coalescer's logic which would avoid
840 // redundant initializations.
841 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
842 return;
843
844 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
845 if (!DestReg.isPhysical()) {
846 if (DestRC == &AMDGPU::AGPR_32RegClass &&
847 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
848 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
849 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
850 CopiesToReplace.push_back(UseMI);
851 return;
852 }
853 }
854
855 // In order to fold immediates into copies, we need to change the
856 // copy to a MOV.
857
858 unsigned MovOp = TII->getMovOpcode(DestRC);
859 if (MovOp == AMDGPU::COPY)
860 return;
861
862 UseMI->setDesc(TII->get(MovOp));
863 MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
864 MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
865 while (ImpOpI != ImpOpE) {
866 MachineInstr::mop_iterator Tmp = ImpOpI;
867 ImpOpI++;
868 UseMI->removeOperand(UseMI->getOperandNo(Tmp));
869 }
870 CopiesToReplace.push_back(UseMI);
871 } else {
872 if (UseMI->isCopy() && OpToFold.isReg() &&
873 UseMI->getOperand(0).getReg().isVirtual() &&
874 !UseMI->getOperand(1).getSubReg()) {
875 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
876 unsigned Size = TII->getOpSize(*UseMI, 1);
877 Register UseReg = OpToFold.getReg();
878 UseMI->getOperand(1).setReg(UseReg);
879 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
880 UseMI->getOperand(1).setIsKill(false);
881 CopiesToReplace.push_back(UseMI);
882 OpToFold.setIsKill(false);
883
884 // Remove kill flags as kills may now be out of order with uses.
885 MRI->clearKillFlags(OpToFold.getReg());
886
887 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
888 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
889 // its initializers right here, so we will rematerialize immediates and
890 // avoid copies via different reg classes.
891 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
892 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
893 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
894 const DebugLoc &DL = UseMI->getDebugLoc();
895 MachineBasicBlock &MBB = *UseMI->getParent();
896
897 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
898 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
899 UseMI->removeOperand(I);
900
901 MachineInstrBuilder B(*MBB.getParent(), UseMI);
902 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
903 SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
904 for (unsigned I = 0; I < Size / 4; ++I) {
905 MachineOperand *Def = Defs[I].first;
906 TargetInstrInfo::RegSubRegPair CopyToVGPR;
907 if (Def->isImm() &&
908 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
909 int64_t Imm = Def->getImm();
910
911 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
912 BuildMI(MBB, UseMI, DL,
913 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
914 B.addReg(Tmp);
915 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
916 auto Src = getRegSubRegPair(*Def);
917 Def->setIsKill(false);
918 if (!SeenAGPRs.insert(Src)) {
919 // We cannot build a reg_sequence out of the same registers, they
920 // must be copied. Better do it here before copyPhysReg() created
921 // several reads to do the AGPR->VGPR->AGPR copy.
922 CopyToVGPR = Src;
923 } else {
924 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
925 Src.SubReg);
926 }
927 } else {
928 assert(Def->isReg());
929 Def->setIsKill(false);
930 auto Src = getRegSubRegPair(*Def);
931
932 // Direct copy from SGPR to AGPR is not possible. To avoid creation
933 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
934 // create a copy here and track if we already have such a copy.
935 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
936 CopyToVGPR = Src;
937 } else {
938 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
939 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
940 B.addReg(Tmp);
941 }
942 }
943
944 if (CopyToVGPR.Reg) {
945 Register Vgpr;
946 if (VGPRCopies.count(CopyToVGPR)) {
947 Vgpr = VGPRCopies[CopyToVGPR];
948 } else {
949 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
950 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
951 VGPRCopies[CopyToVGPR] = Vgpr;
952 }
953 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
954 BuildMI(MBB, UseMI, DL,
955 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
956 B.addReg(Tmp);
957 }
958
959 B.addImm(Defs[I].second);
960 }
961 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
962 return;
963 }
964
965 if (Size != 4)
966 return;
967
968 Register Reg0 = UseMI->getOperand(0).getReg();
969 Register Reg1 = UseMI->getOperand(1).getReg();
970 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
971 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
972 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
973 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
974 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
975 TRI->isAGPR(*MRI, Reg1))
976 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
977 return;
978 }
979
980 unsigned UseOpc = UseMI->getOpcode();
981 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
982 (UseOpc == AMDGPU::V_READLANE_B32 &&
983 (int)UseOpIdx ==
984 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
985 // %vgpr = V_MOV_B32 imm
986 // %sgpr = V_READFIRSTLANE_B32 %vgpr
987 // =>
988 // %sgpr = S_MOV_B32 imm
989 if (FoldingImmLike) {
990 if (execMayBeModifiedBeforeUse(*MRI,
991 UseMI->getOperand(UseOpIdx).getReg(),
992 *OpToFold.getParent(),
993 *UseMI))
994 return;
995
996 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
997
998 if (OpToFold.isImm())
999 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1000 else
1001 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
1002 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1003 return;
1004 }
1005
1006 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1007 if (execMayBeModifiedBeforeUse(*MRI,
1008 UseMI->getOperand(UseOpIdx).getReg(),
1009 *OpToFold.getParent(),
1010 *UseMI))
1011 return;
1012
1013 // %vgpr = COPY %sgpr0
1014 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1015 // =>
1016 // %sgpr1 = COPY %sgpr0
1017 UseMI->setDesc(TII->get(AMDGPU::COPY));
1018 UseMI->getOperand(1).setReg(OpToFold.getReg());
1019 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1020 UseMI->getOperand(1).setIsKill(false);
1021 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1022 return;
1023 }
1024 }
1025
1026 const MCInstrDesc &UseDesc = UseMI->getDesc();
1027
1028 // Don't fold into target independent nodes. Target independent opcodes
1029 // don't have defined register classes.
1030 if (UseDesc.isVariadic() || UseOp.isImplicit() ||
1031 UseDesc.operands()[UseOpIdx].RegClass == -1)
1032 return;
1033 }
1034
1035 if (!FoldingImmLike) {
1036 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1037 // Don't fold if OpToFold doesn't hold an aligned register.
1038 const TargetRegisterClass *RC =
1039 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1040 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1041 unsigned SubReg = OpToFold.getSubReg();
1042 if (const TargetRegisterClass *SubRC =
1043 TRI->getSubRegisterClass(RC, SubReg))
1044 RC = SubRC;
1045 }
1046
1047 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1048 return;
1049 }
1050
1051 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1052
1053 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1054 // to enable more folding opportunities. The shrink operands pass
1055 // already does this.
1056 return;
1057 }
1058
1059
1060 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1061 const TargetRegisterClass *FoldRC =
1062 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1063
1064 // Split 64-bit constants into 32-bits for folding.
1065 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1066 Register UseReg = UseOp.getReg();
1067 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1068 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1069 return;
1070
1071 APInt Imm(64, OpToFold.getImm());
1072 if (UseOp.getSubReg() == AMDGPU::sub0) {
1073 Imm = Imm.getLoBits(32);
1074 } else {
1075 assert(UseOp.getSubReg() == AMDGPU::sub1);
1076 Imm = Imm.getHiBits(32);
1077 }
1078
1079 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1080 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1081 return;
1082 }
1083
1084 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1085 }
1086
evalBinaryInstruction(unsigned Opcode,int32_t & Result,uint32_t LHS,uint32_t RHS)1087 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1088 uint32_t LHS, uint32_t RHS) {
1089 switch (Opcode) {
1090 case AMDGPU::V_AND_B32_e64:
1091 case AMDGPU::V_AND_B32_e32:
1092 case AMDGPU::S_AND_B32:
1093 Result = LHS & RHS;
1094 return true;
1095 case AMDGPU::V_OR_B32_e64:
1096 case AMDGPU::V_OR_B32_e32:
1097 case AMDGPU::S_OR_B32:
1098 Result = LHS | RHS;
1099 return true;
1100 case AMDGPU::V_XOR_B32_e64:
1101 case AMDGPU::V_XOR_B32_e32:
1102 case AMDGPU::S_XOR_B32:
1103 Result = LHS ^ RHS;
1104 return true;
1105 case AMDGPU::S_XNOR_B32:
1106 Result = ~(LHS ^ RHS);
1107 return true;
1108 case AMDGPU::S_NAND_B32:
1109 Result = ~(LHS & RHS);
1110 return true;
1111 case AMDGPU::S_NOR_B32:
1112 Result = ~(LHS | RHS);
1113 return true;
1114 case AMDGPU::S_ANDN2_B32:
1115 Result = LHS & ~RHS;
1116 return true;
1117 case AMDGPU::S_ORN2_B32:
1118 Result = LHS | ~RHS;
1119 return true;
1120 case AMDGPU::V_LSHL_B32_e64:
1121 case AMDGPU::V_LSHL_B32_e32:
1122 case AMDGPU::S_LSHL_B32:
1123 // The instruction ignores the high bits for out of bounds shifts.
1124 Result = LHS << (RHS & 31);
1125 return true;
1126 case AMDGPU::V_LSHLREV_B32_e64:
1127 case AMDGPU::V_LSHLREV_B32_e32:
1128 Result = RHS << (LHS & 31);
1129 return true;
1130 case AMDGPU::V_LSHR_B32_e64:
1131 case AMDGPU::V_LSHR_B32_e32:
1132 case AMDGPU::S_LSHR_B32:
1133 Result = LHS >> (RHS & 31);
1134 return true;
1135 case AMDGPU::V_LSHRREV_B32_e64:
1136 case AMDGPU::V_LSHRREV_B32_e32:
1137 Result = RHS >> (LHS & 31);
1138 return true;
1139 case AMDGPU::V_ASHR_I32_e64:
1140 case AMDGPU::V_ASHR_I32_e32:
1141 case AMDGPU::S_ASHR_I32:
1142 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1143 return true;
1144 case AMDGPU::V_ASHRREV_I32_e64:
1145 case AMDGPU::V_ASHRREV_I32_e32:
1146 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1147 return true;
1148 default:
1149 return false;
1150 }
1151 }
1152
getMovOpc(bool IsScalar)1153 static unsigned getMovOpc(bool IsScalar) {
1154 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1155 }
1156
mutateCopyOp(MachineInstr & MI,const MCInstrDesc & NewDesc)1157 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1158 MI.setDesc(NewDesc);
1159
1160 // Remove any leftover implicit operands from mutating the instruction. e.g.
1161 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1162 // anymore.
1163 const MCInstrDesc &Desc = MI.getDesc();
1164 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1165 Desc.implicit_defs().size();
1166
1167 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1168 MI.removeOperand(I);
1169 }
1170
1171 MachineOperand *
getImmOrMaterializedImm(MachineOperand & Op) const1172 SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1173 // If this has a subregister, it obviously is a register source.
1174 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1175 !Op.getReg().isVirtual())
1176 return &Op;
1177
1178 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1179 if (Def && Def->isMoveImmediate()) {
1180 MachineOperand &ImmSrc = Def->getOperand(1);
1181 if (ImmSrc.isImm())
1182 return &ImmSrc;
1183 }
1184
1185 return &Op;
1186 }
1187
1188 // Try to simplify operations with a constant that may appear after instruction
1189 // selection.
1190 // TODO: See if a frame index with a fixed offset can fold.
tryConstantFoldOp(MachineInstr * MI) const1191 bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1192 if (!MI->allImplicitDefsAreDead())
1193 return false;
1194
1195 unsigned Opc = MI->getOpcode();
1196
1197 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1198 if (Src0Idx == -1)
1199 return false;
1200 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1201
1202 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1203 Opc == AMDGPU::S_NOT_B32) &&
1204 Src0->isImm()) {
1205 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1206 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1207 return true;
1208 }
1209
1210 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1211 if (Src1Idx == -1)
1212 return false;
1213 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1214
1215 if (!Src0->isImm() && !Src1->isImm())
1216 return false;
1217
1218 // and k0, k1 -> v_mov_b32 (k0 & k1)
1219 // or k0, k1 -> v_mov_b32 (k0 | k1)
1220 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1221 if (Src0->isImm() && Src1->isImm()) {
1222 int32_t NewImm;
1223 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1224 return false;
1225
1226 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1227
1228 // Be careful to change the right operand, src0 may belong to a different
1229 // instruction.
1230 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1231 MI->removeOperand(Src1Idx);
1232 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1233 return true;
1234 }
1235
1236 if (!MI->isCommutable())
1237 return false;
1238
1239 if (Src0->isImm() && !Src1->isImm()) {
1240 std::swap(Src0, Src1);
1241 std::swap(Src0Idx, Src1Idx);
1242 }
1243
1244 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1245 if (Opc == AMDGPU::V_OR_B32_e64 ||
1246 Opc == AMDGPU::V_OR_B32_e32 ||
1247 Opc == AMDGPU::S_OR_B32) {
1248 if (Src1Val == 0) {
1249 // y = or x, 0 => y = copy x
1250 MI->removeOperand(Src1Idx);
1251 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1252 } else if (Src1Val == -1) {
1253 // y = or x, -1 => y = v_mov_b32 -1
1254 MI->removeOperand(Src1Idx);
1255 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1256 } else
1257 return false;
1258
1259 return true;
1260 }
1261
1262 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1263 Opc == AMDGPU::S_AND_B32) {
1264 if (Src1Val == 0) {
1265 // y = and x, 0 => y = v_mov_b32 0
1266 MI->removeOperand(Src0Idx);
1267 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1268 } else if (Src1Val == -1) {
1269 // y = and x, -1 => y = copy x
1270 MI->removeOperand(Src1Idx);
1271 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1272 } else
1273 return false;
1274
1275 return true;
1276 }
1277
1278 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1279 Opc == AMDGPU::S_XOR_B32) {
1280 if (Src1Val == 0) {
1281 // y = xor x, 0 => y = copy x
1282 MI->removeOperand(Src1Idx);
1283 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1284 return true;
1285 }
1286 }
1287
1288 return false;
1289 }
1290
1291 // Try to fold an instruction into a simpler one
tryFoldCndMask(MachineInstr & MI) const1292 bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1293 unsigned Opc = MI.getOpcode();
1294 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1295 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1296 return false;
1297
1298 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1299 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1300 if (!Src1->isIdenticalTo(*Src0)) {
1301 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1302 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1303 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1304 return false;
1305 }
1306
1307 int Src1ModIdx =
1308 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1309 int Src0ModIdx =
1310 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1311 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1312 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1313 return false;
1314
1315 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1316 auto &NewDesc =
1317 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1318 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1319 if (Src2Idx != -1)
1320 MI.removeOperand(Src2Idx);
1321 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1322 if (Src1ModIdx != -1)
1323 MI.removeOperand(Src1ModIdx);
1324 if (Src0ModIdx != -1)
1325 MI.removeOperand(Src0ModIdx);
1326 mutateCopyOp(MI, NewDesc);
1327 LLVM_DEBUG(dbgs() << MI);
1328 return true;
1329 }
1330
tryFoldZeroHighBits(MachineInstr & MI) const1331 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1332 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1333 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1334 return false;
1335
1336 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1337 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1338 return false;
1339
1340 Register Src1 = MI.getOperand(2).getReg();
1341 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1342 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1343 return false;
1344
1345 Register Dst = MI.getOperand(0).getReg();
1346 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1347 MI.eraseFromParent();
1348 return true;
1349 }
1350
foldInstOperand(MachineInstr & MI,MachineOperand & OpToFold) const1351 bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1352 MachineOperand &OpToFold) const {
1353 // We need mutate the operands of new mov instructions to add implicit
1354 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1355 // this.
1356 SmallVector<MachineInstr *, 4> CopiesToReplace;
1357 SmallVector<FoldCandidate, 4> FoldList;
1358 MachineOperand &Dst = MI.getOperand(0);
1359 bool Changed = false;
1360
1361 if (OpToFold.isImm()) {
1362 for (auto &UseMI :
1363 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1364 // Folding the immediate may reveal operations that can be constant
1365 // folded or replaced with a copy. This can happen for example after
1366 // frame indices are lowered to constants or from splitting 64-bit
1367 // constants.
1368 //
1369 // We may also encounter cases where one or both operands are
1370 // immediates materialized into a register, which would ordinarily not
1371 // be folded due to multiple uses or operand constraints.
1372 if (tryConstantFoldOp(&UseMI)) {
1373 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1374 Changed = true;
1375 }
1376 }
1377 }
1378
1379 SmallVector<MachineOperand *, 4> UsesToProcess;
1380 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1381 UsesToProcess.push_back(&Use);
1382 for (auto *U : UsesToProcess) {
1383 MachineInstr *UseMI = U->getParent();
1384 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1385 CopiesToReplace);
1386 }
1387
1388 if (CopiesToReplace.empty() && FoldList.empty())
1389 return Changed;
1390
1391 MachineFunction *MF = MI.getParent()->getParent();
1392 // Make sure we add EXEC uses to any new v_mov instructions created.
1393 for (MachineInstr *Copy : CopiesToReplace)
1394 Copy->addImplicitDefUseOperands(*MF);
1395
1396 for (FoldCandidate &Fold : FoldList) {
1397 assert(!Fold.isReg() || Fold.OpToFold);
1398 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1399 Register Reg = Fold.OpToFold->getReg();
1400 MachineInstr *DefMI = Fold.OpToFold->getParent();
1401 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1402 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1403 continue;
1404 }
1405 if (updateOperand(Fold)) {
1406 // Clear kill flags.
1407 if (Fold.isReg()) {
1408 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1409 // FIXME: Probably shouldn't bother trying to fold if not an
1410 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1411 // copies.
1412 MRI->clearKillFlags(Fold.OpToFold->getReg());
1413 }
1414 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1415 << static_cast<int>(Fold.UseOpNo) << " of "
1416 << *Fold.UseMI);
1417 } else if (Fold.Commuted) {
1418 // Restoring instruction's original operand order if fold has failed.
1419 TII->commuteInstruction(*Fold.UseMI, false);
1420 }
1421 }
1422 return true;
1423 }
1424
tryFoldFoldableCopy(MachineInstr & MI,MachineOperand * & CurrentKnownM0Val) const1425 bool SIFoldOperands::tryFoldFoldableCopy(
1426 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1427 // Specially track simple redefs of m0 to the same value in a block, so we
1428 // can erase the later ones.
1429 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1430 MachineOperand &NewM0Val = MI.getOperand(1);
1431 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1432 MI.eraseFromParent();
1433 return true;
1434 }
1435
1436 // We aren't tracking other physical registers
1437 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1438 ? nullptr
1439 : &NewM0Val;
1440 return false;
1441 }
1442
1443 MachineOperand &OpToFold = MI.getOperand(1);
1444 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1445
1446 // FIXME: We could also be folding things like TargetIndexes.
1447 if (!FoldingImm && !OpToFold.isReg())
1448 return false;
1449
1450 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1451 return false;
1452
1453 // Prevent folding operands backwards in the function. For example,
1454 // the COPY opcode must not be replaced by 1 in this example:
1455 //
1456 // %3 = COPY %vgpr0; VGPR_32:%3
1457 // ...
1458 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1459 if (!MI.getOperand(0).getReg().isVirtual())
1460 return false;
1461
1462 bool Changed = foldInstOperand(MI, OpToFold);
1463
1464 // If we managed to fold all uses of this copy then we might as well
1465 // delete it now.
1466 // The only reason we need to follow chains of copies here is that
1467 // tryFoldRegSequence looks forward through copies before folding a
1468 // REG_SEQUENCE into its eventual users.
1469 auto *InstToErase = &MI;
1470 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1471 auto &SrcOp = InstToErase->getOperand(1);
1472 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1473 InstToErase->eraseFromParent();
1474 Changed = true;
1475 InstToErase = nullptr;
1476 if (!SrcReg || SrcReg.isPhysical())
1477 break;
1478 InstToErase = MRI->getVRegDef(SrcReg);
1479 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1480 break;
1481 }
1482
1483 if (InstToErase && InstToErase->isRegSequence() &&
1484 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1485 InstToErase->eraseFromParent();
1486 Changed = true;
1487 }
1488
1489 return Changed;
1490 }
1491
1492 // Clamp patterns are canonically selected to v_max_* instructions, so only
1493 // handle them.
isClamp(const MachineInstr & MI) const1494 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1495 unsigned Op = MI.getOpcode();
1496 switch (Op) {
1497 case AMDGPU::V_MAX_F32_e64:
1498 case AMDGPU::V_MAX_F16_e64:
1499 case AMDGPU::V_MAX_F16_t16_e64:
1500 case AMDGPU::V_MAX_F16_fake16_e64:
1501 case AMDGPU::V_MAX_F64_e64:
1502 case AMDGPU::V_MAX_NUM_F64_e64:
1503 case AMDGPU::V_PK_MAX_F16: {
1504 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1505 return nullptr;
1506
1507 // Make sure sources are identical.
1508 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1509 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1510 if (!Src0->isReg() || !Src1->isReg() ||
1511 Src0->getReg() != Src1->getReg() ||
1512 Src0->getSubReg() != Src1->getSubReg() ||
1513 Src0->getSubReg() != AMDGPU::NoSubRegister)
1514 return nullptr;
1515
1516 // Can't fold up if we have modifiers.
1517 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1518 return nullptr;
1519
1520 unsigned Src0Mods
1521 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1522 unsigned Src1Mods
1523 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1524
1525 // Having a 0 op_sel_hi would require swizzling the output in the source
1526 // instruction, which we can't do.
1527 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1528 : 0u;
1529 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1530 return nullptr;
1531 return Src0;
1532 }
1533 default:
1534 return nullptr;
1535 }
1536 }
1537
1538 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
tryFoldClamp(MachineInstr & MI)1539 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1540 const MachineOperand *ClampSrc = isClamp(MI);
1541 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1542 return false;
1543
1544 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1545
1546 // The type of clamp must be compatible.
1547 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1548 return false;
1549
1550 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1551 if (!DefClamp)
1552 return false;
1553
1554 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1555
1556 // Clamp is applied after omod, so it is OK if omod is set.
1557 DefClamp->setImm(1);
1558 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1559 MI.eraseFromParent();
1560
1561 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1562 // instruction, so we might as well convert it to the more flexible VOP3-only
1563 // mad/fma form.
1564 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1565 Def->eraseFromParent();
1566
1567 return true;
1568 }
1569
getOModValue(unsigned Opc,int64_t Val)1570 static int getOModValue(unsigned Opc, int64_t Val) {
1571 switch (Opc) {
1572 case AMDGPU::V_MUL_F64_e64:
1573 case AMDGPU::V_MUL_F64_pseudo_e64: {
1574 switch (Val) {
1575 case 0x3fe0000000000000: // 0.5
1576 return SIOutMods::DIV2;
1577 case 0x4000000000000000: // 2.0
1578 return SIOutMods::MUL2;
1579 case 0x4010000000000000: // 4.0
1580 return SIOutMods::MUL4;
1581 default:
1582 return SIOutMods::NONE;
1583 }
1584 }
1585 case AMDGPU::V_MUL_F32_e64: {
1586 switch (static_cast<uint32_t>(Val)) {
1587 case 0x3f000000: // 0.5
1588 return SIOutMods::DIV2;
1589 case 0x40000000: // 2.0
1590 return SIOutMods::MUL2;
1591 case 0x40800000: // 4.0
1592 return SIOutMods::MUL4;
1593 default:
1594 return SIOutMods::NONE;
1595 }
1596 }
1597 case AMDGPU::V_MUL_F16_e64:
1598 case AMDGPU::V_MUL_F16_t16_e64:
1599 case AMDGPU::V_MUL_F16_fake16_e64: {
1600 switch (static_cast<uint16_t>(Val)) {
1601 case 0x3800: // 0.5
1602 return SIOutMods::DIV2;
1603 case 0x4000: // 2.0
1604 return SIOutMods::MUL2;
1605 case 0x4400: // 4.0
1606 return SIOutMods::MUL4;
1607 default:
1608 return SIOutMods::NONE;
1609 }
1610 }
1611 default:
1612 llvm_unreachable("invalid mul opcode");
1613 }
1614 }
1615
1616 // FIXME: Does this really not support denormals with f16?
1617 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1618 // handled, so will anything other than that break?
1619 std::pair<const MachineOperand *, int>
isOMod(const MachineInstr & MI) const1620 SIFoldOperands::isOMod(const MachineInstr &MI) const {
1621 unsigned Op = MI.getOpcode();
1622 switch (Op) {
1623 case AMDGPU::V_MUL_F64_e64:
1624 case AMDGPU::V_MUL_F64_pseudo_e64:
1625 case AMDGPU::V_MUL_F32_e64:
1626 case AMDGPU::V_MUL_F16_t16_e64:
1627 case AMDGPU::V_MUL_F16_fake16_e64:
1628 case AMDGPU::V_MUL_F16_e64: {
1629 // If output denormals are enabled, omod is ignored.
1630 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1631 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1632 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1633 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1634 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1635 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1636 return std::pair(nullptr, SIOutMods::NONE);
1637
1638 const MachineOperand *RegOp = nullptr;
1639 const MachineOperand *ImmOp = nullptr;
1640 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1641 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1642 if (Src0->isImm()) {
1643 ImmOp = Src0;
1644 RegOp = Src1;
1645 } else if (Src1->isImm()) {
1646 ImmOp = Src1;
1647 RegOp = Src0;
1648 } else
1649 return std::pair(nullptr, SIOutMods::NONE);
1650
1651 int OMod = getOModValue(Op, ImmOp->getImm());
1652 if (OMod == SIOutMods::NONE ||
1653 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1654 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1655 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1656 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1657 return std::pair(nullptr, SIOutMods::NONE);
1658
1659 return std::pair(RegOp, OMod);
1660 }
1661 case AMDGPU::V_ADD_F64_e64:
1662 case AMDGPU::V_ADD_F64_pseudo_e64:
1663 case AMDGPU::V_ADD_F32_e64:
1664 case AMDGPU::V_ADD_F16_e64:
1665 case AMDGPU::V_ADD_F16_t16_e64:
1666 case AMDGPU::V_ADD_F16_fake16_e64: {
1667 // If output denormals are enabled, omod is ignored.
1668 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1669 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1670 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1671 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1672 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1673 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1674 return std::pair(nullptr, SIOutMods::NONE);
1675
1676 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1677 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1678 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1679
1680 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1681 Src0->getSubReg() == Src1->getSubReg() &&
1682 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1683 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1684 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1685 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1686 return std::pair(Src0, SIOutMods::MUL2);
1687
1688 return std::pair(nullptr, SIOutMods::NONE);
1689 }
1690 default:
1691 return std::pair(nullptr, SIOutMods::NONE);
1692 }
1693 }
1694
1695 // FIXME: Does this need to check IEEE bit on function?
tryFoldOMod(MachineInstr & MI)1696 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1697 const MachineOperand *RegOp;
1698 int OMod;
1699 std::tie(RegOp, OMod) = isOMod(MI);
1700 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1701 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1702 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1703 return false;
1704
1705 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1706 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1707 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1708 return false;
1709
1710 // Clamp is applied after omod. If the source already has clamp set, don't
1711 // fold it.
1712 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1713 return false;
1714
1715 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1716
1717 DefOMod->setImm(OMod);
1718 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1719 MI.eraseFromParent();
1720
1721 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1722 // instruction, so we might as well convert it to the more flexible VOP3-only
1723 // mad/fma form.
1724 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1725 Def->eraseFromParent();
1726
1727 return true;
1728 }
1729
1730 // Try to fold a reg_sequence with vgpr output and agpr inputs into an
1731 // instruction which can take an agpr. So far that means a store.
tryFoldRegSequence(MachineInstr & MI)1732 bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1733 assert(MI.isRegSequence());
1734 auto Reg = MI.getOperand(0).getReg();
1735
1736 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1737 !MRI->hasOneNonDBGUse(Reg))
1738 return false;
1739
1740 SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
1741 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1742 return false;
1743
1744 for (auto &Def : Defs) {
1745 const auto *Op = Def.first;
1746 if (!Op->isReg())
1747 return false;
1748 if (TRI->isAGPR(*MRI, Op->getReg()))
1749 continue;
1750 // Maybe this is a COPY from AREG
1751 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1752 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1753 return false;
1754 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1755 return false;
1756 }
1757
1758 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1759 MachineInstr *UseMI = Op->getParent();
1760 while (UseMI->isCopy() && !Op->getSubReg()) {
1761 Reg = UseMI->getOperand(0).getReg();
1762 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1763 return false;
1764 Op = &*MRI->use_nodbg_begin(Reg);
1765 UseMI = Op->getParent();
1766 }
1767
1768 if (Op->getSubReg())
1769 return false;
1770
1771 unsigned OpIdx = Op - &UseMI->getOperand(0);
1772 const MCInstrDesc &InstDesc = UseMI->getDesc();
1773 const TargetRegisterClass *OpRC =
1774 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1775 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1776 return false;
1777
1778 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1779 auto Dst = MRI->createVirtualRegister(NewDstRC);
1780 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1781 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1782
1783 for (unsigned I = 0; I < Defs.size(); ++I) {
1784 MachineOperand *Def = Defs[I].first;
1785 Def->setIsKill(false);
1786 if (TRI->isAGPR(*MRI, Def->getReg())) {
1787 RS.add(*Def);
1788 } else { // This is a copy
1789 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1790 SubDef->getOperand(1).setIsKill(false);
1791 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1792 }
1793 RS.addImm(Defs[I].second);
1794 }
1795
1796 Op->setReg(Dst);
1797 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1798 Op->setReg(Reg);
1799 RS->eraseFromParent();
1800 return false;
1801 }
1802
1803 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1804
1805 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1806 // in which case we can erase them all later in runOnMachineFunction.
1807 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1808 MI.eraseFromParent();
1809 return true;
1810 }
1811
1812 /// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1813 /// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
isAGPRCopy(const SIRegisterInfo & TRI,const MachineRegisterInfo & MRI,const MachineInstr & Copy,Register & OutReg,unsigned & OutSubReg)1814 static bool isAGPRCopy(const SIRegisterInfo &TRI,
1815 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1816 Register &OutReg, unsigned &OutSubReg) {
1817 assert(Copy.isCopy());
1818
1819 const MachineOperand &CopySrc = Copy.getOperand(1);
1820 Register CopySrcReg = CopySrc.getReg();
1821 if (!CopySrcReg.isVirtual())
1822 return false;
1823
1824 // Common case: copy from AGPR directly, e.g.
1825 // %1:vgpr_32 = COPY %0:agpr_32
1826 if (TRI.isAGPR(MRI, CopySrcReg)) {
1827 OutReg = CopySrcReg;
1828 OutSubReg = CopySrc.getSubReg();
1829 return true;
1830 }
1831
1832 // Sometimes it can also involve two copies, e.g.
1833 // %1:vgpr_256 = COPY %0:agpr_256
1834 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1835 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1836 if (!CopySrcDef || !CopySrcDef->isCopy())
1837 return false;
1838
1839 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1840 Register OtherCopySrcReg = OtherCopySrc.getReg();
1841 if (!OtherCopySrcReg.isVirtual() ||
1842 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1843 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1844 !TRI.isAGPR(MRI, OtherCopySrcReg))
1845 return false;
1846
1847 OutReg = OtherCopySrcReg;
1848 OutSubReg = CopySrc.getSubReg();
1849 return true;
1850 }
1851
1852 // Try to hoist an AGPR to VGPR copy across a PHI.
1853 // This should allow folding of an AGPR into a consumer which may support it.
1854 //
1855 // Example 1: LCSSA PHI
1856 // loop:
1857 // %1:vreg = COPY %0:areg
1858 // exit:
1859 // %2:vreg = PHI %1:vreg, %loop
1860 // =>
1861 // loop:
1862 // exit:
1863 // %1:areg = PHI %0:areg, %loop
1864 // %2:vreg = COPY %1:areg
1865 //
1866 // Example 2: PHI with multiple incoming values:
1867 // entry:
1868 // %1:vreg = GLOBAL_LOAD(..)
1869 // loop:
1870 // %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1871 // %3:areg = COPY %2:vreg
1872 // %4:areg = (instr using %3:areg)
1873 // %5:vreg = COPY %4:areg
1874 // =>
1875 // entry:
1876 // %1:vreg = GLOBAL_LOAD(..)
1877 // %2:areg = COPY %1:vreg
1878 // loop:
1879 // %3:areg = PHI %2:areg, %entry, %X:areg,
1880 // %4:areg = (instr using %3:areg)
tryFoldPhiAGPR(MachineInstr & PHI)1881 bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1882 assert(PHI.isPHI());
1883
1884 Register PhiOut = PHI.getOperand(0).getReg();
1885 if (!TRI->isVGPR(*MRI, PhiOut))
1886 return false;
1887
1888 // Iterate once over all incoming values of the PHI to check if this PHI is
1889 // eligible, and determine the exact AGPR RC we'll target.
1890 const TargetRegisterClass *ARC = nullptr;
1891 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1892 MachineOperand &MO = PHI.getOperand(K);
1893 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1894 if (!Copy || !Copy->isCopy())
1895 continue;
1896
1897 Register AGPRSrc;
1898 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1899 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1900 continue;
1901
1902 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1903 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1904 CopyInRC = SubRC;
1905
1906 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1907 return false;
1908 ARC = CopyInRC;
1909 }
1910
1911 if (!ARC)
1912 return false;
1913
1914 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1915
1916 // Rewrite the PHI's incoming values to ARC.
1917 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1918 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1919 MachineOperand &MO = PHI.getOperand(K);
1920 Register Reg = MO.getReg();
1921
1922 MachineBasicBlock::iterator InsertPt;
1923 MachineBasicBlock *InsertMBB = nullptr;
1924
1925 // Look at the def of Reg, ignoring all copies.
1926 unsigned CopyOpc = AMDGPU::COPY;
1927 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1928
1929 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1930 // the copy was single-use, it will be removed by DCE later.
1931 if (Def->isCopy()) {
1932 Register AGPRSrc;
1933 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1934 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1935 MO.setReg(AGPRSrc);
1936 MO.setSubReg(AGPRSubReg);
1937 continue;
1938 }
1939
1940 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1941 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1942 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1943 // is unlikely to be profitable.
1944 //
1945 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1946 MachineOperand &CopyIn = Def->getOperand(1);
1947 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1948 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1949 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1950 }
1951
1952 InsertMBB = Def->getParent();
1953 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1954 } else {
1955 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1956 InsertPt = InsertMBB->getFirstTerminator();
1957 }
1958
1959 Register NewReg = MRI->createVirtualRegister(ARC);
1960 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1961 TII->get(CopyOpc), NewReg)
1962 .addReg(Reg);
1963 MO.setReg(NewReg);
1964
1965 (void)MI;
1966 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1967 }
1968
1969 // Replace the PHI's result with a new register.
1970 Register NewReg = MRI->createVirtualRegister(ARC);
1971 PHI.getOperand(0).setReg(NewReg);
1972
1973 // COPY that new register back to the original PhiOut register. This COPY will
1974 // usually be folded out later.
1975 MachineBasicBlock *MBB = PHI.getParent();
1976 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
1977 TII->get(AMDGPU::COPY), PhiOut)
1978 .addReg(NewReg);
1979
1980 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
1981 return true;
1982 }
1983
1984 // Attempt to convert VGPR load to an AGPR load.
tryFoldLoad(MachineInstr & MI)1985 bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1986 assert(MI.mayLoad());
1987 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
1988 return false;
1989
1990 MachineOperand &Def = MI.getOperand(0);
1991 if (!Def.isDef())
1992 return false;
1993
1994 Register DefReg = Def.getReg();
1995
1996 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
1997 return false;
1998
1999 SmallVector<const MachineInstr*, 8> Users;
2000 SmallVector<Register, 8> MoveRegs;
2001 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2002 Users.push_back(&I);
2003
2004 if (Users.empty())
2005 return false;
2006
2007 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2008 while (!Users.empty()) {
2009 const MachineInstr *I = Users.pop_back_val();
2010 if (!I->isCopy() && !I->isRegSequence())
2011 return false;
2012 Register DstReg = I->getOperand(0).getReg();
2013 // Physical registers may have more than one instruction definitions
2014 if (DstReg.isPhysical())
2015 return false;
2016 if (TRI->isAGPR(*MRI, DstReg))
2017 continue;
2018 MoveRegs.push_back(DstReg);
2019 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2020 Users.push_back(&U);
2021 }
2022
2023 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2024 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2025 if (!TII->isOperandLegal(MI, 0, &Def)) {
2026 MRI->setRegClass(DefReg, RC);
2027 return false;
2028 }
2029
2030 while (!MoveRegs.empty()) {
2031 Register Reg = MoveRegs.pop_back_val();
2032 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2033 }
2034
2035 LLVM_DEBUG(dbgs() << "Folded " << MI);
2036
2037 return true;
2038 }
2039
2040 // tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2041 // For GFX90A and later, this is pretty much always a good thing, but for GFX908
2042 // there's cases where it can create a lot more AGPR-AGPR copies, which are
2043 // expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2044 //
2045 // This function looks at all AGPR PHIs in a basic block and collects their
2046 // operands. Then, it checks for register that are used more than once across
2047 // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2048 // having to create one VGPR temporary per use, which can get very messy if
2049 // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2050 // element).
2051 //
2052 // Example
2053 // a:
2054 // %in:agpr_256 = COPY %foo:vgpr_256
2055 // c:
2056 // %x:agpr_32 = ..
2057 // b:
2058 // %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2059 // %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2060 // %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2061 // =>
2062 // a:
2063 // %in:agpr_256 = COPY %foo:vgpr_256
2064 // %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2065 // %tmp_agpr:agpr_32 = COPY %tmp
2066 // c:
2067 // %x:agpr_32 = ..
2068 // b:
2069 // %0:areg = PHI %tmp_agpr, %a, %x, %c
2070 // %1:areg = PHI %tmp_agpr, %a, %y, %c
2071 // %2:areg = PHI %tmp_agpr, %a, %z, %c
tryOptimizeAGPRPhis(MachineBasicBlock & MBB)2072 bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2073 // This is only really needed on GFX908 where AGPR-AGPR copies are
2074 // unreasonably difficult.
2075 if (ST->hasGFX90AInsts())
2076 return false;
2077
2078 // Look at all AGPR Phis and collect the register + subregister used.
2079 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2080 RegToMO;
2081
2082 for (auto &MI : MBB) {
2083 if (!MI.isPHI())
2084 break;
2085
2086 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2087 continue;
2088
2089 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2090 MachineOperand &PhiMO = MI.getOperand(K);
2091 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2092 }
2093 }
2094
2095 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2096 // a VGPR.
2097 bool Changed = false;
2098 for (const auto &[Entry, MOs] : RegToMO) {
2099 if (MOs.size() == 1)
2100 continue;
2101
2102 const auto [Reg, SubReg] = Entry;
2103 MachineInstr *Def = MRI->getVRegDef(Reg);
2104 MachineBasicBlock *DefMBB = Def->getParent();
2105
2106 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2107 // out.
2108 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2109 Register TempVGPR =
2110 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2111 MachineInstr *VGPRCopy =
2112 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2113 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2114 .addReg(Reg, /* flags */ 0, SubReg);
2115
2116 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2117 Register TempAGPR = MRI->createVirtualRegister(ARC);
2118 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2119 TII->get(AMDGPU::COPY), TempAGPR)
2120 .addReg(TempVGPR);
2121
2122 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2123 for (MachineOperand *MO : MOs) {
2124 MO->setReg(TempAGPR);
2125 MO->setSubReg(AMDGPU::NoSubRegister);
2126 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2127 }
2128
2129 Changed = true;
2130 }
2131
2132 return Changed;
2133 }
2134
runOnMachineFunction(MachineFunction & MF)2135 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2136 if (skipFunction(MF.getFunction()))
2137 return false;
2138
2139 MRI = &MF.getRegInfo();
2140 ST = &MF.getSubtarget<GCNSubtarget>();
2141 TII = ST->getInstrInfo();
2142 TRI = &TII->getRegisterInfo();
2143 MFI = MF.getInfo<SIMachineFunctionInfo>();
2144
2145 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2146 // correctly handle signed zeros.
2147 //
2148 // FIXME: Also need to check strictfp
2149 bool IsIEEEMode = MFI->getMode().IEEE;
2150 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2151
2152 bool Changed = false;
2153 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2154 MachineOperand *CurrentKnownM0Val = nullptr;
2155 for (auto &MI : make_early_inc_range(*MBB)) {
2156 Changed |= tryFoldCndMask(MI);
2157
2158 if (tryFoldZeroHighBits(MI)) {
2159 Changed = true;
2160 continue;
2161 }
2162
2163 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2164 Changed = true;
2165 continue;
2166 }
2167
2168 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2169 Changed = true;
2170 continue;
2171 }
2172
2173 if (MI.mayLoad() && tryFoldLoad(MI)) {
2174 Changed = true;
2175 continue;
2176 }
2177
2178 if (TII->isFoldableCopy(MI)) {
2179 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2180 continue;
2181 }
2182
2183 // Saw an unknown clobber of m0, so we no longer know what it is.
2184 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2185 CurrentKnownM0Val = nullptr;
2186
2187 // TODO: Omod might be OK if there is NSZ only on the source
2188 // instruction, and not the omod multiply.
2189 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2190 !tryFoldOMod(MI))
2191 Changed |= tryFoldClamp(MI);
2192 }
2193
2194 Changed |= tryOptimizeAGPRPhis(*MBB);
2195 }
2196
2197 return Changed;
2198 }
2199