1 //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "AMDGPUCombinerHelper.h"
10 #include "GCNSubtarget.h"
11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
13 #include "llvm/IR/IntrinsicsAMDGPU.h"
14 #include "llvm/Target/TargetMachine.h"
15
16 using namespace llvm;
17 using namespace MIPatternMatch;
18
19 LLVM_READNONE
fnegFoldsIntoMI(const MachineInstr & MI)20 static bool fnegFoldsIntoMI(const MachineInstr &MI) {
21 switch (MI.getOpcode()) {
22 case AMDGPU::G_FADD:
23 case AMDGPU::G_FSUB:
24 case AMDGPU::G_FMUL:
25 case AMDGPU::G_FMA:
26 case AMDGPU::G_FMAD:
27 case AMDGPU::G_FMINNUM:
28 case AMDGPU::G_FMAXNUM:
29 case AMDGPU::G_FMINNUM_IEEE:
30 case AMDGPU::G_FMAXNUM_IEEE:
31 case AMDGPU::G_FSIN:
32 case AMDGPU::G_FPEXT:
33 case AMDGPU::G_INTRINSIC_TRUNC:
34 case AMDGPU::G_FPTRUNC:
35 case AMDGPU::G_FRINT:
36 case AMDGPU::G_FNEARBYINT:
37 case AMDGPU::G_INTRINSIC_ROUND:
38 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
39 case AMDGPU::G_FCANONICALIZE:
40 case AMDGPU::G_AMDGPU_RCP_IFLAG:
41 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
42 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
43 return true;
44 case AMDGPU::G_INTRINSIC: {
45 unsigned IntrinsicID = MI.getIntrinsicID();
46 switch (IntrinsicID) {
47 case Intrinsic::amdgcn_rcp:
48 case Intrinsic::amdgcn_rcp_legacy:
49 case Intrinsic::amdgcn_sin:
50 case Intrinsic::amdgcn_fmul_legacy:
51 case Intrinsic::amdgcn_fmed3:
52 case Intrinsic::amdgcn_fma_legacy:
53 return true;
54 default:
55 return false;
56 }
57 }
58 default:
59 return false;
60 }
61 }
62
63 /// \p returns true if the operation will definitely need to use a 64-bit
64 /// encoding, and thus will use a VOP3 encoding regardless of the source
65 /// modifiers.
66 LLVM_READONLY
opMustUseVOP3Encoding(const MachineInstr & MI,const MachineRegisterInfo & MRI)67 static bool opMustUseVOP3Encoding(const MachineInstr &MI,
68 const MachineRegisterInfo &MRI) {
69 return MI.getNumOperands() >
70 (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) ||
71 MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
72 }
73
74 // Most FP instructions support source modifiers.
75 LLVM_READONLY
hasSourceMods(const MachineInstr & MI)76 static bool hasSourceMods(const MachineInstr &MI) {
77 if (!MI.memoperands().empty())
78 return false;
79
80 switch (MI.getOpcode()) {
81 case AMDGPU::COPY:
82 case AMDGPU::G_SELECT:
83 case AMDGPU::G_FDIV:
84 case AMDGPU::G_FREM:
85 case TargetOpcode::INLINEASM:
86 case TargetOpcode::INLINEASM_BR:
87 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
88 case AMDGPU::G_BITCAST:
89 case AMDGPU::G_ANYEXT:
90 case AMDGPU::G_BUILD_VECTOR:
91 case AMDGPU::G_BUILD_VECTOR_TRUNC:
92 case AMDGPU::G_PHI:
93 return false;
94 case AMDGPU::G_INTRINSIC: {
95 unsigned IntrinsicID = MI.getIntrinsicID();
96 switch (IntrinsicID) {
97 case Intrinsic::amdgcn_interp_p1:
98 case Intrinsic::amdgcn_interp_p2:
99 case Intrinsic::amdgcn_interp_mov:
100 case Intrinsic::amdgcn_interp_p1_f16:
101 case Intrinsic::amdgcn_interp_p2_f16:
102 case Intrinsic::amdgcn_div_scale:
103 return false;
104 default:
105 return true;
106 }
107 }
108 default:
109 return true;
110 }
111 }
112
allUsesHaveSourceMods(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned CostThreshold=4)113 static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
114 unsigned CostThreshold = 4) {
115 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
116 // it is truly free to use a source modifier in all cases. If there are
117 // multiple users but for each one will necessitate using VOP3, there will be
118 // a code size increase. Try to avoid increasing code size unless we know it
119 // will save on the instruction count.
120 unsigned NumMayIncreaseSize = 0;
121 Register Dst = MI.getOperand(0).getReg();
122 for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
123 if (!hasSourceMods(Use))
124 return false;
125
126 if (!opMustUseVOP3Encoding(Use, MRI)) {
127 if (++NumMayIncreaseSize > CostThreshold)
128 return false;
129 }
130 }
131 return true;
132 }
133
mayIgnoreSignedZero(MachineInstr & MI)134 static bool mayIgnoreSignedZero(MachineInstr &MI) {
135 const TargetOptions &Options = MI.getMF()->getTarget().Options;
136 return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
137 }
138
isInv2Pi(const APFloat & APF)139 static bool isInv2Pi(const APFloat &APF) {
140 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
141 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
142 static const APFloat KF64(APFloat::IEEEdouble(),
143 APInt(64, 0x3fc45f306dc9c882));
144
145 return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
146 APF.bitwiseIsEqual(KF64);
147 }
148
149 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
150 // additional cost to negate them.
isConstantCostlierToNegate(MachineInstr & MI,Register Reg,MachineRegisterInfo & MRI)151 static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
152 MachineRegisterInfo &MRI) {
153 std::optional<FPValueAndVReg> FPValReg;
154 if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
155 if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
156 return true;
157
158 const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
159 if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
160 return true;
161 }
162 return false;
163 }
164
inverseMinMax(unsigned Opc)165 static unsigned inverseMinMax(unsigned Opc) {
166 switch (Opc) {
167 case AMDGPU::G_FMAXNUM:
168 return AMDGPU::G_FMINNUM;
169 case AMDGPU::G_FMINNUM:
170 return AMDGPU::G_FMAXNUM;
171 case AMDGPU::G_FMAXNUM_IEEE:
172 return AMDGPU::G_FMINNUM_IEEE;
173 case AMDGPU::G_FMINNUM_IEEE:
174 return AMDGPU::G_FMAXNUM_IEEE;
175 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
176 return AMDGPU::G_AMDGPU_FMIN_LEGACY;
177 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
178 return AMDGPU::G_AMDGPU_FMAX_LEGACY;
179 default:
180 llvm_unreachable("invalid min/max opcode");
181 }
182 }
183
matchFoldableFneg(MachineInstr & MI,MachineInstr * & MatchInfo)184 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
185 MachineInstr *&MatchInfo) {
186 Register Src = MI.getOperand(1).getReg();
187 MatchInfo = MRI.getVRegDef(Src);
188
189 // If the input has multiple uses and we can either fold the negate down, or
190 // the other uses cannot, give up. This both prevents unprofitable
191 // transformations and infinite loops: we won't repeatedly try to fold around
192 // a negate that has no 'good' form.
193 if (MRI.hasOneNonDBGUse(Src)) {
194 if (allUsesHaveSourceMods(MI, MRI, 0))
195 return false;
196 } else {
197 if (fnegFoldsIntoMI(*MatchInfo) &&
198 (allUsesHaveSourceMods(MI, MRI) ||
199 !allUsesHaveSourceMods(*MatchInfo, MRI)))
200 return false;
201 }
202
203 switch (MatchInfo->getOpcode()) {
204 case AMDGPU::G_FMINNUM:
205 case AMDGPU::G_FMAXNUM:
206 case AMDGPU::G_FMINNUM_IEEE:
207 case AMDGPU::G_FMAXNUM_IEEE:
208 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
209 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
210 // 0 doesn't have a negated inline immediate.
211 return !isConstantCostlierToNegate(*MatchInfo,
212 MatchInfo->getOperand(2).getReg(), MRI);
213 case AMDGPU::G_FADD:
214 case AMDGPU::G_FSUB:
215 case AMDGPU::G_FMA:
216 case AMDGPU::G_FMAD:
217 return mayIgnoreSignedZero(*MatchInfo);
218 case AMDGPU::G_FMUL:
219 case AMDGPU::G_FPEXT:
220 case AMDGPU::G_INTRINSIC_TRUNC:
221 case AMDGPU::G_FPTRUNC:
222 case AMDGPU::G_FRINT:
223 case AMDGPU::G_FNEARBYINT:
224 case AMDGPU::G_INTRINSIC_ROUND:
225 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
226 case AMDGPU::G_FSIN:
227 case AMDGPU::G_FCANONICALIZE:
228 case AMDGPU::G_AMDGPU_RCP_IFLAG:
229 return true;
230 case AMDGPU::G_INTRINSIC: {
231 unsigned IntrinsicID = MatchInfo->getIntrinsicID();
232 switch (IntrinsicID) {
233 case Intrinsic::amdgcn_rcp:
234 case Intrinsic::amdgcn_rcp_legacy:
235 case Intrinsic::amdgcn_sin:
236 case Intrinsic::amdgcn_fmul_legacy:
237 case Intrinsic::amdgcn_fmed3:
238 return true;
239 case Intrinsic::amdgcn_fma_legacy:
240 return mayIgnoreSignedZero(*MatchInfo);
241 default:
242 return false;
243 }
244 }
245 default:
246 return false;
247 }
248 }
249
applyFoldableFneg(MachineInstr & MI,MachineInstr * & MatchInfo)250 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
251 MachineInstr *&MatchInfo) {
252 // Transform:
253 // %A = inst %Op1, ...
254 // %B = fneg %A
255 //
256 // into:
257 //
258 // (if %A has one use, specifically fneg above)
259 // %B = inst (maybe fneg %Op1), ...
260 //
261 // (if %A has multiple uses)
262 // %B = inst (maybe fneg %Op1), ...
263 // %A = fneg %B
264
265 // Replace register in operand with a register holding negated value.
266 auto NegateOperand = [&](MachineOperand &Op) {
267 Register Reg = Op.getReg();
268 if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
269 Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
270 replaceRegOpWith(MRI, Op, Reg);
271 };
272
273 // Replace either register in operands with a register holding negated value.
274 auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
275 Register XReg = X.getReg();
276 Register YReg = Y.getReg();
277 if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
278 replaceRegOpWith(MRI, X, XReg);
279 else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
280 replaceRegOpWith(MRI, Y, YReg);
281 else {
282 YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
283 replaceRegOpWith(MRI, Y, YReg);
284 }
285 };
286
287 Builder.setInstrAndDebugLoc(*MatchInfo);
288
289 // Negate appropriate operands so that resulting value of MatchInfo is
290 // negated.
291 switch (MatchInfo->getOpcode()) {
292 case AMDGPU::G_FADD:
293 case AMDGPU::G_FSUB:
294 NegateOperand(MatchInfo->getOperand(1));
295 NegateOperand(MatchInfo->getOperand(2));
296 break;
297 case AMDGPU::G_FMUL:
298 NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
299 break;
300 case AMDGPU::G_FMINNUM:
301 case AMDGPU::G_FMAXNUM:
302 case AMDGPU::G_FMINNUM_IEEE:
303 case AMDGPU::G_FMAXNUM_IEEE:
304 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
305 case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
306 NegateOperand(MatchInfo->getOperand(1));
307 NegateOperand(MatchInfo->getOperand(2));
308 unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
309 replaceOpcodeWith(*MatchInfo, Opposite);
310 break;
311 }
312 case AMDGPU::G_FMA:
313 case AMDGPU::G_FMAD:
314 NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
315 NegateOperand(MatchInfo->getOperand(3));
316 break;
317 case AMDGPU::G_FPEXT:
318 case AMDGPU::G_INTRINSIC_TRUNC:
319 case AMDGPU::G_FRINT:
320 case AMDGPU::G_FNEARBYINT:
321 case AMDGPU::G_INTRINSIC_ROUND:
322 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
323 case AMDGPU::G_FSIN:
324 case AMDGPU::G_FCANONICALIZE:
325 case AMDGPU::G_AMDGPU_RCP_IFLAG:
326 case AMDGPU::G_FPTRUNC:
327 NegateOperand(MatchInfo->getOperand(1));
328 break;
329 case AMDGPU::G_INTRINSIC: {
330 unsigned IntrinsicID = MatchInfo->getIntrinsicID();
331 switch (IntrinsicID) {
332 case Intrinsic::amdgcn_rcp:
333 case Intrinsic::amdgcn_rcp_legacy:
334 case Intrinsic::amdgcn_sin:
335 NegateOperand(MatchInfo->getOperand(2));
336 break;
337 case Intrinsic::amdgcn_fmul_legacy:
338 NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
339 break;
340 case Intrinsic::amdgcn_fmed3:
341 NegateOperand(MatchInfo->getOperand(2));
342 NegateOperand(MatchInfo->getOperand(3));
343 NegateOperand(MatchInfo->getOperand(4));
344 break;
345 case Intrinsic::amdgcn_fma_legacy:
346 NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
347 NegateOperand(MatchInfo->getOperand(4));
348 break;
349 default:
350 llvm_unreachable("folding fneg not supported for this intrinsic");
351 }
352 break;
353 }
354 default:
355 llvm_unreachable("folding fneg not supported for this instruction");
356 }
357
358 Register Dst = MI.getOperand(0).getReg();
359 Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
360
361 if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
362 // MatchInfo now has negated value so use that instead of old Dst.
363 replaceRegWith(MRI, Dst, MatchInfoDst);
364 } else {
365 // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
366 // but replaceRegWith will replace defs as well. It is easier to replace one
367 // def with a new register.
368 LLT Type = MRI.getType(Dst);
369 Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
370 replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
371
372 // MatchInfo now has negated value so use that instead of old Dst.
373 replaceRegWith(MRI, Dst, NegatedMatchInfo);
374
375 // Recreate non negated value for other uses of old MatchInfoDst
376 auto NextInst = ++MatchInfo->getIterator();
377 Builder.setInstrAndDebugLoc(*NextInst);
378 Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
379 }
380
381 MI.eraseFromParent();
382 }
383