1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "R600Subtarget.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 
24 using namespace llvm;
25 
26 #define DEBUG_TYPE "AMDGPUtti"
27 
28 namespace {
29 
30 struct AMDGPUImageDMaskIntrinsic {
31   unsigned Intr;
32 };
33 
34 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
35 #include "InstCombineTables.inc"
36 
37 } // end anonymous namespace
38 
39 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
40 //
41 // A single NaN input is folded to minnum, so we rely on that folding for
42 // handling NaNs.
43 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
44                            const APFloat &Src2) {
45   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
46 
47   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
48   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
49   if (Cmp0 == APFloat::cmpEqual)
50     return maxnum(Src1, Src2);
51 
52   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
53   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
54   if (Cmp1 == APFloat::cmpEqual)
55     return maxnum(Src0, Src2);
56 
57   return maxnum(Src0, Src1);
58 }
59 
60 // Check if a value can be converted to a 16-bit value without losing
61 // precision.
62 static bool canSafelyConvertTo16Bit(Value &V) {
63   Type *VTy = V.getType();
64   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
65     // The value is already 16-bit, so we don't want to convert to 16-bit again!
66     return false;
67   }
68   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
69     // We need to check that if we cast the index down to a half, we do not lose
70     // precision.
71     APFloat FloatValue(ConstFloat->getValueAPF());
72     bool LosesInfo = true;
73     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
74     return !LosesInfo;
75   }
76   Value *CastSrc;
77   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
78       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
79       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
80     Type *CastSrcTy = CastSrc->getType();
81     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
82       return true;
83   }
84 
85   return false;
86 }
87 
88 // Convert a value to 16-bit.
89 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
90   Type *VTy = V.getType();
91   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
92     return cast<Instruction>(&V)->getOperand(0);
93   if (VTy->isIntegerTy())
94     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
95   if (VTy->isFloatingPointTy())
96     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
97 
98   llvm_unreachable("Should never be called!");
99 }
100 
101 static Optional<Instruction *>
102 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
103                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
104                              IntrinsicInst &II, InstCombiner &IC) {
105   if (!ST->hasA16() && !ST->hasG16())
106     return None;
107 
108   bool FloatCoord = false;
109   // true means derivatives can be converted to 16 bit, coordinates not
110   bool OnlyDerivatives = false;
111 
112   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
113        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
114     Value *Coord = II.getOperand(OperandIndex);
115     // If the values are not derived from 16-bit values, we cannot optimize.
116     if (!canSafelyConvertTo16Bit(*Coord)) {
117       if (OperandIndex < ImageDimIntr->CoordStart ||
118           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
119         return None;
120       }
121       // All gradients can be converted, so convert only them
122       OnlyDerivatives = true;
123       break;
124     }
125 
126     assert(OperandIndex == ImageDimIntr->GradientStart ||
127            FloatCoord == Coord->getType()->isFloatingPointTy());
128     FloatCoord = Coord->getType()->isFloatingPointTy();
129   }
130 
131   if (OnlyDerivatives) {
132     if (!ST->hasG16())
133       return None;
134   } else {
135     if (!ST->hasA16())
136       OnlyDerivatives = true; // Only supports G16
137   }
138 
139   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
140                                : Type::getInt16Ty(II.getContext());
141 
142   SmallVector<Type *, 4> ArgTys;
143   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
144     return None;
145 
146   ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
147   if (!OnlyDerivatives)
148     ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
149   Function *I =
150       Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
151 
152   SmallVector<Value *, 8> Args(II.arg_operands());
153 
154   unsigned EndIndex =
155       OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
156   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
157        OperandIndex < EndIndex; OperandIndex++) {
158     Args[OperandIndex] =
159         convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
160   }
161 
162   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
163   NewCall->takeName(&II);
164   NewCall->copyMetadata(II);
165   if (isa<FPMathOperator>(NewCall))
166     NewCall->copyFastMathFlags(&II);
167   return IC.replaceInstUsesWith(II, NewCall);
168 }
169 
170 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
171                                            InstCombiner &IC) const {
172   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
173   // infinity, gives +0.0. If we can prove we don't have one of the special
174   // cases then we can use a normal multiply instead.
175   // TODO: Create and use isKnownFiniteNonZero instead of just matching
176   // constants here.
177   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
178       match(Op1, PatternMatch::m_FiniteNonZero())) {
179     // One operand is not zero or infinity or NaN.
180     return true;
181   }
182   auto *TLI = &IC.getTargetLibraryInfo();
183   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
184       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
185     // Neither operand is infinity or NaN.
186     return true;
187   }
188   return false;
189 }
190 
191 Optional<Instruction *>
192 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
193   Intrinsic::ID IID = II.getIntrinsicID();
194   switch (IID) {
195   case Intrinsic::amdgcn_rcp: {
196     Value *Src = II.getArgOperand(0);
197 
198     // TODO: Move to ConstantFolding/InstSimplify?
199     if (isa<UndefValue>(Src)) {
200       Type *Ty = II.getType();
201       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
202       return IC.replaceInstUsesWith(II, QNaN);
203     }
204 
205     if (II.isStrictFP())
206       break;
207 
208     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
209       const APFloat &ArgVal = C->getValueAPF();
210       APFloat Val(ArgVal.getSemantics(), 1);
211       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
212 
213       // This is more precise than the instruction may give.
214       //
215       // TODO: The instruction always flushes denormal results (except for f16),
216       // should this also?
217       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
218     }
219 
220     break;
221   }
222   case Intrinsic::amdgcn_rsq: {
223     Value *Src = II.getArgOperand(0);
224 
225     // TODO: Move to ConstantFolding/InstSimplify?
226     if (isa<UndefValue>(Src)) {
227       Type *Ty = II.getType();
228       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
229       return IC.replaceInstUsesWith(II, QNaN);
230     }
231 
232     break;
233   }
234   case Intrinsic::amdgcn_frexp_mant:
235   case Intrinsic::amdgcn_frexp_exp: {
236     Value *Src = II.getArgOperand(0);
237     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
238       int Exp;
239       APFloat Significand =
240           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
241 
242       if (IID == Intrinsic::amdgcn_frexp_mant) {
243         return IC.replaceInstUsesWith(
244             II, ConstantFP::get(II.getContext(), Significand));
245       }
246 
247       // Match instruction special case behavior.
248       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
249         Exp = 0;
250 
251       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
252     }
253 
254     if (isa<UndefValue>(Src)) {
255       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
256     }
257 
258     break;
259   }
260   case Intrinsic::amdgcn_class: {
261     enum {
262       S_NAN = 1 << 0,       // Signaling NaN
263       Q_NAN = 1 << 1,       // Quiet NaN
264       N_INFINITY = 1 << 2,  // Negative infinity
265       N_NORMAL = 1 << 3,    // Negative normal
266       N_SUBNORMAL = 1 << 4, // Negative subnormal
267       N_ZERO = 1 << 5,      // Negative zero
268       P_ZERO = 1 << 6,      // Positive zero
269       P_SUBNORMAL = 1 << 7, // Positive subnormal
270       P_NORMAL = 1 << 8,    // Positive normal
271       P_INFINITY = 1 << 9   // Positive infinity
272     };
273 
274     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
275                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
276                               P_NORMAL | P_INFINITY;
277 
278     Value *Src0 = II.getArgOperand(0);
279     Value *Src1 = II.getArgOperand(1);
280     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
281     if (!CMask) {
282       if (isa<UndefValue>(Src0)) {
283         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
284       }
285 
286       if (isa<UndefValue>(Src1)) {
287         return IC.replaceInstUsesWith(II,
288                                       ConstantInt::get(II.getType(), false));
289       }
290       break;
291     }
292 
293     uint32_t Mask = CMask->getZExtValue();
294 
295     // If all tests are made, it doesn't matter what the value is.
296     if ((Mask & FullMask) == FullMask) {
297       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
298     }
299 
300     if ((Mask & FullMask) == 0) {
301       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
302     }
303 
304     if (Mask == (S_NAN | Q_NAN)) {
305       // Equivalent of isnan. Replace with standard fcmp.
306       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
307       FCmp->takeName(&II);
308       return IC.replaceInstUsesWith(II, FCmp);
309     }
310 
311     if (Mask == (N_ZERO | P_ZERO)) {
312       // Equivalent of == 0.
313       Value *FCmp =
314           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
315 
316       FCmp->takeName(&II);
317       return IC.replaceInstUsesWith(II, FCmp);
318     }
319 
320     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
321     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
322         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
323       return IC.replaceOperand(
324           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
325     }
326 
327     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
328     if (!CVal) {
329       if (isa<UndefValue>(Src0)) {
330         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
331       }
332 
333       // Clamp mask to used bits
334       if ((Mask & FullMask) != Mask) {
335         CallInst *NewCall = IC.Builder.CreateCall(
336             II.getCalledFunction(),
337             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
338 
339         NewCall->takeName(&II);
340         return IC.replaceInstUsesWith(II, NewCall);
341       }
342 
343       break;
344     }
345 
346     const APFloat &Val = CVal->getValueAPF();
347 
348     bool Result =
349         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
350         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
351         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
352         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
353         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
354         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
355         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
356         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
357         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
358         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
359 
360     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
361   }
362   case Intrinsic::amdgcn_cvt_pkrtz: {
363     Value *Src0 = II.getArgOperand(0);
364     Value *Src1 = II.getArgOperand(1);
365     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
366       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
367         const fltSemantics &HalfSem =
368             II.getType()->getScalarType()->getFltSemantics();
369         bool LosesInfo;
370         APFloat Val0 = C0->getValueAPF();
371         APFloat Val1 = C1->getValueAPF();
372         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
373         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
374 
375         Constant *Folded =
376             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
377                                  ConstantFP::get(II.getContext(), Val1)});
378         return IC.replaceInstUsesWith(II, Folded);
379       }
380     }
381 
382     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
383       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
384     }
385 
386     break;
387   }
388   case Intrinsic::amdgcn_cvt_pknorm_i16:
389   case Intrinsic::amdgcn_cvt_pknorm_u16:
390   case Intrinsic::amdgcn_cvt_pk_i16:
391   case Intrinsic::amdgcn_cvt_pk_u16: {
392     Value *Src0 = II.getArgOperand(0);
393     Value *Src1 = II.getArgOperand(1);
394 
395     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
396       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
397     }
398 
399     break;
400   }
401   case Intrinsic::amdgcn_ubfe:
402   case Intrinsic::amdgcn_sbfe: {
403     // Decompose simple cases into standard shifts.
404     Value *Src = II.getArgOperand(0);
405     if (isa<UndefValue>(Src)) {
406       return IC.replaceInstUsesWith(II, Src);
407     }
408 
409     unsigned Width;
410     Type *Ty = II.getType();
411     unsigned IntSize = Ty->getIntegerBitWidth();
412 
413     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
414     if (CWidth) {
415       Width = CWidth->getZExtValue();
416       if ((Width & (IntSize - 1)) == 0) {
417         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
418       }
419 
420       // Hardware ignores high bits, so remove those.
421       if (Width >= IntSize) {
422         return IC.replaceOperand(
423             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
424       }
425     }
426 
427     unsigned Offset;
428     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
429     if (COffset) {
430       Offset = COffset->getZExtValue();
431       if (Offset >= IntSize) {
432         return IC.replaceOperand(
433             II, 1,
434             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
435       }
436     }
437 
438     bool Signed = IID == Intrinsic::amdgcn_sbfe;
439 
440     if (!CWidth || !COffset)
441       break;
442 
443     // The case of Width == 0 is handled above, which makes this tranformation
444     // safe.  If Width == 0, then the ashr and lshr instructions become poison
445     // value since the shift amount would be equal to the bit size.
446     assert(Width != 0);
447 
448     // TODO: This allows folding to undef when the hardware has specific
449     // behavior?
450     if (Offset + Width < IntSize) {
451       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
452       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
453                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
454       RightShift->takeName(&II);
455       return IC.replaceInstUsesWith(II, RightShift);
456     }
457 
458     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
459                                : IC.Builder.CreateLShr(Src, Offset);
460 
461     RightShift->takeName(&II);
462     return IC.replaceInstUsesWith(II, RightShift);
463   }
464   case Intrinsic::amdgcn_exp:
465   case Intrinsic::amdgcn_exp_compr: {
466     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
467     unsigned EnBits = En->getZExtValue();
468     if (EnBits == 0xf)
469       break; // All inputs enabled.
470 
471     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
472     bool Changed = false;
473     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
474       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
475           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
476         Value *Src = II.getArgOperand(I + 2);
477         if (!isa<UndefValue>(Src)) {
478           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
479           Changed = true;
480         }
481       }
482     }
483 
484     if (Changed) {
485       return &II;
486     }
487 
488     break;
489   }
490   case Intrinsic::amdgcn_fmed3: {
491     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
492     // for the shader.
493 
494     Value *Src0 = II.getArgOperand(0);
495     Value *Src1 = II.getArgOperand(1);
496     Value *Src2 = II.getArgOperand(2);
497 
498     // Checking for NaN before canonicalization provides better fidelity when
499     // mapping other operations onto fmed3 since the order of operands is
500     // unchanged.
501     CallInst *NewCall = nullptr;
502     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
503       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
504     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
505       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
506     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
507       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
508     }
509 
510     if (NewCall) {
511       NewCall->copyFastMathFlags(&II);
512       NewCall->takeName(&II);
513       return IC.replaceInstUsesWith(II, NewCall);
514     }
515 
516     bool Swap = false;
517     // Canonicalize constants to RHS operands.
518     //
519     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
520     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
521       std::swap(Src0, Src1);
522       Swap = true;
523     }
524 
525     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
526       std::swap(Src1, Src2);
527       Swap = true;
528     }
529 
530     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
531       std::swap(Src0, Src1);
532       Swap = true;
533     }
534 
535     if (Swap) {
536       II.setArgOperand(0, Src0);
537       II.setArgOperand(1, Src1);
538       II.setArgOperand(2, Src2);
539       return &II;
540     }
541 
542     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
543       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
544         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
545           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
546                                        C2->getValueAPF());
547           return IC.replaceInstUsesWith(
548               II, ConstantFP::get(IC.Builder.getContext(), Result));
549         }
550       }
551     }
552 
553     break;
554   }
555   case Intrinsic::amdgcn_icmp:
556   case Intrinsic::amdgcn_fcmp: {
557     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
558     // Guard against invalid arguments.
559     int64_t CCVal = CC->getZExtValue();
560     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
561     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
562                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
563         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
564                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
565       break;
566 
567     Value *Src0 = II.getArgOperand(0);
568     Value *Src1 = II.getArgOperand(1);
569 
570     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
571       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
572         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
573         if (CCmp->isNullValue()) {
574           return IC.replaceInstUsesWith(
575               II, ConstantExpr::getSExt(CCmp, II.getType()));
576         }
577 
578         // The result of V_ICMP/V_FCMP assembly instructions (which this
579         // intrinsic exposes) is one bit per thread, masked with the EXEC
580         // register (which contains the bitmask of live threads). So a
581         // comparison that always returns true is the same as a read of the
582         // EXEC register.
583         Function *NewF = Intrinsic::getDeclaration(
584             II.getModule(), Intrinsic::read_register, II.getType());
585         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
586         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
587         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
588         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
589         NewCall->addAttribute(AttributeList::FunctionIndex,
590                               Attribute::Convergent);
591         NewCall->takeName(&II);
592         return IC.replaceInstUsesWith(II, NewCall);
593       }
594 
595       // Canonicalize constants to RHS.
596       CmpInst::Predicate SwapPred =
597           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
598       II.setArgOperand(0, Src1);
599       II.setArgOperand(1, Src0);
600       II.setArgOperand(
601           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
602       return &II;
603     }
604 
605     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
606       break;
607 
608     // Canonicalize compare eq with true value to compare != 0
609     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
610     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
611     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
612     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
613     Value *ExtSrc;
614     if (CCVal == CmpInst::ICMP_EQ &&
615         ((match(Src1, PatternMatch::m_One()) &&
616           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
617          (match(Src1, PatternMatch::m_AllOnes()) &&
618           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
619         ExtSrc->getType()->isIntegerTy(1)) {
620       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
621       IC.replaceOperand(II, 2,
622                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
623       return &II;
624     }
625 
626     CmpInst::Predicate SrcPred;
627     Value *SrcLHS;
628     Value *SrcRHS;
629 
630     // Fold compare eq/ne with 0 from a compare result as the predicate to the
631     // intrinsic. The typical use is a wave vote function in the library, which
632     // will be fed from a user code condition compared with 0. Fold in the
633     // redundant compare.
634 
635     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
636     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
637     //
638     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
639     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
640     if (match(Src1, PatternMatch::m_Zero()) &&
641         match(Src0, PatternMatch::m_ZExtOrSExt(
642                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
643                               PatternMatch::m_Value(SrcRHS))))) {
644       if (CCVal == CmpInst::ICMP_EQ)
645         SrcPred = CmpInst::getInversePredicate(SrcPred);
646 
647       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
648                                  ? Intrinsic::amdgcn_fcmp
649                                  : Intrinsic::amdgcn_icmp;
650 
651       Type *Ty = SrcLHS->getType();
652       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
653         // Promote to next legal integer type.
654         unsigned Width = CmpType->getBitWidth();
655         unsigned NewWidth = Width;
656 
657         // Don't do anything for i1 comparisons.
658         if (Width == 1)
659           break;
660 
661         if (Width <= 16)
662           NewWidth = 16;
663         else if (Width <= 32)
664           NewWidth = 32;
665         else if (Width <= 64)
666           NewWidth = 64;
667         else if (Width > 64)
668           break; // Can't handle this.
669 
670         if (Width != NewWidth) {
671           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
672           if (CmpInst::isSigned(SrcPred)) {
673             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
674             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
675           } else {
676             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
677             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
678           }
679         }
680       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
681         break;
682 
683       Function *NewF = Intrinsic::getDeclaration(
684           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
685       Value *Args[] = {SrcLHS, SrcRHS,
686                        ConstantInt::get(CC->getType(), SrcPred)};
687       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
688       NewCall->takeName(&II);
689       return IC.replaceInstUsesWith(II, NewCall);
690     }
691 
692     break;
693   }
694   case Intrinsic::amdgcn_ballot: {
695     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
696       if (Src->isZero()) {
697         // amdgcn.ballot(i1 0) is zero.
698         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
699       }
700 
701       if (Src->isOne()) {
702         // amdgcn.ballot(i1 1) is exec.
703         const char *RegName = "exec";
704         if (II.getType()->isIntegerTy(32))
705           RegName = "exec_lo";
706         else if (!II.getType()->isIntegerTy(64))
707           break;
708 
709         Function *NewF = Intrinsic::getDeclaration(
710             II.getModule(), Intrinsic::read_register, II.getType());
711         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
712         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
713         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
714         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
715         NewCall->addAttribute(AttributeList::FunctionIndex,
716                               Attribute::Convergent);
717         NewCall->takeName(&II);
718         return IC.replaceInstUsesWith(II, NewCall);
719       }
720     }
721     break;
722   }
723   case Intrinsic::amdgcn_wqm_vote: {
724     // wqm_vote is identity when the argument is constant.
725     if (!isa<Constant>(II.getArgOperand(0)))
726       break;
727 
728     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
729   }
730   case Intrinsic::amdgcn_kill: {
731     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
732     if (!C || !C->getZExtValue())
733       break;
734 
735     // amdgcn.kill(i1 1) is a no-op
736     return IC.eraseInstFromFunction(II);
737   }
738   case Intrinsic::amdgcn_update_dpp: {
739     Value *Old = II.getArgOperand(0);
740 
741     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
742     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
743     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
744     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
745         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
746       break;
747 
748     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
749     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
750   }
751   case Intrinsic::amdgcn_permlane16:
752   case Intrinsic::amdgcn_permlanex16: {
753     // Discard vdst_in if it's not going to be read.
754     Value *VDstIn = II.getArgOperand(0);
755     if (isa<UndefValue>(VDstIn))
756       break;
757 
758     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
759     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
760     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
761       break;
762 
763     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
764   }
765   case Intrinsic::amdgcn_readfirstlane:
766   case Intrinsic::amdgcn_readlane: {
767     // A constant value is trivially uniform.
768     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
769       return IC.replaceInstUsesWith(II, C);
770     }
771 
772     // The rest of these may not be safe if the exec may not be the same between
773     // the def and use.
774     Value *Src = II.getArgOperand(0);
775     Instruction *SrcInst = dyn_cast<Instruction>(Src);
776     if (SrcInst && SrcInst->getParent() != II.getParent())
777       break;
778 
779     // readfirstlane (readfirstlane x) -> readfirstlane x
780     // readlane (readfirstlane x), y -> readfirstlane x
781     if (match(Src,
782               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
783       return IC.replaceInstUsesWith(II, Src);
784     }
785 
786     if (IID == Intrinsic::amdgcn_readfirstlane) {
787       // readfirstlane (readlane x, y) -> readlane x, y
788       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
789         return IC.replaceInstUsesWith(II, Src);
790       }
791     } else {
792       // readlane (readlane x, y), y -> readlane x, y
793       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
794                          PatternMatch::m_Value(),
795                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
796         return IC.replaceInstUsesWith(II, Src);
797       }
798     }
799 
800     break;
801   }
802   case Intrinsic::amdgcn_ldexp: {
803     // FIXME: This doesn't introduce new instructions and belongs in
804     // InstructionSimplify.
805     Type *Ty = II.getType();
806     Value *Op0 = II.getArgOperand(0);
807     Value *Op1 = II.getArgOperand(1);
808 
809     // Folding undef to qnan is safe regardless of the FP mode.
810     if (isa<UndefValue>(Op0)) {
811       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
812       return IC.replaceInstUsesWith(II, QNaN);
813     }
814 
815     const APFloat *C = nullptr;
816     match(Op0, PatternMatch::m_APFloat(C));
817 
818     // FIXME: Should flush denorms depending on FP mode, but that's ignored
819     // everywhere else.
820     //
821     // These cases should be safe, even with strictfp.
822     // ldexp(0.0, x) -> 0.0
823     // ldexp(-0.0, x) -> -0.0
824     // ldexp(inf, x) -> inf
825     // ldexp(-inf, x) -> -inf
826     if (C && (C->isZero() || C->isInfinity())) {
827       return IC.replaceInstUsesWith(II, Op0);
828     }
829 
830     // With strictfp, be more careful about possibly needing to flush denormals
831     // or not, and snan behavior depends on ieee_mode.
832     if (II.isStrictFP())
833       break;
834 
835     if (C && C->isNaN()) {
836       // FIXME: We just need to make the nan quiet here, but that's unavailable
837       // on APFloat, only IEEEfloat
838       auto *Quieted =
839           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
840       return IC.replaceInstUsesWith(II, Quieted);
841     }
842 
843     // ldexp(x, 0) -> x
844     // ldexp(x, undef) -> x
845     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
846       return IC.replaceInstUsesWith(II, Op0);
847     }
848 
849     break;
850   }
851   case Intrinsic::amdgcn_fmul_legacy: {
852     Value *Op0 = II.getArgOperand(0);
853     Value *Op1 = II.getArgOperand(1);
854 
855     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
856     // infinity, gives +0.0.
857     // TODO: Move to InstSimplify?
858     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
859         match(Op1, PatternMatch::m_AnyZeroFP()))
860       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
861 
862     // If we can prove we don't have one of the special cases then we can use a
863     // normal fmul instruction instead.
864     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
865       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
866       FMul->takeName(&II);
867       return IC.replaceInstUsesWith(II, FMul);
868     }
869     break;
870   }
871   case Intrinsic::amdgcn_fma_legacy: {
872     Value *Op0 = II.getArgOperand(0);
873     Value *Op1 = II.getArgOperand(1);
874     Value *Op2 = II.getArgOperand(2);
875 
876     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
877     // infinity, gives +0.0.
878     // TODO: Move to InstSimplify?
879     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
880         match(Op1, PatternMatch::m_AnyZeroFP())) {
881       // It's tempting to just return Op2 here, but that would give the wrong
882       // result if Op2 was -0.0.
883       auto *Zero = ConstantFP::getNullValue(II.getType());
884       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
885       FAdd->takeName(&II);
886       return IC.replaceInstUsesWith(II, FAdd);
887     }
888 
889     // If we can prove we don't have one of the special cases then we can use a
890     // normal fma instead.
891     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
892       II.setCalledOperand(Intrinsic::getDeclaration(
893           II.getModule(), Intrinsic::fma, II.getType()));
894       return &II;
895     }
896     break;
897   }
898   default: {
899     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
900             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
901       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
902     }
903   }
904   }
905   return None;
906 }
907 
908 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
909 ///
910 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
911 ///       struct returns.
912 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
913                                                     IntrinsicInst &II,
914                                                     APInt DemandedElts,
915                                                     int DMaskIdx = -1) {
916 
917   auto *IIVTy = cast<FixedVectorType>(II.getType());
918   unsigned VWidth = IIVTy->getNumElements();
919   if (VWidth == 1)
920     return nullptr;
921 
922   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
923   IC.Builder.SetInsertPoint(&II);
924 
925   // Assume the arguments are unchanged and later override them, if needed.
926   SmallVector<Value *, 16> Args(II.args());
927 
928   if (DMaskIdx < 0) {
929     // Buffer case.
930 
931     const unsigned ActiveBits = DemandedElts.getActiveBits();
932     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
933 
934     // Start assuming the prefix of elements is demanded, but possibly clear
935     // some other bits if there are trailing zeros (unused components at front)
936     // and update offset.
937     DemandedElts = (1 << ActiveBits) - 1;
938 
939     if (UnusedComponentsAtFront > 0) {
940       static const unsigned InvalidOffsetIdx = 0xf;
941 
942       unsigned OffsetIdx;
943       switch (II.getIntrinsicID()) {
944       case Intrinsic::amdgcn_raw_buffer_load:
945         OffsetIdx = 1;
946         break;
947       case Intrinsic::amdgcn_s_buffer_load:
948         // If resulting type is vec3, there is no point in trimming the
949         // load with updated offset, as the vec3 would most likely be widened to
950         // vec4 anyway during lowering.
951         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
952           OffsetIdx = InvalidOffsetIdx;
953         else
954           OffsetIdx = 1;
955         break;
956       case Intrinsic::amdgcn_struct_buffer_load:
957         OffsetIdx = 2;
958         break;
959       default:
960         // TODO: handle tbuffer* intrinsics.
961         OffsetIdx = InvalidOffsetIdx;
962         break;
963       }
964 
965       if (OffsetIdx != InvalidOffsetIdx) {
966         // Clear demanded bits and update the offset.
967         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
968         auto *Offset = II.getArgOperand(OffsetIdx);
969         unsigned SingleComponentSizeInBits =
970             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
971         unsigned OffsetAdd =
972             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
973         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
974         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
975       }
976     }
977   } else {
978     // Image case.
979 
980     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
981     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
982 
983     // Mask off values that are undefined because the dmask doesn't cover them
984     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
985 
986     unsigned NewDMaskVal = 0;
987     unsigned OrigLoadIdx = 0;
988     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
989       const unsigned Bit = 1 << SrcIdx;
990       if (!!(DMaskVal & Bit)) {
991         if (!!DemandedElts[OrigLoadIdx])
992           NewDMaskVal |= Bit;
993         OrigLoadIdx++;
994       }
995     }
996 
997     if (DMaskVal != NewDMaskVal)
998       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
999   }
1000 
1001   unsigned NewNumElts = DemandedElts.countPopulation();
1002   if (!NewNumElts)
1003     return UndefValue::get(II.getType());
1004 
1005   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1006     if (DMaskIdx >= 0)
1007       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1008     return nullptr;
1009   }
1010 
1011   // Validate function argument and return types, extracting overloaded types
1012   // along the way.
1013   SmallVector<Type *, 6> OverloadTys;
1014   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1015     return nullptr;
1016 
1017   Module *M = II.getParent()->getParent()->getParent();
1018   Type *EltTy = IIVTy->getElementType();
1019   Type *NewTy =
1020       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1021 
1022   OverloadTys[0] = NewTy;
1023   Function *NewIntrin =
1024       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1025 
1026   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1027   NewCall->takeName(&II);
1028   NewCall->copyMetadata(II);
1029 
1030   if (NewNumElts == 1) {
1031     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1032                                           NewCall,
1033                                           DemandedElts.countTrailingZeros());
1034   }
1035 
1036   SmallVector<int, 8> EltMask;
1037   unsigned NewLoadIdx = 0;
1038   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1039     if (!!DemandedElts[OrigLoadIdx])
1040       EltMask.push_back(NewLoadIdx++);
1041     else
1042       EltMask.push_back(NewNumElts);
1043   }
1044 
1045   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1046 
1047   return Shuffle;
1048 }
1049 
1050 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1051     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1052     APInt &UndefElts2, APInt &UndefElts3,
1053     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1054         SimplifyAndSetOp) const {
1055   switch (II.getIntrinsicID()) {
1056   case Intrinsic::amdgcn_buffer_load:
1057   case Intrinsic::amdgcn_buffer_load_format:
1058   case Intrinsic::amdgcn_raw_buffer_load:
1059   case Intrinsic::amdgcn_raw_buffer_load_format:
1060   case Intrinsic::amdgcn_raw_tbuffer_load:
1061   case Intrinsic::amdgcn_s_buffer_load:
1062   case Intrinsic::amdgcn_struct_buffer_load:
1063   case Intrinsic::amdgcn_struct_buffer_load_format:
1064   case Intrinsic::amdgcn_struct_tbuffer_load:
1065   case Intrinsic::amdgcn_tbuffer_load:
1066     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1067   default: {
1068     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1069       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1070     }
1071     break;
1072   }
1073   }
1074   return None;
1075 }
1076