1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "AMDGPUtti"
26 
27 namespace {
28 
29 struct AMDGPUImageDMaskIntrinsic {
30   unsigned Intr;
31 };
32 
33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
34 #include "InstCombineTables.inc"
35 
36 } // end anonymous namespace
37 
38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
39 //
40 // A single NaN input is folded to minnum, so we rely on that folding for
41 // handling NaNs.
42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
43                            const APFloat &Src2) {
44   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
45 
46   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
47   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
48   if (Cmp0 == APFloat::cmpEqual)
49     return maxnum(Src1, Src2);
50 
51   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
52   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
53   if (Cmp1 == APFloat::cmpEqual)
54     return maxnum(Src0, Src2);
55 
56   return maxnum(Src0, Src1);
57 }
58 
59 // Check if a value can be converted to a 16-bit value without losing
60 // precision.
61 // The value is expected to be either a float (IsFloat = true) or an unsigned
62 // integer (IsFloat = false).
63 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
64   Type *VTy = V.getType();
65   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
66     // The value is already 16-bit, so we don't want to convert to 16-bit again!
67     return false;
68   }
69   if (IsFloat) {
70     if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
71       // We need to check that if we cast the index down to a half, we do not
72       // lose precision.
73       APFloat FloatValue(ConstFloat->getValueAPF());
74       bool LosesInfo = true;
75       FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
76                          &LosesInfo);
77       return !LosesInfo;
78     }
79   } else {
80     if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
81       // We need to check that if we cast the index down to an i16, we do not
82       // lose precision.
83       APInt IntValue(ConstInt->getValue());
84       return IntValue.getActiveBits() <= 16;
85     }
86   }
87 
88   Value *CastSrc;
89   bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
90                        : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
91   if (IsExt) {
92     Type *CastSrcTy = CastSrc->getType();
93     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
94       return true;
95   }
96 
97   return false;
98 }
99 
100 // Convert a value to 16-bit.
101 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
102   Type *VTy = V.getType();
103   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
104     return cast<Instruction>(&V)->getOperand(0);
105   if (VTy->isIntegerTy())
106     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
107   if (VTy->isFloatingPointTy())
108     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
109 
110   llvm_unreachable("Should never be called!");
111 }
112 
113 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
114 /// modified arguments (based on OldIntr) and replaces InstToReplace with
115 /// this newly created intrinsic call.
116 static Optional<Instruction *> modifyIntrinsicCall(
117     IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
118     InstCombiner &IC,
119     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
120         Func) {
121   SmallVector<Type *, 4> ArgTys;
122   if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
123     return None;
124 
125   SmallVector<Value *, 8> Args(OldIntr.args());
126 
127   // Modify arguments and types
128   Func(Args, ArgTys);
129 
130   Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
131 
132   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
133   NewCall->takeName(&OldIntr);
134   NewCall->copyMetadata(OldIntr);
135   if (isa<FPMathOperator>(NewCall))
136     NewCall->copyFastMathFlags(&OldIntr);
137 
138   // Erase and replace uses
139   if (!InstToReplace.getType()->isVoidTy())
140     IC.replaceInstUsesWith(InstToReplace, NewCall);
141 
142   bool RemoveOldIntr = &OldIntr != &InstToReplace;
143 
144   auto RetValue = IC.eraseInstFromFunction(InstToReplace);
145   if (RemoveOldIntr)
146     IC.eraseInstFromFunction(OldIntr);
147 
148   return RetValue;
149 }
150 
151 static Optional<Instruction *>
152 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
153                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
154                              IntrinsicInst &II, InstCombiner &IC) {
155   // Optimize _L to _LZ when _L is zero
156   if (const auto *LZMappingInfo =
157           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
158     if (auto *ConstantLod =
159             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
160       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
161         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
162             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
163                                                      ImageDimIntr->Dim);
164         return modifyIntrinsicCall(
165             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
166               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
167             });
168       }
169     }
170   }
171 
172   // Optimize _mip away, when 'lod' is zero
173   if (const auto *MIPMappingInfo =
174           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
175     if (auto *ConstantMip =
176             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
177       if (ConstantMip->isZero()) {
178         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
179             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
180                                                      ImageDimIntr->Dim);
181         return modifyIntrinsicCall(
182             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
183               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
184             });
185       }
186     }
187   }
188 
189   // Optimize _bias away when 'bias' is zero
190   if (const auto *BiasMappingInfo =
191           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
192     if (auto *ConstantBias =
193             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
194       if (ConstantBias->isZero()) {
195         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
196             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
197                                                      ImageDimIntr->Dim);
198         return modifyIntrinsicCall(
199             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
200               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
201               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
202             });
203       }
204     }
205   }
206 
207   // Optimize _offset away when 'offset' is zero
208   if (const auto *OffsetMappingInfo =
209           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
210     if (auto *ConstantOffset =
211             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
212       if (ConstantOffset->isZero()) {
213         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
214             AMDGPU::getImageDimIntrinsicByBaseOpcode(
215                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
216         return modifyIntrinsicCall(
217             II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
218               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
219             });
220       }
221     }
222   }
223 
224   // Try to use D16
225   if (ST->hasD16Images()) {
226 
227     const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
228         AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
229 
230     if (BaseOpcode->HasD16) {
231 
232       // If the only use of image intrinsic is a fptrunc (with conversion to
233       // half) then both fptrunc and image intrinsic will be replaced with image
234       // intrinsic with D16 flag.
235       if (II.hasOneUse()) {
236         Instruction *User = II.user_back();
237 
238         if (User->getOpcode() == Instruction::FPTrunc &&
239             User->getType()->getScalarType()->isHalfTy()) {
240 
241           return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
242                                      [&](auto &Args, auto &ArgTys) {
243                                        // Change return type of image intrinsic.
244                                        // Set it to return type of fptrunc.
245                                        ArgTys[0] = User->getType();
246                                      });
247         }
248       }
249     }
250   }
251 
252   // Try to use A16 or G16
253   if (!ST->hasA16() && !ST->hasG16())
254     return None;
255 
256   // Address is interpreted as float if the instruction has a sampler or as
257   // unsigned int if there is no sampler.
258   bool HasSampler =
259       AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
260   bool FloatCoord = false;
261   // true means derivatives can be converted to 16 bit, coordinates not
262   bool OnlyDerivatives = false;
263 
264   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
265        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
266     Value *Coord = II.getOperand(OperandIndex);
267     // If the values are not derived from 16-bit values, we cannot optimize.
268     if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
269       if (OperandIndex < ImageDimIntr->CoordStart ||
270           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
271         return None;
272       }
273       // All gradients can be converted, so convert only them
274       OnlyDerivatives = true;
275       break;
276     }
277 
278     assert(OperandIndex == ImageDimIntr->GradientStart ||
279            FloatCoord == Coord->getType()->isFloatingPointTy());
280     FloatCoord = Coord->getType()->isFloatingPointTy();
281   }
282 
283   if (!OnlyDerivatives && !ST->hasA16())
284     OnlyDerivatives = true; // Only supports G16
285 
286   // Check if there is a bias parameter and if it can be converted to f16
287   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
288     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
289     assert(HasSampler &&
290            "Only image instructions with a sampler can have a bias");
291     if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
292       OnlyDerivatives = true;
293   }
294 
295   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
296                                                ImageDimIntr->CoordStart))
297     return None;
298 
299   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
300                                : Type::getInt16Ty(II.getContext());
301 
302   return modifyIntrinsicCall(
303       II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
304         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
305         if (!OnlyDerivatives) {
306           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
307 
308           // Change the bias type
309           if (ImageDimIntr->NumBiasArgs != 0)
310             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
311         }
312 
313         unsigned EndIndex =
314             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
315         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
316              OperandIndex < EndIndex; OperandIndex++) {
317           Args[OperandIndex] =
318               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
319         }
320 
321         // Convert the bias
322         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
323           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
324           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
325         }
326       });
327 }
328 
329 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
330                                            InstCombiner &IC) const {
331   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
332   // infinity, gives +0.0. If we can prove we don't have one of the special
333   // cases then we can use a normal multiply instead.
334   // TODO: Create and use isKnownFiniteNonZero instead of just matching
335   // constants here.
336   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
337       match(Op1, PatternMatch::m_FiniteNonZero())) {
338     // One operand is not zero or infinity or NaN.
339     return true;
340   }
341   auto *TLI = &IC.getTargetLibraryInfo();
342   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
343       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
344     // Neither operand is infinity or NaN.
345     return true;
346   }
347   return false;
348 }
349 
350 Optional<Instruction *>
351 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
352   Intrinsic::ID IID = II.getIntrinsicID();
353   switch (IID) {
354   case Intrinsic::amdgcn_rcp: {
355     Value *Src = II.getArgOperand(0);
356 
357     // TODO: Move to ConstantFolding/InstSimplify?
358     if (isa<UndefValue>(Src)) {
359       Type *Ty = II.getType();
360       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
361       return IC.replaceInstUsesWith(II, QNaN);
362     }
363 
364     if (II.isStrictFP())
365       break;
366 
367     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
368       const APFloat &ArgVal = C->getValueAPF();
369       APFloat Val(ArgVal.getSemantics(), 1);
370       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
371 
372       // This is more precise than the instruction may give.
373       //
374       // TODO: The instruction always flushes denormal results (except for f16),
375       // should this also?
376       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
377     }
378 
379     break;
380   }
381   case Intrinsic::amdgcn_rsq: {
382     Value *Src = II.getArgOperand(0);
383 
384     // TODO: Move to ConstantFolding/InstSimplify?
385     if (isa<UndefValue>(Src)) {
386       Type *Ty = II.getType();
387       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
388       return IC.replaceInstUsesWith(II, QNaN);
389     }
390 
391     break;
392   }
393   case Intrinsic::amdgcn_frexp_mant:
394   case Intrinsic::amdgcn_frexp_exp: {
395     Value *Src = II.getArgOperand(0);
396     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
397       int Exp;
398       APFloat Significand =
399           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
400 
401       if (IID == Intrinsic::amdgcn_frexp_mant) {
402         return IC.replaceInstUsesWith(
403             II, ConstantFP::get(II.getContext(), Significand));
404       }
405 
406       // Match instruction special case behavior.
407       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
408         Exp = 0;
409 
410       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
411     }
412 
413     if (isa<UndefValue>(Src)) {
414       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
415     }
416 
417     break;
418   }
419   case Intrinsic::amdgcn_class: {
420     enum {
421       S_NAN = 1 << 0,       // Signaling NaN
422       Q_NAN = 1 << 1,       // Quiet NaN
423       N_INFINITY = 1 << 2,  // Negative infinity
424       N_NORMAL = 1 << 3,    // Negative normal
425       N_SUBNORMAL = 1 << 4, // Negative subnormal
426       N_ZERO = 1 << 5,      // Negative zero
427       P_ZERO = 1 << 6,      // Positive zero
428       P_SUBNORMAL = 1 << 7, // Positive subnormal
429       P_NORMAL = 1 << 8,    // Positive normal
430       P_INFINITY = 1 << 9   // Positive infinity
431     };
432 
433     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
434                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
435                               P_NORMAL | P_INFINITY;
436 
437     Value *Src0 = II.getArgOperand(0);
438     Value *Src1 = II.getArgOperand(1);
439     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
440     if (!CMask) {
441       if (isa<UndefValue>(Src0)) {
442         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
443       }
444 
445       if (isa<UndefValue>(Src1)) {
446         return IC.replaceInstUsesWith(II,
447                                       ConstantInt::get(II.getType(), false));
448       }
449       break;
450     }
451 
452     uint32_t Mask = CMask->getZExtValue();
453 
454     // If all tests are made, it doesn't matter what the value is.
455     if ((Mask & FullMask) == FullMask) {
456       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
457     }
458 
459     if ((Mask & FullMask) == 0) {
460       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
461     }
462 
463     if (Mask == (S_NAN | Q_NAN)) {
464       // Equivalent of isnan. Replace with standard fcmp.
465       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
466       FCmp->takeName(&II);
467       return IC.replaceInstUsesWith(II, FCmp);
468     }
469 
470     if (Mask == (N_ZERO | P_ZERO)) {
471       // Equivalent of == 0.
472       Value *FCmp =
473           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
474 
475       FCmp->takeName(&II);
476       return IC.replaceInstUsesWith(II, FCmp);
477     }
478 
479     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
480     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
481         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
482       return IC.replaceOperand(
483           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
484     }
485 
486     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
487     if (!CVal) {
488       if (isa<UndefValue>(Src0)) {
489         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
490       }
491 
492       // Clamp mask to used bits
493       if ((Mask & FullMask) != Mask) {
494         CallInst *NewCall = IC.Builder.CreateCall(
495             II.getCalledFunction(),
496             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
497 
498         NewCall->takeName(&II);
499         return IC.replaceInstUsesWith(II, NewCall);
500       }
501 
502       break;
503     }
504 
505     const APFloat &Val = CVal->getValueAPF();
506 
507     bool Result =
508         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
509         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
510         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
511         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
512         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
513         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
514         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
515         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
516         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
517         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
518 
519     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
520   }
521   case Intrinsic::amdgcn_cvt_pkrtz: {
522     Value *Src0 = II.getArgOperand(0);
523     Value *Src1 = II.getArgOperand(1);
524     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
525       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
526         const fltSemantics &HalfSem =
527             II.getType()->getScalarType()->getFltSemantics();
528         bool LosesInfo;
529         APFloat Val0 = C0->getValueAPF();
530         APFloat Val1 = C1->getValueAPF();
531         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
532         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
533 
534         Constant *Folded =
535             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
536                                  ConstantFP::get(II.getContext(), Val1)});
537         return IC.replaceInstUsesWith(II, Folded);
538       }
539     }
540 
541     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
542       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
543     }
544 
545     break;
546   }
547   case Intrinsic::amdgcn_cvt_pknorm_i16:
548   case Intrinsic::amdgcn_cvt_pknorm_u16:
549   case Intrinsic::amdgcn_cvt_pk_i16:
550   case Intrinsic::amdgcn_cvt_pk_u16: {
551     Value *Src0 = II.getArgOperand(0);
552     Value *Src1 = II.getArgOperand(1);
553 
554     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
555       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
556     }
557 
558     break;
559   }
560   case Intrinsic::amdgcn_ubfe:
561   case Intrinsic::amdgcn_sbfe: {
562     // Decompose simple cases into standard shifts.
563     Value *Src = II.getArgOperand(0);
564     if (isa<UndefValue>(Src)) {
565       return IC.replaceInstUsesWith(II, Src);
566     }
567 
568     unsigned Width;
569     Type *Ty = II.getType();
570     unsigned IntSize = Ty->getIntegerBitWidth();
571 
572     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
573     if (CWidth) {
574       Width = CWidth->getZExtValue();
575       if ((Width & (IntSize - 1)) == 0) {
576         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
577       }
578 
579       // Hardware ignores high bits, so remove those.
580       if (Width >= IntSize) {
581         return IC.replaceOperand(
582             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
583       }
584     }
585 
586     unsigned Offset;
587     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
588     if (COffset) {
589       Offset = COffset->getZExtValue();
590       if (Offset >= IntSize) {
591         return IC.replaceOperand(
592             II, 1,
593             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
594       }
595     }
596 
597     bool Signed = IID == Intrinsic::amdgcn_sbfe;
598 
599     if (!CWidth || !COffset)
600       break;
601 
602     // The case of Width == 0 is handled above, which makes this transformation
603     // safe.  If Width == 0, then the ashr and lshr instructions become poison
604     // value since the shift amount would be equal to the bit size.
605     assert(Width != 0);
606 
607     // TODO: This allows folding to undef when the hardware has specific
608     // behavior?
609     if (Offset + Width < IntSize) {
610       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
611       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
612                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
613       RightShift->takeName(&II);
614       return IC.replaceInstUsesWith(II, RightShift);
615     }
616 
617     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
618                                : IC.Builder.CreateLShr(Src, Offset);
619 
620     RightShift->takeName(&II);
621     return IC.replaceInstUsesWith(II, RightShift);
622   }
623   case Intrinsic::amdgcn_exp:
624   case Intrinsic::amdgcn_exp_row:
625   case Intrinsic::amdgcn_exp_compr: {
626     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
627     unsigned EnBits = En->getZExtValue();
628     if (EnBits == 0xf)
629       break; // All inputs enabled.
630 
631     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
632     bool Changed = false;
633     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
634       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
635           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
636         Value *Src = II.getArgOperand(I + 2);
637         if (!isa<UndefValue>(Src)) {
638           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
639           Changed = true;
640         }
641       }
642     }
643 
644     if (Changed) {
645       return &II;
646     }
647 
648     break;
649   }
650   case Intrinsic::amdgcn_fmed3: {
651     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
652     // for the shader.
653 
654     Value *Src0 = II.getArgOperand(0);
655     Value *Src1 = II.getArgOperand(1);
656     Value *Src2 = II.getArgOperand(2);
657 
658     // Checking for NaN before canonicalization provides better fidelity when
659     // mapping other operations onto fmed3 since the order of operands is
660     // unchanged.
661     CallInst *NewCall = nullptr;
662     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
663       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
664     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
665       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
666     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
667       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
668     }
669 
670     if (NewCall) {
671       NewCall->copyFastMathFlags(&II);
672       NewCall->takeName(&II);
673       return IC.replaceInstUsesWith(II, NewCall);
674     }
675 
676     bool Swap = false;
677     // Canonicalize constants to RHS operands.
678     //
679     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
680     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
681       std::swap(Src0, Src1);
682       Swap = true;
683     }
684 
685     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
686       std::swap(Src1, Src2);
687       Swap = true;
688     }
689 
690     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
691       std::swap(Src0, Src1);
692       Swap = true;
693     }
694 
695     if (Swap) {
696       II.setArgOperand(0, Src0);
697       II.setArgOperand(1, Src1);
698       II.setArgOperand(2, Src2);
699       return &II;
700     }
701 
702     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
703       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
704         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
705           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
706                                        C2->getValueAPF());
707           return IC.replaceInstUsesWith(
708               II, ConstantFP::get(IC.Builder.getContext(), Result));
709         }
710       }
711     }
712 
713     break;
714   }
715   case Intrinsic::amdgcn_icmp:
716   case Intrinsic::amdgcn_fcmp: {
717     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
718     // Guard against invalid arguments.
719     int64_t CCVal = CC->getZExtValue();
720     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
721     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
722                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
723         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
724                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
725       break;
726 
727     Value *Src0 = II.getArgOperand(0);
728     Value *Src1 = II.getArgOperand(1);
729 
730     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
731       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
732         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
733         if (CCmp->isNullValue()) {
734           return IC.replaceInstUsesWith(
735               II, ConstantExpr::getSExt(CCmp, II.getType()));
736         }
737 
738         // The result of V_ICMP/V_FCMP assembly instructions (which this
739         // intrinsic exposes) is one bit per thread, masked with the EXEC
740         // register (which contains the bitmask of live threads). So a
741         // comparison that always returns true is the same as a read of the
742         // EXEC register.
743         Function *NewF = Intrinsic::getDeclaration(
744             II.getModule(), Intrinsic::read_register, II.getType());
745         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
746         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
747         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
748         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
749         NewCall->addFnAttr(Attribute::Convergent);
750         NewCall->takeName(&II);
751         return IC.replaceInstUsesWith(II, NewCall);
752       }
753 
754       // Canonicalize constants to RHS.
755       CmpInst::Predicate SwapPred =
756           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
757       II.setArgOperand(0, Src1);
758       II.setArgOperand(1, Src0);
759       II.setArgOperand(
760           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
761       return &II;
762     }
763 
764     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
765       break;
766 
767     // Canonicalize compare eq with true value to compare != 0
768     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
769     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
770     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
771     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
772     Value *ExtSrc;
773     if (CCVal == CmpInst::ICMP_EQ &&
774         ((match(Src1, PatternMatch::m_One()) &&
775           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
776          (match(Src1, PatternMatch::m_AllOnes()) &&
777           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
778         ExtSrc->getType()->isIntegerTy(1)) {
779       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
780       IC.replaceOperand(II, 2,
781                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
782       return &II;
783     }
784 
785     CmpInst::Predicate SrcPred;
786     Value *SrcLHS;
787     Value *SrcRHS;
788 
789     // Fold compare eq/ne with 0 from a compare result as the predicate to the
790     // intrinsic. The typical use is a wave vote function in the library, which
791     // will be fed from a user code condition compared with 0. Fold in the
792     // redundant compare.
793 
794     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
795     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
796     //
797     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
798     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
799     if (match(Src1, PatternMatch::m_Zero()) &&
800         match(Src0, PatternMatch::m_ZExtOrSExt(
801                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
802                               PatternMatch::m_Value(SrcRHS))))) {
803       if (CCVal == CmpInst::ICMP_EQ)
804         SrcPred = CmpInst::getInversePredicate(SrcPred);
805 
806       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
807                                  ? Intrinsic::amdgcn_fcmp
808                                  : Intrinsic::amdgcn_icmp;
809 
810       Type *Ty = SrcLHS->getType();
811       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
812         // Promote to next legal integer type.
813         unsigned Width = CmpType->getBitWidth();
814         unsigned NewWidth = Width;
815 
816         // Don't do anything for i1 comparisons.
817         if (Width == 1)
818           break;
819 
820         if (Width <= 16)
821           NewWidth = 16;
822         else if (Width <= 32)
823           NewWidth = 32;
824         else if (Width <= 64)
825           NewWidth = 64;
826         else if (Width > 64)
827           break; // Can't handle this.
828 
829         if (Width != NewWidth) {
830           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
831           if (CmpInst::isSigned(SrcPred)) {
832             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
833             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
834           } else {
835             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
836             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
837           }
838         }
839       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
840         break;
841 
842       Function *NewF = Intrinsic::getDeclaration(
843           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
844       Value *Args[] = {SrcLHS, SrcRHS,
845                        ConstantInt::get(CC->getType(), SrcPred)};
846       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
847       NewCall->takeName(&II);
848       return IC.replaceInstUsesWith(II, NewCall);
849     }
850 
851     break;
852   }
853   case Intrinsic::amdgcn_ballot: {
854     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
855       if (Src->isZero()) {
856         // amdgcn.ballot(i1 0) is zero.
857         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
858       }
859 
860       if (Src->isOne()) {
861         // amdgcn.ballot(i1 1) is exec.
862         const char *RegName = "exec";
863         if (II.getType()->isIntegerTy(32))
864           RegName = "exec_lo";
865         else if (!II.getType()->isIntegerTy(64))
866           break;
867 
868         Function *NewF = Intrinsic::getDeclaration(
869             II.getModule(), Intrinsic::read_register, II.getType());
870         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
871         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
872         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
873         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
874         NewCall->addFnAttr(Attribute::Convergent);
875         NewCall->takeName(&II);
876         return IC.replaceInstUsesWith(II, NewCall);
877       }
878     }
879     break;
880   }
881   case Intrinsic::amdgcn_wqm_vote: {
882     // wqm_vote is identity when the argument is constant.
883     if (!isa<Constant>(II.getArgOperand(0)))
884       break;
885 
886     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
887   }
888   case Intrinsic::amdgcn_kill: {
889     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
890     if (!C || !C->getZExtValue())
891       break;
892 
893     // amdgcn.kill(i1 1) is a no-op
894     return IC.eraseInstFromFunction(II);
895   }
896   case Intrinsic::amdgcn_update_dpp: {
897     Value *Old = II.getArgOperand(0);
898 
899     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
900     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
901     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
902     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
903         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
904       break;
905 
906     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
907     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
908   }
909   case Intrinsic::amdgcn_permlane16:
910   case Intrinsic::amdgcn_permlanex16: {
911     // Discard vdst_in if it's not going to be read.
912     Value *VDstIn = II.getArgOperand(0);
913     if (isa<UndefValue>(VDstIn))
914       break;
915 
916     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
917     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
918     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
919       break;
920 
921     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
922   }
923   case Intrinsic::amdgcn_permlane64:
924     // A constant value is trivially uniform.
925     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
926       return IC.replaceInstUsesWith(II, C);
927     }
928     break;
929   case Intrinsic::amdgcn_readfirstlane:
930   case Intrinsic::amdgcn_readlane: {
931     // A constant value is trivially uniform.
932     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
933       return IC.replaceInstUsesWith(II, C);
934     }
935 
936     // The rest of these may not be safe if the exec may not be the same between
937     // the def and use.
938     Value *Src = II.getArgOperand(0);
939     Instruction *SrcInst = dyn_cast<Instruction>(Src);
940     if (SrcInst && SrcInst->getParent() != II.getParent())
941       break;
942 
943     // readfirstlane (readfirstlane x) -> readfirstlane x
944     // readlane (readfirstlane x), y -> readfirstlane x
945     if (match(Src,
946               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
947       return IC.replaceInstUsesWith(II, Src);
948     }
949 
950     if (IID == Intrinsic::amdgcn_readfirstlane) {
951       // readfirstlane (readlane x, y) -> readlane x, y
952       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
953         return IC.replaceInstUsesWith(II, Src);
954       }
955     } else {
956       // readlane (readlane x, y), y -> readlane x, y
957       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
958                          PatternMatch::m_Value(),
959                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
960         return IC.replaceInstUsesWith(II, Src);
961       }
962     }
963 
964     break;
965   }
966   case Intrinsic::amdgcn_ldexp: {
967     // FIXME: This doesn't introduce new instructions and belongs in
968     // InstructionSimplify.
969     Type *Ty = II.getType();
970     Value *Op0 = II.getArgOperand(0);
971     Value *Op1 = II.getArgOperand(1);
972 
973     // Folding undef to qnan is safe regardless of the FP mode.
974     if (isa<UndefValue>(Op0)) {
975       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
976       return IC.replaceInstUsesWith(II, QNaN);
977     }
978 
979     const APFloat *C = nullptr;
980     match(Op0, PatternMatch::m_APFloat(C));
981 
982     // FIXME: Should flush denorms depending on FP mode, but that's ignored
983     // everywhere else.
984     //
985     // These cases should be safe, even with strictfp.
986     // ldexp(0.0, x) -> 0.0
987     // ldexp(-0.0, x) -> -0.0
988     // ldexp(inf, x) -> inf
989     // ldexp(-inf, x) -> -inf
990     if (C && (C->isZero() || C->isInfinity())) {
991       return IC.replaceInstUsesWith(II, Op0);
992     }
993 
994     // With strictfp, be more careful about possibly needing to flush denormals
995     // or not, and snan behavior depends on ieee_mode.
996     if (II.isStrictFP())
997       break;
998 
999     if (C && C->isNaN()) {
1000       // FIXME: We just need to make the nan quiet here, but that's unavailable
1001       // on APFloat, only IEEEfloat
1002       auto *Quieted =
1003           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
1004       return IC.replaceInstUsesWith(II, Quieted);
1005     }
1006 
1007     // ldexp(x, 0) -> x
1008     // ldexp(x, undef) -> x
1009     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
1010       return IC.replaceInstUsesWith(II, Op0);
1011     }
1012 
1013     break;
1014   }
1015   case Intrinsic::amdgcn_fmul_legacy: {
1016     Value *Op0 = II.getArgOperand(0);
1017     Value *Op1 = II.getArgOperand(1);
1018 
1019     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1020     // infinity, gives +0.0.
1021     // TODO: Move to InstSimplify?
1022     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1023         match(Op1, PatternMatch::m_AnyZeroFP()))
1024       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
1025 
1026     // If we can prove we don't have one of the special cases then we can use a
1027     // normal fmul instruction instead.
1028     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1029       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1030       FMul->takeName(&II);
1031       return IC.replaceInstUsesWith(II, FMul);
1032     }
1033     break;
1034   }
1035   case Intrinsic::amdgcn_fma_legacy: {
1036     Value *Op0 = II.getArgOperand(0);
1037     Value *Op1 = II.getArgOperand(1);
1038     Value *Op2 = II.getArgOperand(2);
1039 
1040     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1041     // infinity, gives +0.0.
1042     // TODO: Move to InstSimplify?
1043     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1044         match(Op1, PatternMatch::m_AnyZeroFP())) {
1045       // It's tempting to just return Op2 here, but that would give the wrong
1046       // result if Op2 was -0.0.
1047       auto *Zero = ConstantFP::getNullValue(II.getType());
1048       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1049       FAdd->takeName(&II);
1050       return IC.replaceInstUsesWith(II, FAdd);
1051     }
1052 
1053     // If we can prove we don't have one of the special cases then we can use a
1054     // normal fma instead.
1055     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1056       II.setCalledOperand(Intrinsic::getDeclaration(
1057           II.getModule(), Intrinsic::fma, II.getType()));
1058       return &II;
1059     }
1060     break;
1061   }
1062   case Intrinsic::amdgcn_is_shared:
1063   case Intrinsic::amdgcn_is_private: {
1064     if (isa<UndefValue>(II.getArgOperand(0)))
1065       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1066 
1067     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1068       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1069     break;
1070   }
1071   default: {
1072     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1073             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1074       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1075     }
1076   }
1077   }
1078   return None;
1079 }
1080 
1081 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1082 ///
1083 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1084 ///       struct returns.
1085 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1086                                                     IntrinsicInst &II,
1087                                                     APInt DemandedElts,
1088                                                     int DMaskIdx = -1) {
1089 
1090   auto *IIVTy = cast<FixedVectorType>(II.getType());
1091   unsigned VWidth = IIVTy->getNumElements();
1092   if (VWidth == 1)
1093     return nullptr;
1094 
1095   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1096   IC.Builder.SetInsertPoint(&II);
1097 
1098   // Assume the arguments are unchanged and later override them, if needed.
1099   SmallVector<Value *, 16> Args(II.args());
1100 
1101   if (DMaskIdx < 0) {
1102     // Buffer case.
1103 
1104     const unsigned ActiveBits = DemandedElts.getActiveBits();
1105     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1106 
1107     // Start assuming the prefix of elements is demanded, but possibly clear
1108     // some other bits if there are trailing zeros (unused components at front)
1109     // and update offset.
1110     DemandedElts = (1 << ActiveBits) - 1;
1111 
1112     if (UnusedComponentsAtFront > 0) {
1113       static const unsigned InvalidOffsetIdx = 0xf;
1114 
1115       unsigned OffsetIdx;
1116       switch (II.getIntrinsicID()) {
1117       case Intrinsic::amdgcn_raw_buffer_load:
1118         OffsetIdx = 1;
1119         break;
1120       case Intrinsic::amdgcn_s_buffer_load:
1121         // If resulting type is vec3, there is no point in trimming the
1122         // load with updated offset, as the vec3 would most likely be widened to
1123         // vec4 anyway during lowering.
1124         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1125           OffsetIdx = InvalidOffsetIdx;
1126         else
1127           OffsetIdx = 1;
1128         break;
1129       case Intrinsic::amdgcn_struct_buffer_load:
1130         OffsetIdx = 2;
1131         break;
1132       default:
1133         // TODO: handle tbuffer* intrinsics.
1134         OffsetIdx = InvalidOffsetIdx;
1135         break;
1136       }
1137 
1138       if (OffsetIdx != InvalidOffsetIdx) {
1139         // Clear demanded bits and update the offset.
1140         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1141         auto *Offset = II.getArgOperand(OffsetIdx);
1142         unsigned SingleComponentSizeInBits =
1143             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
1144         unsigned OffsetAdd =
1145             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1146         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1147         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1148       }
1149     }
1150   } else {
1151     // Image case.
1152 
1153     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
1154     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1155 
1156     // Mask off values that are undefined because the dmask doesn't cover them
1157     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
1158 
1159     unsigned NewDMaskVal = 0;
1160     unsigned OrigLoadIdx = 0;
1161     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1162       const unsigned Bit = 1 << SrcIdx;
1163       if (!!(DMaskVal & Bit)) {
1164         if (!!DemandedElts[OrigLoadIdx])
1165           NewDMaskVal |= Bit;
1166         OrigLoadIdx++;
1167       }
1168     }
1169 
1170     if (DMaskVal != NewDMaskVal)
1171       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1172   }
1173 
1174   unsigned NewNumElts = DemandedElts.countPopulation();
1175   if (!NewNumElts)
1176     return UndefValue::get(II.getType());
1177 
1178   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1179     if (DMaskIdx >= 0)
1180       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1181     return nullptr;
1182   }
1183 
1184   // Validate function argument and return types, extracting overloaded types
1185   // along the way.
1186   SmallVector<Type *, 6> OverloadTys;
1187   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1188     return nullptr;
1189 
1190   Module *M = II.getParent()->getParent()->getParent();
1191   Type *EltTy = IIVTy->getElementType();
1192   Type *NewTy =
1193       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1194 
1195   OverloadTys[0] = NewTy;
1196   Function *NewIntrin =
1197       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1198 
1199   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1200   NewCall->takeName(&II);
1201   NewCall->copyMetadata(II);
1202 
1203   if (NewNumElts == 1) {
1204     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1205                                           NewCall,
1206                                           DemandedElts.countTrailingZeros());
1207   }
1208 
1209   SmallVector<int, 8> EltMask;
1210   unsigned NewLoadIdx = 0;
1211   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1212     if (!!DemandedElts[OrigLoadIdx])
1213       EltMask.push_back(NewLoadIdx++);
1214     else
1215       EltMask.push_back(NewNumElts);
1216   }
1217 
1218   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1219 
1220   return Shuffle;
1221 }
1222 
1223 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1224     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1225     APInt &UndefElts2, APInt &UndefElts3,
1226     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1227         SimplifyAndSetOp) const {
1228   switch (II.getIntrinsicID()) {
1229   case Intrinsic::amdgcn_buffer_load:
1230   case Intrinsic::amdgcn_buffer_load_format:
1231   case Intrinsic::amdgcn_raw_buffer_load:
1232   case Intrinsic::amdgcn_raw_buffer_load_format:
1233   case Intrinsic::amdgcn_raw_tbuffer_load:
1234   case Intrinsic::amdgcn_s_buffer_load:
1235   case Intrinsic::amdgcn_struct_buffer_load:
1236   case Intrinsic::amdgcn_struct_buffer_load_format:
1237   case Intrinsic::amdgcn_struct_tbuffer_load:
1238   case Intrinsic::amdgcn_tbuffer_load:
1239     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1240   default: {
1241     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1242       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1243     }
1244     break;
1245   }
1246   }
1247   return None;
1248 }
1249