1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "x86tti"
25 
26 /// Return a constant boolean vector that has true elements in all positions
27 /// where the input constant data vector has an element with the sign bit set.
getNegativeIsTrueBoolVec(Constant * V)28 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
29   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
30   V = ConstantExpr::getBitCast(V, IntTy);
31   V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
32                             V);
33   return V;
34 }
35 
36 /// Convert the x86 XMM integer vector mask to a vector of bools based on
37 /// each element's most significant bit (the sign bit).
getBoolVecFromMask(Value * Mask)38 static Value *getBoolVecFromMask(Value *Mask) {
39   // Fold Constant Mask.
40   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
41     return getNegativeIsTrueBoolVec(ConstantMask);
42 
43   // Mask was extended from a boolean vector.
44   Value *ExtMask;
45   if (PatternMatch::match(
46           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
47       ExtMask->getType()->isIntOrIntVectorTy(1))
48     return ExtMask;
49 
50   return nullptr;
51 }
52 
53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
54 // XMM register mask efficiently, we could transform all x86 masked intrinsics
55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedLoad(IntrinsicInst & II,InstCombiner & IC)56 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
57   Value *Ptr = II.getOperand(0);
58   Value *Mask = II.getOperand(1);
59   Constant *ZeroVec = Constant::getNullValue(II.getType());
60 
61   // Zero Mask - masked load instruction creates a zero vector.
62   if (isa<ConstantAggregateZero>(Mask))
63     return IC.replaceInstUsesWith(II, ZeroVec);
64 
65   // The mask is constant or extended from a bool vector. Convert this x86
66   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
67   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
68     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
69     // the LLVM intrinsic definition for the pointer argument.
70     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
71     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
72     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
73 
74     // The pass-through vector for an x86 masked load is a zero vector.
75     CallInst *NewMaskedLoad =
76         IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
77     return IC.replaceInstUsesWith(II, NewMaskedLoad);
78   }
79 
80   return nullptr;
81 }
82 
83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
84 // XMM register mask efficiently, we could transform all x86 masked intrinsics
85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedStore(IntrinsicInst & II,InstCombiner & IC)86 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
87   Value *Ptr = II.getOperand(0);
88   Value *Mask = II.getOperand(1);
89   Value *Vec = II.getOperand(2);
90 
91   // Zero Mask - this masked store instruction does nothing.
92   if (isa<ConstantAggregateZero>(Mask)) {
93     IC.eraseInstFromFunction(II);
94     return true;
95   }
96 
97   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
98   // anything else at this level.
99   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
100     return false;
101 
102   // The mask is constant or extended from a bool vector. Convert this x86
103   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
104   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
105     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
106     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
107     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
108 
109     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
110 
111     // 'Replace uses' doesn't work for stores. Erase the original masked store.
112     IC.eraseInstFromFunction(II);
113     return true;
114   }
115 
116   return false;
117 }
118 
simplifyX86immShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)119 static Value *simplifyX86immShift(const IntrinsicInst &II,
120                                   InstCombiner::BuilderTy &Builder) {
121   bool LogicalShift = false;
122   bool ShiftLeft = false;
123   bool IsImm = false;
124 
125   switch (II.getIntrinsicID()) {
126   default:
127     llvm_unreachable("Unexpected intrinsic!");
128   case Intrinsic::x86_sse2_psrai_d:
129   case Intrinsic::x86_sse2_psrai_w:
130   case Intrinsic::x86_avx2_psrai_d:
131   case Intrinsic::x86_avx2_psrai_w:
132   case Intrinsic::x86_avx512_psrai_q_128:
133   case Intrinsic::x86_avx512_psrai_q_256:
134   case Intrinsic::x86_avx512_psrai_d_512:
135   case Intrinsic::x86_avx512_psrai_q_512:
136   case Intrinsic::x86_avx512_psrai_w_512:
137     IsImm = true;
138     LLVM_FALLTHROUGH;
139   case Intrinsic::x86_sse2_psra_d:
140   case Intrinsic::x86_sse2_psra_w:
141   case Intrinsic::x86_avx2_psra_d:
142   case Intrinsic::x86_avx2_psra_w:
143   case Intrinsic::x86_avx512_psra_q_128:
144   case Intrinsic::x86_avx512_psra_q_256:
145   case Intrinsic::x86_avx512_psra_d_512:
146   case Intrinsic::x86_avx512_psra_q_512:
147   case Intrinsic::x86_avx512_psra_w_512:
148     LogicalShift = false;
149     ShiftLeft = false;
150     break;
151   case Intrinsic::x86_sse2_psrli_d:
152   case Intrinsic::x86_sse2_psrli_q:
153   case Intrinsic::x86_sse2_psrli_w:
154   case Intrinsic::x86_avx2_psrli_d:
155   case Intrinsic::x86_avx2_psrli_q:
156   case Intrinsic::x86_avx2_psrli_w:
157   case Intrinsic::x86_avx512_psrli_d_512:
158   case Intrinsic::x86_avx512_psrli_q_512:
159   case Intrinsic::x86_avx512_psrli_w_512:
160     IsImm = true;
161     LLVM_FALLTHROUGH;
162   case Intrinsic::x86_sse2_psrl_d:
163   case Intrinsic::x86_sse2_psrl_q:
164   case Intrinsic::x86_sse2_psrl_w:
165   case Intrinsic::x86_avx2_psrl_d:
166   case Intrinsic::x86_avx2_psrl_q:
167   case Intrinsic::x86_avx2_psrl_w:
168   case Intrinsic::x86_avx512_psrl_d_512:
169   case Intrinsic::x86_avx512_psrl_q_512:
170   case Intrinsic::x86_avx512_psrl_w_512:
171     LogicalShift = true;
172     ShiftLeft = false;
173     break;
174   case Intrinsic::x86_sse2_pslli_d:
175   case Intrinsic::x86_sse2_pslli_q:
176   case Intrinsic::x86_sse2_pslli_w:
177   case Intrinsic::x86_avx2_pslli_d:
178   case Intrinsic::x86_avx2_pslli_q:
179   case Intrinsic::x86_avx2_pslli_w:
180   case Intrinsic::x86_avx512_pslli_d_512:
181   case Intrinsic::x86_avx512_pslli_q_512:
182   case Intrinsic::x86_avx512_pslli_w_512:
183     IsImm = true;
184     LLVM_FALLTHROUGH;
185   case Intrinsic::x86_sse2_psll_d:
186   case Intrinsic::x86_sse2_psll_q:
187   case Intrinsic::x86_sse2_psll_w:
188   case Intrinsic::x86_avx2_psll_d:
189   case Intrinsic::x86_avx2_psll_q:
190   case Intrinsic::x86_avx2_psll_w:
191   case Intrinsic::x86_avx512_psll_d_512:
192   case Intrinsic::x86_avx512_psll_q_512:
193   case Intrinsic::x86_avx512_psll_w_512:
194     LogicalShift = true;
195     ShiftLeft = true;
196     break;
197   }
198   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
199 
200   auto Vec = II.getArgOperand(0);
201   auto Amt = II.getArgOperand(1);
202   auto VT = cast<FixedVectorType>(Vec->getType());
203   auto SVT = VT->getElementType();
204   auto AmtVT = Amt->getType();
205   unsigned VWidth = VT->getNumElements();
206   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
207 
208   // If the shift amount is guaranteed to be in-range we can replace it with a
209   // generic shift. If its guaranteed to be out of range, logical shifts combine
210   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
211   if (IsImm) {
212     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
213     KnownBits KnownAmtBits =
214         llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
215     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
216       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
217       Amt = Builder.CreateVectorSplat(VWidth, Amt);
218       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
219                                         : Builder.CreateLShr(Vec, Amt))
220                            : Builder.CreateAShr(Vec, Amt));
221     }
222     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
223       if (LogicalShift)
224         return ConstantAggregateZero::get(VT);
225       Amt = ConstantInt::get(SVT, BitWidth - 1);
226       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
227     }
228   } else {
229     // Ensure the first element has an in-range value and the rest of the
230     // elements in the bottom 64 bits are zero.
231     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
232            cast<VectorType>(AmtVT)->getElementType() == SVT &&
233            "Unexpected shift-by-scalar type");
234     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
235     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
236     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
237     KnownBits KnownLowerBits = llvm::computeKnownBits(
238         Amt, DemandedLower, II.getModule()->getDataLayout());
239     KnownBits KnownUpperBits = llvm::computeKnownBits(
240         Amt, DemandedUpper, II.getModule()->getDataLayout());
241     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
242         (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
243       SmallVector<int, 16> ZeroSplat(VWidth, 0);
244       Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat);
245       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
246                                         : Builder.CreateLShr(Vec, Amt))
247                            : Builder.CreateAShr(Vec, Amt));
248     }
249   }
250 
251   // Simplify if count is constant vector.
252   auto CDV = dyn_cast<ConstantDataVector>(Amt);
253   if (!CDV)
254     return nullptr;
255 
256   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
257   // operand to compute the shift amount.
258   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
259          cast<VectorType>(AmtVT)->getElementType() == SVT &&
260          "Unexpected shift-by-scalar type");
261 
262   // Concatenate the sub-elements to create the 64-bit value.
263   APInt Count(64, 0);
264   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
265     unsigned SubEltIdx = (NumSubElts - 1) - i;
266     auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
267     Count <<= BitWidth;
268     Count |= SubElt->getValue().zextOrTrunc(64);
269   }
270 
271   // If shift-by-zero then just return the original value.
272   if (Count.isNullValue())
273     return Vec;
274 
275   // Handle cases when Shift >= BitWidth.
276   if (Count.uge(BitWidth)) {
277     // If LogicalShift - just return zero.
278     if (LogicalShift)
279       return ConstantAggregateZero::get(VT);
280 
281     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
282     Count = APInt(64, BitWidth - 1);
283   }
284 
285   // Get a constant vector of the same type as the first operand.
286   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
287   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
288 
289   if (ShiftLeft)
290     return Builder.CreateShl(Vec, ShiftVec);
291 
292   if (LogicalShift)
293     return Builder.CreateLShr(Vec, ShiftVec);
294 
295   return Builder.CreateAShr(Vec, ShiftVec);
296 }
297 
298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
simplifyX86varShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)301 static Value *simplifyX86varShift(const IntrinsicInst &II,
302                                   InstCombiner::BuilderTy &Builder) {
303   bool LogicalShift = false;
304   bool ShiftLeft = false;
305 
306   switch (II.getIntrinsicID()) {
307   default:
308     llvm_unreachable("Unexpected intrinsic!");
309   case Intrinsic::x86_avx2_psrav_d:
310   case Intrinsic::x86_avx2_psrav_d_256:
311   case Intrinsic::x86_avx512_psrav_q_128:
312   case Intrinsic::x86_avx512_psrav_q_256:
313   case Intrinsic::x86_avx512_psrav_d_512:
314   case Intrinsic::x86_avx512_psrav_q_512:
315   case Intrinsic::x86_avx512_psrav_w_128:
316   case Intrinsic::x86_avx512_psrav_w_256:
317   case Intrinsic::x86_avx512_psrav_w_512:
318     LogicalShift = false;
319     ShiftLeft = false;
320     break;
321   case Intrinsic::x86_avx2_psrlv_d:
322   case Intrinsic::x86_avx2_psrlv_d_256:
323   case Intrinsic::x86_avx2_psrlv_q:
324   case Intrinsic::x86_avx2_psrlv_q_256:
325   case Intrinsic::x86_avx512_psrlv_d_512:
326   case Intrinsic::x86_avx512_psrlv_q_512:
327   case Intrinsic::x86_avx512_psrlv_w_128:
328   case Intrinsic::x86_avx512_psrlv_w_256:
329   case Intrinsic::x86_avx512_psrlv_w_512:
330     LogicalShift = true;
331     ShiftLeft = false;
332     break;
333   case Intrinsic::x86_avx2_psllv_d:
334   case Intrinsic::x86_avx2_psllv_d_256:
335   case Intrinsic::x86_avx2_psllv_q:
336   case Intrinsic::x86_avx2_psllv_q_256:
337   case Intrinsic::x86_avx512_psllv_d_512:
338   case Intrinsic::x86_avx512_psllv_q_512:
339   case Intrinsic::x86_avx512_psllv_w_128:
340   case Intrinsic::x86_avx512_psllv_w_256:
341   case Intrinsic::x86_avx512_psllv_w_512:
342     LogicalShift = true;
343     ShiftLeft = true;
344     break;
345   }
346   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
347 
348   auto Vec = II.getArgOperand(0);
349   auto Amt = II.getArgOperand(1);
350   auto VT = cast<FixedVectorType>(II.getType());
351   auto SVT = VT->getElementType();
352   int NumElts = VT->getNumElements();
353   int BitWidth = SVT->getIntegerBitWidth();
354 
355   // If the shift amount is guaranteed to be in-range we can replace it with a
356   // generic shift.
357   APInt UpperBits =
358       APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
359   if (llvm::MaskedValueIsZero(Amt, UpperBits,
360                               II.getModule()->getDataLayout())) {
361     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
362                                       : Builder.CreateLShr(Vec, Amt))
363                          : Builder.CreateAShr(Vec, Amt));
364   }
365 
366   // Simplify if all shift amounts are constant/undef.
367   auto *CShift = dyn_cast<Constant>(Amt);
368   if (!CShift)
369     return nullptr;
370 
371   // Collect each element's shift amount.
372   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373   bool AnyOutOfRange = false;
374   SmallVector<int, 8> ShiftAmts;
375   for (int I = 0; I < NumElts; ++I) {
376     auto *CElt = CShift->getAggregateElement(I);
377     if (isa_and_nonnull<UndefValue>(CElt)) {
378       ShiftAmts.push_back(-1);
379       continue;
380     }
381 
382     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
383     if (!COp)
384       return nullptr;
385 
386     // Handle out of range shifts.
387     // If LogicalShift - set to BitWidth (special case).
388     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389     APInt ShiftVal = COp->getValue();
390     if (ShiftVal.uge(BitWidth)) {
391       AnyOutOfRange = LogicalShift;
392       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
393       continue;
394     }
395 
396     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
397   }
398 
399   // If all elements out of range or UNDEF, return vector of zeros/undefs.
400   // ArithmeticShift should only hit this if they are all UNDEF.
401   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
402   if (llvm::all_of(ShiftAmts, OutOfRange)) {
403     SmallVector<Constant *, 8> ConstantVec;
404     for (int Idx : ShiftAmts) {
405       if (Idx < 0) {
406         ConstantVec.push_back(UndefValue::get(SVT));
407       } else {
408         assert(LogicalShift && "Logical shift expected");
409         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
410       }
411     }
412     return ConstantVector::get(ConstantVec);
413   }
414 
415   // We can't handle only some out of range values with generic logical shifts.
416   if (AnyOutOfRange)
417     return nullptr;
418 
419   // Build the shift amount constant vector.
420   SmallVector<Constant *, 8> ShiftVecAmts;
421   for (int Idx : ShiftAmts) {
422     if (Idx < 0)
423       ShiftVecAmts.push_back(UndefValue::get(SVT));
424     else
425       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
426   }
427   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
428 
429   if (ShiftLeft)
430     return Builder.CreateShl(Vec, ShiftVec);
431 
432   if (LogicalShift)
433     return Builder.CreateLShr(Vec, ShiftVec);
434 
435   return Builder.CreateAShr(Vec, ShiftVec);
436 }
437 
simplifyX86pack(IntrinsicInst & II,InstCombiner::BuilderTy & Builder,bool IsSigned)438 static Value *simplifyX86pack(IntrinsicInst &II,
439                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
440   Value *Arg0 = II.getArgOperand(0);
441   Value *Arg1 = II.getArgOperand(1);
442   Type *ResTy = II.getType();
443 
444   // Fast all undef handling.
445   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
446     return UndefValue::get(ResTy);
447 
448   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
449   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
450   unsigned NumSrcElts = ArgTy->getNumElements();
451   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452          "Unexpected packing types");
453 
454   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
455   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
456   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458          "Unexpected packing types");
459 
460   // Constant folding.
461   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
462     return nullptr;
463 
464   // Clamp Values - signed/unsigned both use signed clamp values, but they
465   // differ on the min/max values.
466   APInt MinValue, MaxValue;
467   if (IsSigned) {
468     // PACKSS: Truncate signed value with signed saturation.
469     // Source values less than dst minint are saturated to minint.
470     // Source values greater than dst maxint are saturated to maxint.
471     MinValue =
472         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
473     MaxValue =
474         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
475   } else {
476     // PACKUS: Truncate signed value with unsigned saturation.
477     // Source values less than zero are saturated to zero.
478     // Source values greater than dst maxuint are saturated to maxuint.
479     MinValue = APInt::getNullValue(SrcScalarSizeInBits);
480     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
481   }
482 
483   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
484   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
485   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
489 
490   // Shuffle clamped args together at the lane level.
491   SmallVector<int, 32> PackMask;
492   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
497   }
498   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
499 
500   // Truncate to dst size.
501   return Builder.CreateTrunc(Shuffle, ResTy);
502 }
503 
simplifyX86movmsk(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)504 static Value *simplifyX86movmsk(const IntrinsicInst &II,
505                                 InstCombiner::BuilderTy &Builder) {
506   Value *Arg = II.getArgOperand(0);
507   Type *ResTy = II.getType();
508 
509   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510   if (isa<UndefValue>(Arg))
511     return Constant::getNullValue(ResTy);
512 
513   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
514   // We can't easily peek through x86_mmx types.
515   if (!ArgTy)
516     return nullptr;
517 
518   // Expand MOVMSK to compare/bitcast/zext:
519   // e.g. PMOVMSKB(v16i8 x):
520   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521   // %int = bitcast <16 x i1> %cmp to i16
522   // %res = zext i16 %int to i32
523   unsigned NumElts = ArgTy->getNumElements();
524   Type *IntegerVecTy = VectorType::getInteger(ArgTy);
525   Type *IntegerTy = Builder.getIntNTy(NumElts);
526 
527   Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
528   Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
529   Res = Builder.CreateBitCast(Res, IntegerTy);
530   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
531   return Res;
532 }
533 
simplifyX86addcarry(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)534 static Value *simplifyX86addcarry(const IntrinsicInst &II,
535                                   InstCombiner::BuilderTy &Builder) {
536   Value *CarryIn = II.getArgOperand(0);
537   Value *Op1 = II.getArgOperand(1);
538   Value *Op2 = II.getArgOperand(2);
539   Type *RetTy = II.getType();
540   Type *OpTy = Op1->getType();
541   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
542          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
543          "Unexpected types for x86 addcarry");
544 
545   // If carry-in is zero, this is just an unsigned add with overflow.
546   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
547     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
548                                           {Op1, Op2});
549     // The types have to be adjusted to match the x86 call types.
550     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
551     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
552                                        Builder.getInt8Ty());
553     Value *Res = UndefValue::get(RetTy);
554     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
555     return Builder.CreateInsertValue(Res, UAddResult, 1);
556   }
557 
558   return nullptr;
559 }
560 
simplifyX86insertps(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)561 static Value *simplifyX86insertps(const IntrinsicInst &II,
562                                   InstCombiner::BuilderTy &Builder) {
563   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
564   if (!CInt)
565     return nullptr;
566 
567   auto *VecTy = cast<FixedVectorType>(II.getType());
568   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
569 
570   // The immediate permute control byte looks like this:
571   //    [3:0] - zero mask for each 32-bit lane
572   //    [5:4] - select one 32-bit destination lane
573   //    [7:6] - select one 32-bit source lane
574 
575   uint8_t Imm = CInt->getZExtValue();
576   uint8_t ZMask = Imm & 0xf;
577   uint8_t DestLane = (Imm >> 4) & 0x3;
578   uint8_t SourceLane = (Imm >> 6) & 0x3;
579 
580   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
581 
582   // If all zero mask bits are set, this was just a weird way to
583   // generate a zero vector.
584   if (ZMask == 0xf)
585     return ZeroVector;
586 
587   // Initialize by passing all of the first source bits through.
588   int ShuffleMask[4] = {0, 1, 2, 3};
589 
590   // We may replace the second operand with the zero vector.
591   Value *V1 = II.getArgOperand(1);
592 
593   if (ZMask) {
594     // If the zero mask is being used with a single input or the zero mask
595     // overrides the destination lane, this is a shuffle with the zero vector.
596     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
597         (ZMask & (1 << DestLane))) {
598       V1 = ZeroVector;
599       // We may still move 32-bits of the first source vector from one lane
600       // to another.
601       ShuffleMask[DestLane] = SourceLane;
602       // The zero mask may override the previous insert operation.
603       for (unsigned i = 0; i < 4; ++i)
604         if ((ZMask >> i) & 0x1)
605           ShuffleMask[i] = i + 4;
606     } else {
607       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
608       return nullptr;
609     }
610   } else {
611     // Replace the selected destination lane with the selected source lane.
612     ShuffleMask[DestLane] = SourceLane + 4;
613   }
614 
615   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
616 }
617 
618 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
619 /// or conversion to a shuffle vector.
simplifyX86extrq(IntrinsicInst & II,Value * Op0,ConstantInt * CILength,ConstantInt * CIIndex,InstCombiner::BuilderTy & Builder)620 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
621                                ConstantInt *CILength, ConstantInt *CIIndex,
622                                InstCombiner::BuilderTy &Builder) {
623   auto LowConstantHighUndef = [&](uint64_t Val) {
624     Type *IntTy64 = Type::getInt64Ty(II.getContext());
625     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
626                         UndefValue::get(IntTy64)};
627     return ConstantVector::get(Args);
628   };
629 
630   // See if we're dealing with constant values.
631   Constant *C0 = dyn_cast<Constant>(Op0);
632   ConstantInt *CI0 =
633       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
634          : nullptr;
635 
636   // Attempt to constant fold.
637   if (CILength && CIIndex) {
638     // From AMD documentation: "The bit index and field length are each six
639     // bits in length other bits of the field are ignored."
640     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
641     APInt APLength = CILength->getValue().zextOrTrunc(6);
642 
643     unsigned Index = APIndex.getZExtValue();
644 
645     // From AMD documentation: "a value of zero in the field length is
646     // defined as length of 64".
647     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
648 
649     // From AMD documentation: "If the sum of the bit index + length field
650     // is greater than 64, the results are undefined".
651     unsigned End = Index + Length;
652 
653     // Note that both field index and field length are 8-bit quantities.
654     // Since variables 'Index' and 'Length' are unsigned values
655     // obtained from zero-extending field index and field length
656     // respectively, their sum should never wrap around.
657     if (End > 64)
658       return UndefValue::get(II.getType());
659 
660     // If we are inserting whole bytes, we can convert this to a shuffle.
661     // Lowering can recognize EXTRQI shuffle masks.
662     if ((Length % 8) == 0 && (Index % 8) == 0) {
663       // Convert bit indices to byte indices.
664       Length /= 8;
665       Index /= 8;
666 
667       Type *IntTy8 = Type::getInt8Ty(II.getContext());
668       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
669 
670       SmallVector<int, 16> ShuffleMask;
671       for (int i = 0; i != (int)Length; ++i)
672         ShuffleMask.push_back(i + Index);
673       for (int i = Length; i != 8; ++i)
674         ShuffleMask.push_back(i + 16);
675       for (int i = 8; i != 16; ++i)
676         ShuffleMask.push_back(-1);
677 
678       Value *SV = Builder.CreateShuffleVector(
679           Builder.CreateBitCast(Op0, ShufTy),
680           ConstantAggregateZero::get(ShufTy), ShuffleMask);
681       return Builder.CreateBitCast(SV, II.getType());
682     }
683 
684     // Constant Fold - shift Index'th bit to lowest position and mask off
685     // Length bits.
686     if (CI0) {
687       APInt Elt = CI0->getValue();
688       Elt.lshrInPlace(Index);
689       Elt = Elt.zextOrTrunc(Length);
690       return LowConstantHighUndef(Elt.getZExtValue());
691     }
692 
693     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
694     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
695       Value *Args[] = {Op0, CILength, CIIndex};
696       Module *M = II.getModule();
697       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
698       return Builder.CreateCall(F, Args);
699     }
700   }
701 
702   // Constant Fold - extraction from zero is always {zero, undef}.
703   if (CI0 && CI0->isZero())
704     return LowConstantHighUndef(0);
705 
706   return nullptr;
707 }
708 
709 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
710 /// folding or conversion to a shuffle vector.
simplifyX86insertq(IntrinsicInst & II,Value * Op0,Value * Op1,APInt APLength,APInt APIndex,InstCombiner::BuilderTy & Builder)711 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
712                                  APInt APLength, APInt APIndex,
713                                  InstCombiner::BuilderTy &Builder) {
714   // From AMD documentation: "The bit index and field length are each six bits
715   // in length other bits of the field are ignored."
716   APIndex = APIndex.zextOrTrunc(6);
717   APLength = APLength.zextOrTrunc(6);
718 
719   // Attempt to constant fold.
720   unsigned Index = APIndex.getZExtValue();
721 
722   // From AMD documentation: "a value of zero in the field length is
723   // defined as length of 64".
724   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
725 
726   // From AMD documentation: "If the sum of the bit index + length field
727   // is greater than 64, the results are undefined".
728   unsigned End = Index + Length;
729 
730   // Note that both field index and field length are 8-bit quantities.
731   // Since variables 'Index' and 'Length' are unsigned values
732   // obtained from zero-extending field index and field length
733   // respectively, their sum should never wrap around.
734   if (End > 64)
735     return UndefValue::get(II.getType());
736 
737   // If we are inserting whole bytes, we can convert this to a shuffle.
738   // Lowering can recognize INSERTQI shuffle masks.
739   if ((Length % 8) == 0 && (Index % 8) == 0) {
740     // Convert bit indices to byte indices.
741     Length /= 8;
742     Index /= 8;
743 
744     Type *IntTy8 = Type::getInt8Ty(II.getContext());
745     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
746 
747     SmallVector<int, 16> ShuffleMask;
748     for (int i = 0; i != (int)Index; ++i)
749       ShuffleMask.push_back(i);
750     for (int i = 0; i != (int)Length; ++i)
751       ShuffleMask.push_back(i + 16);
752     for (int i = Index + Length; i != 8; ++i)
753       ShuffleMask.push_back(i);
754     for (int i = 8; i != 16; ++i)
755       ShuffleMask.push_back(-1);
756 
757     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
758                                             Builder.CreateBitCast(Op1, ShufTy),
759                                             ShuffleMask);
760     return Builder.CreateBitCast(SV, II.getType());
761   }
762 
763   // See if we're dealing with constant values.
764   Constant *C0 = dyn_cast<Constant>(Op0);
765   Constant *C1 = dyn_cast<Constant>(Op1);
766   ConstantInt *CI00 =
767       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
768          : nullptr;
769   ConstantInt *CI10 =
770       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
771          : nullptr;
772 
773   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
774   if (CI00 && CI10) {
775     APInt V00 = CI00->getValue();
776     APInt V10 = CI10->getValue();
777     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
778     V00 = V00 & ~Mask;
779     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
780     APInt Val = V00 | V10;
781     Type *IntTy64 = Type::getInt64Ty(II.getContext());
782     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
783                         UndefValue::get(IntTy64)};
784     return ConstantVector::get(Args);
785   }
786 
787   // If we were an INSERTQ call, we'll save demanded elements if we convert to
788   // INSERTQI.
789   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
790     Type *IntTy8 = Type::getInt8Ty(II.getContext());
791     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
792     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
793 
794     Value *Args[] = {Op0, Op1, CILength, CIIndex};
795     Module *M = II.getModule();
796     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
797     return Builder.CreateCall(F, Args);
798   }
799 
800   return nullptr;
801 }
802 
803 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
simplifyX86pshufb(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)804 static Value *simplifyX86pshufb(const IntrinsicInst &II,
805                                 InstCombiner::BuilderTy &Builder) {
806   Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
807   if (!V)
808     return nullptr;
809 
810   auto *VecTy = cast<FixedVectorType>(II.getType());
811   unsigned NumElts = VecTy->getNumElements();
812   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
813          "Unexpected number of elements in shuffle mask!");
814 
815   // Construct a shuffle mask from constant integers or UNDEFs.
816   int Indexes[64];
817 
818   // Each byte in the shuffle control mask forms an index to permute the
819   // corresponding byte in the destination operand.
820   for (unsigned I = 0; I < NumElts; ++I) {
821     Constant *COp = V->getAggregateElement(I);
822     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
823       return nullptr;
824 
825     if (isa<UndefValue>(COp)) {
826       Indexes[I] = -1;
827       continue;
828     }
829 
830     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
831 
832     // If the most significant bit (bit[7]) of each byte of the shuffle
833     // control mask is set, then zero is written in the result byte.
834     // The zero vector is in the right-hand side of the resulting
835     // shufflevector.
836 
837     // The value of each index for the high 128-bit lane is the least
838     // significant 4 bits of the respective shuffle control byte.
839     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
840     Indexes[I] = Index;
841   }
842 
843   auto V1 = II.getArgOperand(0);
844   auto V2 = Constant::getNullValue(VecTy);
845   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
846 }
847 
848 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
simplifyX86vpermilvar(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)849 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
850                                     InstCombiner::BuilderTy &Builder) {
851   Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
852   if (!V)
853     return nullptr;
854 
855   auto *VecTy = cast<FixedVectorType>(II.getType());
856   unsigned NumElts = VecTy->getNumElements();
857   bool IsPD = VecTy->getScalarType()->isDoubleTy();
858   unsigned NumLaneElts = IsPD ? 2 : 4;
859   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
860 
861   // Construct a shuffle mask from constant integers or UNDEFs.
862   int Indexes[16];
863 
864   // The intrinsics only read one or two bits, clear the rest.
865   for (unsigned I = 0; I < NumElts; ++I) {
866     Constant *COp = V->getAggregateElement(I);
867     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
868       return nullptr;
869 
870     if (isa<UndefValue>(COp)) {
871       Indexes[I] = -1;
872       continue;
873     }
874 
875     APInt Index = cast<ConstantInt>(COp)->getValue();
876     Index = Index.zextOrTrunc(32).getLoBits(2);
877 
878     // The PD variants uses bit 1 to select per-lane element index, so
879     // shift down to convert to generic shuffle mask index.
880     if (IsPD)
881       Index.lshrInPlace(1);
882 
883     // The _256 variants are a bit trickier since the mask bits always index
884     // into the corresponding 128 half. In order to convert to a generic
885     // shuffle, we have to make that explicit.
886     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
887 
888     Indexes[I] = Index.getZExtValue();
889   }
890 
891   auto V1 = II.getArgOperand(0);
892   auto V2 = UndefValue::get(V1->getType());
893   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
894 }
895 
896 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
simplifyX86vpermv(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)897 static Value *simplifyX86vpermv(const IntrinsicInst &II,
898                                 InstCombiner::BuilderTy &Builder) {
899   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
900   if (!V)
901     return nullptr;
902 
903   auto *VecTy = cast<FixedVectorType>(II.getType());
904   unsigned Size = VecTy->getNumElements();
905   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
906          "Unexpected shuffle mask size");
907 
908   // Construct a shuffle mask from constant integers or UNDEFs.
909   int Indexes[64];
910 
911   for (unsigned I = 0; I < Size; ++I) {
912     Constant *COp = V->getAggregateElement(I);
913     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
914       return nullptr;
915 
916     if (isa<UndefValue>(COp)) {
917       Indexes[I] = -1;
918       continue;
919     }
920 
921     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
922     Index &= Size - 1;
923     Indexes[I] = Index;
924   }
925 
926   auto V1 = II.getArgOperand(0);
927   auto V2 = UndefValue::get(VecTy);
928   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size));
929 }
930 
931 Optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const932 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
933   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
934                                              unsigned DemandedWidth) {
935     APInt UndefElts(Width, 0);
936     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
937     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
938   };
939 
940   Intrinsic::ID IID = II.getIntrinsicID();
941   switch (IID) {
942   case Intrinsic::x86_bmi_bextr_32:
943   case Intrinsic::x86_bmi_bextr_64:
944   case Intrinsic::x86_tbm_bextri_u32:
945   case Intrinsic::x86_tbm_bextri_u64:
946     // If the RHS is a constant we can try some simplifications.
947     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
948       uint64_t Shift = C->getZExtValue();
949       uint64_t Length = (Shift >> 8) & 0xff;
950       Shift &= 0xff;
951       unsigned BitWidth = II.getType()->getIntegerBitWidth();
952       // If the length is 0 or the shift is out of range, replace with zero.
953       if (Length == 0 || Shift >= BitWidth) {
954         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
955       }
956       // If the LHS is also a constant, we can completely constant fold this.
957       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
958         uint64_t Result = InC->getZExtValue() >> Shift;
959         if (Length > BitWidth)
960           Length = BitWidth;
961         Result &= maskTrailingOnes<uint64_t>(Length);
962         return IC.replaceInstUsesWith(II,
963                                       ConstantInt::get(II.getType(), Result));
964       }
965       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
966       // are only masking bits that a shift already cleared?
967     }
968     break;
969 
970   case Intrinsic::x86_bmi_bzhi_32:
971   case Intrinsic::x86_bmi_bzhi_64:
972     // If the RHS is a constant we can try some simplifications.
973     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
974       uint64_t Index = C->getZExtValue() & 0xff;
975       unsigned BitWidth = II.getType()->getIntegerBitWidth();
976       if (Index >= BitWidth) {
977         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
978       }
979       if (Index == 0) {
980         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
981       }
982       // If the LHS is also a constant, we can completely constant fold this.
983       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
984         uint64_t Result = InC->getZExtValue();
985         Result &= maskTrailingOnes<uint64_t>(Index);
986         return IC.replaceInstUsesWith(II,
987                                       ConstantInt::get(II.getType(), Result));
988       }
989       // TODO should we convert this to an AND if the RHS is constant?
990     }
991     break;
992   case Intrinsic::x86_bmi_pext_32:
993   case Intrinsic::x86_bmi_pext_64:
994     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
995       if (MaskC->isNullValue()) {
996         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
997       }
998       if (MaskC->isAllOnesValue()) {
999         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1000       }
1001 
1002       if (MaskC->getValue().isShiftedMask()) {
1003         // any single contingous sequence of 1s anywhere in the mask simply
1004         // describes a subset of the input bits shifted to the appropriate
1005         // position.  Replace with the straight forward IR.
1006         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1007         Value *Input = II.getArgOperand(0);
1008         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1009         Value *Shifted = IC.Builder.CreateLShr(Masked,
1010                                                ConstantInt::get(II.getType(),
1011                                                                 ShiftAmount));
1012         return IC.replaceInstUsesWith(II, Shifted);
1013       }
1014 
1015 
1016       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1017         uint64_t Src = SrcC->getZExtValue();
1018         uint64_t Mask = MaskC->getZExtValue();
1019         uint64_t Result = 0;
1020         uint64_t BitToSet = 1;
1021 
1022         while (Mask) {
1023           // Isolate lowest set bit.
1024           uint64_t BitToTest = Mask & -Mask;
1025           if (BitToTest & Src)
1026             Result |= BitToSet;
1027 
1028           BitToSet <<= 1;
1029           // Clear lowest set bit.
1030           Mask &= Mask - 1;
1031         }
1032 
1033         return IC.replaceInstUsesWith(II,
1034                                       ConstantInt::get(II.getType(), Result));
1035       }
1036     }
1037     break;
1038   case Intrinsic::x86_bmi_pdep_32:
1039   case Intrinsic::x86_bmi_pdep_64:
1040     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1041       if (MaskC->isNullValue()) {
1042         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1043       }
1044       if (MaskC->isAllOnesValue()) {
1045         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1046       }
1047       if (MaskC->getValue().isShiftedMask()) {
1048         // any single contingous sequence of 1s anywhere in the mask simply
1049         // describes a subset of the input bits shifted to the appropriate
1050         // position.  Replace with the straight forward IR.
1051         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1052         Value *Input = II.getArgOperand(0);
1053         Value *Shifted = IC.Builder.CreateShl(Input,
1054                                               ConstantInt::get(II.getType(),
1055                                                                ShiftAmount));
1056         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1057         return IC.replaceInstUsesWith(II, Masked);
1058       }
1059 
1060       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1061         uint64_t Src = SrcC->getZExtValue();
1062         uint64_t Mask = MaskC->getZExtValue();
1063         uint64_t Result = 0;
1064         uint64_t BitToTest = 1;
1065 
1066         while (Mask) {
1067           // Isolate lowest set bit.
1068           uint64_t BitToSet = Mask & -Mask;
1069           if (BitToTest & Src)
1070             Result |= BitToSet;
1071 
1072           BitToTest <<= 1;
1073           // Clear lowest set bit;
1074           Mask &= Mask - 1;
1075         }
1076 
1077         return IC.replaceInstUsesWith(II,
1078                                       ConstantInt::get(II.getType(), Result));
1079       }
1080     }
1081     break;
1082 
1083   case Intrinsic::x86_sse_cvtss2si:
1084   case Intrinsic::x86_sse_cvtss2si64:
1085   case Intrinsic::x86_sse_cvttss2si:
1086   case Intrinsic::x86_sse_cvttss2si64:
1087   case Intrinsic::x86_sse2_cvtsd2si:
1088   case Intrinsic::x86_sse2_cvtsd2si64:
1089   case Intrinsic::x86_sse2_cvttsd2si:
1090   case Intrinsic::x86_sse2_cvttsd2si64:
1091   case Intrinsic::x86_avx512_vcvtss2si32:
1092   case Intrinsic::x86_avx512_vcvtss2si64:
1093   case Intrinsic::x86_avx512_vcvtss2usi32:
1094   case Intrinsic::x86_avx512_vcvtss2usi64:
1095   case Intrinsic::x86_avx512_vcvtsd2si32:
1096   case Intrinsic::x86_avx512_vcvtsd2si64:
1097   case Intrinsic::x86_avx512_vcvtsd2usi32:
1098   case Intrinsic::x86_avx512_vcvtsd2usi64:
1099   case Intrinsic::x86_avx512_cvttss2si:
1100   case Intrinsic::x86_avx512_cvttss2si64:
1101   case Intrinsic::x86_avx512_cvttss2usi:
1102   case Intrinsic::x86_avx512_cvttss2usi64:
1103   case Intrinsic::x86_avx512_cvttsd2si:
1104   case Intrinsic::x86_avx512_cvttsd2si64:
1105   case Intrinsic::x86_avx512_cvttsd2usi:
1106   case Intrinsic::x86_avx512_cvttsd2usi64: {
1107     // These intrinsics only demand the 0th element of their input vectors. If
1108     // we can simplify the input based on that, do so now.
1109     Value *Arg = II.getArgOperand(0);
1110     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1111     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1112       return IC.replaceOperand(II, 0, V);
1113     }
1114     break;
1115   }
1116 
1117   case Intrinsic::x86_mmx_pmovmskb:
1118   case Intrinsic::x86_sse_movmsk_ps:
1119   case Intrinsic::x86_sse2_movmsk_pd:
1120   case Intrinsic::x86_sse2_pmovmskb_128:
1121   case Intrinsic::x86_avx_movmsk_pd_256:
1122   case Intrinsic::x86_avx_movmsk_ps_256:
1123   case Intrinsic::x86_avx2_pmovmskb:
1124     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1125       return IC.replaceInstUsesWith(II, V);
1126     }
1127     break;
1128 
1129   case Intrinsic::x86_sse_comieq_ss:
1130   case Intrinsic::x86_sse_comige_ss:
1131   case Intrinsic::x86_sse_comigt_ss:
1132   case Intrinsic::x86_sse_comile_ss:
1133   case Intrinsic::x86_sse_comilt_ss:
1134   case Intrinsic::x86_sse_comineq_ss:
1135   case Intrinsic::x86_sse_ucomieq_ss:
1136   case Intrinsic::x86_sse_ucomige_ss:
1137   case Intrinsic::x86_sse_ucomigt_ss:
1138   case Intrinsic::x86_sse_ucomile_ss:
1139   case Intrinsic::x86_sse_ucomilt_ss:
1140   case Intrinsic::x86_sse_ucomineq_ss:
1141   case Intrinsic::x86_sse2_comieq_sd:
1142   case Intrinsic::x86_sse2_comige_sd:
1143   case Intrinsic::x86_sse2_comigt_sd:
1144   case Intrinsic::x86_sse2_comile_sd:
1145   case Intrinsic::x86_sse2_comilt_sd:
1146   case Intrinsic::x86_sse2_comineq_sd:
1147   case Intrinsic::x86_sse2_ucomieq_sd:
1148   case Intrinsic::x86_sse2_ucomige_sd:
1149   case Intrinsic::x86_sse2_ucomigt_sd:
1150   case Intrinsic::x86_sse2_ucomile_sd:
1151   case Intrinsic::x86_sse2_ucomilt_sd:
1152   case Intrinsic::x86_sse2_ucomineq_sd:
1153   case Intrinsic::x86_avx512_vcomi_ss:
1154   case Intrinsic::x86_avx512_vcomi_sd:
1155   case Intrinsic::x86_avx512_mask_cmp_ss:
1156   case Intrinsic::x86_avx512_mask_cmp_sd: {
1157     // These intrinsics only demand the 0th element of their input vectors. If
1158     // we can simplify the input based on that, do so now.
1159     bool MadeChange = false;
1160     Value *Arg0 = II.getArgOperand(0);
1161     Value *Arg1 = II.getArgOperand(1);
1162     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1163     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1164       IC.replaceOperand(II, 0, V);
1165       MadeChange = true;
1166     }
1167     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1168       IC.replaceOperand(II, 1, V);
1169       MadeChange = true;
1170     }
1171     if (MadeChange) {
1172       return &II;
1173     }
1174     break;
1175   }
1176 
1177   case Intrinsic::x86_avx512_add_ps_512:
1178   case Intrinsic::x86_avx512_div_ps_512:
1179   case Intrinsic::x86_avx512_mul_ps_512:
1180   case Intrinsic::x86_avx512_sub_ps_512:
1181   case Intrinsic::x86_avx512_add_pd_512:
1182   case Intrinsic::x86_avx512_div_pd_512:
1183   case Intrinsic::x86_avx512_mul_pd_512:
1184   case Intrinsic::x86_avx512_sub_pd_512:
1185     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1186     // IR operations.
1187     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1188       if (R->getValue() == 4) {
1189         Value *Arg0 = II.getArgOperand(0);
1190         Value *Arg1 = II.getArgOperand(1);
1191 
1192         Value *V;
1193         switch (IID) {
1194         default:
1195           llvm_unreachable("Case stmts out of sync!");
1196         case Intrinsic::x86_avx512_add_ps_512:
1197         case Intrinsic::x86_avx512_add_pd_512:
1198           V = IC.Builder.CreateFAdd(Arg0, Arg1);
1199           break;
1200         case Intrinsic::x86_avx512_sub_ps_512:
1201         case Intrinsic::x86_avx512_sub_pd_512:
1202           V = IC.Builder.CreateFSub(Arg0, Arg1);
1203           break;
1204         case Intrinsic::x86_avx512_mul_ps_512:
1205         case Intrinsic::x86_avx512_mul_pd_512:
1206           V = IC.Builder.CreateFMul(Arg0, Arg1);
1207           break;
1208         case Intrinsic::x86_avx512_div_ps_512:
1209         case Intrinsic::x86_avx512_div_pd_512:
1210           V = IC.Builder.CreateFDiv(Arg0, Arg1);
1211           break;
1212         }
1213 
1214         return IC.replaceInstUsesWith(II, V);
1215       }
1216     }
1217     break;
1218 
1219   case Intrinsic::x86_avx512_mask_add_ss_round:
1220   case Intrinsic::x86_avx512_mask_div_ss_round:
1221   case Intrinsic::x86_avx512_mask_mul_ss_round:
1222   case Intrinsic::x86_avx512_mask_sub_ss_round:
1223   case Intrinsic::x86_avx512_mask_add_sd_round:
1224   case Intrinsic::x86_avx512_mask_div_sd_round:
1225   case Intrinsic::x86_avx512_mask_mul_sd_round:
1226   case Intrinsic::x86_avx512_mask_sub_sd_round:
1227     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1228     // IR operations.
1229     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1230       if (R->getValue() == 4) {
1231         // Extract the element as scalars.
1232         Value *Arg0 = II.getArgOperand(0);
1233         Value *Arg1 = II.getArgOperand(1);
1234         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1235         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1236 
1237         Value *V;
1238         switch (IID) {
1239         default:
1240           llvm_unreachable("Case stmts out of sync!");
1241         case Intrinsic::x86_avx512_mask_add_ss_round:
1242         case Intrinsic::x86_avx512_mask_add_sd_round:
1243           V = IC.Builder.CreateFAdd(LHS, RHS);
1244           break;
1245         case Intrinsic::x86_avx512_mask_sub_ss_round:
1246         case Intrinsic::x86_avx512_mask_sub_sd_round:
1247           V = IC.Builder.CreateFSub(LHS, RHS);
1248           break;
1249         case Intrinsic::x86_avx512_mask_mul_ss_round:
1250         case Intrinsic::x86_avx512_mask_mul_sd_round:
1251           V = IC.Builder.CreateFMul(LHS, RHS);
1252           break;
1253         case Intrinsic::x86_avx512_mask_div_ss_round:
1254         case Intrinsic::x86_avx512_mask_div_sd_round:
1255           V = IC.Builder.CreateFDiv(LHS, RHS);
1256           break;
1257         }
1258 
1259         // Handle the masking aspect of the intrinsic.
1260         Value *Mask = II.getArgOperand(3);
1261         auto *C = dyn_cast<ConstantInt>(Mask);
1262         // We don't need a select if we know the mask bit is a 1.
1263         if (!C || !C->getValue()[0]) {
1264           // Cast the mask to an i1 vector and then extract the lowest element.
1265           auto *MaskTy = FixedVectorType::get(
1266               IC.Builder.getInt1Ty(),
1267               cast<IntegerType>(Mask->getType())->getBitWidth());
1268           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1269           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
1270           // Extract the lowest element from the passthru operand.
1271           Value *Passthru =
1272               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
1273           V = IC.Builder.CreateSelect(Mask, V, Passthru);
1274         }
1275 
1276         // Insert the result back into the original argument 0.
1277         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1278 
1279         return IC.replaceInstUsesWith(II, V);
1280       }
1281     }
1282     break;
1283 
1284   // Constant fold ashr( <A x Bi>, Ci ).
1285   // Constant fold lshr( <A x Bi>, Ci ).
1286   // Constant fold shl( <A x Bi>, Ci ).
1287   case Intrinsic::x86_sse2_psrai_d:
1288   case Intrinsic::x86_sse2_psrai_w:
1289   case Intrinsic::x86_avx2_psrai_d:
1290   case Intrinsic::x86_avx2_psrai_w:
1291   case Intrinsic::x86_avx512_psrai_q_128:
1292   case Intrinsic::x86_avx512_psrai_q_256:
1293   case Intrinsic::x86_avx512_psrai_d_512:
1294   case Intrinsic::x86_avx512_psrai_q_512:
1295   case Intrinsic::x86_avx512_psrai_w_512:
1296   case Intrinsic::x86_sse2_psrli_d:
1297   case Intrinsic::x86_sse2_psrli_q:
1298   case Intrinsic::x86_sse2_psrli_w:
1299   case Intrinsic::x86_avx2_psrli_d:
1300   case Intrinsic::x86_avx2_psrli_q:
1301   case Intrinsic::x86_avx2_psrli_w:
1302   case Intrinsic::x86_avx512_psrli_d_512:
1303   case Intrinsic::x86_avx512_psrli_q_512:
1304   case Intrinsic::x86_avx512_psrli_w_512:
1305   case Intrinsic::x86_sse2_pslli_d:
1306   case Intrinsic::x86_sse2_pslli_q:
1307   case Intrinsic::x86_sse2_pslli_w:
1308   case Intrinsic::x86_avx2_pslli_d:
1309   case Intrinsic::x86_avx2_pslli_q:
1310   case Intrinsic::x86_avx2_pslli_w:
1311   case Intrinsic::x86_avx512_pslli_d_512:
1312   case Intrinsic::x86_avx512_pslli_q_512:
1313   case Intrinsic::x86_avx512_pslli_w_512:
1314     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1315       return IC.replaceInstUsesWith(II, V);
1316     }
1317     break;
1318 
1319   case Intrinsic::x86_sse2_psra_d:
1320   case Intrinsic::x86_sse2_psra_w:
1321   case Intrinsic::x86_avx2_psra_d:
1322   case Intrinsic::x86_avx2_psra_w:
1323   case Intrinsic::x86_avx512_psra_q_128:
1324   case Intrinsic::x86_avx512_psra_q_256:
1325   case Intrinsic::x86_avx512_psra_d_512:
1326   case Intrinsic::x86_avx512_psra_q_512:
1327   case Intrinsic::x86_avx512_psra_w_512:
1328   case Intrinsic::x86_sse2_psrl_d:
1329   case Intrinsic::x86_sse2_psrl_q:
1330   case Intrinsic::x86_sse2_psrl_w:
1331   case Intrinsic::x86_avx2_psrl_d:
1332   case Intrinsic::x86_avx2_psrl_q:
1333   case Intrinsic::x86_avx2_psrl_w:
1334   case Intrinsic::x86_avx512_psrl_d_512:
1335   case Intrinsic::x86_avx512_psrl_q_512:
1336   case Intrinsic::x86_avx512_psrl_w_512:
1337   case Intrinsic::x86_sse2_psll_d:
1338   case Intrinsic::x86_sse2_psll_q:
1339   case Intrinsic::x86_sse2_psll_w:
1340   case Intrinsic::x86_avx2_psll_d:
1341   case Intrinsic::x86_avx2_psll_q:
1342   case Intrinsic::x86_avx2_psll_w:
1343   case Intrinsic::x86_avx512_psll_d_512:
1344   case Intrinsic::x86_avx512_psll_q_512:
1345   case Intrinsic::x86_avx512_psll_w_512: {
1346     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1347       return IC.replaceInstUsesWith(II, V);
1348     }
1349 
1350     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1351     // operand to compute the shift amount.
1352     Value *Arg1 = II.getArgOperand(1);
1353     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1354            "Unexpected packed shift size");
1355     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1356 
1357     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1358       return IC.replaceOperand(II, 1, V);
1359     }
1360     break;
1361   }
1362 
1363   case Intrinsic::x86_avx2_psllv_d:
1364   case Intrinsic::x86_avx2_psllv_d_256:
1365   case Intrinsic::x86_avx2_psllv_q:
1366   case Intrinsic::x86_avx2_psllv_q_256:
1367   case Intrinsic::x86_avx512_psllv_d_512:
1368   case Intrinsic::x86_avx512_psllv_q_512:
1369   case Intrinsic::x86_avx512_psllv_w_128:
1370   case Intrinsic::x86_avx512_psllv_w_256:
1371   case Intrinsic::x86_avx512_psllv_w_512:
1372   case Intrinsic::x86_avx2_psrav_d:
1373   case Intrinsic::x86_avx2_psrav_d_256:
1374   case Intrinsic::x86_avx512_psrav_q_128:
1375   case Intrinsic::x86_avx512_psrav_q_256:
1376   case Intrinsic::x86_avx512_psrav_d_512:
1377   case Intrinsic::x86_avx512_psrav_q_512:
1378   case Intrinsic::x86_avx512_psrav_w_128:
1379   case Intrinsic::x86_avx512_psrav_w_256:
1380   case Intrinsic::x86_avx512_psrav_w_512:
1381   case Intrinsic::x86_avx2_psrlv_d:
1382   case Intrinsic::x86_avx2_psrlv_d_256:
1383   case Intrinsic::x86_avx2_psrlv_q:
1384   case Intrinsic::x86_avx2_psrlv_q_256:
1385   case Intrinsic::x86_avx512_psrlv_d_512:
1386   case Intrinsic::x86_avx512_psrlv_q_512:
1387   case Intrinsic::x86_avx512_psrlv_w_128:
1388   case Intrinsic::x86_avx512_psrlv_w_256:
1389   case Intrinsic::x86_avx512_psrlv_w_512:
1390     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1391       return IC.replaceInstUsesWith(II, V);
1392     }
1393     break;
1394 
1395   case Intrinsic::x86_sse2_packssdw_128:
1396   case Intrinsic::x86_sse2_packsswb_128:
1397   case Intrinsic::x86_avx2_packssdw:
1398   case Intrinsic::x86_avx2_packsswb:
1399   case Intrinsic::x86_avx512_packssdw_512:
1400   case Intrinsic::x86_avx512_packsswb_512:
1401     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1402       return IC.replaceInstUsesWith(II, V);
1403     }
1404     break;
1405 
1406   case Intrinsic::x86_sse2_packuswb_128:
1407   case Intrinsic::x86_sse41_packusdw:
1408   case Intrinsic::x86_avx2_packusdw:
1409   case Intrinsic::x86_avx2_packuswb:
1410   case Intrinsic::x86_avx512_packusdw_512:
1411   case Intrinsic::x86_avx512_packuswb_512:
1412     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1413       return IC.replaceInstUsesWith(II, V);
1414     }
1415     break;
1416 
1417   case Intrinsic::x86_pclmulqdq:
1418   case Intrinsic::x86_pclmulqdq_256:
1419   case Intrinsic::x86_pclmulqdq_512: {
1420     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1421       unsigned Imm = C->getZExtValue();
1422 
1423       bool MadeChange = false;
1424       Value *Arg0 = II.getArgOperand(0);
1425       Value *Arg1 = II.getArgOperand(1);
1426       unsigned VWidth =
1427           cast<FixedVectorType>(Arg0->getType())->getNumElements();
1428 
1429       APInt UndefElts1(VWidth, 0);
1430       APInt DemandedElts1 =
1431           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1432       if (Value *V =
1433               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1434         IC.replaceOperand(II, 0, V);
1435         MadeChange = true;
1436       }
1437 
1438       APInt UndefElts2(VWidth, 0);
1439       APInt DemandedElts2 =
1440           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1441       if (Value *V =
1442               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1443         IC.replaceOperand(II, 1, V);
1444         MadeChange = true;
1445       }
1446 
1447       // If either input elements are undef, the result is zero.
1448       if (DemandedElts1.isSubsetOf(UndefElts1) ||
1449           DemandedElts2.isSubsetOf(UndefElts2)) {
1450         return IC.replaceInstUsesWith(II,
1451                                       ConstantAggregateZero::get(II.getType()));
1452       }
1453 
1454       if (MadeChange) {
1455         return &II;
1456       }
1457     }
1458     break;
1459   }
1460 
1461   case Intrinsic::x86_sse41_insertps:
1462     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1463       return IC.replaceInstUsesWith(II, V);
1464     }
1465     break;
1466 
1467   case Intrinsic::x86_sse4a_extrq: {
1468     Value *Op0 = II.getArgOperand(0);
1469     Value *Op1 = II.getArgOperand(1);
1470     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1471     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1472     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1473            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1474            VWidth1 == 16 && "Unexpected operand sizes");
1475 
1476     // See if we're dealing with constant values.
1477     Constant *C1 = dyn_cast<Constant>(Op1);
1478     ConstantInt *CILength =
1479         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1480            : nullptr;
1481     ConstantInt *CIIndex =
1482         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1483            : nullptr;
1484 
1485     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1486     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1487       return IC.replaceInstUsesWith(II, V);
1488     }
1489 
1490     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1491     // operands and the lowest 16-bits of the second.
1492     bool MadeChange = false;
1493     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1494       IC.replaceOperand(II, 0, V);
1495       MadeChange = true;
1496     }
1497     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1498       IC.replaceOperand(II, 1, V);
1499       MadeChange = true;
1500     }
1501     if (MadeChange) {
1502       return &II;
1503     }
1504     break;
1505   }
1506 
1507   case Intrinsic::x86_sse4a_extrqi: {
1508     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1509     // bits of the lower 64-bits. The upper 64-bits are undefined.
1510     Value *Op0 = II.getArgOperand(0);
1511     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1512     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1513            "Unexpected operand size");
1514 
1515     // See if we're dealing with constant values.
1516     ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1517     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1518 
1519     // Attempt to simplify to a constant or shuffle vector.
1520     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1521       return IC.replaceInstUsesWith(II, V);
1522     }
1523 
1524     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1525     // operand.
1526     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1527       return IC.replaceOperand(II, 0, V);
1528     }
1529     break;
1530   }
1531 
1532   case Intrinsic::x86_sse4a_insertq: {
1533     Value *Op0 = II.getArgOperand(0);
1534     Value *Op1 = II.getArgOperand(1);
1535     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1536     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1537            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1538            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1539            "Unexpected operand size");
1540 
1541     // See if we're dealing with constant values.
1542     Constant *C1 = dyn_cast<Constant>(Op1);
1543     ConstantInt *CI11 =
1544         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1545            : nullptr;
1546 
1547     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1548     if (CI11) {
1549       const APInt &V11 = CI11->getValue();
1550       APInt Len = V11.zextOrTrunc(6);
1551       APInt Idx = V11.lshr(8).zextOrTrunc(6);
1552       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1553         return IC.replaceInstUsesWith(II, V);
1554       }
1555     }
1556 
1557     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1558     // operand.
1559     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1560       return IC.replaceOperand(II, 0, V);
1561     }
1562     break;
1563   }
1564 
1565   case Intrinsic::x86_sse4a_insertqi: {
1566     // INSERTQI: Extract lowest Length bits from lower half of second source and
1567     // insert over first source starting at Index bit. The upper 64-bits are
1568     // undefined.
1569     Value *Op0 = II.getArgOperand(0);
1570     Value *Op1 = II.getArgOperand(1);
1571     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1572     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1573     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1574            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1575            VWidth1 == 2 && "Unexpected operand sizes");
1576 
1577     // See if we're dealing with constant values.
1578     ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1579     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1580 
1581     // Attempt to simplify to a constant or shuffle vector.
1582     if (CILength && CIIndex) {
1583       APInt Len = CILength->getValue().zextOrTrunc(6);
1584       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1585       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1586         return IC.replaceInstUsesWith(II, V);
1587       }
1588     }
1589 
1590     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1591     // operands.
1592     bool MadeChange = false;
1593     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1594       IC.replaceOperand(II, 0, V);
1595       MadeChange = true;
1596     }
1597     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1598       IC.replaceOperand(II, 1, V);
1599       MadeChange = true;
1600     }
1601     if (MadeChange) {
1602       return &II;
1603     }
1604     break;
1605   }
1606 
1607   case Intrinsic::x86_sse41_pblendvb:
1608   case Intrinsic::x86_sse41_blendvps:
1609   case Intrinsic::x86_sse41_blendvpd:
1610   case Intrinsic::x86_avx_blendv_ps_256:
1611   case Intrinsic::x86_avx_blendv_pd_256:
1612   case Intrinsic::x86_avx2_pblendvb: {
1613     // fold (blend A, A, Mask) -> A
1614     Value *Op0 = II.getArgOperand(0);
1615     Value *Op1 = II.getArgOperand(1);
1616     Value *Mask = II.getArgOperand(2);
1617     if (Op0 == Op1) {
1618       return IC.replaceInstUsesWith(II, Op0);
1619     }
1620 
1621     // Zero Mask - select 1st argument.
1622     if (isa<ConstantAggregateZero>(Mask)) {
1623       return IC.replaceInstUsesWith(II, Op0);
1624     }
1625 
1626     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1627     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1628       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1629       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1630     }
1631 
1632     // Convert to a vector select if we can bypass casts and find a boolean
1633     // vector condition value.
1634     Value *BoolVec;
1635     Mask = InstCombiner::peekThroughBitcast(Mask);
1636     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
1637         BoolVec->getType()->isVectorTy() &&
1638         BoolVec->getType()->getScalarSizeInBits() == 1) {
1639       assert(Mask->getType()->getPrimitiveSizeInBits() ==
1640                  II.getType()->getPrimitiveSizeInBits() &&
1641              "Not expecting mask and operands with different sizes");
1642 
1643       unsigned NumMaskElts =
1644           cast<FixedVectorType>(Mask->getType())->getNumElements();
1645       unsigned NumOperandElts =
1646           cast<FixedVectorType>(II.getType())->getNumElements();
1647       if (NumMaskElts == NumOperandElts) {
1648         return SelectInst::Create(BoolVec, Op1, Op0);
1649       }
1650 
1651       // If the mask has less elements than the operands, each mask bit maps to
1652       // multiple elements of the operands. Bitcast back and forth.
1653       if (NumMaskElts < NumOperandElts) {
1654         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1655         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1656         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1657         return new BitCastInst(Sel, II.getType());
1658       }
1659     }
1660 
1661     break;
1662   }
1663 
1664   case Intrinsic::x86_ssse3_pshuf_b_128:
1665   case Intrinsic::x86_avx2_pshuf_b:
1666   case Intrinsic::x86_avx512_pshuf_b_512:
1667     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1668       return IC.replaceInstUsesWith(II, V);
1669     }
1670     break;
1671 
1672   case Intrinsic::x86_avx_vpermilvar_ps:
1673   case Intrinsic::x86_avx_vpermilvar_ps_256:
1674   case Intrinsic::x86_avx512_vpermilvar_ps_512:
1675   case Intrinsic::x86_avx_vpermilvar_pd:
1676   case Intrinsic::x86_avx_vpermilvar_pd_256:
1677   case Intrinsic::x86_avx512_vpermilvar_pd_512:
1678     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1679       return IC.replaceInstUsesWith(II, V);
1680     }
1681     break;
1682 
1683   case Intrinsic::x86_avx2_permd:
1684   case Intrinsic::x86_avx2_permps:
1685   case Intrinsic::x86_avx512_permvar_df_256:
1686   case Intrinsic::x86_avx512_permvar_df_512:
1687   case Intrinsic::x86_avx512_permvar_di_256:
1688   case Intrinsic::x86_avx512_permvar_di_512:
1689   case Intrinsic::x86_avx512_permvar_hi_128:
1690   case Intrinsic::x86_avx512_permvar_hi_256:
1691   case Intrinsic::x86_avx512_permvar_hi_512:
1692   case Intrinsic::x86_avx512_permvar_qi_128:
1693   case Intrinsic::x86_avx512_permvar_qi_256:
1694   case Intrinsic::x86_avx512_permvar_qi_512:
1695   case Intrinsic::x86_avx512_permvar_sf_512:
1696   case Intrinsic::x86_avx512_permvar_si_512:
1697     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1698       return IC.replaceInstUsesWith(II, V);
1699     }
1700     break;
1701 
1702   case Intrinsic::x86_avx_maskload_ps:
1703   case Intrinsic::x86_avx_maskload_pd:
1704   case Intrinsic::x86_avx_maskload_ps_256:
1705   case Intrinsic::x86_avx_maskload_pd_256:
1706   case Intrinsic::x86_avx2_maskload_d:
1707   case Intrinsic::x86_avx2_maskload_q:
1708   case Intrinsic::x86_avx2_maskload_d_256:
1709   case Intrinsic::x86_avx2_maskload_q_256:
1710     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1711       return I;
1712     }
1713     break;
1714 
1715   case Intrinsic::x86_sse2_maskmov_dqu:
1716   case Intrinsic::x86_avx_maskstore_ps:
1717   case Intrinsic::x86_avx_maskstore_pd:
1718   case Intrinsic::x86_avx_maskstore_ps_256:
1719   case Intrinsic::x86_avx_maskstore_pd_256:
1720   case Intrinsic::x86_avx2_maskstore_d:
1721   case Intrinsic::x86_avx2_maskstore_q:
1722   case Intrinsic::x86_avx2_maskstore_d_256:
1723   case Intrinsic::x86_avx2_maskstore_q_256:
1724     if (simplifyX86MaskedStore(II, IC)) {
1725       return nullptr;
1726     }
1727     break;
1728 
1729   case Intrinsic::x86_addcarry_32:
1730   case Intrinsic::x86_addcarry_64:
1731     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1732       return IC.replaceInstUsesWith(II, V);
1733     }
1734     break;
1735 
1736   default:
1737     break;
1738   }
1739   return None;
1740 }
1741 
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed) const1742 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
1743     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1744     bool &KnownBitsComputed) const {
1745   switch (II.getIntrinsicID()) {
1746   default:
1747     break;
1748   case Intrinsic::x86_mmx_pmovmskb:
1749   case Intrinsic::x86_sse_movmsk_ps:
1750   case Intrinsic::x86_sse2_movmsk_pd:
1751   case Intrinsic::x86_sse2_pmovmskb_128:
1752   case Intrinsic::x86_avx_movmsk_ps_256:
1753   case Intrinsic::x86_avx_movmsk_pd_256:
1754   case Intrinsic::x86_avx2_pmovmskb: {
1755     // MOVMSK copies the vector elements' sign bits to the low bits
1756     // and zeros the high bits.
1757     unsigned ArgWidth;
1758     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1759       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1760     } else {
1761       auto Arg = II.getArgOperand(0);
1762       auto ArgType = cast<FixedVectorType>(Arg->getType());
1763       ArgWidth = ArgType->getNumElements();
1764     }
1765 
1766     // If we don't need any of low bits then return zero,
1767     // we know that DemandedMask is non-zero already.
1768     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1769     Type *VTy = II.getType();
1770     if (DemandedElts.isNullValue()) {
1771       return ConstantInt::getNullValue(VTy);
1772     }
1773 
1774     // We know that the upper bits are set to zero.
1775     Known.Zero.setBitsFrom(ArgWidth);
1776     KnownBitsComputed = true;
1777     break;
1778   }
1779   }
1780   return None;
1781 }
1782 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> simplifyAndSetOp) const1783 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1784     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1785     APInt &UndefElts2, APInt &UndefElts3,
1786     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1787         simplifyAndSetOp) const {
1788   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1789   switch (II.getIntrinsicID()) {
1790   default:
1791     break;
1792   case Intrinsic::x86_xop_vfrcz_ss:
1793   case Intrinsic::x86_xop_vfrcz_sd:
1794     // The instructions for these intrinsics are speced to zero upper bits not
1795     // pass them through like other scalar intrinsics. So we shouldn't just
1796     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1797     // Instead we should return a zero vector.
1798     if (!DemandedElts[0]) {
1799       IC.addToWorklist(&II);
1800       return ConstantAggregateZero::get(II.getType());
1801     }
1802 
1803     // Only the lower element is used.
1804     DemandedElts = 1;
1805     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1806 
1807     // Only the lower element is undefined. The high elements are zero.
1808     UndefElts = UndefElts[0];
1809     break;
1810 
1811   // Unary scalar-as-vector operations that work column-wise.
1812   case Intrinsic::x86_sse_rcp_ss:
1813   case Intrinsic::x86_sse_rsqrt_ss:
1814     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1815 
1816     // If lowest element of a scalar op isn't used then use Arg0.
1817     if (!DemandedElts[0]) {
1818       IC.addToWorklist(&II);
1819       return II.getArgOperand(0);
1820     }
1821     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1822     // checks).
1823     break;
1824 
1825   // Binary scalar-as-vector operations that work column-wise. The high
1826   // elements come from operand 0. The low element is a function of both
1827   // operands.
1828   case Intrinsic::x86_sse_min_ss:
1829   case Intrinsic::x86_sse_max_ss:
1830   case Intrinsic::x86_sse_cmp_ss:
1831   case Intrinsic::x86_sse2_min_sd:
1832   case Intrinsic::x86_sse2_max_sd:
1833   case Intrinsic::x86_sse2_cmp_sd: {
1834     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1835 
1836     // If lowest element of a scalar op isn't used then use Arg0.
1837     if (!DemandedElts[0]) {
1838       IC.addToWorklist(&II);
1839       return II.getArgOperand(0);
1840     }
1841 
1842     // Only lower element is used for operand 1.
1843     DemandedElts = 1;
1844     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1845 
1846     // Lower element is undefined if both lower elements are undefined.
1847     // Consider things like undef&0.  The result is known zero, not undef.
1848     if (!UndefElts2[0])
1849       UndefElts.clearBit(0);
1850 
1851     break;
1852   }
1853 
1854   // Binary scalar-as-vector operations that work column-wise. The high
1855   // elements come from operand 0 and the low element comes from operand 1.
1856   case Intrinsic::x86_sse41_round_ss:
1857   case Intrinsic::x86_sse41_round_sd: {
1858     // Don't use the low element of operand 0.
1859     APInt DemandedElts2 = DemandedElts;
1860     DemandedElts2.clearBit(0);
1861     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1862 
1863     // If lowest element of a scalar op isn't used then use Arg0.
1864     if (!DemandedElts[0]) {
1865       IC.addToWorklist(&II);
1866       return II.getArgOperand(0);
1867     }
1868 
1869     // Only lower element is used for operand 1.
1870     DemandedElts = 1;
1871     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1872 
1873     // Take the high undef elements from operand 0 and take the lower element
1874     // from operand 1.
1875     UndefElts.clearBit(0);
1876     UndefElts |= UndefElts2[0];
1877     break;
1878   }
1879 
1880   // Three input scalar-as-vector operations that work column-wise. The high
1881   // elements come from operand 0 and the low element is a function of all
1882   // three inputs.
1883   case Intrinsic::x86_avx512_mask_add_ss_round:
1884   case Intrinsic::x86_avx512_mask_div_ss_round:
1885   case Intrinsic::x86_avx512_mask_mul_ss_round:
1886   case Intrinsic::x86_avx512_mask_sub_ss_round:
1887   case Intrinsic::x86_avx512_mask_max_ss_round:
1888   case Intrinsic::x86_avx512_mask_min_ss_round:
1889   case Intrinsic::x86_avx512_mask_add_sd_round:
1890   case Intrinsic::x86_avx512_mask_div_sd_round:
1891   case Intrinsic::x86_avx512_mask_mul_sd_round:
1892   case Intrinsic::x86_avx512_mask_sub_sd_round:
1893   case Intrinsic::x86_avx512_mask_max_sd_round:
1894   case Intrinsic::x86_avx512_mask_min_sd_round:
1895     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1896 
1897     // If lowest element of a scalar op isn't used then use Arg0.
1898     if (!DemandedElts[0]) {
1899       IC.addToWorklist(&II);
1900       return II.getArgOperand(0);
1901     }
1902 
1903     // Only lower element is used for operand 1 and 2.
1904     DemandedElts = 1;
1905     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1906     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1907 
1908     // Lower element is undefined if all three lower elements are undefined.
1909     // Consider things like undef&0.  The result is known zero, not undef.
1910     if (!UndefElts2[0] || !UndefElts3[0])
1911       UndefElts.clearBit(0);
1912 
1913     break;
1914 
1915   case Intrinsic::x86_sse2_packssdw_128:
1916   case Intrinsic::x86_sse2_packsswb_128:
1917   case Intrinsic::x86_sse2_packuswb_128:
1918   case Intrinsic::x86_sse41_packusdw:
1919   case Intrinsic::x86_avx2_packssdw:
1920   case Intrinsic::x86_avx2_packsswb:
1921   case Intrinsic::x86_avx2_packusdw:
1922   case Intrinsic::x86_avx2_packuswb:
1923   case Intrinsic::x86_avx512_packssdw_512:
1924   case Intrinsic::x86_avx512_packsswb_512:
1925   case Intrinsic::x86_avx512_packusdw_512:
1926   case Intrinsic::x86_avx512_packuswb_512: {
1927     auto *Ty0 = II.getArgOperand(0)->getType();
1928     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1929     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1930 
1931     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1932     unsigned VWidthPerLane = VWidth / NumLanes;
1933     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1934 
1935     // Per lane, pack the elements of the first input and then the second.
1936     // e.g.
1937     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1938     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1939     for (int OpNum = 0; OpNum != 2; ++OpNum) {
1940       APInt OpDemandedElts(InnerVWidth, 0);
1941       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1942         unsigned LaneIdx = Lane * VWidthPerLane;
1943         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1944           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1945           if (DemandedElts[Idx])
1946             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1947         }
1948       }
1949 
1950       // Demand elements from the operand.
1951       APInt OpUndefElts(InnerVWidth, 0);
1952       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1953 
1954       // Pack the operand's UNDEF elements, one lane at a time.
1955       OpUndefElts = OpUndefElts.zext(VWidth);
1956       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1957         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1958         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1959         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1960         UndefElts |= LaneElts;
1961       }
1962     }
1963     break;
1964   }
1965 
1966   // PSHUFB
1967   case Intrinsic::x86_ssse3_pshuf_b_128:
1968   case Intrinsic::x86_avx2_pshuf_b:
1969   case Intrinsic::x86_avx512_pshuf_b_512:
1970   // PERMILVAR
1971   case Intrinsic::x86_avx_vpermilvar_ps:
1972   case Intrinsic::x86_avx_vpermilvar_ps_256:
1973   case Intrinsic::x86_avx512_vpermilvar_ps_512:
1974   case Intrinsic::x86_avx_vpermilvar_pd:
1975   case Intrinsic::x86_avx_vpermilvar_pd_256:
1976   case Intrinsic::x86_avx512_vpermilvar_pd_512:
1977   // PERMV
1978   case Intrinsic::x86_avx2_permd:
1979   case Intrinsic::x86_avx2_permps: {
1980     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
1981     break;
1982   }
1983 
1984   // SSE4A instructions leave the upper 64-bits of the 128-bit result
1985   // in an undefined state.
1986   case Intrinsic::x86_sse4a_extrq:
1987   case Intrinsic::x86_sse4a_extrqi:
1988   case Intrinsic::x86_sse4a_insertq:
1989   case Intrinsic::x86_sse4a_insertqi:
1990     UndefElts.setHighBits(VWidth / 2);
1991     break;
1992   }
1993   return None;
1994 }
1995