1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 #include <optional>
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "x86tti"
26
27 /// Return a constant boolean vector that has true elements in all positions
28 /// where the input constant data vector has an element with the sign bit set.
getNegativeIsTrueBoolVec(Constant * V)29 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31 V = ConstantExpr::getBitCast(V, IntTy);
32 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
33 V);
34 return V;
35 }
36
37 /// Convert the x86 XMM integer vector mask to a vector of bools based on
38 /// each element's most significant bit (the sign bit).
getBoolVecFromMask(Value * Mask)39 static Value *getBoolVecFromMask(Value *Mask) {
40 // Fold Constant Mask.
41 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
42 return getNegativeIsTrueBoolVec(ConstantMask);
43
44 // Mask was extended from a boolean vector.
45 Value *ExtMask;
46 if (PatternMatch::match(
47 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
48 ExtMask->getType()->isIntOrIntVectorTy(1))
49 return ExtMask;
50
51 return nullptr;
52 }
53
54 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
55 // XMM register mask efficiently, we could transform all x86 masked intrinsics
56 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedLoad(IntrinsicInst & II,InstCombiner & IC)57 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
58 Value *Ptr = II.getOperand(0);
59 Value *Mask = II.getOperand(1);
60 Constant *ZeroVec = Constant::getNullValue(II.getType());
61
62 // Zero Mask - masked load instruction creates a zero vector.
63 if (isa<ConstantAggregateZero>(Mask))
64 return IC.replaceInstUsesWith(II, ZeroVec);
65
66 // The mask is constant or extended from a bool vector. Convert this x86
67 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
68 if (Value *BoolMask = getBoolVecFromMask(Mask)) {
69 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
70 // the LLVM intrinsic definition for the pointer argument.
71 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
72 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
73 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
74
75 // The pass-through vector for an x86 masked load is a zero vector.
76 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
77 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
78 return IC.replaceInstUsesWith(II, NewMaskedLoad);
79 }
80
81 return nullptr;
82 }
83
84 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
85 // XMM register mask efficiently, we could transform all x86 masked intrinsics
86 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedStore(IntrinsicInst & II,InstCombiner & IC)87 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
88 Value *Ptr = II.getOperand(0);
89 Value *Mask = II.getOperand(1);
90 Value *Vec = II.getOperand(2);
91
92 // Zero Mask - this masked store instruction does nothing.
93 if (isa<ConstantAggregateZero>(Mask)) {
94 IC.eraseInstFromFunction(II);
95 return true;
96 }
97
98 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
99 // anything else at this level.
100 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
101 return false;
102
103 // The mask is constant or extended from a bool vector. Convert this x86
104 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
105 if (Value *BoolMask = getBoolVecFromMask(Mask)) {
106 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
107 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
108 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
109
110 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
111
112 // 'Replace uses' doesn't work for stores. Erase the original masked store.
113 IC.eraseInstFromFunction(II);
114 return true;
115 }
116
117 return false;
118 }
119
simplifyX86immShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)120 static Value *simplifyX86immShift(const IntrinsicInst &II,
121 InstCombiner::BuilderTy &Builder) {
122 bool LogicalShift = false;
123 bool ShiftLeft = false;
124 bool IsImm = false;
125
126 switch (II.getIntrinsicID()) {
127 default:
128 llvm_unreachable("Unexpected intrinsic!");
129 case Intrinsic::x86_sse2_psrai_d:
130 case Intrinsic::x86_sse2_psrai_w:
131 case Intrinsic::x86_avx2_psrai_d:
132 case Intrinsic::x86_avx2_psrai_w:
133 case Intrinsic::x86_avx512_psrai_q_128:
134 case Intrinsic::x86_avx512_psrai_q_256:
135 case Intrinsic::x86_avx512_psrai_d_512:
136 case Intrinsic::x86_avx512_psrai_q_512:
137 case Intrinsic::x86_avx512_psrai_w_512:
138 IsImm = true;
139 [[fallthrough]];
140 case Intrinsic::x86_sse2_psra_d:
141 case Intrinsic::x86_sse2_psra_w:
142 case Intrinsic::x86_avx2_psra_d:
143 case Intrinsic::x86_avx2_psra_w:
144 case Intrinsic::x86_avx512_psra_q_128:
145 case Intrinsic::x86_avx512_psra_q_256:
146 case Intrinsic::x86_avx512_psra_d_512:
147 case Intrinsic::x86_avx512_psra_q_512:
148 case Intrinsic::x86_avx512_psra_w_512:
149 LogicalShift = false;
150 ShiftLeft = false;
151 break;
152 case Intrinsic::x86_sse2_psrli_d:
153 case Intrinsic::x86_sse2_psrli_q:
154 case Intrinsic::x86_sse2_psrli_w:
155 case Intrinsic::x86_avx2_psrli_d:
156 case Intrinsic::x86_avx2_psrli_q:
157 case Intrinsic::x86_avx2_psrli_w:
158 case Intrinsic::x86_avx512_psrli_d_512:
159 case Intrinsic::x86_avx512_psrli_q_512:
160 case Intrinsic::x86_avx512_psrli_w_512:
161 IsImm = true;
162 [[fallthrough]];
163 case Intrinsic::x86_sse2_psrl_d:
164 case Intrinsic::x86_sse2_psrl_q:
165 case Intrinsic::x86_sse2_psrl_w:
166 case Intrinsic::x86_avx2_psrl_d:
167 case Intrinsic::x86_avx2_psrl_q:
168 case Intrinsic::x86_avx2_psrl_w:
169 case Intrinsic::x86_avx512_psrl_d_512:
170 case Intrinsic::x86_avx512_psrl_q_512:
171 case Intrinsic::x86_avx512_psrl_w_512:
172 LogicalShift = true;
173 ShiftLeft = false;
174 break;
175 case Intrinsic::x86_sse2_pslli_d:
176 case Intrinsic::x86_sse2_pslli_q:
177 case Intrinsic::x86_sse2_pslli_w:
178 case Intrinsic::x86_avx2_pslli_d:
179 case Intrinsic::x86_avx2_pslli_q:
180 case Intrinsic::x86_avx2_pslli_w:
181 case Intrinsic::x86_avx512_pslli_d_512:
182 case Intrinsic::x86_avx512_pslli_q_512:
183 case Intrinsic::x86_avx512_pslli_w_512:
184 IsImm = true;
185 [[fallthrough]];
186 case Intrinsic::x86_sse2_psll_d:
187 case Intrinsic::x86_sse2_psll_q:
188 case Intrinsic::x86_sse2_psll_w:
189 case Intrinsic::x86_avx2_psll_d:
190 case Intrinsic::x86_avx2_psll_q:
191 case Intrinsic::x86_avx2_psll_w:
192 case Intrinsic::x86_avx512_psll_d_512:
193 case Intrinsic::x86_avx512_psll_q_512:
194 case Intrinsic::x86_avx512_psll_w_512:
195 LogicalShift = true;
196 ShiftLeft = true;
197 break;
198 }
199 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
200
201 Value *Vec = II.getArgOperand(0);
202 Value *Amt = II.getArgOperand(1);
203 auto *VT = cast<FixedVectorType>(Vec->getType());
204 Type *SVT = VT->getElementType();
205 Type *AmtVT = Amt->getType();
206 unsigned VWidth = VT->getNumElements();
207 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
208
209 // If the shift amount is guaranteed to be in-range we can replace it with a
210 // generic shift. If its guaranteed to be out of range, logical shifts combine
211 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
212 if (IsImm) {
213 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
214 KnownBits KnownAmtBits =
215 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
216 if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
217 Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
218 Amt = Builder.CreateVectorSplat(VWidth, Amt);
219 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
220 : Builder.CreateLShr(Vec, Amt))
221 : Builder.CreateAShr(Vec, Amt));
222 }
223 if (KnownAmtBits.getMinValue().uge(BitWidth)) {
224 if (LogicalShift)
225 return ConstantAggregateZero::get(VT);
226 Amt = ConstantInt::get(SVT, BitWidth - 1);
227 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
228 }
229 } else {
230 // Ensure the first element has an in-range value and the rest of the
231 // elements in the bottom 64 bits are zero.
232 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
233 cast<VectorType>(AmtVT)->getElementType() == SVT &&
234 "Unexpected shift-by-scalar type");
235 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
236 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
237 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
238 KnownBits KnownLowerBits = llvm::computeKnownBits(
239 Amt, DemandedLower, II.getModule()->getDataLayout());
240 KnownBits KnownUpperBits = llvm::computeKnownBits(
241 Amt, DemandedUpper, II.getModule()->getDataLayout());
242 if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
243 (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
244 SmallVector<int, 16> ZeroSplat(VWidth, 0);
245 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
246 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
247 : Builder.CreateLShr(Vec, Amt))
248 : Builder.CreateAShr(Vec, Amt));
249 }
250 }
251
252 // Simplify if count is constant vector.
253 auto *CDV = dyn_cast<ConstantDataVector>(Amt);
254 if (!CDV)
255 return nullptr;
256
257 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
258 // operand to compute the shift amount.
259 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
260 cast<VectorType>(AmtVT)->getElementType() == SVT &&
261 "Unexpected shift-by-scalar type");
262
263 // Concatenate the sub-elements to create the 64-bit value.
264 APInt Count(64, 0);
265 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
266 unsigned SubEltIdx = (NumSubElts - 1) - i;
267 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
268 Count <<= BitWidth;
269 Count |= SubElt->getValue().zextOrTrunc(64);
270 }
271
272 // If shift-by-zero then just return the original value.
273 if (Count.isZero())
274 return Vec;
275
276 // Handle cases when Shift >= BitWidth.
277 if (Count.uge(BitWidth)) {
278 // If LogicalShift - just return zero.
279 if (LogicalShift)
280 return ConstantAggregateZero::get(VT);
281
282 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
283 Count = APInt(64, BitWidth - 1);
284 }
285
286 // Get a constant vector of the same type as the first operand.
287 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
288 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
289
290 if (ShiftLeft)
291 return Builder.CreateShl(Vec, ShiftVec);
292
293 if (LogicalShift)
294 return Builder.CreateLShr(Vec, ShiftVec);
295
296 return Builder.CreateAShr(Vec, ShiftVec);
297 }
298
299 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
300 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
301 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
simplifyX86varShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)302 static Value *simplifyX86varShift(const IntrinsicInst &II,
303 InstCombiner::BuilderTy &Builder) {
304 bool LogicalShift = false;
305 bool ShiftLeft = false;
306
307 switch (II.getIntrinsicID()) {
308 default:
309 llvm_unreachable("Unexpected intrinsic!");
310 case Intrinsic::x86_avx2_psrav_d:
311 case Intrinsic::x86_avx2_psrav_d_256:
312 case Intrinsic::x86_avx512_psrav_q_128:
313 case Intrinsic::x86_avx512_psrav_q_256:
314 case Intrinsic::x86_avx512_psrav_d_512:
315 case Intrinsic::x86_avx512_psrav_q_512:
316 case Intrinsic::x86_avx512_psrav_w_128:
317 case Intrinsic::x86_avx512_psrav_w_256:
318 case Intrinsic::x86_avx512_psrav_w_512:
319 LogicalShift = false;
320 ShiftLeft = false;
321 break;
322 case Intrinsic::x86_avx2_psrlv_d:
323 case Intrinsic::x86_avx2_psrlv_d_256:
324 case Intrinsic::x86_avx2_psrlv_q:
325 case Intrinsic::x86_avx2_psrlv_q_256:
326 case Intrinsic::x86_avx512_psrlv_d_512:
327 case Intrinsic::x86_avx512_psrlv_q_512:
328 case Intrinsic::x86_avx512_psrlv_w_128:
329 case Intrinsic::x86_avx512_psrlv_w_256:
330 case Intrinsic::x86_avx512_psrlv_w_512:
331 LogicalShift = true;
332 ShiftLeft = false;
333 break;
334 case Intrinsic::x86_avx2_psllv_d:
335 case Intrinsic::x86_avx2_psllv_d_256:
336 case Intrinsic::x86_avx2_psllv_q:
337 case Intrinsic::x86_avx2_psllv_q_256:
338 case Intrinsic::x86_avx512_psllv_d_512:
339 case Intrinsic::x86_avx512_psllv_q_512:
340 case Intrinsic::x86_avx512_psllv_w_128:
341 case Intrinsic::x86_avx512_psllv_w_256:
342 case Intrinsic::x86_avx512_psllv_w_512:
343 LogicalShift = true;
344 ShiftLeft = true;
345 break;
346 }
347 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
348
349 Value *Vec = II.getArgOperand(0);
350 Value *Amt = II.getArgOperand(1);
351 auto *VT = cast<FixedVectorType>(II.getType());
352 Type *SVT = VT->getElementType();
353 int NumElts = VT->getNumElements();
354 int BitWidth = SVT->getIntegerBitWidth();
355
356 // If the shift amount is guaranteed to be in-range we can replace it with a
357 // generic shift.
358 KnownBits KnownAmt =
359 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
360 if (KnownAmt.getMaxValue().ult(BitWidth)) {
361 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
362 : Builder.CreateLShr(Vec, Amt))
363 : Builder.CreateAShr(Vec, Amt));
364 }
365
366 // Simplify if all shift amounts are constant/undef.
367 auto *CShift = dyn_cast<Constant>(Amt);
368 if (!CShift)
369 return nullptr;
370
371 // Collect each element's shift amount.
372 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373 bool AnyOutOfRange = false;
374 SmallVector<int, 8> ShiftAmts;
375 for (int I = 0; I < NumElts; ++I) {
376 auto *CElt = CShift->getAggregateElement(I);
377 if (isa_and_nonnull<UndefValue>(CElt)) {
378 ShiftAmts.push_back(-1);
379 continue;
380 }
381
382 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
383 if (!COp)
384 return nullptr;
385
386 // Handle out of range shifts.
387 // If LogicalShift - set to BitWidth (special case).
388 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389 APInt ShiftVal = COp->getValue();
390 if (ShiftVal.uge(BitWidth)) {
391 AnyOutOfRange = LogicalShift;
392 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
393 continue;
394 }
395
396 ShiftAmts.push_back((int)ShiftVal.getZExtValue());
397 }
398
399 // If all elements out of range or UNDEF, return vector of zeros/undefs.
400 // ArithmeticShift should only hit this if they are all UNDEF.
401 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
402 if (llvm::all_of(ShiftAmts, OutOfRange)) {
403 SmallVector<Constant *, 8> ConstantVec;
404 for (int Idx : ShiftAmts) {
405 if (Idx < 0) {
406 ConstantVec.push_back(UndefValue::get(SVT));
407 } else {
408 assert(LogicalShift && "Logical shift expected");
409 ConstantVec.push_back(ConstantInt::getNullValue(SVT));
410 }
411 }
412 return ConstantVector::get(ConstantVec);
413 }
414
415 // We can't handle only some out of range values with generic logical shifts.
416 if (AnyOutOfRange)
417 return nullptr;
418
419 // Build the shift amount constant vector.
420 SmallVector<Constant *, 8> ShiftVecAmts;
421 for (int Idx : ShiftAmts) {
422 if (Idx < 0)
423 ShiftVecAmts.push_back(UndefValue::get(SVT));
424 else
425 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
426 }
427 auto ShiftVec = ConstantVector::get(ShiftVecAmts);
428
429 if (ShiftLeft)
430 return Builder.CreateShl(Vec, ShiftVec);
431
432 if (LogicalShift)
433 return Builder.CreateLShr(Vec, ShiftVec);
434
435 return Builder.CreateAShr(Vec, ShiftVec);
436 }
437
simplifyX86pack(IntrinsicInst & II,InstCombiner::BuilderTy & Builder,bool IsSigned)438 static Value *simplifyX86pack(IntrinsicInst &II,
439 InstCombiner::BuilderTy &Builder, bool IsSigned) {
440 Value *Arg0 = II.getArgOperand(0);
441 Value *Arg1 = II.getArgOperand(1);
442 Type *ResTy = II.getType();
443
444 // Fast all undef handling.
445 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
446 return UndefValue::get(ResTy);
447
448 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
449 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
450 unsigned NumSrcElts = ArgTy->getNumElements();
451 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452 "Unexpected packing types");
453
454 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
455 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
456 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458 "Unexpected packing types");
459
460 // Constant folding.
461 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
462 return nullptr;
463
464 // Clamp Values - signed/unsigned both use signed clamp values, but they
465 // differ on the min/max values.
466 APInt MinValue, MaxValue;
467 if (IsSigned) {
468 // PACKSS: Truncate signed value with signed saturation.
469 // Source values less than dst minint are saturated to minint.
470 // Source values greater than dst maxint are saturated to maxint.
471 MinValue =
472 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
473 MaxValue =
474 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
475 } else {
476 // PACKUS: Truncate signed value with unsigned saturation.
477 // Source values less than zero are saturated to zero.
478 // Source values greater than dst maxuint are saturated to maxuint.
479 MinValue = APInt::getZero(SrcScalarSizeInBits);
480 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
481 }
482
483 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
484 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
485 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
489
490 // Shuffle clamped args together at the lane level.
491 SmallVector<int, 32> PackMask;
492 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
497 }
498 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
499
500 // Truncate to dst size.
501 return Builder.CreateTrunc(Shuffle, ResTy);
502 }
503
simplifyX86movmsk(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)504 static Value *simplifyX86movmsk(const IntrinsicInst &II,
505 InstCombiner::BuilderTy &Builder) {
506 Value *Arg = II.getArgOperand(0);
507 Type *ResTy = II.getType();
508
509 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510 if (isa<UndefValue>(Arg))
511 return Constant::getNullValue(ResTy);
512
513 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
514 // We can't easily peek through x86_mmx types.
515 if (!ArgTy)
516 return nullptr;
517
518 // Expand MOVMSK to compare/bitcast/zext:
519 // e.g. PMOVMSKB(v16i8 x):
520 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521 // %int = bitcast <16 x i1> %cmp to i16
522 // %res = zext i16 %int to i32
523 unsigned NumElts = ArgTy->getNumElements();
524 Type *IntegerTy = Builder.getIntNTy(NumElts);
525
526 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
527 Res = Builder.CreateIsNeg(Res);
528 Res = Builder.CreateBitCast(Res, IntegerTy);
529 Res = Builder.CreateZExtOrTrunc(Res, ResTy);
530 return Res;
531 }
532
simplifyX86addcarry(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)533 static Value *simplifyX86addcarry(const IntrinsicInst &II,
534 InstCombiner::BuilderTy &Builder) {
535 Value *CarryIn = II.getArgOperand(0);
536 Value *Op1 = II.getArgOperand(1);
537 Value *Op2 = II.getArgOperand(2);
538 Type *RetTy = II.getType();
539 Type *OpTy = Op1->getType();
540 assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
541 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
542 "Unexpected types for x86 addcarry");
543
544 // If carry-in is zero, this is just an unsigned add with overflow.
545 if (match(CarryIn, PatternMatch::m_ZeroInt())) {
546 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
547 {Op1, Op2});
548 // The types have to be adjusted to match the x86 call types.
549 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
550 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
551 Builder.getInt8Ty());
552 Value *Res = PoisonValue::get(RetTy);
553 Res = Builder.CreateInsertValue(Res, UAddOV, 0);
554 return Builder.CreateInsertValue(Res, UAddResult, 1);
555 }
556
557 return nullptr;
558 }
559
simplifyX86insertps(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)560 static Value *simplifyX86insertps(const IntrinsicInst &II,
561 InstCombiner::BuilderTy &Builder) {
562 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
563 if (!CInt)
564 return nullptr;
565
566 auto *VecTy = cast<FixedVectorType>(II.getType());
567 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
568
569 // The immediate permute control byte looks like this:
570 // [3:0] - zero mask for each 32-bit lane
571 // [5:4] - select one 32-bit destination lane
572 // [7:6] - select one 32-bit source lane
573
574 uint8_t Imm = CInt->getZExtValue();
575 uint8_t ZMask = Imm & 0xf;
576 uint8_t DestLane = (Imm >> 4) & 0x3;
577 uint8_t SourceLane = (Imm >> 6) & 0x3;
578
579 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
580
581 // If all zero mask bits are set, this was just a weird way to
582 // generate a zero vector.
583 if (ZMask == 0xf)
584 return ZeroVector;
585
586 // Initialize by passing all of the first source bits through.
587 int ShuffleMask[4] = {0, 1, 2, 3};
588
589 // We may replace the second operand with the zero vector.
590 Value *V1 = II.getArgOperand(1);
591
592 if (ZMask) {
593 // If the zero mask is being used with a single input or the zero mask
594 // overrides the destination lane, this is a shuffle with the zero vector.
595 if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
596 (ZMask & (1 << DestLane))) {
597 V1 = ZeroVector;
598 // We may still move 32-bits of the first source vector from one lane
599 // to another.
600 ShuffleMask[DestLane] = SourceLane;
601 // The zero mask may override the previous insert operation.
602 for (unsigned i = 0; i < 4; ++i)
603 if ((ZMask >> i) & 0x1)
604 ShuffleMask[i] = i + 4;
605 } else {
606 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
607 return nullptr;
608 }
609 } else {
610 // Replace the selected destination lane with the selected source lane.
611 ShuffleMask[DestLane] = SourceLane + 4;
612 }
613
614 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
615 }
616
617 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
618 /// or conversion to a shuffle vector.
simplifyX86extrq(IntrinsicInst & II,Value * Op0,ConstantInt * CILength,ConstantInt * CIIndex,InstCombiner::BuilderTy & Builder)619 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
620 ConstantInt *CILength, ConstantInt *CIIndex,
621 InstCombiner::BuilderTy &Builder) {
622 auto LowConstantHighUndef = [&](uint64_t Val) {
623 Type *IntTy64 = Type::getInt64Ty(II.getContext());
624 Constant *Args[] = {ConstantInt::get(IntTy64, Val),
625 UndefValue::get(IntTy64)};
626 return ConstantVector::get(Args);
627 };
628
629 // See if we're dealing with constant values.
630 auto *C0 = dyn_cast<Constant>(Op0);
631 auto *CI0 =
632 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
633 : nullptr;
634
635 // Attempt to constant fold.
636 if (CILength && CIIndex) {
637 // From AMD documentation: "The bit index and field length are each six
638 // bits in length other bits of the field are ignored."
639 APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
640 APInt APLength = CILength->getValue().zextOrTrunc(6);
641
642 unsigned Index = APIndex.getZExtValue();
643
644 // From AMD documentation: "a value of zero in the field length is
645 // defined as length of 64".
646 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
647
648 // From AMD documentation: "If the sum of the bit index + length field
649 // is greater than 64, the results are undefined".
650 unsigned End = Index + Length;
651
652 // Note that both field index and field length are 8-bit quantities.
653 // Since variables 'Index' and 'Length' are unsigned values
654 // obtained from zero-extending field index and field length
655 // respectively, their sum should never wrap around.
656 if (End > 64)
657 return UndefValue::get(II.getType());
658
659 // If we are inserting whole bytes, we can convert this to a shuffle.
660 // Lowering can recognize EXTRQI shuffle masks.
661 if ((Length % 8) == 0 && (Index % 8) == 0) {
662 // Convert bit indices to byte indices.
663 Length /= 8;
664 Index /= 8;
665
666 Type *IntTy8 = Type::getInt8Ty(II.getContext());
667 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
668
669 SmallVector<int, 16> ShuffleMask;
670 for (int i = 0; i != (int)Length; ++i)
671 ShuffleMask.push_back(i + Index);
672 for (int i = Length; i != 8; ++i)
673 ShuffleMask.push_back(i + 16);
674 for (int i = 8; i != 16; ++i)
675 ShuffleMask.push_back(-1);
676
677 Value *SV = Builder.CreateShuffleVector(
678 Builder.CreateBitCast(Op0, ShufTy),
679 ConstantAggregateZero::get(ShufTy), ShuffleMask);
680 return Builder.CreateBitCast(SV, II.getType());
681 }
682
683 // Constant Fold - shift Index'th bit to lowest position and mask off
684 // Length bits.
685 if (CI0) {
686 APInt Elt = CI0->getValue();
687 Elt.lshrInPlace(Index);
688 Elt = Elt.zextOrTrunc(Length);
689 return LowConstantHighUndef(Elt.getZExtValue());
690 }
691
692 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
693 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
694 Value *Args[] = {Op0, CILength, CIIndex};
695 Module *M = II.getModule();
696 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
697 return Builder.CreateCall(F, Args);
698 }
699 }
700
701 // Constant Fold - extraction from zero is always {zero, undef}.
702 if (CI0 && CI0->isZero())
703 return LowConstantHighUndef(0);
704
705 return nullptr;
706 }
707
708 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
709 /// folding or conversion to a shuffle vector.
simplifyX86insertq(IntrinsicInst & II,Value * Op0,Value * Op1,APInt APLength,APInt APIndex,InstCombiner::BuilderTy & Builder)710 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
711 APInt APLength, APInt APIndex,
712 InstCombiner::BuilderTy &Builder) {
713 // From AMD documentation: "The bit index and field length are each six bits
714 // in length other bits of the field are ignored."
715 APIndex = APIndex.zextOrTrunc(6);
716 APLength = APLength.zextOrTrunc(6);
717
718 // Attempt to constant fold.
719 unsigned Index = APIndex.getZExtValue();
720
721 // From AMD documentation: "a value of zero in the field length is
722 // defined as length of 64".
723 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
724
725 // From AMD documentation: "If the sum of the bit index + length field
726 // is greater than 64, the results are undefined".
727 unsigned End = Index + Length;
728
729 // Note that both field index and field length are 8-bit quantities.
730 // Since variables 'Index' and 'Length' are unsigned values
731 // obtained from zero-extending field index and field length
732 // respectively, their sum should never wrap around.
733 if (End > 64)
734 return UndefValue::get(II.getType());
735
736 // If we are inserting whole bytes, we can convert this to a shuffle.
737 // Lowering can recognize INSERTQI shuffle masks.
738 if ((Length % 8) == 0 && (Index % 8) == 0) {
739 // Convert bit indices to byte indices.
740 Length /= 8;
741 Index /= 8;
742
743 Type *IntTy8 = Type::getInt8Ty(II.getContext());
744 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
745
746 SmallVector<int, 16> ShuffleMask;
747 for (int i = 0; i != (int)Index; ++i)
748 ShuffleMask.push_back(i);
749 for (int i = 0; i != (int)Length; ++i)
750 ShuffleMask.push_back(i + 16);
751 for (int i = Index + Length; i != 8; ++i)
752 ShuffleMask.push_back(i);
753 for (int i = 8; i != 16; ++i)
754 ShuffleMask.push_back(-1);
755
756 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
757 Builder.CreateBitCast(Op1, ShufTy),
758 ShuffleMask);
759 return Builder.CreateBitCast(SV, II.getType());
760 }
761
762 // See if we're dealing with constant values.
763 auto *C0 = dyn_cast<Constant>(Op0);
764 auto *C1 = dyn_cast<Constant>(Op1);
765 auto *CI00 =
766 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
767 : nullptr;
768 auto *CI10 =
769 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
770 : nullptr;
771
772 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
773 if (CI00 && CI10) {
774 APInt V00 = CI00->getValue();
775 APInt V10 = CI10->getValue();
776 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
777 V00 = V00 & ~Mask;
778 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
779 APInt Val = V00 | V10;
780 Type *IntTy64 = Type::getInt64Ty(II.getContext());
781 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
782 UndefValue::get(IntTy64)};
783 return ConstantVector::get(Args);
784 }
785
786 // If we were an INSERTQ call, we'll save demanded elements if we convert to
787 // INSERTQI.
788 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
789 Type *IntTy8 = Type::getInt8Ty(II.getContext());
790 Constant *CILength = ConstantInt::get(IntTy8, Length, false);
791 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
792
793 Value *Args[] = {Op0, Op1, CILength, CIIndex};
794 Module *M = II.getModule();
795 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
796 return Builder.CreateCall(F, Args);
797 }
798
799 return nullptr;
800 }
801
802 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
simplifyX86pshufb(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)803 static Value *simplifyX86pshufb(const IntrinsicInst &II,
804 InstCombiner::BuilderTy &Builder) {
805 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
806 if (!V)
807 return nullptr;
808
809 auto *VecTy = cast<FixedVectorType>(II.getType());
810 unsigned NumElts = VecTy->getNumElements();
811 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
812 "Unexpected number of elements in shuffle mask!");
813
814 // Construct a shuffle mask from constant integers or UNDEFs.
815 int Indexes[64];
816
817 // Each byte in the shuffle control mask forms an index to permute the
818 // corresponding byte in the destination operand.
819 for (unsigned I = 0; I < NumElts; ++I) {
820 Constant *COp = V->getAggregateElement(I);
821 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
822 return nullptr;
823
824 if (isa<UndefValue>(COp)) {
825 Indexes[I] = -1;
826 continue;
827 }
828
829 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
830
831 // If the most significant bit (bit[7]) of each byte of the shuffle
832 // control mask is set, then zero is written in the result byte.
833 // The zero vector is in the right-hand side of the resulting
834 // shufflevector.
835
836 // The value of each index for the high 128-bit lane is the least
837 // significant 4 bits of the respective shuffle control byte.
838 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
839 Indexes[I] = Index;
840 }
841
842 auto V1 = II.getArgOperand(0);
843 auto V2 = Constant::getNullValue(VecTy);
844 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
845 }
846
847 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
simplifyX86vpermilvar(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)848 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
849 InstCombiner::BuilderTy &Builder) {
850 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
851 if (!V)
852 return nullptr;
853
854 auto *VecTy = cast<FixedVectorType>(II.getType());
855 unsigned NumElts = VecTy->getNumElements();
856 bool IsPD = VecTy->getScalarType()->isDoubleTy();
857 unsigned NumLaneElts = IsPD ? 2 : 4;
858 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
859
860 // Construct a shuffle mask from constant integers or UNDEFs.
861 int Indexes[16];
862
863 // The intrinsics only read one or two bits, clear the rest.
864 for (unsigned I = 0; I < NumElts; ++I) {
865 Constant *COp = V->getAggregateElement(I);
866 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
867 return nullptr;
868
869 if (isa<UndefValue>(COp)) {
870 Indexes[I] = -1;
871 continue;
872 }
873
874 APInt Index = cast<ConstantInt>(COp)->getValue();
875 Index = Index.zextOrTrunc(32).getLoBits(2);
876
877 // The PD variants uses bit 1 to select per-lane element index, so
878 // shift down to convert to generic shuffle mask index.
879 if (IsPD)
880 Index.lshrInPlace(1);
881
882 // The _256 variants are a bit trickier since the mask bits always index
883 // into the corresponding 128 half. In order to convert to a generic
884 // shuffle, we have to make that explicit.
885 Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
886
887 Indexes[I] = Index.getZExtValue();
888 }
889
890 auto V1 = II.getArgOperand(0);
891 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
892 }
893
894 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
simplifyX86vpermv(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)895 static Value *simplifyX86vpermv(const IntrinsicInst &II,
896 InstCombiner::BuilderTy &Builder) {
897 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
898 if (!V)
899 return nullptr;
900
901 auto *VecTy = cast<FixedVectorType>(II.getType());
902 unsigned Size = VecTy->getNumElements();
903 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
904 "Unexpected shuffle mask size");
905
906 // Construct a shuffle mask from constant integers or UNDEFs.
907 int Indexes[64];
908
909 for (unsigned I = 0; I < Size; ++I) {
910 Constant *COp = V->getAggregateElement(I);
911 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
912 return nullptr;
913
914 if (isa<UndefValue>(COp)) {
915 Indexes[I] = -1;
916 continue;
917 }
918
919 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
920 Index &= Size - 1;
921 Indexes[I] = Index;
922 }
923
924 auto V1 = II.getArgOperand(0);
925 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
926 }
927
928 std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const929 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
930 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
931 unsigned DemandedWidth) {
932 APInt UndefElts(Width, 0);
933 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
934 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
935 };
936
937 Intrinsic::ID IID = II.getIntrinsicID();
938 switch (IID) {
939 case Intrinsic::x86_bmi_bextr_32:
940 case Intrinsic::x86_bmi_bextr_64:
941 case Intrinsic::x86_tbm_bextri_u32:
942 case Intrinsic::x86_tbm_bextri_u64:
943 // If the RHS is a constant we can try some simplifications.
944 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
945 uint64_t Shift = C->getZExtValue();
946 uint64_t Length = (Shift >> 8) & 0xff;
947 Shift &= 0xff;
948 unsigned BitWidth = II.getType()->getIntegerBitWidth();
949 // If the length is 0 or the shift is out of range, replace with zero.
950 if (Length == 0 || Shift >= BitWidth) {
951 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
952 }
953 // If the LHS is also a constant, we can completely constant fold this.
954 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
955 uint64_t Result = InC->getZExtValue() >> Shift;
956 if (Length > BitWidth)
957 Length = BitWidth;
958 Result &= maskTrailingOnes<uint64_t>(Length);
959 return IC.replaceInstUsesWith(II,
960 ConstantInt::get(II.getType(), Result));
961 }
962 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
963 // are only masking bits that a shift already cleared?
964 }
965 break;
966
967 case Intrinsic::x86_bmi_bzhi_32:
968 case Intrinsic::x86_bmi_bzhi_64:
969 // If the RHS is a constant we can try some simplifications.
970 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
971 uint64_t Index = C->getZExtValue() & 0xff;
972 unsigned BitWidth = II.getType()->getIntegerBitWidth();
973 if (Index >= BitWidth) {
974 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
975 }
976 if (Index == 0) {
977 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
978 }
979 // If the LHS is also a constant, we can completely constant fold this.
980 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
981 uint64_t Result = InC->getZExtValue();
982 Result &= maskTrailingOnes<uint64_t>(Index);
983 return IC.replaceInstUsesWith(II,
984 ConstantInt::get(II.getType(), Result));
985 }
986 // TODO should we convert this to an AND if the RHS is constant?
987 }
988 break;
989 case Intrinsic::x86_bmi_pext_32:
990 case Intrinsic::x86_bmi_pext_64:
991 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
992 if (MaskC->isNullValue()) {
993 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
994 }
995 if (MaskC->isAllOnesValue()) {
996 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
997 }
998
999 unsigned MaskIdx, MaskLen;
1000 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1001 // any single contingous sequence of 1s anywhere in the mask simply
1002 // describes a subset of the input bits shifted to the appropriate
1003 // position. Replace with the straight forward IR.
1004 Value *Input = II.getArgOperand(0);
1005 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1006 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1007 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
1008 return IC.replaceInstUsesWith(II, Shifted);
1009 }
1010
1011 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1012 uint64_t Src = SrcC->getZExtValue();
1013 uint64_t Mask = MaskC->getZExtValue();
1014 uint64_t Result = 0;
1015 uint64_t BitToSet = 1;
1016
1017 while (Mask) {
1018 // Isolate lowest set bit.
1019 uint64_t BitToTest = Mask & -Mask;
1020 if (BitToTest & Src)
1021 Result |= BitToSet;
1022
1023 BitToSet <<= 1;
1024 // Clear lowest set bit.
1025 Mask &= Mask - 1;
1026 }
1027
1028 return IC.replaceInstUsesWith(II,
1029 ConstantInt::get(II.getType(), Result));
1030 }
1031 }
1032 break;
1033 case Intrinsic::x86_bmi_pdep_32:
1034 case Intrinsic::x86_bmi_pdep_64:
1035 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1036 if (MaskC->isNullValue()) {
1037 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1038 }
1039 if (MaskC->isAllOnesValue()) {
1040 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1041 }
1042
1043 unsigned MaskIdx, MaskLen;
1044 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1045 // any single contingous sequence of 1s anywhere in the mask simply
1046 // describes a subset of the input bits shifted to the appropriate
1047 // position. Replace with the straight forward IR.
1048 Value *Input = II.getArgOperand(0);
1049 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1050 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
1051 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1052 return IC.replaceInstUsesWith(II, Masked);
1053 }
1054
1055 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1056 uint64_t Src = SrcC->getZExtValue();
1057 uint64_t Mask = MaskC->getZExtValue();
1058 uint64_t Result = 0;
1059 uint64_t BitToTest = 1;
1060
1061 while (Mask) {
1062 // Isolate lowest set bit.
1063 uint64_t BitToSet = Mask & -Mask;
1064 if (BitToTest & Src)
1065 Result |= BitToSet;
1066
1067 BitToTest <<= 1;
1068 // Clear lowest set bit;
1069 Mask &= Mask - 1;
1070 }
1071
1072 return IC.replaceInstUsesWith(II,
1073 ConstantInt::get(II.getType(), Result));
1074 }
1075 }
1076 break;
1077
1078 case Intrinsic::x86_sse_cvtss2si:
1079 case Intrinsic::x86_sse_cvtss2si64:
1080 case Intrinsic::x86_sse_cvttss2si:
1081 case Intrinsic::x86_sse_cvttss2si64:
1082 case Intrinsic::x86_sse2_cvtsd2si:
1083 case Intrinsic::x86_sse2_cvtsd2si64:
1084 case Intrinsic::x86_sse2_cvttsd2si:
1085 case Intrinsic::x86_sse2_cvttsd2si64:
1086 case Intrinsic::x86_avx512_vcvtss2si32:
1087 case Intrinsic::x86_avx512_vcvtss2si64:
1088 case Intrinsic::x86_avx512_vcvtss2usi32:
1089 case Intrinsic::x86_avx512_vcvtss2usi64:
1090 case Intrinsic::x86_avx512_vcvtsd2si32:
1091 case Intrinsic::x86_avx512_vcvtsd2si64:
1092 case Intrinsic::x86_avx512_vcvtsd2usi32:
1093 case Intrinsic::x86_avx512_vcvtsd2usi64:
1094 case Intrinsic::x86_avx512_cvttss2si:
1095 case Intrinsic::x86_avx512_cvttss2si64:
1096 case Intrinsic::x86_avx512_cvttss2usi:
1097 case Intrinsic::x86_avx512_cvttss2usi64:
1098 case Intrinsic::x86_avx512_cvttsd2si:
1099 case Intrinsic::x86_avx512_cvttsd2si64:
1100 case Intrinsic::x86_avx512_cvttsd2usi:
1101 case Intrinsic::x86_avx512_cvttsd2usi64: {
1102 // These intrinsics only demand the 0th element of their input vectors. If
1103 // we can simplify the input based on that, do so now.
1104 Value *Arg = II.getArgOperand(0);
1105 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1106 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1107 return IC.replaceOperand(II, 0, V);
1108 }
1109 break;
1110 }
1111
1112 case Intrinsic::x86_mmx_pmovmskb:
1113 case Intrinsic::x86_sse_movmsk_ps:
1114 case Intrinsic::x86_sse2_movmsk_pd:
1115 case Intrinsic::x86_sse2_pmovmskb_128:
1116 case Intrinsic::x86_avx_movmsk_pd_256:
1117 case Intrinsic::x86_avx_movmsk_ps_256:
1118 case Intrinsic::x86_avx2_pmovmskb:
1119 if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1120 return IC.replaceInstUsesWith(II, V);
1121 }
1122 break;
1123
1124 case Intrinsic::x86_sse_comieq_ss:
1125 case Intrinsic::x86_sse_comige_ss:
1126 case Intrinsic::x86_sse_comigt_ss:
1127 case Intrinsic::x86_sse_comile_ss:
1128 case Intrinsic::x86_sse_comilt_ss:
1129 case Intrinsic::x86_sse_comineq_ss:
1130 case Intrinsic::x86_sse_ucomieq_ss:
1131 case Intrinsic::x86_sse_ucomige_ss:
1132 case Intrinsic::x86_sse_ucomigt_ss:
1133 case Intrinsic::x86_sse_ucomile_ss:
1134 case Intrinsic::x86_sse_ucomilt_ss:
1135 case Intrinsic::x86_sse_ucomineq_ss:
1136 case Intrinsic::x86_sse2_comieq_sd:
1137 case Intrinsic::x86_sse2_comige_sd:
1138 case Intrinsic::x86_sse2_comigt_sd:
1139 case Intrinsic::x86_sse2_comile_sd:
1140 case Intrinsic::x86_sse2_comilt_sd:
1141 case Intrinsic::x86_sse2_comineq_sd:
1142 case Intrinsic::x86_sse2_ucomieq_sd:
1143 case Intrinsic::x86_sse2_ucomige_sd:
1144 case Intrinsic::x86_sse2_ucomigt_sd:
1145 case Intrinsic::x86_sse2_ucomile_sd:
1146 case Intrinsic::x86_sse2_ucomilt_sd:
1147 case Intrinsic::x86_sse2_ucomineq_sd:
1148 case Intrinsic::x86_avx512_vcomi_ss:
1149 case Intrinsic::x86_avx512_vcomi_sd:
1150 case Intrinsic::x86_avx512_mask_cmp_ss:
1151 case Intrinsic::x86_avx512_mask_cmp_sd: {
1152 // These intrinsics only demand the 0th element of their input vectors. If
1153 // we can simplify the input based on that, do so now.
1154 bool MadeChange = false;
1155 Value *Arg0 = II.getArgOperand(0);
1156 Value *Arg1 = II.getArgOperand(1);
1157 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1158 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1159 IC.replaceOperand(II, 0, V);
1160 MadeChange = true;
1161 }
1162 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1163 IC.replaceOperand(II, 1, V);
1164 MadeChange = true;
1165 }
1166 if (MadeChange) {
1167 return &II;
1168 }
1169 break;
1170 }
1171
1172 case Intrinsic::x86_avx512_add_ps_512:
1173 case Intrinsic::x86_avx512_div_ps_512:
1174 case Intrinsic::x86_avx512_mul_ps_512:
1175 case Intrinsic::x86_avx512_sub_ps_512:
1176 case Intrinsic::x86_avx512_add_pd_512:
1177 case Intrinsic::x86_avx512_div_pd_512:
1178 case Intrinsic::x86_avx512_mul_pd_512:
1179 case Intrinsic::x86_avx512_sub_pd_512:
1180 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1181 // IR operations.
1182 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1183 if (R->getValue() == 4) {
1184 Value *Arg0 = II.getArgOperand(0);
1185 Value *Arg1 = II.getArgOperand(1);
1186
1187 Value *V;
1188 switch (IID) {
1189 default:
1190 llvm_unreachable("Case stmts out of sync!");
1191 case Intrinsic::x86_avx512_add_ps_512:
1192 case Intrinsic::x86_avx512_add_pd_512:
1193 V = IC.Builder.CreateFAdd(Arg0, Arg1);
1194 break;
1195 case Intrinsic::x86_avx512_sub_ps_512:
1196 case Intrinsic::x86_avx512_sub_pd_512:
1197 V = IC.Builder.CreateFSub(Arg0, Arg1);
1198 break;
1199 case Intrinsic::x86_avx512_mul_ps_512:
1200 case Intrinsic::x86_avx512_mul_pd_512:
1201 V = IC.Builder.CreateFMul(Arg0, Arg1);
1202 break;
1203 case Intrinsic::x86_avx512_div_ps_512:
1204 case Intrinsic::x86_avx512_div_pd_512:
1205 V = IC.Builder.CreateFDiv(Arg0, Arg1);
1206 break;
1207 }
1208
1209 return IC.replaceInstUsesWith(II, V);
1210 }
1211 }
1212 break;
1213
1214 case Intrinsic::x86_avx512_mask_add_ss_round:
1215 case Intrinsic::x86_avx512_mask_div_ss_round:
1216 case Intrinsic::x86_avx512_mask_mul_ss_round:
1217 case Intrinsic::x86_avx512_mask_sub_ss_round:
1218 case Intrinsic::x86_avx512_mask_add_sd_round:
1219 case Intrinsic::x86_avx512_mask_div_sd_round:
1220 case Intrinsic::x86_avx512_mask_mul_sd_round:
1221 case Intrinsic::x86_avx512_mask_sub_sd_round:
1222 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1223 // IR operations.
1224 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1225 if (R->getValue() == 4) {
1226 // Extract the element as scalars.
1227 Value *Arg0 = II.getArgOperand(0);
1228 Value *Arg1 = II.getArgOperand(1);
1229 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1230 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1231
1232 Value *V;
1233 switch (IID) {
1234 default:
1235 llvm_unreachable("Case stmts out of sync!");
1236 case Intrinsic::x86_avx512_mask_add_ss_round:
1237 case Intrinsic::x86_avx512_mask_add_sd_round:
1238 V = IC.Builder.CreateFAdd(LHS, RHS);
1239 break;
1240 case Intrinsic::x86_avx512_mask_sub_ss_round:
1241 case Intrinsic::x86_avx512_mask_sub_sd_round:
1242 V = IC.Builder.CreateFSub(LHS, RHS);
1243 break;
1244 case Intrinsic::x86_avx512_mask_mul_ss_round:
1245 case Intrinsic::x86_avx512_mask_mul_sd_round:
1246 V = IC.Builder.CreateFMul(LHS, RHS);
1247 break;
1248 case Intrinsic::x86_avx512_mask_div_ss_round:
1249 case Intrinsic::x86_avx512_mask_div_sd_round:
1250 V = IC.Builder.CreateFDiv(LHS, RHS);
1251 break;
1252 }
1253
1254 // Handle the masking aspect of the intrinsic.
1255 Value *Mask = II.getArgOperand(3);
1256 auto *C = dyn_cast<ConstantInt>(Mask);
1257 // We don't need a select if we know the mask bit is a 1.
1258 if (!C || !C->getValue()[0]) {
1259 // Cast the mask to an i1 vector and then extract the lowest element.
1260 auto *MaskTy = FixedVectorType::get(
1261 IC.Builder.getInt1Ty(),
1262 cast<IntegerType>(Mask->getType())->getBitWidth());
1263 Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1264 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
1265 // Extract the lowest element from the passthru operand.
1266 Value *Passthru =
1267 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
1268 V = IC.Builder.CreateSelect(Mask, V, Passthru);
1269 }
1270
1271 // Insert the result back into the original argument 0.
1272 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1273
1274 return IC.replaceInstUsesWith(II, V);
1275 }
1276 }
1277 break;
1278
1279 // Constant fold ashr( <A x Bi>, Ci ).
1280 // Constant fold lshr( <A x Bi>, Ci ).
1281 // Constant fold shl( <A x Bi>, Ci ).
1282 case Intrinsic::x86_sse2_psrai_d:
1283 case Intrinsic::x86_sse2_psrai_w:
1284 case Intrinsic::x86_avx2_psrai_d:
1285 case Intrinsic::x86_avx2_psrai_w:
1286 case Intrinsic::x86_avx512_psrai_q_128:
1287 case Intrinsic::x86_avx512_psrai_q_256:
1288 case Intrinsic::x86_avx512_psrai_d_512:
1289 case Intrinsic::x86_avx512_psrai_q_512:
1290 case Intrinsic::x86_avx512_psrai_w_512:
1291 case Intrinsic::x86_sse2_psrli_d:
1292 case Intrinsic::x86_sse2_psrli_q:
1293 case Intrinsic::x86_sse2_psrli_w:
1294 case Intrinsic::x86_avx2_psrli_d:
1295 case Intrinsic::x86_avx2_psrli_q:
1296 case Intrinsic::x86_avx2_psrli_w:
1297 case Intrinsic::x86_avx512_psrli_d_512:
1298 case Intrinsic::x86_avx512_psrli_q_512:
1299 case Intrinsic::x86_avx512_psrli_w_512:
1300 case Intrinsic::x86_sse2_pslli_d:
1301 case Intrinsic::x86_sse2_pslli_q:
1302 case Intrinsic::x86_sse2_pslli_w:
1303 case Intrinsic::x86_avx2_pslli_d:
1304 case Intrinsic::x86_avx2_pslli_q:
1305 case Intrinsic::x86_avx2_pslli_w:
1306 case Intrinsic::x86_avx512_pslli_d_512:
1307 case Intrinsic::x86_avx512_pslli_q_512:
1308 case Intrinsic::x86_avx512_pslli_w_512:
1309 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1310 return IC.replaceInstUsesWith(II, V);
1311 }
1312 break;
1313
1314 case Intrinsic::x86_sse2_psra_d:
1315 case Intrinsic::x86_sse2_psra_w:
1316 case Intrinsic::x86_avx2_psra_d:
1317 case Intrinsic::x86_avx2_psra_w:
1318 case Intrinsic::x86_avx512_psra_q_128:
1319 case Intrinsic::x86_avx512_psra_q_256:
1320 case Intrinsic::x86_avx512_psra_d_512:
1321 case Intrinsic::x86_avx512_psra_q_512:
1322 case Intrinsic::x86_avx512_psra_w_512:
1323 case Intrinsic::x86_sse2_psrl_d:
1324 case Intrinsic::x86_sse2_psrl_q:
1325 case Intrinsic::x86_sse2_psrl_w:
1326 case Intrinsic::x86_avx2_psrl_d:
1327 case Intrinsic::x86_avx2_psrl_q:
1328 case Intrinsic::x86_avx2_psrl_w:
1329 case Intrinsic::x86_avx512_psrl_d_512:
1330 case Intrinsic::x86_avx512_psrl_q_512:
1331 case Intrinsic::x86_avx512_psrl_w_512:
1332 case Intrinsic::x86_sse2_psll_d:
1333 case Intrinsic::x86_sse2_psll_q:
1334 case Intrinsic::x86_sse2_psll_w:
1335 case Intrinsic::x86_avx2_psll_d:
1336 case Intrinsic::x86_avx2_psll_q:
1337 case Intrinsic::x86_avx2_psll_w:
1338 case Intrinsic::x86_avx512_psll_d_512:
1339 case Intrinsic::x86_avx512_psll_q_512:
1340 case Intrinsic::x86_avx512_psll_w_512: {
1341 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1342 return IC.replaceInstUsesWith(II, V);
1343 }
1344
1345 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1346 // operand to compute the shift amount.
1347 Value *Arg1 = II.getArgOperand(1);
1348 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1349 "Unexpected packed shift size");
1350 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1351
1352 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1353 return IC.replaceOperand(II, 1, V);
1354 }
1355 break;
1356 }
1357
1358 case Intrinsic::x86_avx2_psllv_d:
1359 case Intrinsic::x86_avx2_psllv_d_256:
1360 case Intrinsic::x86_avx2_psllv_q:
1361 case Intrinsic::x86_avx2_psllv_q_256:
1362 case Intrinsic::x86_avx512_psllv_d_512:
1363 case Intrinsic::x86_avx512_psllv_q_512:
1364 case Intrinsic::x86_avx512_psllv_w_128:
1365 case Intrinsic::x86_avx512_psllv_w_256:
1366 case Intrinsic::x86_avx512_psllv_w_512:
1367 case Intrinsic::x86_avx2_psrav_d:
1368 case Intrinsic::x86_avx2_psrav_d_256:
1369 case Intrinsic::x86_avx512_psrav_q_128:
1370 case Intrinsic::x86_avx512_psrav_q_256:
1371 case Intrinsic::x86_avx512_psrav_d_512:
1372 case Intrinsic::x86_avx512_psrav_q_512:
1373 case Intrinsic::x86_avx512_psrav_w_128:
1374 case Intrinsic::x86_avx512_psrav_w_256:
1375 case Intrinsic::x86_avx512_psrav_w_512:
1376 case Intrinsic::x86_avx2_psrlv_d:
1377 case Intrinsic::x86_avx2_psrlv_d_256:
1378 case Intrinsic::x86_avx2_psrlv_q:
1379 case Intrinsic::x86_avx2_psrlv_q_256:
1380 case Intrinsic::x86_avx512_psrlv_d_512:
1381 case Intrinsic::x86_avx512_psrlv_q_512:
1382 case Intrinsic::x86_avx512_psrlv_w_128:
1383 case Intrinsic::x86_avx512_psrlv_w_256:
1384 case Intrinsic::x86_avx512_psrlv_w_512:
1385 if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1386 return IC.replaceInstUsesWith(II, V);
1387 }
1388 break;
1389
1390 case Intrinsic::x86_sse2_packssdw_128:
1391 case Intrinsic::x86_sse2_packsswb_128:
1392 case Intrinsic::x86_avx2_packssdw:
1393 case Intrinsic::x86_avx2_packsswb:
1394 case Intrinsic::x86_avx512_packssdw_512:
1395 case Intrinsic::x86_avx512_packsswb_512:
1396 if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1397 return IC.replaceInstUsesWith(II, V);
1398 }
1399 break;
1400
1401 case Intrinsic::x86_sse2_packuswb_128:
1402 case Intrinsic::x86_sse41_packusdw:
1403 case Intrinsic::x86_avx2_packusdw:
1404 case Intrinsic::x86_avx2_packuswb:
1405 case Intrinsic::x86_avx512_packusdw_512:
1406 case Intrinsic::x86_avx512_packuswb_512:
1407 if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1408 return IC.replaceInstUsesWith(II, V);
1409 }
1410 break;
1411
1412 case Intrinsic::x86_pclmulqdq:
1413 case Intrinsic::x86_pclmulqdq_256:
1414 case Intrinsic::x86_pclmulqdq_512: {
1415 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1416 unsigned Imm = C->getZExtValue();
1417
1418 bool MadeChange = false;
1419 Value *Arg0 = II.getArgOperand(0);
1420 Value *Arg1 = II.getArgOperand(1);
1421 unsigned VWidth =
1422 cast<FixedVectorType>(Arg0->getType())->getNumElements();
1423
1424 APInt UndefElts1(VWidth, 0);
1425 APInt DemandedElts1 =
1426 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1427 if (Value *V =
1428 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1429 IC.replaceOperand(II, 0, V);
1430 MadeChange = true;
1431 }
1432
1433 APInt UndefElts2(VWidth, 0);
1434 APInt DemandedElts2 =
1435 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1436 if (Value *V =
1437 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1438 IC.replaceOperand(II, 1, V);
1439 MadeChange = true;
1440 }
1441
1442 // If either input elements are undef, the result is zero.
1443 if (DemandedElts1.isSubsetOf(UndefElts1) ||
1444 DemandedElts2.isSubsetOf(UndefElts2)) {
1445 return IC.replaceInstUsesWith(II,
1446 ConstantAggregateZero::get(II.getType()));
1447 }
1448
1449 if (MadeChange) {
1450 return &II;
1451 }
1452 }
1453 break;
1454 }
1455
1456 case Intrinsic::x86_sse41_insertps:
1457 if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1458 return IC.replaceInstUsesWith(II, V);
1459 }
1460 break;
1461
1462 case Intrinsic::x86_sse4a_extrq: {
1463 Value *Op0 = II.getArgOperand(0);
1464 Value *Op1 = II.getArgOperand(1);
1465 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1466 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1467 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1468 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1469 VWidth1 == 16 && "Unexpected operand sizes");
1470
1471 // See if we're dealing with constant values.
1472 auto *C1 = dyn_cast<Constant>(Op1);
1473 auto *CILength =
1474 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1475 : nullptr;
1476 auto *CIIndex =
1477 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1478 : nullptr;
1479
1480 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1481 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1482 return IC.replaceInstUsesWith(II, V);
1483 }
1484
1485 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1486 // operands and the lowest 16-bits of the second.
1487 bool MadeChange = false;
1488 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1489 IC.replaceOperand(II, 0, V);
1490 MadeChange = true;
1491 }
1492 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1493 IC.replaceOperand(II, 1, V);
1494 MadeChange = true;
1495 }
1496 if (MadeChange) {
1497 return &II;
1498 }
1499 break;
1500 }
1501
1502 case Intrinsic::x86_sse4a_extrqi: {
1503 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1504 // bits of the lower 64-bits. The upper 64-bits are undefined.
1505 Value *Op0 = II.getArgOperand(0);
1506 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1507 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1508 "Unexpected operand size");
1509
1510 // See if we're dealing with constant values.
1511 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1512 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1513
1514 // Attempt to simplify to a constant or shuffle vector.
1515 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1516 return IC.replaceInstUsesWith(II, V);
1517 }
1518
1519 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1520 // operand.
1521 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1522 return IC.replaceOperand(II, 0, V);
1523 }
1524 break;
1525 }
1526
1527 case Intrinsic::x86_sse4a_insertq: {
1528 Value *Op0 = II.getArgOperand(0);
1529 Value *Op1 = II.getArgOperand(1);
1530 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1531 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1532 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1533 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1534 "Unexpected operand size");
1535
1536 // See if we're dealing with constant values.
1537 auto *C1 = dyn_cast<Constant>(Op1);
1538 auto *CI11 =
1539 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1540 : nullptr;
1541
1542 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1543 if (CI11) {
1544 const APInt &V11 = CI11->getValue();
1545 APInt Len = V11.zextOrTrunc(6);
1546 APInt Idx = V11.lshr(8).zextOrTrunc(6);
1547 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1548 return IC.replaceInstUsesWith(II, V);
1549 }
1550 }
1551
1552 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1553 // operand.
1554 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1555 return IC.replaceOperand(II, 0, V);
1556 }
1557 break;
1558 }
1559
1560 case Intrinsic::x86_sse4a_insertqi: {
1561 // INSERTQI: Extract lowest Length bits from lower half of second source and
1562 // insert over first source starting at Index bit. The upper 64-bits are
1563 // undefined.
1564 Value *Op0 = II.getArgOperand(0);
1565 Value *Op1 = II.getArgOperand(1);
1566 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1567 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1568 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1569 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1570 VWidth1 == 2 && "Unexpected operand sizes");
1571
1572 // See if we're dealing with constant values.
1573 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1574 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1575
1576 // Attempt to simplify to a constant or shuffle vector.
1577 if (CILength && CIIndex) {
1578 APInt Len = CILength->getValue().zextOrTrunc(6);
1579 APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1580 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1581 return IC.replaceInstUsesWith(II, V);
1582 }
1583 }
1584
1585 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1586 // operands.
1587 bool MadeChange = false;
1588 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1589 IC.replaceOperand(II, 0, V);
1590 MadeChange = true;
1591 }
1592 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1593 IC.replaceOperand(II, 1, V);
1594 MadeChange = true;
1595 }
1596 if (MadeChange) {
1597 return &II;
1598 }
1599 break;
1600 }
1601
1602 case Intrinsic::x86_sse41_pblendvb:
1603 case Intrinsic::x86_sse41_blendvps:
1604 case Intrinsic::x86_sse41_blendvpd:
1605 case Intrinsic::x86_avx_blendv_ps_256:
1606 case Intrinsic::x86_avx_blendv_pd_256:
1607 case Intrinsic::x86_avx2_pblendvb: {
1608 // fold (blend A, A, Mask) -> A
1609 Value *Op0 = II.getArgOperand(0);
1610 Value *Op1 = II.getArgOperand(1);
1611 Value *Mask = II.getArgOperand(2);
1612 if (Op0 == Op1) {
1613 return IC.replaceInstUsesWith(II, Op0);
1614 }
1615
1616 // Zero Mask - select 1st argument.
1617 if (isa<ConstantAggregateZero>(Mask)) {
1618 return IC.replaceInstUsesWith(II, Op0);
1619 }
1620
1621 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1622 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1623 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1624 return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1625 }
1626
1627 // Convert to a vector select if we can bypass casts and find a boolean
1628 // vector condition value.
1629 Value *BoolVec;
1630 Mask = InstCombiner::peekThroughBitcast(Mask);
1631 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
1632 BoolVec->getType()->isVectorTy() &&
1633 BoolVec->getType()->getScalarSizeInBits() == 1) {
1634 assert(Mask->getType()->getPrimitiveSizeInBits() ==
1635 II.getType()->getPrimitiveSizeInBits() &&
1636 "Not expecting mask and operands with different sizes");
1637
1638 unsigned NumMaskElts =
1639 cast<FixedVectorType>(Mask->getType())->getNumElements();
1640 unsigned NumOperandElts =
1641 cast<FixedVectorType>(II.getType())->getNumElements();
1642 if (NumMaskElts == NumOperandElts) {
1643 return SelectInst::Create(BoolVec, Op1, Op0);
1644 }
1645
1646 // If the mask has less elements than the operands, each mask bit maps to
1647 // multiple elements of the operands. Bitcast back and forth.
1648 if (NumMaskElts < NumOperandElts) {
1649 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1650 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1651 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1652 return new BitCastInst(Sel, II.getType());
1653 }
1654 }
1655
1656 break;
1657 }
1658
1659 case Intrinsic::x86_ssse3_pshuf_b_128:
1660 case Intrinsic::x86_avx2_pshuf_b:
1661 case Intrinsic::x86_avx512_pshuf_b_512:
1662 if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1663 return IC.replaceInstUsesWith(II, V);
1664 }
1665 break;
1666
1667 case Intrinsic::x86_avx_vpermilvar_ps:
1668 case Intrinsic::x86_avx_vpermilvar_ps_256:
1669 case Intrinsic::x86_avx512_vpermilvar_ps_512:
1670 case Intrinsic::x86_avx_vpermilvar_pd:
1671 case Intrinsic::x86_avx_vpermilvar_pd_256:
1672 case Intrinsic::x86_avx512_vpermilvar_pd_512:
1673 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1674 return IC.replaceInstUsesWith(II, V);
1675 }
1676 break;
1677
1678 case Intrinsic::x86_avx2_permd:
1679 case Intrinsic::x86_avx2_permps:
1680 case Intrinsic::x86_avx512_permvar_df_256:
1681 case Intrinsic::x86_avx512_permvar_df_512:
1682 case Intrinsic::x86_avx512_permvar_di_256:
1683 case Intrinsic::x86_avx512_permvar_di_512:
1684 case Intrinsic::x86_avx512_permvar_hi_128:
1685 case Intrinsic::x86_avx512_permvar_hi_256:
1686 case Intrinsic::x86_avx512_permvar_hi_512:
1687 case Intrinsic::x86_avx512_permvar_qi_128:
1688 case Intrinsic::x86_avx512_permvar_qi_256:
1689 case Intrinsic::x86_avx512_permvar_qi_512:
1690 case Intrinsic::x86_avx512_permvar_sf_512:
1691 case Intrinsic::x86_avx512_permvar_si_512:
1692 if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1693 return IC.replaceInstUsesWith(II, V);
1694 }
1695 break;
1696
1697 case Intrinsic::x86_avx_maskload_ps:
1698 case Intrinsic::x86_avx_maskload_pd:
1699 case Intrinsic::x86_avx_maskload_ps_256:
1700 case Intrinsic::x86_avx_maskload_pd_256:
1701 case Intrinsic::x86_avx2_maskload_d:
1702 case Intrinsic::x86_avx2_maskload_q:
1703 case Intrinsic::x86_avx2_maskload_d_256:
1704 case Intrinsic::x86_avx2_maskload_q_256:
1705 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1706 return I;
1707 }
1708 break;
1709
1710 case Intrinsic::x86_sse2_maskmov_dqu:
1711 case Intrinsic::x86_avx_maskstore_ps:
1712 case Intrinsic::x86_avx_maskstore_pd:
1713 case Intrinsic::x86_avx_maskstore_ps_256:
1714 case Intrinsic::x86_avx_maskstore_pd_256:
1715 case Intrinsic::x86_avx2_maskstore_d:
1716 case Intrinsic::x86_avx2_maskstore_q:
1717 case Intrinsic::x86_avx2_maskstore_d_256:
1718 case Intrinsic::x86_avx2_maskstore_q_256:
1719 if (simplifyX86MaskedStore(II, IC)) {
1720 return nullptr;
1721 }
1722 break;
1723
1724 case Intrinsic::x86_addcarry_32:
1725 case Intrinsic::x86_addcarry_64:
1726 if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1727 return IC.replaceInstUsesWith(II, V);
1728 }
1729 break;
1730
1731 default:
1732 break;
1733 }
1734 return std::nullopt;
1735 }
1736
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed) const1737 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
1738 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1739 bool &KnownBitsComputed) const {
1740 switch (II.getIntrinsicID()) {
1741 default:
1742 break;
1743 case Intrinsic::x86_mmx_pmovmskb:
1744 case Intrinsic::x86_sse_movmsk_ps:
1745 case Intrinsic::x86_sse2_movmsk_pd:
1746 case Intrinsic::x86_sse2_pmovmskb_128:
1747 case Intrinsic::x86_avx_movmsk_ps_256:
1748 case Intrinsic::x86_avx_movmsk_pd_256:
1749 case Intrinsic::x86_avx2_pmovmskb: {
1750 // MOVMSK copies the vector elements' sign bits to the low bits
1751 // and zeros the high bits.
1752 unsigned ArgWidth;
1753 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1754 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1755 } else {
1756 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
1757 ArgWidth = ArgType->getNumElements();
1758 }
1759
1760 // If we don't need any of low bits then return zero,
1761 // we know that DemandedMask is non-zero already.
1762 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1763 Type *VTy = II.getType();
1764 if (DemandedElts.isZero()) {
1765 return ConstantInt::getNullValue(VTy);
1766 }
1767
1768 // We know that the upper bits are set to zero.
1769 Known.Zero.setBitsFrom(ArgWidth);
1770 KnownBitsComputed = true;
1771 break;
1772 }
1773 }
1774 return std::nullopt;
1775 }
1776
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> simplifyAndSetOp) const1777 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1778 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1779 APInt &UndefElts2, APInt &UndefElts3,
1780 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1781 simplifyAndSetOp) const {
1782 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1783 switch (II.getIntrinsicID()) {
1784 default:
1785 break;
1786 case Intrinsic::x86_xop_vfrcz_ss:
1787 case Intrinsic::x86_xop_vfrcz_sd:
1788 // The instructions for these intrinsics are speced to zero upper bits not
1789 // pass them through like other scalar intrinsics. So we shouldn't just
1790 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1791 // Instead we should return a zero vector.
1792 if (!DemandedElts[0]) {
1793 IC.addToWorklist(&II);
1794 return ConstantAggregateZero::get(II.getType());
1795 }
1796
1797 // Only the lower element is used.
1798 DemandedElts = 1;
1799 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1800
1801 // Only the lower element is undefined. The high elements are zero.
1802 UndefElts = UndefElts[0];
1803 break;
1804
1805 // Unary scalar-as-vector operations that work column-wise.
1806 case Intrinsic::x86_sse_rcp_ss:
1807 case Intrinsic::x86_sse_rsqrt_ss:
1808 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1809
1810 // If lowest element of a scalar op isn't used then use Arg0.
1811 if (!DemandedElts[0]) {
1812 IC.addToWorklist(&II);
1813 return II.getArgOperand(0);
1814 }
1815 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1816 // checks).
1817 break;
1818
1819 // Binary scalar-as-vector operations that work column-wise. The high
1820 // elements come from operand 0. The low element is a function of both
1821 // operands.
1822 case Intrinsic::x86_sse_min_ss:
1823 case Intrinsic::x86_sse_max_ss:
1824 case Intrinsic::x86_sse_cmp_ss:
1825 case Intrinsic::x86_sse2_min_sd:
1826 case Intrinsic::x86_sse2_max_sd:
1827 case Intrinsic::x86_sse2_cmp_sd: {
1828 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1829
1830 // If lowest element of a scalar op isn't used then use Arg0.
1831 if (!DemandedElts[0]) {
1832 IC.addToWorklist(&II);
1833 return II.getArgOperand(0);
1834 }
1835
1836 // Only lower element is used for operand 1.
1837 DemandedElts = 1;
1838 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1839
1840 // Lower element is undefined if both lower elements are undefined.
1841 // Consider things like undef&0. The result is known zero, not undef.
1842 if (!UndefElts2[0])
1843 UndefElts.clearBit(0);
1844
1845 break;
1846 }
1847
1848 // Binary scalar-as-vector operations that work column-wise. The high
1849 // elements come from operand 0 and the low element comes from operand 1.
1850 case Intrinsic::x86_sse41_round_ss:
1851 case Intrinsic::x86_sse41_round_sd: {
1852 // Don't use the low element of operand 0.
1853 APInt DemandedElts2 = DemandedElts;
1854 DemandedElts2.clearBit(0);
1855 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1856
1857 // If lowest element of a scalar op isn't used then use Arg0.
1858 if (!DemandedElts[0]) {
1859 IC.addToWorklist(&II);
1860 return II.getArgOperand(0);
1861 }
1862
1863 // Only lower element is used for operand 1.
1864 DemandedElts = 1;
1865 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1866
1867 // Take the high undef elements from operand 0 and take the lower element
1868 // from operand 1.
1869 UndefElts.clearBit(0);
1870 UndefElts |= UndefElts2[0];
1871 break;
1872 }
1873
1874 // Three input scalar-as-vector operations that work column-wise. The high
1875 // elements come from operand 0 and the low element is a function of all
1876 // three inputs.
1877 case Intrinsic::x86_avx512_mask_add_ss_round:
1878 case Intrinsic::x86_avx512_mask_div_ss_round:
1879 case Intrinsic::x86_avx512_mask_mul_ss_round:
1880 case Intrinsic::x86_avx512_mask_sub_ss_round:
1881 case Intrinsic::x86_avx512_mask_max_ss_round:
1882 case Intrinsic::x86_avx512_mask_min_ss_round:
1883 case Intrinsic::x86_avx512_mask_add_sd_round:
1884 case Intrinsic::x86_avx512_mask_div_sd_round:
1885 case Intrinsic::x86_avx512_mask_mul_sd_round:
1886 case Intrinsic::x86_avx512_mask_sub_sd_round:
1887 case Intrinsic::x86_avx512_mask_max_sd_round:
1888 case Intrinsic::x86_avx512_mask_min_sd_round:
1889 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1890
1891 // If lowest element of a scalar op isn't used then use Arg0.
1892 if (!DemandedElts[0]) {
1893 IC.addToWorklist(&II);
1894 return II.getArgOperand(0);
1895 }
1896
1897 // Only lower element is used for operand 1 and 2.
1898 DemandedElts = 1;
1899 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1900 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1901
1902 // Lower element is undefined if all three lower elements are undefined.
1903 // Consider things like undef&0. The result is known zero, not undef.
1904 if (!UndefElts2[0] || !UndefElts3[0])
1905 UndefElts.clearBit(0);
1906 break;
1907
1908 // TODO: Add fmaddsub support?
1909 case Intrinsic::x86_sse3_addsub_pd:
1910 case Intrinsic::x86_sse3_addsub_ps:
1911 case Intrinsic::x86_avx_addsub_pd_256:
1912 case Intrinsic::x86_avx_addsub_ps_256: {
1913 // If none of the even or none of the odd lanes are required, turn this
1914 // into a generic FP math instruction.
1915 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
1916 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
1917 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
1918 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
1919 if (IsSubOnly || IsAddOnly) {
1920 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
1921 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1922 IC.Builder.SetInsertPoint(&II);
1923 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
1924 return IC.Builder.CreateBinOp(
1925 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1926 }
1927
1928 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1929 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1930 UndefElts &= UndefElts2;
1931 break;
1932 }
1933
1934 // General per-element vector operations.
1935 case Intrinsic::x86_avx2_psllv_d:
1936 case Intrinsic::x86_avx2_psllv_d_256:
1937 case Intrinsic::x86_avx2_psllv_q:
1938 case Intrinsic::x86_avx2_psllv_q_256:
1939 case Intrinsic::x86_avx2_psrlv_d:
1940 case Intrinsic::x86_avx2_psrlv_d_256:
1941 case Intrinsic::x86_avx2_psrlv_q:
1942 case Intrinsic::x86_avx2_psrlv_q_256:
1943 case Intrinsic::x86_avx2_psrav_d:
1944 case Intrinsic::x86_avx2_psrav_d_256: {
1945 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1946 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1947 UndefElts &= UndefElts2;
1948 break;
1949 }
1950
1951 case Intrinsic::x86_sse2_packssdw_128:
1952 case Intrinsic::x86_sse2_packsswb_128:
1953 case Intrinsic::x86_sse2_packuswb_128:
1954 case Intrinsic::x86_sse41_packusdw:
1955 case Intrinsic::x86_avx2_packssdw:
1956 case Intrinsic::x86_avx2_packsswb:
1957 case Intrinsic::x86_avx2_packusdw:
1958 case Intrinsic::x86_avx2_packuswb:
1959 case Intrinsic::x86_avx512_packssdw_512:
1960 case Intrinsic::x86_avx512_packsswb_512:
1961 case Intrinsic::x86_avx512_packusdw_512:
1962 case Intrinsic::x86_avx512_packuswb_512: {
1963 auto *Ty0 = II.getArgOperand(0)->getType();
1964 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1965 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1966
1967 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1968 unsigned VWidthPerLane = VWidth / NumLanes;
1969 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1970
1971 // Per lane, pack the elements of the first input and then the second.
1972 // e.g.
1973 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1974 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1975 for (int OpNum = 0; OpNum != 2; ++OpNum) {
1976 APInt OpDemandedElts(InnerVWidth, 0);
1977 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1978 unsigned LaneIdx = Lane * VWidthPerLane;
1979 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1980 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1981 if (DemandedElts[Idx])
1982 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1983 }
1984 }
1985
1986 // Demand elements from the operand.
1987 APInt OpUndefElts(InnerVWidth, 0);
1988 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1989
1990 // Pack the operand's UNDEF elements, one lane at a time.
1991 OpUndefElts = OpUndefElts.zext(VWidth);
1992 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1993 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1994 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1995 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1996 UndefElts |= LaneElts;
1997 }
1998 }
1999 break;
2000 }
2001
2002 // PSHUFB
2003 case Intrinsic::x86_ssse3_pshuf_b_128:
2004 case Intrinsic::x86_avx2_pshuf_b:
2005 case Intrinsic::x86_avx512_pshuf_b_512:
2006 // PERMILVAR
2007 case Intrinsic::x86_avx_vpermilvar_ps:
2008 case Intrinsic::x86_avx_vpermilvar_ps_256:
2009 case Intrinsic::x86_avx512_vpermilvar_ps_512:
2010 case Intrinsic::x86_avx_vpermilvar_pd:
2011 case Intrinsic::x86_avx_vpermilvar_pd_256:
2012 case Intrinsic::x86_avx512_vpermilvar_pd_512:
2013 // PERMV
2014 case Intrinsic::x86_avx2_permd:
2015 case Intrinsic::x86_avx2_permps: {
2016 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2017 break;
2018 }
2019
2020 // SSE4A instructions leave the upper 64-bits of the 128-bit result
2021 // in an undefined state.
2022 case Intrinsic::x86_sse4a_extrq:
2023 case Intrinsic::x86_sse4a_extrqi:
2024 case Intrinsic::x86_sse4a_insertq:
2025 case Intrinsic::x86_sse4a_insertqi:
2026 UndefElts.setHighBits(VWidth / 2);
2027 break;
2028 }
2029 return std::nullopt;
2030 }
2031